Crawling Craiglisht with python (Not Scrapy)

75 Views Asked by At

I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy

This is the URL: https://chicago.craigslist.org/

At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.

import re
import requests
import csv
from html import unescape
def get_page_content(url):
    response = requests.get(url)
    return response.text

def get_category_list(content):
    return category_pat.findall(content)[90:121]

def get_next_page(content):
    result = next_page_pat.findall(content)
    if len(result) == 0:
        return None
    else:
        result = 'https://chicago.craigslist.org/' + result[0]
        return result

def get_job_list(content):
    result = job_list_pat.findall(content)
    return result

def get_job_details(content):
    result = desc_pat.findall(content)
    if len(result) == 0:
        description = ''
    else:
        description = str(result[0])

    return description


def scrape_job_info(job_info, category_name):
    job_url, job_name = job_info
    job_name = unescape(job_name)

    job_dict = {'jobname': job_name, 'category': category_name}

    job_dict['JOBURL'] = job_url

    print('scraping', job_name)

    content = get_category_list(job_url)

    description = get_job_details(content)
    job_dict['Description'] = description

    print(job_dict)


def crawl_category(category_name, category_url):
    while True:
        print(category_url)
        content = get_page_content(category_url)
        job_list = get_job_list(content)
        print(job_list)

        for job_info in job_list:
            scrape_job_info(job_info, category_name)

        next_page = get_next_page(content)

        if next_page is None:
            break

        category_url = next_page


def crawl_website():
    url = 'https://chicago.craigslist.org'
    content = get_page_content(url)
    category_list = get_category_list(content)

    for category in category_list:
        category_url, category_name = category
        category_url = url + category_url
        crawl_category(category_name, category_url)


if __name__ == '__main__':
    url = 'https://chicago.craigslist.org'

    response = requests.get(url)

    content = response.text
    category_pat = re.compile(r'<li><a href=\"(\/d\/[\w\-]+\/\w+\/\w+)\".+txt\">([\w\-\+\s+\/\<]+)<sup class')

    next_page_pat = re.compile(
        r'<a href=\"\/(.*)\" class=\"button next\" title=\"next\s+page\">next &gt; <\/a>\s+<span class=\"button next\" title=\"next page\">\s+next &gt;\s+<\/span>\s+<\/span>\s+<\/div>\s+<\/div>\s+.+\s+.+')

    job_list_pat = re.compile(r'<a href=\"(https://\w+\.craigslist.org/chc\/.+html)\".+hdrlnk\">([\w\s*]+)</a>')
    desc_pat = re.compile(r'<\/div>\s*<section id=\"postingbody\">.+html\"><\/div>\s*<\/div>(.+)<\/section><ul')
    img_pat = re.compile(r'<img src=\"(.*jpg)\" title')

    crawl_website()
0

There are 0 best solutions below