Scrape Glassdoor data with selenium

23 Views Asked by At

I'm scrape the reviews data from glassdoor, and I test my code succesfully when I first finish the edit, but when I need to use them for large scale scrape, it always have a verify from cloudflare, and how should I passby it, here is my code, especially function:

import csv
import time
import random
import logging
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

DEFAULT_URL = 'https://www.glassdoor.com/Reviews/index.htm?overall_rating_low=3.5&page=1&locId=1140588&locType=C'
COMPANY_NAMES = ['Super Microsoft Technology', 'NICE Holdings']

USERNAME = 'username'
PASSWORD = 'password'

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
logger.addHandler(ch)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(lineno)d:%(filename)s(%(process)d) - %(message)s')
ch.setFormatter(formatter)

def sign_in():
    url = 'https://www.glassdoor.com/profile/login_input.htm'
    browser.get(url)

    email_field = browser.find_element('css selector', 'input[autocomplete="username"]')
    email_field.send_keys(USERNAME)
    time.sleep(3)
    next_button = browser.find_element('xpath', '//*[@id="InlineLoginModule"]/div/div[1]/div/div/div/div/form/div[2]/button')
    next_button.click()

    time.sleep(3)

    password_field = browser.find_element('css selector', 'input[autocomplete="current-password"]')
    submit_btn = browser.find_element('xpath', '//*[@id="InlineLoginModule"]/div/div[1]/div/div/div/div/form/div[2]/button')

    password_field.send_keys(PASSWORD)
    submit_btn.click()

    time.sleep(3)
    browser.get(DEFAULT_URL)

def navigate_to_reviews(company_name):
    browser.get(DEFAULT_URL)

    time.sleep(random.uniform(1, 3))

    search_field = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#companyAutocomplete-companyDiscover-employerSearch')))
    search_field.send_keys(company_name)

    search_button = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.gd-ui-button[data-test="company-search-button"]')))
    search_button.click()

    time.sleep(random.uniform(1, 3))

    search_results = WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.d-flex.flex-column.my-std.mb-sm-0.css-1b46kjl a')))

    if search_results:
        first_result = search_results[0]
        first_result.click()

        time.sleep(random.uniform(1, 3))

        WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//a[@data-test="ei-nav-reviews-link"]')))

        reviews_cell = browser.find_element('xpath', '//a[@data-test="ei-nav-reviews-link"]')
        reviews_path = reviews_cell.get_attribute('href')

        browser.get(reviews_path)
        time.sleep(random.uniform(1, 3))

        WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '//*[starts-with(@id, "empReview")]')))

        reviews = []

        while True:
            review_elements = browser.find_elements('xpath', '//*[starts-with(@id, "empReview")]')
            for review_element in review_elements:
                review = extract_pros_cons(review_element)
                reviews.append(review)

            next_button = browser.find_elements('xpath', '//button[@data-test="next-page" and not(@disabled)]')
            if not next_button:
                break

            next_button[0].click()
            time.sleep(random.uniform(1, 3))
            WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '//*[starts-with(@id, "empReview")]')))

        return reviews

    else:
        print("No search results found.")
        return []

def get_browser():
    logger.info('Configuring browser')
    chrome_options = wd.EdgeOptions()
    chrome_options.add_argument('log-level=3')
    chrome_options.add_argument('--incognito')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--disable-setuid-sandbox')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36')
    browser = wd.Edge(options=chrome_options)
    return browser

def extract_pros_cons(review_element):
    pros_element = review_element.find_element('css selector', '.review-details_pro__rMvtX span[data-test="review-text-pros"]')
    cons_element = review_element.find_element('css selector', '.review-details_con__9IvnD span[data-test="review-text-cons"]')
    date_element = review_element.find_element('css selector', '.timestamp_reviewDate__fBGY6')
    employee_status_element = review_element.find_element('css selector', '.review-details_employeeDetails__LuKJ7')
    overall_rating_element = review_element.find_element('css selector', '.review-details_overallRating__Rxhdr')

    show_more_button = review_element.find_elements('css selector', '.review-details_showMoreButton__x_JZx button')

    if show_more_button:
        show_more_button[0].click()
        time.sleep(random.uniform(1, 3))

    pros_text = pros_element.text.strip()
    cons_text = cons_element.text.strip()
    date_text = date_element.text.strip()
    employee_status_text = employee_status_element.text.strip()
    overall_rating_text = overall_rating_element.text.strip()

    return {
        "pros": pros_text,
        "cons": cons_text,
        "date": date_text,
        "employee_status": employee_status_text,
        "overall_rating": float(overall_rating_text)
    }

def write_to_csv(company_name, reviews):
    fieldnames = ['pros', 'cons', 'date', 'employee_status', 'overall_rating']

    filename = f"{company_name}.csv"
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for review in reviews:
            writer.writerow(review)

if __name__ == "__main__":
    browser = get_browser()
    sign_in()

    for company_name in COMPANY_NAMES:
        reviews = navigate_to_reviews(company_name)
        write_to_csv(company_name, reviews)

    browser.quit()

some elements may seems weird but exactly pass my test, and how could I pass the verify to scrape about 1,000 companies's review

a lot thanks for help!!!

0

There are 0 best solutions below