How to change path to webdriver_manager to custom path in the cloud function environment

1k Views Asked by At

I'm trying to create a headless web scraper on cloud function. I have used Selenium to automate the driver provided by the Webdriver manager.

Can you please tell me how to change the wdm.cachePath according to virtual environment? Below is my code and the error I'm getting.

import os
import logging
# selenium 4
os.environ['GH_TOKEN'] = "gkjkjhjkhjhkjhuihjhgjhg"
os.environ['WDM_LOG'] = str(logging.NOTSET)
os.environ['WDM_LOCAL'] = '1'
os.environ['WDM_SSL_VERIFY'] = '0'

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

def hello_world(request):
    """Responds to any HTTP request.
    Args:
        request (flask.Request): HTTP request object.
    Returns:
        The response text or any set of values that can be turned into a
        Response object using
        `make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
    """
    # instance of Options class allows
    # us to configure Headless Chrome
    options = Options()
    print("options")
    options.headless = True
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager("2.26", cache_valid_range=1, path = r".\\temp\\Drivers").install()
), options=options)

    print("driver was initiated")
    
      
    # this parameter tells Chrome that
    # it should be run without UI (Headless)
   
      
    # initializing webdriver for Chrome with our options
    # driver = webdriver.Chrome(options=options)
    
    # driver = webdriver.Chrome(ChromeDriverManager(path = r"/temp/data").install())      
    
    request_json = request.get_json()
    
    if request_json and 'url' in request_json:
        url = request_json['url']
        driver.get('https://www.geeksforgeeks.org')
        print(driver.title)
        driver.close()
        return f'Success!'
    else:
        return f'Not run'
    

Error logs -

Traceback (most recent call last): File "/layers/google.python.pip/pip/lib/python3.10/site-packages/flask/app.py", line 2525, in wsgi_app response = self.full_dispatch_request() File "/layers/google.python.pip/pip/lib/python3.10/site-packages/flask/app.py", line 1822, in full_dispatch_request rv = self.handle_user_exception(e) File "/layers/google.python.pip/pip/lib/python3.10/site-packages/flask/app.py", line 1820, in full_dispatch_request rv = self.dispatch_request() File "/layers/google.python.pip/pip/lib/python3.10/site-packages/flask/app.py", line 1796, in dispatch_request return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args) File "/layers/google.python.pip/pip/lib/python3.10/site-packages/functions_framework/__init__.py", line 98, in view_func return function(request._get_current_object()) File "/workspace/main.py", line 28, in hello_world driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager("2.26", cache_valid_range=1, path = r".\\temp\\Drivers").install() File "/layers/google.python.pip/pip/lib/python3.10/site-packages/webdriver_manager/chrome.py", line 39, in install driver_path = self._get_driver_path(self.driver) File "/layers/google.python.pip/pip/lib/python3.10/site-packages/webdriver_manager/core/manager.py", line 31, in _get_driver_path binary_path = self.driver_cache.save_file_to_cache(driver, file) File "/layers/google.python.pip/pip/lib/python3.10/site-packages/webdriver_manager/core/driver_cache.py", line 45, in save_file_to_cache archive = save_file(file, path) File "/layers/google.python.pip/pip/lib/python3.10/site-packages/webdriver_manager/core/utils.py", line 38, in save_file os.makedirs(directory, exist_ok=True) File "/layers/google.python.runtime/python/lib/python3.10/os.py", line 215, in makedirs makedirs(head, exist_ok=exist_ok) File "/layers/google.python.runtime/python/lib/python3.10/os.py", line 215, in makedirs makedirs(head, exist_ok=exist_ok) File "/layers/google.python.runtime/python/lib/python3.10/os.py", line 215, in makedirs makedirs(head, exist_ok=exist_ok)

I think the error is caused due to web driver manager trying to save the driver to cache is some static path, I already changed the path setting using

path = r".\\temp\\Drivers"

How to do it correctly?

1

There are 1 best solutions below

1
On

So I figured this out...

import os
import logging
# selenium 4

os.environ['WDM_LOG'] = str(logging.NOTSET)

from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import Chrome

def hello_world(request):
    """Responds to any HTTP request.
    Args:
        request (flask.Request): HTTP request object.
    Returns:
        The response text or any set of values that can be turned into a
        Response object using
        `make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
    """
    # instance of Options class allows
    # us to configure Headless Chrome
  
    print("driver was initiated")
    
      
    # this parameter tells Chrome that
    # it should be run without UI (Headless)
    opts = Options()
    opts.add_experimental_option("detach", True)
    opts.headless= True
      
    # initializing webdriver for Chrome with our options
    driver =  webdriver.Chrome(service= ChromeService(ChromeDriverManager(cache_valid_range=1).install() ), options = opts)
   
    # chrome_driver_path = ChromeDriverManager().install()
    
    request_json = request.get_json()
    
    if request_json and 'url' in request_json:
        # driver = webdriver.Chrome(service= chrome_driver_path, options = opts)
        url = request_json['url']
        driver.get(url)
        driver.get(url)

        # driver.find_element(By.XPATH,'//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/div[2]').click()
        #to make sure content is fully loaded we can use time.sleep() after navigating to each page

        import time
        time.sleep(3)

        #Find the total number of reviews
        # total_number_of_reviews = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[9]').text.splitlines()[3]
        # total_number_of_reviews = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[9]').text
        # time.sleep(3)
        # print(total_number_of_reviews)
        # Find scroll layout
        scrollable_div = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
        # time.sleep(3)
        #Scroll as many times as necessary to load all reviews
        total_reviews = int(driver.find_element('xpath', '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[2]').text.split(' ')[0].replace(',',''))
        time.sleep(3)
        print(total_reviews)

        for i in range(0, min(total_reviews, 500) ):
                driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight',scrollable_div)
                time.sleep(1.5)



        response = BeautifulSoup(driver.page_source, 'html.parser')
        reviews = response.find_all('span', class_='wiI7pd')

        restaurant__reviews = []
        for review in reviews:
            restaurant__reviews.append(review.text)
        print(restaurant__reviews)
        driver.close()
        return f'Success!'
    else:
        driver.close()
        return f'Not run'