How to connect code to chromedriver in airflow? Following are the codes that I have created. Please, if there is something wrong, please correct it and add the shortcomings clearly.
/dags/etl_pipeline_task_1.py
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
from airflow.decorators import dag, task
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import json
import requests
from webdriver_manager.chrome import ChromeDriverManager
default_args = {
'owner': 'airflow',
'start_date': datetime(2024, 3, 17),
'email': ['[email protected]'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
}
@dag(
default_args=default_args,
schedule_interval='@daily',
catchup=False
)
def task_ichsan():
@task
def export():
url = "https://www.tokopedia.com/search?st=&q=hp%20samsung&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&navsource="
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()
containers = soup.findAll("div", attrs={'class':'css-jza1fo'}, limit = 1)
print(containers)
print("Selesai")
scraped_data = export()
dag_run = task_ichsan()
Dockerfile
FROM apache/airflow:2.7.2
COPY requirements.txt .
RUN pip install -r requirements.txt
RUN pip install --user --upgrade pip
RUN pip install requests
RUN pip install bs4
RUN pip install pandas
RUN pip install apache-airflow-providers-postgres
RUN pip install webdriver_manager
RUN pip install selenium
requirements.txt
apache-airflow==2.7.2
So, when I ran the Dockerfile and installed Selenium, an error occurred in Airflow:
Broken DAG: [/opt/airflow/dags/etl_pipeline_task1.py] Traceback (most recent call last):
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "/opt/airflow/dags/etl_pipeline_task1.py", line 12, in <module>
from selenium import webdriver
ModuleNotFoundError: No module named 'selenium'
Lastly, I'm still confused about how to connect the code to the Chrome driver in Airflow. Please explain, good people :)
I just want to learn to become an expert in data