I am trying to accomplish the simple long running task by using redis queue but everytime I get timeout error even though I increase the time out value in job = q.enqueue(run_scraper, temp_file, job_timeout=16600)
but no matter what it gives me time out error.
Traceback:
01:17:18 Traceback (most recent call last):
File "/home/zerox/fp-google-search/venv/lib/python3.9/site-packages/rq/worker.py", line 1061, in perform_job
rv = job.perform()
File "/home/zerox/fp-google-search/venv/lib/python3.9/site-packages/rq/job.py", line 821, in perform
self._result = self._execute()
File "/home/zerox/fp-google-search/venv/lib/python3.9/site-packages/rq/job.py", line 847, in _execute
coro_result = loop.run_until_complete(result)
File "/usr/lib/python3.9/asyncio/base_events.py", line 634, in run_until_complete
self.run_forever()
File "/usr/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
self._run_once()
File "/usr/lib/python3.9/asyncio/base_events.py", line 1869, in _run_once
event_list = self._selector.select(timeout)
File "/usr/lib/python3.9/selectors.py", line 469, in select
fd_event_list = self._selector.poll(timeout, max_ev)
File "/home/zerox/fp-google-search/venv/lib/python3.9/site-packages/rq/timeouts.py", line 63, in handle_death_penalty
raise self._exception('Task exceeded maximum timeout value '
rq.timeouts.JobTimeoutException: Task exceeded maximum timeout value (16600 seconds)
Traceback (most recent call last):
File "/home/zerox/fp-google-search/venv/lib/python3.9/site-packages/rq/worker.py", line 1061, in perform_job
rv = job.perform()
File "/home/zerox/fp-google-search/venv/lib/python3.9/site-packages/rq/job.py", line 821, in perform
self._result = self._execute()
File "/home/zerox/fp-google-search/venv/lib/python3.9/site-packages/rq/job.py", line 847, in _execute
coro_result = loop.run_until_complete(result)
File "/usr/lib/python3.9/asyncio/base_events.py", line 634, in run_until_complete
self.run_forever()
File "/usr/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
self._run_once()
File "/usr/lib/python3.9/asyncio/base_events.py", line 1869, in _run_once
event_list = self._selector.select(timeout)
File "/usr/lib/python3.9/selectors.py", line 469, in select
fd_event_list = self._selector.poll(timeout, max_ev)
File "/home/zerox/fp-google-search/venv/lib/python3.9/site-packages/rq/timeouts.py", line 63, in handle_death_penalty
raise self._exception('Task exceeded maximum timeout value '
rq.timeouts.JobTimeoutException: Task exceeded maximum timeout value (16600 seconds)
FastAPI code:
import fastapi as _fastapi
from fastapi.responses import HTMLResponse, FileResponse, Response
from starlette.requests import Request
from starlette.templating import Jinja2Templates
import shutil
import os
import json
from rq import Queue
from rq.job import Job
from redis import Redis
from scraper import run_scraper
from utils import clean_file, csv_writer
app = _fastapi.FastAPI()
r = Redis(
host="localhost",
port=6379,
db=0,
)
q = Queue(connection=r)
templates = Jinja2Templates("templates")
@app.get("/")
def index(request: Request):
return templates.TemplateResponse("index.html", {"request": request})
@app.post("/api/v1/scraped_csv")
async def extract_ads(csv_file: _fastapi.UploadFile = _fastapi.File(...)):
temp_file = _save_file_to_disk(csv_file, path="temp", save_as="temp")
job = q.enqueue(run_scraper, temp_file, job_timeout=16600)
return {"message": "Scraping has been started", "job_id": job.id}
@app.get("/progress/{job_id}")
def progress(job_id):
job = Job.fetch(job_id, connection=r)
if job.is_finished:
csv_path = os.path.abspath(clean_file)
return FileResponse(path=csv_path, media_type="text/csv", filename=clean_file)
return {"message": "Scraper is running."}
def _save_file_to_disk(uploaded_file, path=".", save_as="default"):
extension = os.path.splitext(uploaded_file.filename)[-1]
temp_file = os.path.join(path, save_as + extension)
with open(temp_file, "wb") as buffer:
shutil.copyfileobj(uploaded_file.file, buffer)
return temp_file
I am new to integrating redis queue with scraping so any guidance regarding solving/handle the timeout error would be much appreciated.
It turns out
python-rq
does not support asyncio directly so I have to wrap the asyncio func into sync func to get it up and running.Source