Since crawling takes time, I am trying to build a progress bar on the front-end by tracking the number of items my spider has crawled. The number of items is compared to a capped number of items, at which point a CloseSpider exception is raised.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import LinkItem
class MySpider(CrawlSpider):
def __init__(self, allowed_domains=None, start_urls=None, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
self.allowed_domains = [allowed_domains]
self.start_urls = [start_urls]
self.item_count = 0
self.max_item_count = 200
self.loading_unit = self.max_item_count / 100
self.loading_progress = 0
name = "crawl"
rules = (Rule(LinkExtractor(), callback='parse_item', follow=True),)
def update_loading(self):
if self.item_count % self.loading_unit == 0:
self.loading_progress = int(self.item_count / self.max_item_count * 100)
if(self.loading_progress == 100):
self.loading_progress = 99
def parse_item(self, response):
if self.item_count >= self.max_item_count:
raise scrapy.exceptions.CloseSpider("Reached maximum item count")
content_type = response.headers.get('Content-Type', b'').decode('utf-8')
if not content_type.startswith('text/html'):
return
if "english" in response.url:
self.item_count += 1
self.update_loading()
link_item = LinkItem()
link_item["url"] = response.url
link_item["keyword"] = "english"
print(f"{self.item_count}. Found url: {response.url} with keyword english")
yield link_item
But the problem is, I haven't been able to send this loading progress on the client-side.
I've tried calling an emit function from the main app.
The spider modification:
def update_loading(self):
from app import emit_loading_progress
if self.item_count % self.loading_unit == 0:
self.loading_progress = int(self.item_count / self.max_item_count * 100)
if(self.loading_progress == 100):
self.loading_progress = 99
emit_loading_progress(self.loading_progress)
And the app.py:
from flask import Flask, render_template
from flask_socketio import SocketIO
import scrapy.crawler as crawler
from crawl.crawl.spiders.crawl import MySpider
from dotenv import load_dotenv
import os
from forms import YourForm
from multiprocessing import Process, Queue
from twisted.internet import reactor
import traceback
from scrapy.utils.project import get_project_settings
import pandas as pd
load_dotenv()
app = Flask(__name__)
app.config['SECRET_KEY'] = os.getenv('SECRET_KEY')
socketio = SocketIO(app)
def f(q, allowed_domains, start_urls):
try:
settings = get_project_settings()
runner = crawler.CrawlerRunner(settings)
deferred = runner.crawl(MySpider, allowed_domains=allowed_domains, start_urls=start_urls)
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
q.put(None)
except Exception as e:
q.put(e)
def run_spider(allowed_domains, start_urls):
q = Queue()
p = Process(target=f, args=(q, allowed_domains, start_urls))
p.start()
result = q.get()
p.join()
if result is not None:
raise result
@socketio.on('submit')
def handle_submit(allowed_domains, start_urls):
run_spider(allowed_domains, start_urls)
emit_result()
def emit_result():
file_path = os.path.join(os.path.dirname(__file__), 'output', 'links.jsonl')
if os.path.exists(file_path):
data = pd.read_json(path_or_buf=file_path, lines=True)
json_data = data.to_dict(orient='records')
socketio.emit('spider_closed', json_data)
print("SPIDER CLOSED")
def emit_loading_progress(progress):
socketio.emit('update_loading', progress)
@app.route('/')
def index():
form = YourForm()
return render_template('index.html', form=form)
@socketio.on_error()
def handle_error(e):
print("An error occurred:", e)
print(traceback.format_exc())
if __name__ == '__main__':
socketio.run(app, debug=True, log_output=True)
I was expecting that the client would receive the update_loading event, but it's not.
What can I do to track the progress of the spider and send real time updates of it?