I have deployed Scrapyd as docker conainter on Google CloudRun. On my local, when I am running container, everything is working fine. But, when I am deploying same container on Google CloudRun, Spider jobs are not removed from Running queue. Though Jobs are finished but they are not being removed from Queue. Any thoughts?
Scrapyd spiders are finished but they are still being shown as Running on WebUI as well as listjobs.json
243 Views Asked by Shivkumar Agrawal At
2
There are 2 best solutions below
0
On
i meet the same issue, when deployed scrapyd on cloud run. The reason may be the sub proccessor exited , but processEnded method not invoked, when add processExited method to kill zhe slot, the issue solved!
i temply solve it by using my own launcher:
class Launcher(Service):
name = 'launcher'
def __init__(self, config, app):
self.processes = {}
self.finished = app.getComponent(IJobStorage)
self.max_proc = self._get_max_proc(config)
self.runner = config.get('runner', 'scrapyd.runner')
self.app = app
def startService(self):
for slot in range(self.max_proc):
self._wait_for_project(slot)
log.msg(
format=
'Scrapyd %(version)s started: max_proc=%(max_proc)r, runner=%(runner)r',
version=__version__,
max_proc=self.max_proc,
runner=self.runner,
system='Launcher')
def _wait_for_project(self, slot):
poller = self.app.getComponent(IPoller)
poller.next().addCallback(self._spawn_process, slot)
def _spawn_process(self, message, slot):
msg = native_stringify_dict(message, keys_only=False)
project = msg['_project']
args = [sys.executable, '-m', self.runner, 'crawl']
args += get_crawl_args(msg)
e = self.app.getComponent(IEnvironment)
env = e.get_environment(msg, slot)
env = native_stringify_dict(env, keys_only=False)
pp = ScrapyProcessProtocol(slot, project, msg['_spider'], \
msg['_job'], env)
pp.deferred.addBoth(self._process_finished, slot)
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
self.processes[slot] = pp
def _process_finished(self, _, slot):
process = self.processes.pop(slot)
process.end_time = datetime.now()
self.finished.add(process)
self._wait_for_project(slot)
def _get_max_proc(self, config):
max_proc = config.getint('max_proc', 0)
if not max_proc:
try:
cpus = cpu_count()
except NotImplementedError:
cpus = 1
max_proc = cpus * config.getint('max_proc_per_cpu', 4)
return max_proc
class ScrapyProcessProtocol(protocol.ProcessProtocol):
def __init__(self, slot, project, spider, job, env):
self.slot = slot
self.pid = None
self.project = project
self.spider = spider
self.job = job
self.start_time = datetime.now()
self.end_time = None
self.env = env
self.logfile = env.get('SCRAPY_LOG_FILE')
self.itemsfile = env.get('SCRAPY_FEED_URI')
self.deferred = defer.Deferred()
def outReceived(self, data):
log.msg(data.rstrip(), system="Launcher,%d/stdout" % self.pid)
def errReceived(self, data):
log.msg(data.rstrip(), system="Launcher,%d/stderr" % self.pid)
def connectionMade(self):
self.pid = self.transport.pid
self.log("Process started: ")
def processEnded(self, status):
if isinstance(status.value, error.ProcessDone):
self.log("Process finished: ")
else:
self.log("Process died: exitstatus=%r " % status.value.exitCode)
self.deferred.callback(self)
# on cloud run processEnded not invoked, but processExited
def processExited(self, status):
self.processEnded(status)
def log(self, action):
fmt = '%(action)s project=%(project)r spider=%(spider)r job=%(job)r pid=%(pid)r log=%(log)r items=%(items)r'
log.msg(format=fmt, action=action, project=self.project, spider=self.spider,
job=self.job, pid=self.pid, log=self.logfile, items=self.itemsfile)
finally , you should change the scrapyd.conf file:
launcher ={your launcher module}.Launcher
As mentioned in the Github :
Close the webdriver in spider close function as follows:
For more information, you can refer to the link and documentation.