Splash (+scrapy) does not render web page correctly

313 Views Asked by At

Im' using Scrapy + Splash, I have problems downloading this page: http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText='http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText=

It seems that Splash cannot execute the javascript correctly. Here is a stripped down, working, self contanied, version of my program (sorry if not stripped down at best)

# -*- coding: utf-8 -*- import scrapy from scrapy_splash import SplashRequest from scrapy.selector import Selector from scrapy.http import HtmlResponse import sys import io import os import base64

def saveFile(ss, fileNameExt, folderName):
    f = open(folderName + '/' + fileNameExt, 'w')
    f.write(ss)
    f.close()
    return fileNameExt

def savePng(png_bytes, fileNameExt, folderName):
    f = open( folderName +'/' + fileNameExt, 'wb')
    f.write(png_bytes)
    f.close()
    return fileNameExt

def savePageOriginalInFolder(response, folderName, chiave='pag1'):
    fileName = "site.html"
    testo = response.data[chiave].decode('utf8')       
    return saveFile(testo, fileName, folderName)       def savePagePng(response, folderName, pngDataName):
    fileName = 'site.png'
    if hasattr(response, 'data'):
        png_bytes = base64.b64decode(response.data[pngDataName])
        return savePng(png_bytes, fileName, folderName)

class GenericoSpider(scrapy.Spider):
    name = 'provaAbb'

    def asSplashRequest(self, url, callback, id_elenco="no_id", id_sessione="no_id_sessione"):
        return SplashRequest(
                    url = url,
                    endpoint='execute',
                    args={'lua_source': self.script, 'id_elenco': id_elenco, 'id_sessione': id_sessione},
                    callback=callback,
                )

    outDir = name # prendo in nome della cartella dal nome dello spider
    db_name = ""

    def start_requests(self):   
        sito = 'http://new.abb.com/jobs/it/center#JobCountry=IT&JobCity=any&JobFunction=any&JobRole=any&JobText='
        yield self.asSplashRequest(sito, self.parse_list, 'id_mio_elenco')

    script = """
    function main(splash)
      local url = splash.args.url
      splash:set_viewport_size(1280, 2500)      
      splash:init_cookies(splash.args.cookies)
      assert(splash:go(url))
      assert(splash:wait(10))
      return {
        url  = splash:url(),
        pag1 = splash:html(),
        png1  = splash:png(),
        id_elenco = splash.args.id_elenco,
        id_sessione = splash.args.id_sessione,

        cookies = splash:get_cookies(),
        tt = splash.args
      }
    end
    """
    def parse_list(self, response):
            for ss in response.data:
                if len(ss) >= 4:
                    if ss[0:3] == 'pag':
                        fileName = savePageOriginalInFolder(response, self.outDir, ss)
                    elif ss[0:3] == 'png':
                        fileName = savePagePng(response, self.outDir,ss)

A part of the settings.py

DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }

SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }

DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'

HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

Result, as you can see there is the spinner in the list area and page numbers are not loaded. (augmenting wait time in lua did not solve the problem) enter image description here

0

There are 0 best solutions below