I'm very new to python, and coding in general. I'm trying to make a webcrawler that scrapes the data from an overwatch player page (eg: https://playoverwatch.com/en-gb/career/pc/eu/Taimou-2526) I tried using portia, and it worked in the cloud, but I couldn't get it working when I exported it as scrapy code. Here is a screenshot of my portia spider.
Here is the code of my spider (exported from portia as scrapy): owData.py
from __future__ import absolute_import
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Identity
from scrapy.spiders import Rule
from utils.spiders import BasePortiaSpider
from utils.starturls import FeedGenerator, FragmentGenerator
from utils.processors import Item, Field, Text, Number, Price, Date, Url
(Image, Regex)
from items import PortiaItem
class Owdata(BasePortiaSpider):
name = 'owData'
allowed_domains = [u'playoverwatch.com']
start_urls = \
[u'https://playoverwatch.com/en-gb/career/pc/eu/Taimou-2526']
rules = [Rule(LinkExtractor(allow=(), deny='.*'),
callback='parse_item', follow=True)]
items = [[]]
And here is my items.py code:
from __future__ import absolute_import
import scrapy
from collections import defaultdict
from scrapy.loader.processors import Join, MapCompose, Identity
from w3lib.html import remove_tags
from .utils.processors import Text, Number, Price, Date, Url, Image
class PortiaItem(scrapy.Item):
fields = defaultdict(
lambda: scrapy.Field(
input_processor=Identity(),
output_processor=Identity()
)
)
def __setitem__(self, key, value):
self._values[key] = value
def __repr__(self):
data = str(self)
if not data:
return '%s' % self.__class__.__name__
return '%s(%s)' % (self.__class__.__name__, data)
def __str__(self):
if not self._values:
return ''
string = super(PortiaItem, self).__repr__()
return string
class CareerOverviewOverwatch1Item(PortiaItem):
field1 = scrapy.Field(
input_processor=Text(),
output_processor=Join(),
)
melee_final_blows = scrapy.Field(
input_processor=Text(),
output_processor=Join(),
)
table = scrapy.Field(
input_processor=Text(),
output_processor=Join(),
)
tr = scrapy.Field(
input_processor=Text(),
output_processor=Join(),
when I run my spider using:
scrapy crawl owData -o data.csv
I just get an empty data.csv file. I'm guessing there's something wrong with my items? I think the xPath line should just be //tbody, but again, I know nothing about Python, xPath or scrapy...