Scrapy webscraping an overwatch profile page

185 Views Asked by At

I'm very new to python, and coding in general. I'm trying to make a webcrawler that scrapes the data from an overwatch player page (eg: https://playoverwatch.com/en-gb/career/pc/eu/Taimou-2526) I tried using portia, and it worked in the cloud, but I couldn't get it working when I exported it as scrapy code. Here is a screenshot of my portia spider.

Here is the code of my spider (exported from portia as scrapy): owData.py

from __future__ import absolute_import



   #!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import absolute_import

from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Identity
from scrapy.spiders import Rule

from utils.spiders import BasePortiaSpider
from utils.starturls import FeedGenerator, FragmentGenerator
from utils.processors import Item, Field, Text, Number, Price, Date, Url
(Image, Regex)
from items import PortiaItem


class Owdata(BasePortiaSpider):

    name = 'owData'
    allowed_domains = [u'playoverwatch.com']
    start_urls = \
        [u'https://playoverwatch.com/en-gb/career/pc/eu/Taimou-2526']
    rules = [Rule(LinkExtractor(allow=(), deny='.*'),
             callback='parse_item', follow=True)]


items = [[]]

And here is my items.py code:

 from __future__ import absolute_import

import scrapy
from collections import defaultdict
from scrapy.loader.processors import Join, MapCompose, Identity
from w3lib.html import remove_tags
from .utils.processors import Text, Number, Price, Date, Url, Image


class PortiaItem(scrapy.Item):
    fields = defaultdict(
    lambda: scrapy.Field(
        input_processor=Identity(),
        output_processor=Identity()
    )
)

def __setitem__(self, key, value):
    self._values[key] = value

def __repr__(self):
    data = str(self)
    if not data:
        return '%s' % self.__class__.__name__
    return '%s(%s)' % (self.__class__.__name__, data)

def __str__(self):
    if not self._values:
        return ''
    string = super(PortiaItem, self).__repr__()
    return string


class CareerOverviewOverwatch1Item(PortiaItem):
field1 = scrapy.Field(
    input_processor=Text(),
    output_processor=Join(),
)
melee_final_blows = scrapy.Field(
    input_processor=Text(),
    output_processor=Join(),
)
table = scrapy.Field(
    input_processor=Text(),
    output_processor=Join(),
)
tr = scrapy.Field(
    input_processor=Text(),
    output_processor=Join(),

when I run my spider using:

scrapy crawl owData -o data.csv

I just get an empty data.csv file. I'm guessing there's something wrong with my items? I think the xPath line should just be //tbody, but again, I know nothing about Python, xPath or scrapy...

0

There are 0 best solutions below