when I try to run my code I face with this issue I have defined a real-time request for this scraping but still does not working. anyone knows how to deal with this issue in python? How sitemap is important in this case? Thanks in advance
import logging
import re
from urllib.parse import urljoin, urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Request
from scrapy.spiders import SitemapSpider
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.shell import inspect_response
from sqlalchemy.orm import sessionmaker
from content.spiders.templates.sitemap_template import ModSitemapSpider
from content.models import db_connect, create_db_table, Articles
from content.items import ContentItems
from content.item_functions import (process_item,
process_singular_item,
process_date_item,
process_array_item,
process_plural_texts,
process_external_links,
process_article_text)
HEADER_XPATH = ['//h1[@class="article-title"]//text()']
AUTHOR_XPATH = ['//span[@class="cnnbyline"]//text()',
'//span[@class="byline"]//text()']
PUBDATE_XPATH = ['//span[@class="cnnDateStamp"]//text()']
TAGS_XPATH = ['']
CATEGORY_XPATH = ['']
TEXT = ['//div[@id="storytext"]//text()',
'//div[@id="storycontent"]//p//text()']
INTERLINKS = ['//span[@class="inStoryHeading"]//a/@href']
DATE_FORMAT_STRING = '%Y-%m-%d'
class CNNnewsSpider(ModSitemapSpider):
name = 'cnn'
allowed_domains = ["cnn.com"]
sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"]
def parse(self, response):
items = []
item = ContentItems()
item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
item['resource'] = urlparse(response.url).hostname
item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False)
item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
item['article_text'] = process_article_text(self, response, TEXT)
item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
item['link'] = response.url
items.append(item)
return items
This is my Text result:
File "/home/nik/project/lib/python3.5/site- packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError
NotImplementedError
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET http://edition.cnn.com/2016/10/15/opinions/the-black-panthers-heirs-after-50- years-joseph/index.html> from <GET http://www.cnn.com/2016/10/15/opinions/the- black-panthers-heirs-after-50-years-joseph/index.html>
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET http://edition.cnn.com/2016/10/15/africa/montreal-climate-change-hfc- kigali/index.html> from <GET http://www.cnn.com/2016/10/15/africa/montreal- climate-change-hfc-kigali/index.html>
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET http://edition.cnn.com/2016/10/14/middleeast/battle-for-mosul-hawija-iraq/index.html> from <GET http://www.cnn.com/2016/10/14/middleeast/battle-for-mosul-hawija-iraq/index.html>
2016-10-17 18:48:04 [scrapy] ERROR: Spider error processing <GET http://edition.cnn.com/2016/10/15/politics/donald-trump-hillary-clinton-drug- test/index.html> (referer: http://edition.cnn.com/sitemaps/sitemap-news.xml)
Traceback (most recent call last):
File "/home/nik/project/lib/python3.5/site- packages/twisted/internet/defer.py", line 587, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/nik/project/lib/python3.5/site- packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError
The exception is being thrown because your class
CNNnewsSpider
does not override the methodparse()
fromscrapy.BaseSpider
. Although you are defining aparse()
method in the code you pasted, it is not being included inCNNnewsSpider
because of indentation: instead, it is being defined as a standalone function. You need to fix your indentation as follows: