Throw a NotImplementedError in Python

2.1k Views Asked by At

when I try to run my code I face with this issue I have defined a real-time request for this scraping but still does not working. anyone knows how to deal with this issue in python? How sitemap is important in this case? Thanks in advance

import logging
import re
from urllib.parse import urljoin, urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Request
from scrapy.spiders import SitemapSpider
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.shell import inspect_response
from sqlalchemy.orm import sessionmaker
from content.spiders.templates.sitemap_template import ModSitemapSpider
from content.models import db_connect, create_db_table, Articles
from content.items import ContentItems
from content.item_functions import (process_item,
                                process_singular_item,
                                process_date_item,
                                process_array_item,
                                process_plural_texts,
                                process_external_links,
                                process_article_text)

HEADER_XPATH = ['//h1[@class="article-title"]//text()']
AUTHOR_XPATH = ['//span[@class="cnnbyline"]//text()',
            '//span[@class="byline"]//text()']
PUBDATE_XPATH = ['//span[@class="cnnDateStamp"]//text()']
TAGS_XPATH = ['']
CATEGORY_XPATH = ['']
TEXT = ['//div[@id="storytext"]//text()',
    '//div[@id="storycontent"]//p//text()']
INTERLINKS = ['//span[@class="inStoryHeading"]//a/@href']
DATE_FORMAT_STRING = '%Y-%m-%d'


class CNNnewsSpider(ModSitemapSpider):

    name = 'cnn'
    allowed_domains = ["cnn.com"]
    sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"]


def parse(self, response):
    items = []
    item = ContentItems()
    item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
    item['resource'] = urlparse(response.url).hostname
    item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
    item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
    item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False)
    item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
    item['article_text'] = process_article_text(self, response, TEXT)
    item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
    item['link'] = response.url
    items.append(item)
    return items

This is my Text result:

File "/home/nik/project/lib/python3.5/site-      packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError
NotImplementedError
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET     http://edition.cnn.com/2016/10/15/opinions/the-black-panthers-heirs-after-50-     years-joseph/index.html> from <GET http://www.cnn.com/2016/10/15/opinions/the-     black-panthers-heirs-after-50-years-joseph/index.html>
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET   http://edition.cnn.com/2016/10/15/africa/montreal-climate-change-hfc-  kigali/index.html> from <GET http://www.cnn.com/2016/10/15/africa/montreal-  climate-change-hfc-kigali/index.html>
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET http://edition.cnn.com/2016/10/14/middleeast/battle-for-mosul-hawija-iraq/index.html> from <GET http://www.cnn.com/2016/10/14/middleeast/battle-for-mosul-hawija-iraq/index.html>
2016-10-17 18:48:04 [scrapy] ERROR: Spider error processing <GET    http://edition.cnn.com/2016/10/15/politics/donald-trump-hillary-clinton-drug-    test/index.html> (referer: http://edition.cnn.com/sitemaps/sitemap-news.xml)
Traceback (most recent call last):
File "/home/nik/project/lib/python3.5/site-   packages/twisted/internet/defer.py", line 587, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/nik/project/lib/python3.5/site-   packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError
1

There are 1 best solutions below

5
On

The exception is being thrown because your class CNNnewsSpider does not override the method parse() from scrapy.BaseSpider. Although you are defining a parse() method in the code you pasted, it is not being included in CNNnewsSpider because of indentation: instead, it is being defined as a standalone function. You need to fix your indentation as follows:

class CNNnewsSpider(ModSitemapSpider):
    name = 'cnn'
    allowed_domains = ["cnn.com"]
    sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"]

    def parse(self, response):
        items = []
        item = ContentItems()
        item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
        item['resource'] = urlparse(response.url).hostname
        item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
        item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
        item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False)
        item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
        item['article_text'] = process_article_text(self, response, TEXT)
        item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
        item['link'] = response.url
        items.append(item)
        return items