Newspaper3k filter out bad URL while extracting

389 Views Asked by At

With some help ;) I have managed to scrape titles and content from CNN news website and put this in a .csv file.

Now the list with URLs (which has been extracted with another code) has some bad URLs. The code for this is really simple as it just scans through the website and returns all URLs. Therefore the list has some bad URLs (e.g. http://cnn.com/date/2021-10-17) Rather than searching this list and removing those bad URLs manually I was wondering if this could be resolved by changing my code into skipping bad URLs and continue with the next and so on.

example code:

import csv
from newspaper import Config
from newspaper import Article
from os.path import exists

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'

config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10

urls = ['https://www.cnn.com/2021/10/25/tech/facebook-papers/index.html', 'http://cnn.com/date/2021-10-17', 'https://www.cnn.com/entertainment/live-news/rust-shooting-alec-baldwin-10-25-21/h_257c62772a2b69cb37db397592971b58']
# the above normally would be where I refer to the .csv file with URLs
for url in urls:
    article = Article(url, config=config)
    article.download()
    article.parse()
    article_meta_data = article.meta_data

    file_exists = exists('cnn_extraction_results.csv')
    if not file_exists:
        with open('cnn_extraction_results.csv', 'w', newline='') as file:
            headers = ['article title', 'article text']
            writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
            writer.writeheader()
            writer.writerow({'article title': article.title,
                             'article text': article.text})
    else:
        with open('cnn_extraction_results.csv', 'a', newline='') as file:
            headers = ['article title', 'article text']
            writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
            writer.writerow({'article title': article.title,
                             'article text': article.text})
1

There are 1 best solutions below

0
Life is complex On BEST ANSWER

Try this:

import csv
from os.path import exists
from newspaper import Config
from newspaper import Article
from newspaper import ArticleException

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'

config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10

urls = ['https://www.cnn.com/2021/10/25/tech/facebook-papers/index.html',
        'http://cnn.com/date/2021-10-17',
        'https://www.cnn.com/entertainment/live-news/rust-shooting-alec-baldwin-10-25-21/h_257c62772a2b69cb37db397592971b58']

for url in urls:
    try:
        article = Article(url, config=config)
        article.download()
        article.parse()
        article_meta_data = article.meta_data

        file_exists = exists('cnn_extraction_results.csv')
        if not file_exists:
            with open('cnn_extraction_results.csv', 'w', newline='') as file:
                headers = ['article title', 'article text']
                writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
                writer.writeheader()
                writer.writerow({'article title': article.title,
                                 'article text': article.text})
        else:
            with open('cnn_extraction_results.csv', 'a', newline='') as file:
                headers = ['article title', 'article text']
                writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
                writer.writerow({'article title': article.title,
                                 'article text': article.text})
    except ArticleException:
        print('***FAILED TO DOWNLOAD***', url)