I aim to scrape products from Target, utilizing Scrapy, which has proven effective for retailers like Amazon and Walmart. However, I am encountering a challenge with Target. Despite the product page loading successfully, the Target website responds with a status code 404.
Investigation in the browser's Inspection -> Network tab also reveals a 404 status.
I attempted to use Scrapy Playwright to load the website in a browser and then scrape, but the issue persists.
For a clearer understanding, please refer to this brief video: Issue Link
Here is the code I used.
import datetime
import re
from scrapy.http import HtmlResponse
from .base_redis_spider import BaseRedisSpider
from ..enums.globals import RETAILERS, Component
class TargetSearchProductSpider(BaseRedisSpider):
custom_settings = {'ITEM_PIPELINES': {"scrapers.pipelines.timestamp_pipeline.TimeStampPipeline": 400,
"scrapers.pipelines.redis_item_store_pipeline.RedisItemStoragePipeline": 501, },
# add timestamp to item
"DOWNLOAD_HANDLERS": {"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", },
"PLAYWRIGHT_BROWSER_TYPE": "firefox", "HTTPERROR_ALLOWED_CODES": [404],
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": False}
}
name = "target"
def generate_key(self):
return f"{self.retailer.lower()}_{self.component.lower()}_{self.environment.lower()}"
def __init__(self, *args, **kwargs):
super(TargetSearchProductSpider, self).__init__(*args, **kwargs)
self.retailer = RETAILERS.TARGET
self.environment = kwargs.get('environment').upper()
self.component = kwargs.get('component').upper()
self.spider_key = self.generate_key() # 'target_retail_production'
self.redis_key = f"{self.spider_key}:start_urls"
self.redis_batch_size = 1 # Number of url to fetch from redis on each attempt
self.max_idle_time = 60 # Max idle time(in seconds) before the spider stops checking redis and shuts down
def modify_request(self, request):
return request.replace(meta={'playwright': True, 'playwright_include_page': True}, callback=self.parse,
errback=self.errback)
def parse(self, response, **kwargs):
# Get the Playwright browser instance from the response.meta dictionary
browser = response.meta['playwright_browser']
# ...other logic ...
await self.close_browser(browser)
yield {}
async def errback(self, failure):
self.logger.error(repr(failure))
# Get the Playwright browser instance from the response.meta dictionary
browser = failure.request.meta['playwright_browser']
# Close the browser
await browser.close()