I am trying to scrape the furniture products of amazon website using beautifulsoup library.I tried scraping website without using headers and I got nothing so I print my soup and got this " to discuss automated access to amazon data please contact [email protected]".amazon blocked me from scraping its website so I use the header "{'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})" and it worked fine for 2 to 3 times only after that I am again blocked from scraping amazon website .As I am a newbie in webscraping I don't know what is the solution to my problem?.
My code runs successfully for the 1st 2 times and also the scraped valued got written in csv but after running for the 3rd time it returns me nothing and csv file is empty too.I am expecting it might be due to headers again.I have already tried solutions from different stackoverflow questions but they did not work out for me .also added proxies in my code but it did not work out either .here is my code. from bs4 import BeautifulSoup import requests
`# Function to extract Product Title def get_title(soup):
try:
# Outer Tag Object
title = soup.find("span", attrs={"id":'productTitle'})
# Inner NavigatableString Object
title_value = title.string
# Title as a string value
title_string = title_value.strip()
# # Printing types of values for efficient understanding
# print(type(title))
# print(type(title_value))
# print(type(title_string))
# print()
except AttributeError:
title_string = ""
return title_string
Function to extract Product Price
def get_price(soup):
try:
price = soup.find("span", attrs={'id':'priceblock_ourprice'}).string.strip()
except AttributeError:
try:
# If there is some deal price
price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()
except:
price = ""
return price
Function to extract Product Rating
def get_rating(soup):
try:
rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
except AttributeError:
try:
rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
except:
rating = ""
return rating
Function to extract Number of User Reviews
def get_review_count(soup): try: review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()
except AttributeError:
review_count = ""
return review_count
Function to extract Availability Status
def get_availability(soup): try: available = soup.find("div", attrs={'id':'availability'}) available = available.find("span").string.strip()
except AttributeError:
available = "Not Available"
return available
if name == 'main':
# Headers for request
HEADERS = ({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
# The webpage URL
URL = "https://www.amazon.com/s?k=furniture&crid=3C1AP0SFA5J8Y&sprefix=furniture%2Caps%2C389&ref=nb_sb_noss_1s"
# HTTP Request
webpage = requests.get(URL, headers=HEADERS)
# Soup Object containing all data
soup = BeautifulSoup(webpage.content, "lxml")
# Fetch links as List of Tag Objects
links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
# Store the links
links_list = []
# Loop for extracting links from Tag Objects
for link in links:
links_list.append(link.get('href'))
# Loop for extracting product details from each link
for link in links_list:
File = open("product_record.csv", "a")
new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)
new_soup = BeautifulSoup(new_webpage.content, "lxml")
# Function calls to display all necessary product information
print("Product Title =", get_title(new_soup))
File.write(f"{get_title(new_soup)},")
print("Product Price =", get_price(new_soup))
File.write(f"{get_price(new_soup)},")
print("Product Rating =", get_rating(new_soup))
File.write(f"{get_rating(new_soup)},")
print("Number of Product Reviews =", get_review_count(new_soup))
File.write(f"{get_review_count(new_soup)},")
print("Availability =", get_availability(new_soup))
File.write(f"{get_availability(new_soup)},")
File.close()
print()
print()
`