Python Beatifulsoup only scrapes the 1st page of Amazon reviews, someone knows how to extract all ideally without a proxy?

134 Views Asked by At

I am using Beautifulsoup on Python to try to scrape all Amazon reviews for a product, but it only extracts the 1st page (9 reviews). This code seem to have worked in the past for other users, maybe I need a proxy to avoid been blocked in the process to change the page? Thanks in advance!

I am using Pyhon 3 on collab for this test

import requests

import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import logging

headers = {
    "authority": "www.amazon.com",
    "pragma": "no-cache",
    "cache-control": "no-cache",
    "dnt": "1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "sec-fetch-site": "none",
    "sec-fetch-mode": "navigate",
    "sec-fetch-dest": "document",
    "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
}

URLS = [
 "https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
 "https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2",
 "https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_getr_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3"
]


def get_page_html(page_url: str) -> str:
    resp = requests.get(page_url, headers=headers)
    return resp.text


def get_reviews_from_html(page_html: str) -> BeautifulSoup:
    soup = BeautifulSoup(page_html, "lxml")
    reviews = soup.find_all("div", {"class": "a-section celwidget"})
    return reviews


def get_review_date(soup_object: BeautifulSoup):
    date_string = soup_object.find("span", {"class": "review-date"}).get_text()
    return date_string


def get_review_text(soup_object: BeautifulSoup) -> str:
    review_text = soup_object.find(
        "span", {"class": "a-size-base review-text review-text-content"}
    ).get_text()
    return review_text.strip()


def get_review_header(soup_object: BeautifulSoup) -> str:
    review_header = soup_object.find(
        "a",
        {
            "class": "a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold"
        },
    ).get_text()
    return review_header.strip()


def get_number_stars(soup_object: BeautifulSoup) -> str:
    stars = soup_object.find("span", {"class": "a-icon-alt"}).get_text()
    return stars.strip()


def get_product_name(soup_object: BeautifulSoup) -> str:
    product = soup_object.find(
        "a", {"class": "a-size-mini a-link-normal a-color-secondary"}
    ).get_text()
    return product.strip()


def orchestrate_data_gathering(single_review: BeautifulSoup) -> dict:
    return {
        "review_text": get_review_text(single_review),
        "review_date": get_review_date(single_review),
        "review_title": get_review_header(single_review),
        "review_stars": get_number_stars(single_review),
        "review_flavor": get_product_name(single_review),
    }


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    all_results = []

    for u in URLS:
        logging.info(u)
        html = get_page_html(u)
        reviews = get_reviews_from_html(html)
        for rev in reviews:
            data = orchestrate_data_gathering(rev)
            all_results.append(data)

    out = pd.DataFrame.from_records(all_results)
    logging.info(f"{out.shape[0]} Is the shape of the dataframe")
    save_name = f"{datetime.now().strftime('%Y-%m-%d-%m')}.xlsx"
    logging.info(f"saving to {save_name}")
    out.to_excel(save_name)
1

There are 1 best solutions below

1
Andrej Kesely On

You can try to use their Review pagination Ajax API to load more pages:

import re
from ast import literal_eval

import requests
from bs4 import BeautifulSoup

url = "https://www.amazon.com/hz/reviews-render/ajax/reviews/get/ref=cm_cr_arp_d_paging_btm_next_2"

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
}

payload = {
    "sortBy": "",
    "reviewerType": "all_reviews",
    "formatType": "",
    "mediaType": "",
    "filterByStar": "",
    "filterByAge": "",
    "pageNumber": "1",
    "filterByLanguage": "",
    "filterByKeyword": "",
    "shouldAppend": "undefined",
    "deviceType": "desktop",
    "canShowIntHeader": "undefined",
    "reftag": "cm_cr_arp_d_paging_btm_next_2",
    "pageSize": "10",
    "asin": "B07JXRWJ8D",  # <--- change product asin here
    "scope": "reviewsAjax0",
}


for page in range(1, 4):  # <--- change number of pages here
    payload["pageNumber"] = page

    t = requests.post(url, data=payload, headers=headers).text

    soup = BeautifulSoup(
        "\n".join(map(literal_eval, re.findall(r'"<div id=.*?</div>"', t))),
        "html.parser",
    )

    for r in soup.select('[data-hook="review"]'):
        print(r.select_one(".a-profile-name").text.strip())
        print(r.select_one('[data-hook="review-body"]').text.strip())
        print()

Prints:


...

Kindle Customer
We have 4 of these throughout my house now and with the app they’re so easy to use.The they haven’t raised my electric bill at all yet and they’re so quiet. The wall installation is perfect for smaller rooms and children. Never having to worry about them pushing them over, and they have a child lock on the screen. So convenient

Josh
Heats up my work trailer perfectly without having to worry about a electric heater constantly being on and being able to have it set to a exact temp.