import asyncio
import json
import httpx
from nested_lookup import nested_lookup
from parsel import Selector
# create HTTPX client with headers that resemble a web browser
client = httpx.AsyncClient(
http2=True,
follow_redirects=True,
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
},
)
def parse_nextjs(html: str) -> dict:
"""extract nextjs cache from page"""
selector = Selector(html)
data = selector.css("script#__NEXT_DATA__::text").get()
if not data:
data = selector.css("script[data-name=query]::text").get()
data = data.split("=", 1)[-1].strip().strip(";")
data = json.loads(data)
return data
async def scrape_product(url: str) -> dict:
"""scrape a single stockx product page for product data"""
response = await client.get(url)
assert response.status_code == 200
data = parse_nextjs(response.text)
# extract all products datasets from page cache
products = nested_lookup("product", data)
# find the current product dataset
try:
product = next(p for p in products if p.get("urlKey") in str(response.url))
except StopIteration:
raise ValueError("Could not find product dataset in page cache", response)
return product
# example use:
url = "https://stockx.com/amiri-skel-top-low-white-black-white"
print(asyncio.run(scrape_product(url)))
I DONT GET WHATS WRONG LOL please if anyone can tell me whats incorrect, i hopefully should get a number/price retunred but im not getting that im getting errors left right and center i have rebooted, un and reinstalled and cant find anything on google
had the same error. I saw in the
parser.pypackage file forcssselect 1.1.0that it didn't have PseudoElement variable initialised hence the error. This has been fixed incssselect 1.2.0. I would advise to update package and try again. Worked for me after update.