I am trying to parse a google shopping page and I am trying to do it faster than selenium. I stumbled across request_html and it's been working pretty well. I have almost everything I need from it except one element it isn't parsing from the page. If you go to this google shopping page you will notice that you can hover over some of the product images and see a second one. I am parsing the information from each product but when it comes to both images for some reason request_html is only retrieving the second(hovered) image and not the first(main) one. I have attached my code below I have been trying to find a good way to represent the output of request_html to show what it IS retrieving but I haven't found a way for it to print in a readable manner. To my knowledge request_html can render javascript on pages and in my case, it is just weird that it is getting everything but the first image. I have viewed the 'inspect' part of the page to get the HTML and the image's div class that I am trying to get is '.gOenxf'. Why is request_html not rendering the first image of each product?
for google_post in google_initiate(request):
# parse what is needed
def google_initiate(request):
form = SearchForm(request.POST or None)
if form.is_valid():
url = 'https://www.google.com/search?biw=1866&bih=1043&tbm=shop&q=desk&tbs=mr:1,price:1,ppr_min:,ppr_max:,avg_rating:None'
session = HTMLSession()
response = session.get(url)
print(response.html)
google_parsed = response.html.find('.sh-dgr__gr-auto.sh-dgr__grid-result')
response.close()
session.close()
return google_parsed
UPDATE:
import requests
from requests_html import HTMLSession
for google_post in google_initiate(request):
post_website = 'Google'
post_parse_page = google_initiate.google_parse_page
try:
post_title = google_post.find('.Xjkr3b', first=True).text
except:
post_title = ''
try:
post_url = str(google_post.find('.xCpuod'))
post_url = 'https://www.google.com' + post_url[post_url.find("href='") + len("href='"):post_url.rfind("'")]
except:
post_url = ''
try:
post_second_website = google_post.find('.aULzUe.IuHnof', first=True).text
if 'Amazon' in post_second_website or 'eBay' in post_second_website or 'Walmart' in post_second_website or 'AliExpress' in post_second_website or 'Craigslist' in post_second_website or 'Facebook Marketplace' in post_second_website or 'Oodle' in post_second_website:
post_second_website = ''
except:
post_second_website = ''
try:
post_second_url = str(google_post.find('.shntl'))
post_second_url = post_second_url[post_second_url.find("href='/url?url=") + len("href='/url?url="):post_second_url.rfind("'")]
if '%' in post_second_url:
post_second_url = post_second_url.split('%')[0]
except:
post_second_url = ''
try:
post_second_image_url = str(google_post.find('img'))
if 'encrypted' in post_second_image_url:
post_second_image_url = post_second_image_url[post_second_image_url.find("data-image-src='") + len("data-image-src='"):post_second_image_url.rfind('')]
else:
post_second_image_url = NO_IMAGE
except:
post_second_image_url = ''
try:
post_price = google_post.find('.a8Pemb.OFFNJ', first=True).text
post_price = str(post_price.split()[0])
try: string first
if '.' not in post_price:
post_price = post_price + '.00'
elif len(post_price.split('.')[1]) == 1:
post_price = post_price + '0'
elif len(post_price.split('.')[1]) == 0:
post_price = post_price + '00'
post_sort_by = post_price.replace(',', '')
post_sort_by = float(post_sort_by.split('$')[1])
except:
post_price = 'n/a'
post_sort_by = ''
except:
post_price = 'n/a'
post_sort_by = ''
try:
post_rating = google_post.find('.Rsc7Yb', first=True).text
except:
post_rating = ''
try:
post_rating_quantity = google_post.find('.NzUzee', first=True).text
post_rating_quantity = str(post_rating_quantity.split()[1])
except:
post_rating_quantity = ''
try:
post_image_url = str(google_post.find('.gOenxf'))
if 'encrypted' in post_image_url:
post_image_url = post_image_url[post_image_url.find("src='") + len("src='"):post_image_url.rfind("'")]
else:
post_image_url = NO_IMAGE
except:
post_image_url = ''
google_final_postings.append((post_title, post_url, post_price, post_image_url, post_rating, post_rating_quantity, post_website, post_second_website, post_second_url, post_second_image_url, post_parse_page, post_sort_by))
def google_initiate(request):
form = SearchForm(request.POST or None)
if form.is_valid():
url = 'https://www.google.com/search?biw=1866&bih=1043&tbm=shop&q=desk&tbs=mr:1,price:1,ppr_min:,ppr_max:,avg_rating:None'
session = HTMLSession()
response = session.get(url)
google_parsed = response.html.find('.sh-dgr__gr-auto.sh-
dgr__grid-result')
print(google_parsed)
response.close()
session.close()
return google_parsed
Assuming the content in question is dynamically injected by Javascript, you need to call
response.html.render()before seeking the element.See example in official docs.