im trying to scrap a web with but this products don't have href
Im using selenium to navigate the web and beautifulsoup to digest the results, but if i want to open every product to get more data i don't know how to do it
im trying to scrap a web with but this products don't have href
Im using selenium to navigate the web and beautifulsoup to digest the results, but if i want to open every product to get more data i don't know how to do it
On
You can use wallapop's api to get this data. There are many ways to handle these data, I suggest using the the Scrapy framework. Here is the code to get the item urls (as well as saving item's data as a json file) using Scrapy:
import json
import scrapy
class WallapopSpider(scrapy.Spider):
name = "wallapop"
search_url = "https://api.wallapop.com/api/v3/general/search?keywords={keywords}&latitude={latitude}&longitude={longitude}&start={start}&items_count={items_count}&filters_source=search_box&order_by={order_by}"
item_url = "https://api.wallapop.com/api/v3/items/"
def start_requests(self):
meta = {
"keywords": "monitor",
"latitude": 39.46895,
"longitude": -0.37686,
"start": 0,
"items_count": 40,
"order_by": "most_relevance",
}
yield scrapy.Request(
url=self.search_url.format(**meta), callback=self.parse_search
)
def parse_search(self, response):
# parsing the response into a dict
dict_data = json.loads(response.text)
# iterating over the search results
for item in dict_data["search_objects"]:
yield scrapy.Request(
url=self.item_url + item["id"], callback=self.parse_item
)
def parse_item(self, response):
# parsing the response into a dict
item_data = json.loads(response.text)
print(item_data["share_url"])
print(item_data["description"])
# saving the item data on a json file
with open(f"{item_data['id']}.json", "w") as f:
json.dump(item_data, f)
I found both API endpoints by inspecting the network traffic on the browser dev tools. I'm using Valencia's latitude and longitude's values and "monitor" as keyword. The required parameters to run the query are keywords, latitude and longitude.
I can also suggest running the crawler on a spider management solution like estela.
On
While I think the API is more efficient, if you want a selenium+bs4 solution, this is an example.
Using these function:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from bs4 import BeautifulSoup
## scroll to an element and click [targetEl can be and element or selector] ##
def scrollClick(driverX, targetEl, maxWait=5, scroll2Top=False, printErr=True):
try:
xWait = WebDriverWait(driverX, maxWait)
if isinstance(targetEl, str):
xWait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,targetEl)))
targetEl = driverX.find_element(By.CSS_SELECTOR, targetEl)
xWait.until(EC.element_to_be_clickable(targetEl))
driverX.execute_script('''
arguments[0].scrollIntoView(arguments[1]);
''', targetEl, bool(scroll2Top)) ## execute js to scroll
targetEl.click()
except Exception as e:
if printErr: print(repr(e), '\nFailed to click', targetEl)
## find a nextSibling of refEl that matches selector [if specified by sel] ##
def selectNextSib(driverX, refEl, sel=False, printError=False):
sel = sel.strip() if isinstance(sel, str) and sel.strip() else False
try: ## execute js code to find next card
return driverX.execute_script('''
var sibling = arguments[0].nextElementSibling;
while (sibling && arguments[1]) {
if (sibling.matches(arguments[1])) break;
sibling = sibling.nextElementSibling; }
return sibling;''', refEl, sel)
except Exception as e:
if printError: print(f'Error finding next "{sel}":',repr(e))
## [bs4] extract text or attribute from a tag inside tagSoup ##
def selectGet(tagSoup, selector='', ta='', defaultVal=None):
el = tagSoup.select_one(selector) if selector else tagSoup
if el is None: return defaultVal
return el.get(ta, defaultVal) if ta else el.get_text(' ', strip=True)
## parse product page html and extract product details ##
def getProductDetails(prodPgHtml:str, prodUrl=None):
pSoup = BeautifulSoup(prodPgHtml.encode('utf-8'))
detsDiv = pSoup.select_one('div.detail-item')
detKeys = ['category_id', 'is_bulky', 'is_bumped',
'is_free_shipping_allowed', 'item_id', 'item_uuid',
'main_image_thumbnail', 'mine', 'sell_price',
'seller_user_id', 'subcategory_id', 'itle', 'title']
pDets = {} if detsDiv is None else {
k.lstrip('data-').replace('-', '_'): v
for k, v in sorted(detsDiv.attrs.items(), key=lambda x: x[0])
if k.lstrip('data-').replace('-', '_') in detKeys
}
pDets['description'] = selectGet(pSoup, 'div.card-product-detail-top>p')
pDets['date_posted'] = selectGet(pSoup, 'div[class$="published"]')
pDets['views_count'] = selectGet(pSoup, 'i.ico-eye+span')
pDets['likes_count'] = selectGet(pSoup, 'i.ico-coounter_favourites+span')
pDets['seller_name'] = selectGet(pSoup, 'h2.card-user-detail-name')
uLink = selectGet(pSoup, 'a.card-user-right[href]', 'href')
if uLink: pDets['seller_link'] = urljoin(prodUrl, uLink)
### EXTRACT ANY OTHER DETAILS YOU WANT ###
pDets['product_link'] = prodUrl
return pDets
you can loop through the cards on the results page, clicking on each to open on a new tab and scrape the product details
kSearch, maxItems = 'monitor', 1500 ## adjust as preferred
url = f'https://es.wallapop.com/app/search?keywords={"+".join(kSearch.split())}'
url = f'{url}&filters_source=search_box&latitude=39.46895&longitude=-0.37686'
browser = webdriver.Chrome()
browser.get(url)
browser.maximize_window()
scrollClick(browser, 'button[id="onetrust-accept-btn-handler"]') ## accept cookies
scrollClick(browser, 'tsl-button[id="btn-load-more"]') ## load more [then ∞-scroll]
itemCt, scrapedLinks, products = 0, [], [] ## initiate
itemSel, nextItem = 'a.ItemCardList__item[title]', None
try: nextItem = browser.find_element(By.CSS_SELECTOR, itemSel) ## first card
except Exception as e: print('No items found:', repr(e))
while nextItem:
itemCt += 1 # counter
cpHtml, cpTxt = '', '' # clear/initiate
resultsTab = browser.current_window_handle # to go back
try: # click card -> open new tab -> scrape product details
cpHtml, cpTxt = nextItem.get_attribute('outerHTML'), nextItem.text
scrollClick(browser, nextItem) ## click current card
# add wait ?
browser.switch_to.window(browser.window_handles[1]) ## go to 2nd tab
WebDriverWait(browser, 5).until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'div.detail-item'))) ## wait to load details
pLink = browser.current_url ## product URL
if pLink not in scrapedLinks: # skip duplicates [just in case]
products.append(getProductDetails(browser.page_source, pLink))
scrapedLinks.append(pLink)
except Exception as e:
print('!', [itemCt], ' '.join(cpTxt.split()), repr(e)) ## print error
pSoup = BeautifulSoup(cpHtml.encode('utf-8'), 'lxml')
products.append({
'title': selectGet(pSoup, '', 'title'),
'price': selectGet(pSoup, 'span.ItemCard__price'),
'errorMsg': f'{type(e)} {e}'
}) ## [ make do with info in card ]
try: # close all tabs other than results tab
for w in browser.window_handles:
if w != resultsTab:
browser.switch_to.window(w)
browser.close()
browser.switch_to.window(resultsTab)
except Exception as e:
print('Failed to restore results-tab-only window:', repr(e))
break
# print('', end=f"\r[{itemCt} of {maxItems}] {' '.join(cpTxt.split())} {repr(e)}")
if isinstance(maxItems, int):
if maxItems < itemCt: break
nextItem = selectNextSib(browser, nextItem, itemSel) # get next result card
Some notes:
scrollClick call is for the "Load More" button - it only needs to be clicked once and after that more results load as you scroll down.scrapedLinks doesn't seem to be necessary as there doesn't appear to be any duplicates to filter out. [It's just a habit since some sites do have duplicates, especially if there's pagination...]maxItems as None [or a really high number], it should keep scrolling and scraping until either
products should be a list of dictionaries each containing details about a different product.Results should look something like 
Btw, you can also save the results using something like pandas .to_csv like
# import pandas as pd
pd.DataFrame(products).to_csv('monitors.csv', index=False)
You can use their Ajax API to download the information about the items (one of the info is the URL):
Prints:
EDIT: To get the products from next pages you can add
start=parameter to URL: