Could you please help me? I've been trying to scrape this webpage: https://www.goplaceit.com/cl/mapa?id_modalidad=1&tipo_propiedad=1%2C2&selectedTool=list#10/-33.4379/-70.6505 And it's been a total pain.
The webpage doesn't change URL when clicking and interacting with the elements, and there's no real info in the DOM HTML body, so my strategy will be to interact with the page. When clicking the img elements of each property a new tab opens with the info I need to scrap. So I was thinking about opening the page, start on results page 1, click all elements, parse all new tabs (for now as you'll see in the code I'm just getting the urls, then close each tab as they are parsed). Then click on the button with the class "paginator-btn-right" to get the next results and iterate.
The code I'm currently working on is here:
import scrapy
from scrapy_playwright.page import PageMethod
class GoplaceItSpider(scrapy.Spider):
name = "goplaceit"
allowed_domains = ["goplaceit.com"]
start_urls = ["https://www.goplaceit.com/cl/mapa?id_modalidad=1&tipo_propiedad=1%2C2&selectedTool=list"]
method_initial_wait_for_selector = PageMethod('wait_for_selector', "div#gpi-property-list-container", timeout=10000)
def start_requests(self):
yield scrapy.Request(
self.start_urls[0],
meta=dict(
playwright=True,
playwright_include_page=True,
callback=self.parse,
errback=self.errback,
playwright_page_methods=[
self.method_initial_wait_for_selector
]
)
)
async def parse(self, response):
page = response.meta['playwright_page']
elements = await page.query_selector_all("div#gpi-property-list-container > div:nth-child(3) div > div:nth-child(1) img")
# Iterate over the elements and open a new tab for each one
for element in elements:
await element.click()
# Iterate over the opened tabs and parse the information
for new_tab in await page.context.pages():
await new_tab.close()
yield {
'url': new_tab.url
}
async def errback(self, failure):
# Function in case of an error, close the tab/page
page = failure.request.meta['playwright_page']
await page.close()