WARN PuppeteerCrawler: Reclaiming failed request back to the list or queue. waiting for selector .show-more-less-html__markup
failed: timeout 30000ms exceeded {"id":"nWhJU8Vn62Xzbr3","url":"https://www.somesite.com/view/result_search-card","retryCount":1}
import { RequestQueue, PuppeteerCrawler, ProxyConfiguration } from "crawlee";
let currentPage = 1;
(async () => {
const requestQueue = await RequestQueue.open();
const crawler = new PuppeteerCrawler({
requestQueue,
headless: true,
maxRequestsPerMinute: 20,
maxConcurrency: 5,
proxyConfiguration: proxy ? new ProxyConfiguration({ proxyUrls: proxies }) : undefined,
preNavigationHooks: [
async (context, gotoOptions) => {
gotoOptions.waitUntil = "domcontentloaded";
},
],
async requestHandler({ page, request }) {
console.log("Requesting: " + request.url);
if (request.label === "JD") {
console.log("JOB: " + request.url);
// console.log(request.userData);
const title = await page.title();
const resultsSelector = '.show-more-less-html__markup';
await page.waitForSelector(resultsSelector);
const jobDetails = await page.evaluate((resultsSelector, userData) => {
return {
...userData,
description: document.querySelector(resultsSelector).innerHTML.trim(),
hiringOrganization: {
name: userData.hiringOrganization.name,
description: ""
},
type: "",
industry: ""
};
function getElementText(element) {
if (element) {
return element.textcontent || element.innerText || "";
}
return "";
}
}, resultsSelector, request.userData);
jobsData.push(jobDetails);
} else {
const resultsSelector = 'li';
await page.waitForSelector(resultsSelector);
const jobsList = await page.evaluate((resultsSelector, queryword, city) => {
return [...document.querySelectorAll(resultsSelector)].map((item, index) => {
let title = item.querySelector(".base-search-card__title");
let subtitle = item.querySelector(".base-search-card__subtitle");
let location = item.querySelector(".job-search-card__location");
let listdate = item.querySelector(".job-search-card__listdate");
let newlistdate = item.querySelector(".job-search-card__listdate--new");
let link = item.querySelector(".base-card__full-link");
return {
index,
title: getElementText(title),
subtitle: getElementText(subtitle),
description: "",
location: getElementText(location),
listdate: getElementText(listdate) || getElementText(newlistdate),
link: link && link.href ? link.href : "",
queryword: queryword,
hiringOrganization: {
name: getElementText(subtitle),
description: ""
},
type: "",
industry: "",
city: city
};
});
function getElementText(element) {
if (element) {
return element.textcontent || element.innerText || "";
}
return "";
}
}, resultsSelector, queryword, location);
console.log("[info]: Found", jobsList.length, "jobs on page", currentPage);
// await requestQueue.addRequest({ url: jobsList[0].link, label: "JD", userData: jobsList[0] });
// testing for one particular link
await requestQueue.addRequests([{ url: jobsList[0].link, label: "JD", userData: jobsList[0] }]);
if (Number(currentPage) < Number(pages)) {
console.log("[info] Getting page " + ++currentPage)
}
}
}
})
console.log('[info]: Starting Crawler ' + url);
let urls = [];
for (let index = 0; index < pages; index++) {
urls.push(getURL(index + 1));
}
if (urls.length < 1) {
urls = [getURL(1)];
}
await crawler.run(urls);
console.log("Saving JOBs Data");
writeFileSync(resultFile, JSON.stringify(jobsData, null, 4));
})();
In the above else block is working perfectly as expected but not this if block if (request.label === "JD") {
. There is a list of URLs passed to await crawler.run(urls);
and then these pages will have one more link and it is collected using link: link && link.href ? link.href : "",
This same URL is again passed to await requestQueue.addRequests([{ url: jobsList[0].link, label: "JD", userData: jobsList[0] }]);
, It's should navigate to that page and evaluate as expected in If block. If
block is not working, please give me some hints.
"crawlee": "^3.0.4",
"puppeteer": "^15.1.1",