I am trying to scrape the content of a page using the apify sdk. This also works nicely with the following code. But how can I force using the Apify SDK the headless mode as with puppeteer.launch({headless: true})?
Code for your reference:
async function scrape(number) {
let output = { links: [], title: [], content: [] };
const URL = "https://somepage/";
process.env.APIFY_LOCAL_STORAGE_DIR = '/someappfolder/apify_storage/run_' + number;
const requestQueue = await Apify.openRequestQueue(number);
await requestQueue.addRequest({ url: URL });
const pseudoUrls = [new Apify.PseudoUrl(URL + "[.*]")];
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
handlePageFunction: async ({ request, page }) => {
output.links.push(request.url);
output.title.push(await page.title());
output.content.push((await page.content()).length);
var save = { url: request.url, title: await page.title(), content: (await page.content()).length };
//sendToAirtable(save);
console.log(`URL: ${request.url}`);
await Apify.utils.enqueueLinks({
page,
selector: 'a',
pseudoUrls,
requestQueue,
});
},
maxRequestsPerCrawl: 10,
maxConcurrency: 10,
minConcurrency: 2,
});
await crawler.run();
return output;
};
Add
launchPuppeteerOptions: { headless: true }
on the same level asrequestQueue
https://sdk.apify.com/docs/typedefs/launch-puppeteer-options#docsNavEDIT: This no longer works. Latest docs are https://crawlee.dev/api/browser-crawler/interface/BrowserLaunchContext#launchOptions