My package.json:
"dependencies": {
"chrome-aws-lambda": "^10.1.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-core": "^10.1.0"
}
As I'm unable to load the full site it stops on loading and gives null error Configuration (aws lambda):-
- Nodejs - 14.0
- Memory - 2000MB
- Ephemeral storage- 1024MB I don't know what I have done wrong with the configuration of lambda, As I uploaded code on s3 then on the main function:
Code :
const pupeteerExtra = require('puppeteer-extra');
const pupeteerExtraPluginStealth = require('puppeteer-extra-plugin-stealth');
const { DOMParser } = require('domino');
const chromium = require('chrome-aws-lambda');
const waitTillHTMLRendered = async (page, timeout = 30000) => {
const checkDurationMsecs = 1000;
const maxChecks = timeout / checkDurationMsecs;
let lastHTMLSize = 0;
let checkCounts = 1;
let countStableSizeIterations = 0;
const minStableSizeIterations = 3;
while (checkCounts++ <= maxChecks) {
let html = await page.content();
let currentHTMLSize = html.length;
let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length);
console.log('last: ', lastHTMLSize, ' <> curr: ', currentHTMLSize, " body html size: ", bodyHTMLSize);
if (lastHTMLSize != 0 && currentHTMLSize == lastHTMLSize)
countStableSizeIterations++;
else
countStableSizeIterations = 0; //reset the counter
if (countStableSizeIterations >= minStableSizeIterations) {
console.log("Page rendered fully..");
break;
}
lastHTMLSize = currentHTMLSize;
await page.waitForTimeout(checkDurationMsecs);
}
};
async function getData(page) {
try {
const cancel = await page.waitForSelector('#mat-dialog-title-0 > span',)
await cancel.click()
} catch (error) {
const data = await page.evaluate(() => {
const notAvailable = 'No flights available';
const head = document.querySelector("h1").textContent.trim();
const classCabins = document.querySelectorAll(".cabin-heading");
const classCabin = []
classCabins.forEach((element) => {
classCabin.push(element.textContent.trim());
})
if (head === notAvailable) {
return false;
}
const elements = document.querySelectorAll('.upsell-row.stop-over.ng-star-inserted'); // Selector for elements
const scrapedData = [];
elements.forEach(async (element) => {
const data = [];
const cabinDiv = element.querySelector('.cabins-container.ng-star-inserted').outerHTML;
const extractedPage = new DOMParser().parseFromString(cabinDiv, 'text/html');
const cabins = Array.from(extractedPage.querySelectorAll("kilo-cabin-cell-pres"));
cabins.forEach((cabin) => {
const seatsLeftElement = cabin.querySelector('.seat-text.ng-star-inserted');
const pointsElemnet = cabin.querySelector('.points-total');
const cashElement = cabin.querySelector('.remaining-cash');
const cabinclassElemnt = cabin.querySelector('.mixed-cabin.good.ng-star-inserted');
const seatLeft = seatsLeftElement ? seatsLeftElement.textContent.trim() : '';
const points = pointsElemnet ? pointsElemnet.textContent.trim() : '';
const cash = cashElement ? cashElement.textContent.trim() : '';
const mixedCabin = cabinclassElemnt ? cabinclassElemnt.textContent.trim() : '';
data.push({
seatLeft,
points,
cash,
mixedCabin,
});
});
const departureTimeElement = element.querySelector('.departure-time');
const arrivalTimeElement = element.querySelector('.arrival-time');
const durationElement = element.querySelector('.flight-summary.ng-star-inserted');
const layoverElements = Array.from(element.querySelectorAll('.connection-time.mat-caption.ng-star-inserted'));
const operatingAirlineElement = element.querySelector('.operating-airline-icon');
const specificClassElement = element.querySelector('.cabin-text');
const departureTime = departureTimeElement ? departureTimeElement.textContent.trim() : 'Not available';
const arrivalTime = arrivalTimeElement ? arrivalTimeElement.textContent.trim() : 'Not available';
const duration = durationElement ? durationElement.textContent.trim() : 'Not available';
const layover = layoverElements.map((layoverElement) => layoverElement.textContent.trim());
const operatingAirline = operatingAirlineElement ? operatingAirlineElement.getAttribute('alt') : 'Not available';
const specificClass = specificClassElement ? specificClassElement.textContent.trim() : 'Not available';
// Create an object with the extracted data
const flightData = {
departureTime,
arrivalTime,
duration,
layover,
operatingAirline,
specificClass,
data
};
// Add the object to the scrapedData array
scrapedData.push(flightData);
});
return JSON.stringify({ classCabin, scrapedData });
});
console.log(data);
return data;
}
}
exports.handler = async (event, context, callback) => {
const url = 'https://www.aircanada.com/aeroplan/redeem/availability/outbound?org0=DEL&dest0=YYC&departureDate0=2023-12-08&lang=en-CA&tripType=O&ADT=1&YTH=0&CHD=0&INF=0&INS=0&marketCode=INT';
try {
console.log(chromium.headless)
pupeteerExtra.use(pupeteerExtraPluginStealth());
// const args = chromium.args.filter(item => item !== '--use-gl=swiftshader');
// console.log(args)
const browser = await pupeteerExtra.launch({
args: [
'--disable-gpu',
'--disable-dev-shm-usage',
'--disable-setuid-sandbox',
'--no-first-run',
'--no-sandbox',
'--no-zygote',
'--single-process', // <- this one doesn't works in Windows
],
defaultViewport: chromium.defaultViewport,
executablePath: await chromium.executablePath,
headless: true,
ignoreHTTPSErrors: true,
});
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0);
page.setDefaultTimeout(0);
await page.goto(url, { waitUntil: 'networkidle0' });
await waitTillHTMLRendered(page)
const pageTitle = await getData(page);
console.log('vande ma')
return pageTitle
// return callback(null, pageTitle);
} catch (error) {
console.log('Error at test.js:', error.message)
}
};
T
I have tried layers but thin so i have done something wrong Layer1 - all node modules of the requiered packages in package.json excluding domino Layer2 - domino node modules and all files Layer - main handler file This is the link I'm try to scrape to get flight details https://www.aircanada.com/aeroplan/redeem/availability/outbound?org0=DEL&dest0=YYC&departureDate0=2023-12-08&lang=en-CA&tripType=O&ADT=1&YTH=0&CHD=0&INF=0&INS=0&marketCode=IN