How to configure aws lamda to work with Puppeteer in detail, As I'm Begineer in lambda as well as puppeteer

290 Views Asked by At

My package.json:

"dependencies": {
"chrome-aws-lambda": "^10.1.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-core": "^10.1.0"

}

As I'm unable to load the full site it stops on loading and gives null error Configuration (aws lambda):-

  1. Nodejs - 14.0
  2. Memory - 2000MB
  3. Ephemeral storage- 1024MB I don't know what I have done wrong with the configuration of lambda, As I uploaded code on s3 then on the main function:

Code :

    const pupeteerExtra = require('puppeteer-extra');
    const pupeteerExtraPluginStealth = require('puppeteer-extra-plugin-stealth');
    const { DOMParser } = require('domino');
    const chromium = require('chrome-aws-lambda');

    const waitTillHTMLRendered = async (page, timeout = 30000) => {
    const checkDurationMsecs = 1000;
    const maxChecks = timeout / checkDurationMsecs;
    let lastHTMLSize = 0;
    let checkCounts = 1;
    let countStableSizeIterations = 0;
    const minStableSizeIterations = 3;

    while (checkCounts++ <= maxChecks) {
        let html = await page.content();
        let currentHTMLSize = html.length;

        let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length);

        console.log('last: ', lastHTMLSize, ' <> curr: ', currentHTMLSize, " body html size: ", bodyHTMLSize);

        if (lastHTMLSize != 0 && currentHTMLSize == lastHTMLSize)
            countStableSizeIterations++;
        else
            countStableSizeIterations = 0; //reset the counter

        if (countStableSizeIterations >= minStableSizeIterations) {
            console.log("Page rendered fully..");
            break;
        }

        lastHTMLSize = currentHTMLSize;
        await page.waitForTimeout(checkDurationMsecs);
    }
};


    async function getData(page) {
    try {

        const cancel = await page.waitForSelector('#mat-dialog-title-0 > span',)
        await cancel.click()
    } catch (error) {
        const data = await page.evaluate(() => {
            const notAvailable = 'No flights available';
            const head = document.querySelector("h1").textContent.trim();
            const classCabins = document.querySelectorAll(".cabin-heading");
            const classCabin = []
            classCabins.forEach((element) => {
                classCabin.push(element.textContent.trim());
            })


            if (head === notAvailable) {
                return false;
            }
            const elements = document.querySelectorAll('.upsell-row.stop-over.ng-star-inserted'); // Selector for elements


            const scrapedData = [];

            elements.forEach(async (element) => {


                const data = [];

                const cabinDiv = element.querySelector('.cabins-container.ng-star-inserted').outerHTML;
                const extractedPage = new DOMParser().parseFromString(cabinDiv, 'text/html');
                const cabins = Array.from(extractedPage.querySelectorAll("kilo-cabin-cell-pres"));

                cabins.forEach((cabin) => {
                    const seatsLeftElement = cabin.querySelector('.seat-text.ng-star-inserted');
                    const pointsElemnet = cabin.querySelector('.points-total');
                    const cashElement = cabin.querySelector('.remaining-cash');
                    const cabinclassElemnt = cabin.querySelector('.mixed-cabin.good.ng-star-inserted');

                    const seatLeft = seatsLeftElement ? seatsLeftElement.textContent.trim() : '';
                    const points = pointsElemnet ? pointsElemnet.textContent.trim() : '';
                    const cash = cashElement ? cashElement.textContent.trim() : '';
                    const mixedCabin = cabinclassElemnt ? cabinclassElemnt.textContent.trim() : '';
                    data.push({
                        seatLeft,
                        points,
                        cash,
                        mixedCabin,
                    });
                });


                const departureTimeElement = element.querySelector('.departure-time');
                const arrivalTimeElement = element.querySelector('.arrival-time');
                const durationElement = element.querySelector('.flight-summary.ng-star-inserted');
                const layoverElements = Array.from(element.querySelectorAll('.connection-time.mat-caption.ng-star-inserted'));
                const operatingAirlineElement = element.querySelector('.operating-airline-icon');
                const specificClassElement = element.querySelector('.cabin-text');

                const departureTime = departureTimeElement ? departureTimeElement.textContent.trim() : 'Not available';
                const arrivalTime = arrivalTimeElement ? arrivalTimeElement.textContent.trim() : 'Not available';
                const duration = durationElement ? durationElement.textContent.trim() : 'Not available';
                const layover = layoverElements.map((layoverElement) => layoverElement.textContent.trim());
                const operatingAirline = operatingAirlineElement ? operatingAirlineElement.getAttribute('alt') : 'Not available';
                const specificClass = specificClassElement ? specificClassElement.textContent.trim() : 'Not available';

                // Create an object with the extracted data
                const flightData = {
                    departureTime,
                    arrivalTime,
                    duration,
                    layover,
                    operatingAirline,
                    specificClass,
                    data
                };

                // Add the object to the scrapedData array
                scrapedData.push(flightData);
            });

            return JSON.stringify({ classCabin, scrapedData });
        });
        console.log(data);
        return data;

    }
}


     exports.handler = async (event, context, callback) => {
    const url = 'https://www.aircanada.com/aeroplan/redeem/availability/outbound?org0=DEL&dest0=YYC&departureDate0=2023-12-08&lang=en-CA&tripType=O&ADT=1&YTH=0&CHD=0&INF=0&INS=0&marketCode=INT';
    try {
        console.log(chromium.headless)
        pupeteerExtra.use(pupeteerExtraPluginStealth());
        // const args = chromium.args.filter(item => item !== '--use-gl=swiftshader');
        // console.log(args)
        const browser = await pupeteerExtra.launch({
             args: [
            '--disable-gpu',
            '--disable-dev-shm-usage',
            '--disable-setuid-sandbox',
            '--no-first-run',
            '--no-sandbox',
            '--no-zygote',
            '--single-process', // <- this one doesn't works in Windows
        ],
            defaultViewport: chromium.defaultViewport,
            executablePath: await chromium.executablePath,
            headless: true,
            ignoreHTTPSErrors: true,
        });
        const page = await browser.newPage();
        page.setDefaultNavigationTimeout(0);
        page.setDefaultTimeout(0);
        await page.goto(url, { waitUntil: 'networkidle0' });
       await waitTillHTMLRendered(page)
       const pageTitle =  await getData(page);
       console.log('vande ma')
        return pageTitle
        // return callback(null, pageTitle);
    } catch (error) {
        console.log('Error at test.js:', error.message)
    }
};

 


T

I have tried layers but thin so i have done something wrong Layer1 - all node modules of the requiered packages in package.json excluding domino Layer2 - domino node modules and all files Layer - main handler file This is the link I'm try to scrape to get flight details https://www.aircanada.com/aeroplan/redeem/availability/outbound?org0=DEL&dest0=YYC&departureDate0=2023-12-08&lang=en-CA&tripType=O&ADT=1&YTH=0&CHD=0&INF=0&INS=0&marketCode=IN

0

There are 0 best solutions below