Headless browser not scraping as much data (debugging)

47 Views Asked by At

I am trying to figure out the exact reason why my headless program retrieves less data than the graphical one.

You can find the repository here and to run the code you must have a TikTok account. This is because if you load cookies into the browser it gets rid of popups and makes the program easier to write.

Once cloned you will run node cookieLoader.js and sign into your tiktok account, then press enter and you can run the main program.

Then try this command (headless by default)

node index.js -m undertimeslopper

If the repository no longer exists or you don't want to clone it you can follow these code snippets instead. This snippet will create your tiktok cookies once you login to the site and press enter.

const readline = require('readline');
const { exit } = require("process");
const fs = require('fs');
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');

// Apply the stealth plugin to avoid being detected as a bot
puppeteer.use(StealthPlugin());

(async () => {
    const readline = require('node:readline').createInterface({
        input: process.stdin,
        output: process.stdout,
    });
        
 
  const browser = await puppeteer.launch({ headless: false });

  // Open a new page
  const page = await browser.newPage();

  // Navigate to your desired URL
  await page.goto('https://www.tiktok.com');

  readline.question(`Press enter button to save your cookies\n`, async ()=> {
            readline.close();
            const cookies = await page.cookies();
            console.log(cookies)
            await fs.writeFileSync('./cookies.json', JSON.stringify(cookies, null, 2));
            exit()
        });
 
})();

Then you can run the actual program with this snippet.

const chalk = require("chalk");
const fs = require("fs");
const puppeteer = require("puppeteer");
const { exit } = require("process");
const path = require("path");


const loadCookie = async (page) => {
    //could be useful in future so ill keep it
    const cookieJson = await fs.readFileSync(path.join(__dirname,'cookies.json'));
    const cookies = JSON.parse(cookieJson);
    await page.setCookie(...cookies);
}

const generateUrlProfile = (username) => {
    var baseUrl = "https://www.tiktok.com/";
    if (username.includes("@")) {
        baseUrl = `${baseUrl}${username}`;
    } else {
        baseUrl = `${baseUrl}@${username}`;
    }
    return baseUrl;
};

const getListVideoByUsername = async (username) => {

    var baseUrl = await generateUrlProfile(username)
  
    const browser = await puppeteer.launch({
        headless: true,
    })
    
    const page = await browser.newPage()

    await page.setRequestInterception(true)
    page.on('request', (request) => {
    if (request.resourceType() === 'image') request.abort()
    else request.continue()
    })

    await loadCookie(page);
    page.setUserAgent(
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4182.0 Safari/537.36"
      );
    await page.goto(baseUrl).catch(err =>{
         console.error(err)
         exit();
    });
    await page.keyboard.press('Escape')
    const delay_milliseconds=3000+500
    const delay_after_load=1000
    
    await page.keyboard.press('Escape')

    try {

        await sleep(delay_milliseconds)
    
        const xpathSelector = "//button[contains(text(),'Refresh')]"; // Replace with your XPath
        await page.evaluate(xpath => {
            const xpathResult = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
            const element = xpathResult.singleNodeValue;
            if (element) {
                element.click()
            }
        }, xpathSelector);

        await sleep(delay_after_load)
    } 

    catch (error) {
    
    }
    
    await page.keyboard.press('Escape')
    var listVideo = []
    console.log(chalk.green("[*] Getting list video from: " + username))
    var loop = true
    var no_video_found=false
   
    while(loop) {
        listVideo = await page.evaluate(() => {
        const listVideo = document.querySelectorAll('a');
    
        const videoUrls2 = Array.from(listVideo).map(item => item.href)
            .filter(href => href.includes('/video/') || href.includes('/photo/'))
            .filter((value, index, self) => self.indexOf(value) === index).map(item=>item.replace('photo','video'));
            return videoUrls2;
        });
    
        console.log(chalk.green(`[*] ${listVideo.length} video found`))
        previousHeight = await page.evaluate("document.body.scrollHeight").catch(() => {
            
        });
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)").catch(() => {
        })
                          
        await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`, {timeout: 10000})
        .catch(() => {
            console.log(chalk.red("[X] No more video found"));
            console.log(chalk.green(`[*] Total video found: ${listVideo.length}`))
            loop = false
            if(listVideo.length===0){
                no_video_found=true
            }
            
            
        });
        await new Promise((resolve) => setTimeout(resolve, 1000));
    } 
    await browser.close()
    return listVideo
}

(async() => {
    getListVideoByUsername('undertimeslopper') // or any valid tiktok username
})()

The output on my machine is

[*] Getting list video from: undertimeslopper
[*] 35 video found
[*] 69 video found
[*] 69 video found
[X] No more video found

but after I go to line 5 in getListVideoByUsername and change headless: true to headless: false the output is

[*] Getting list video from: undertimeslopper
[*] 35 video found
[*] 69 video found
[*] 102 video found
[*] 137 video found
[*] 158 video found
[X] No more video found

As we can observe, the graphical program performed as intended: scraping all the users videos whilst the headless one only got 69.

This is the core of the problem as I intend to run this script headlessly on a server and if I cant get all the videos its worthless.

You don't have to run the code to help me. Essentially I am just looking for ways to debug and see what a headless browser is doing but I included the instructions and output as supplementary information.

1

There are 1 best solutions below

0
jgore200377 On

This issue was either caused by issues in the cache of the puppeteer library or simply by the version of the library. Upon upgrading puppeteer with the following changes

"puppeteer": "^13.7.0",

to

"puppeteer": "^22.5.0",

The issue was resolved and the program ran as intended.