How to add server-side delay to Javascript for loop?

1k Views Asked by At

I'm fiddling around with using Node.js to scrape data from an e-commerce site. I use Request to retrieve the DOM of the page and Cheerio to do server-side DOM selection.

const cheerio = require('cheerio');
const request = require('request');

// takes a URL, scrapes the page, and returns an object with the data
let scrapePage = (url) => {

    return new Promise((resolve, reject) => {

        request(url, (error, resp, body) => {

            if(error){
                reject(error);
            };

            let $ = cheerio.load(body); 
            let $url = url;
            let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text();

            let obj = {
                url: $url,
                price: $price
            }

            resolve(obj);

        });

    });

};

// Runs scrapePage in a loop
// There is a variable called arrayOfURLs defined elsewhere that contains 100s of URLs

for( let i = 0; i < arrayOfURLs.length; i++){
    scrapePage(arrayOfURLs[i])
        .then((obj) => {
            //write to a file
        })
        .catch((error) => {
        })
};

The problem is that the server that I send requests to sometimes sends back blank data, I'm assuming because I'm sending too many requests without any kind of pause. Due to the async nature of JS I'm having a hard time figuring out how to add an effective delay between each iteration of the loop. It's not enough to just add a setTimeOut in a synchronous fashion because setTimeOut itself is async, and I'm running this on the server so there's no Window object.

EDIT

The code above is a simplified version of what I'm working on. The entire code is this:

app.js

const fs = require('fs');
const path = 'urls.txt';
const path2 = 'results.txt';
const scraper = require('./scraper');

let scrapePage = (url) => {
    scraper.scrapePage(url)
        .then((obj) => {
            // console.log('obj from the scraper with Promises was received');
            // console.log(obj);
            // console.log('writing obj to a file');
            fs.appendFile(path2, JSON.stringify(obj) + ', ', (error) => {
                if(error){
                    console.log(error);
                } else {
                    // console.log('Successfully wrote to ' + path2);
                }
            })
        })
        .catch((error) => {
            console.log('There was an error scraping obj: ');
            console.log(error);
        })  
}

fs.readFile(path, 'utf8', (err, data) => {

  if (err){
    throw err;
  };

  var urlArray = JSON.parse(data);

  // this returns an Unexpected Identifier error    
  // const results = await Promise.all(urlArray.map(scrapePage));

  // this returns an Unexpected Token Function error
  // async function scrapePages(){
  //    const results = await Promise.all(urlArray.map(scrapePage));
  // };

});

scraper.js

const request = require('request');
const cheerio = require('cheerio');

exports.scrapePage = (url) => {
    return new Promise((resolve, reject) => {
        request(url, (error, resp, body) => {
            if(error){
                reject(error);
            };

            let $ = cheerio.load(body); 
            let $url = url;

            let $price = $('#rt-mainbody > div > div.details > div.itemData > div:nth-child(4) > div.description').text();

            let obj = {
                url: $url,
                price: $price
            }

            resolve(obj);

        })
    })
}
3

There are 3 best solutions below

9
On

Looks to me like you aren't waiting for your promises to resolve before you send the server response. You could completely eliminate the for loop using using async / await e.g.

const results = await Promise.all(arrayOfURLs.map(scrapePage));
0
On

If you want to have no more than x amount of active connections you could use throttle. Or if you want no more than x amount per second you could use throttlePeriod.

Using Promise.all will never call your resolve handler if only one request fails so you could catch any errors and return a Fail object

const Fail = function(details){this.details=details;};
const max10 = throttle(10)(scrapePage);//max 10 active connections
//const fivePerSecond = throttlePeriod(2,1000)(scrapePage); //start no more than 2 per second
Promise.all(
  arrayOfURLs.map(
    url =>
      max10(url)
      .catch(err=>new Fail([err,url]))
  )
)
.then(
  results =>{
    successes =
      results.filter(
        result=>(result&&result.constructor)!==Fail
      );
    failed =
      results.filter(
        result=>(result&&result.constructor)===Fail
      )
  }
);
1
On
const cheerio = require('cheerio');
const request = require('request');
let scrapePage = (url) => {

return new Promise((resolve, reject) => {

    request(url, (error, resp, body) => {

        if(error){
            reject(error);
            return;
        };

        if(!body) {
             reject('Empty Body');
             return;
        }


        let $ = cheerio.load(body); 

        let $url = url;
        let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text();

        let obj = {
            url: $url,
            price: $price
        }

        resolve(obj);

    });

});
};

function processUrl(url){
 scrapePage(url)
    .then((obj) => {
        //write to a file
        if(i < arrayOfURLs.length) 
            processUrl(arrayOfURLs.pop())
    })
    .catch((error) => {
       arrayOfURLs.unshift(url);
        if(i < arrayOfURLs.length)  // put this in finally block
            processUrl(arrayOfURLs.pop())
    })
};
processUrl(arrayOfURLs.pop());

Here we can use arrayOfUrls arrays as queue and if we received an error or blank page, we push this URL in array again. in that way we can process every URL in a synchronous fashion.