add some delays between request in promis.map using cheerio and nodejs?

270 Views Asked by At

I have following code:

urls have 5000 different url, when I try to grab and scrape these urls, I ran into a 500 Error, so I decide to add some delay between each request, I add the {concurrency: 1} but nothing changed.

const requestPromise = require('request-promise');
const Promise = require('bluebird');
const cheerio = require('cheerio');

for (var i=1; i<=250; i++)
{

p="https://mywebsite.com/" + i.toString() 

 urls[i-1]= p 

   
}

Promise.map(urls, requestPromise)
  .map((htmlOnePage, index) => {


    const $ = cheerio.load(htmlOnePage);

    $('.txtSearch1').each(function () { 
        var h="";
        h=$(this).text()
        h= h.replace(/(\r\n|\n|\r)/gm, "")

        html44.push (h)


})
    shareTuple[urls[index]] = html44;
    html44=[]


   fs.writeFileSync( "data.json", JSON.stringify(  shareTuple ) )
    
     
  }, {concurrency: 1})
  
  .then ()
  .catch((e) => console.log('We encountered an error' + e));
 

how I can add some random delay between each request here? I should using my code, so I need a solution or modification on my code.

Update:

I learn from answers, but just one point is remain in this question. How I can detect which URL cause to 500 Errors and skip it? How I can find about URL ran into 500 Error?

2

There are 2 best solutions below

0
derpirscher On BEST ANSWER

You seem to have a bit of a problem with which parameters you are passing to what function. Currently you do as follows

Promise.map(urls, requestPromise)
  .map((htmlOnePage, index) => { ...}, { concurrency: 1})
  .then(...)

which has multiple issues, so I'm quite wondering how that would even run without throwing Syntax errors ...

  1. you are not passing your options { concurrency: 1} to Promise.map but to the latter Array.map (where they are ignored)

  2. Promise.map does return a Promise, which does not have a .map()

  3. Array.map does not return a promise, so you can't call then() on it ...

  4. You are (synchronously) writing into the very same data.json file for each of the returned values. You probably want to go through the results first and then just write the file once everything is finished.

The proper code would be something like this

import { promises as fs } from "fs"; //provides promise based fs operations

Promise.map(urls, requestPromise, { concurrency: 1})
  .then(values => {
    values.map((htmlOnePage, index) => {
      const $ = cheerio.load(htmlOnePage);
      ...
      html44.push (h)
    })
    let sharetuple = html44;
    return fs.writeFile("data.json", JSON.stringify(sharetuple));
  })
  .catch((e) => console.log('We encountered an error' + e));

I don't know, whether cheerio is something async as well. I suppose not. If yes you have to handle that accordingly ...

EDIT

If you still think, you need a delay you can add it as follows (but I think, you should address the issue on the backend, if you have access to it)

function delayedRequest(url) {
  return new Promise(res => setTimeout(res, 100))
    .then(() => requestPromise(url));
}

and then call

Promise.map(urls, delayedRequest, { concurrency: 1})
  .then(values => {
    values.map((htmlOnePage, index) => {
      const $ = cheerio.load(htmlOnePage);
      ...
      html44.push (h)
    })
    let sharetuple = html44;
    return fs.writeFile("data.json", JSON.stringify(sharetuple));
  })
  .catch((e) => console.log('We encountered an error' + e));

But you could also ditch Bluebird completely and do it with the built-in async await of JS

async function scraper(urls) {
  for (let u of urls) {
    await new Promise(res => setTimeout(res, 100));
    let res = await requestPromise(url);
    ...
    html44.push(h)
  }
  await fs.writeFile("data.json", JSON.stringify(html44));
}
4
edvard chen On

What your second .map call does is to wait until all requests are resolved, which is sent parallel, and then do another round of mapping with your html processing.callback

Although I think derpirscher's suggestion should work, I give mine here

Promise.map(
  urls,
  (url, index) => {
    return requestPromise(url).then((htmlOnePage) => {
      const $ = cheerio.load(htmlOnePage);
      const html44 = [];
      $(".txtSearch1").each(function () {
        var h = "";
        h = $(this).text();
        h = h.replace(/(\r\n|\n|\r)/gm, "");
        html44.push(h);
      });
      shareTuple = html44;
      fs.writeFileSync("data.json", JSON.stringify(shareTuple));
      // delay 5s
      return new Promise((resolve) => setTimeout(resolve, 5e3));
    });
  },
  {
    concurrency: 1,
  }
).catch((e) => console.log("We encountered an error" + e));