Can't get text from a div

51 Views Asked by At

I want to get the content of the div mw-content-text from some wikipedia page (this is just examples to learn node.js) I have made this:

var fetch       = require('node-fetch');
var cheerio = require('cheerio');
var fs = require('fs');
var vv = [
'https://en.wikipedia.org/wiki/Ben_Silbermann',
'https://en.wikipedia.org/wiki/List_of_Internet_entrepreneurs'
];
var bo=[],
    $;

vv.forEach((t)=>{
 fetch(t)
  .then(res => res.text())
  .then((body) => {
    $ = cheerio.load(body);
    var finded = $('#mw-content-text').text();
    bo.push(finded);
 });
});
console.log(bo);

If I output body, it is filled with a string containing the whole html page (so, this step is ok), If I output $ it contains a collection (but I'm not sure if it's populated, I use the node.js command prompt but it looks that it's not the right tool, any advice on that too?)

Anyway, variable bo returns me an empty array

1

There are 1 best solutions below

2
On BEST ANSWER

The issue here is that we're logging bo before the fetch call is complete. I'd suggest using the async/await syntax to ensure we wait for all the gets to return, then we can log the result.

You could follow with some more processing like removing empty lines, whitespace etc, but that shouldn't be too hard.

var fetch   = require('node-fetch');
var cheerio = require('cheerio');

var vv = [
    'https://en.wikipedia.org/wiki/Ben_Silbermann',
    'https://en.wikipedia.org/wiki/List_of_Internet_entrepreneurs'
];


async function getDivcontent() { 
    const promises = vv.map(async t => {
        const body = await fetch(t).then(res => res.text());
        const $ = cheerio.load(body);
        return $('#mw-content-text').text();
    });
    return await Promise.all(promises); 
}

async function test() {
    let result = await getDivcontent();
    console.log("Result:" + result);
}

test();