I am trying to learn js/puppeteer and by building a simple web scraper to scrape books info for educational purposes. I am trying to get the web scraper to fill UPC numbers from a CSV file onto the search bar of a book website. I managed to get a the web scraper to scrape the website if I use a single UPC number.
But I have a CSV with a list of UPCs and would love for the web scraper:
- to read the CSV file,
- grab the UPC from first line,
- search for the UPC on website,
- scrape the information,
- grab the UPC from 2nd line,
- repeat 3, 4
Sample CSV:
DATE,QUANTITY,NAME,CODECONTENT,CODETYPE
2021-10-13 20:16:44 +1100,1,"Book 1","9781250035288",9
2021-10-13 20:16:40 +1100,1,"Book 2","9781847245601",9
2021-10-13 20:16:35 +1100,1,"Book 3","9780007149247",9
2021-10-13 20:16:30 +1100,1,"Book 4","9780749958084",9
2021-10-13 20:16:26 +1100,1,"Book 5","9781405920384",9
This is my code so far. I am stuck at async function for the CSV parser where its giving me an undefined result when i do a
console.log(allupcs);
Plus I am not sure how to get the
await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input','9781509847556');
to accept the UPCs
See code below:
const puppeteer = require('puppeteer');
const parse = require('csv-parser');
const fs = require('fs');
async function getupcs(){
var upcData=[];
fs.createReadStream('Book_Bulk.csv')
.pipe(parse({delimiter: ':'}))
.on('data', function(csvrow) {
// console.log(+csvrow.CODECONTENT);
//do something with csvrow
upcData.push(+csvrow.CODECONTENT);
})
.on('end',function() {
//do something with csvData
// return upcData;
console.log(upcData);
});
}
async function main(){
// const allupcs = await upcData();
// console.log(allupcs);
const browser = await puppeteer.launch({ headless: false, defaultViewport: null, args: ['--start-maximized']});
const page = await browser.newPage();
await page.goto('https://www.bookdepository.com/');
await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input','9781509847556');
await page.click('#book-search-form > div.el-wrap.header-search-el-wrap > button');
//Title
await page.waitForSelector('.item-info h1');
const title = await page.$eval('.item-info h1', h1 => h1.textContent);
//Author
await page.waitForSelector('div.author-info.hidden-md > span > a > span');
const author = await page.$eval('div.author-info.hidden-md > span > a > span', span => span.innerText);
//Genre
await page.waitForSelector('.active a');
const genre = await page.$eval('.active a', a => a.innerText);
//Format
await page.waitForSelector('.item-info li');
const format = await page.$eval('.item-info li', li => li.innerText);
//Publisher
await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span');
const publisher = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span', span => span.innerText);
//Year
await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(3) > span');
const year = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(3) > span', span => span.innerText);
const newyear = year.slice(-4)
// Price
try {
await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span', { timeout: 1000 });
const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span', span => span.innerText);
var newprice = price.slice(-6);
} catch {
await page.waitForSelector('p.list-price');
const price = await page.$eval('p.list-price', p => p.innerText);
var newprice = price.slice(-6);
} finally {
await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price');
const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price', span => span.innerText);
var newprice = price.slice(-6);
}
console.log(title);
console.log(author);
console.log(genre);
console.log(format);
console.log(publisher);
console.log(newyear);
console.log(newprice);
// return {
// title: title,
// author: author,
// genre: genre,
// format: format,
// publisher: publisher,
// year: newyear,
// price: newprice
// }
}
main();
Updated: with code from Answer
const puppeteer = require('puppeteer');
const parse = require('csv-parser');
const fs = require('fs');
async function getpageData(page,upc){
await page.goto('https://www.bookdepository.com/');
await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input',upc);
await page.click('#book-search-form > div.el-wrap.header-search-el-wrap > button');
//Title
await page.waitForSelector('.item-info h1');
const title = await page.$eval('.item-info h1', h1 => h1.textContent);
//Author
await page.waitForSelector('div.author-info.hidden-md > span > a > span');
const author = await page.$eval('div.author-info.hidden-md > span > a > span', span => span.innerText);
//Genre
await page.waitForSelector('.active a');
const genre = await page.$eval('.active a', a => a.innerText);
//Format
await page.waitForSelector('.item-info li');
const format = await page.$eval('.item-info li', li => li.innerText);
//Publisher
await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span');
const publisher = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span', span => span.innerText);
//Year
await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(3) > span');
const year = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(3) > span', span => span.innerText);
const newyear = year.slice(-4)
// Price
try {
await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span', { timeout: 1000 });
const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span', span => span.innerText);
var newprice = price.slice(-6);
} catch {
await page.waitForSelector('p.list-price');
const price = await page.$eval('p.list-price', p => p.innerText);
var newprice = price.slice(-6);
} finally {
await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price');
const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price', span => span.innerText);
var newprice = price.slice(-6);
}
// console.log(title);
// console.log(author);
// console.log(genre);
// console.log(format);
// console.log(publisher);
// console.log(newyear);
// console.log(newprice);
return {
title: title,
author: author,
genre: genre,
format: format,
publisher: publisher,
year: newyear,
price: newprice
}
};
function readCsvAsync(filename, delimiter=',', encoding='utf-8') {
return new Promise((resolve, reject) => {
const rows = [];
try {
fs.createReadStream(filename, {encoding: encoding})
.pipe(parse({delimiter: delimiter}))
.on('data', (row) => rows.push(+row.CODECONTENT))
.on('end', () => resolve(rows))
.on('error', reject);
} catch (err) {
reject(err);
}
});
}
async function upcData() {
try {
const rows = await readCsvAsync('Book_Bulk.csv', ':');
// console.log(csvData);
// call puppeteer or whatever
return rows;
} catch (err) {
console.log(err);
}
}
async function main(){
const allupcs = await upcData();
// console.log(allupcs);
const browser = await puppeteer.launch({ headless: false, defaultViewport: null, args: ['--start-maximized']});
const page = await browser.newPage();
const scrapedData = [];
for(let upc of allupcs){
const data = await getpageData(page,upc);
scrapedData.push(data);
}
console.log(scrapedData);
}
main();
As you have noticed, the CSV parser is asynchronous. "asynchronous" means you can't do this:
I've outlined the order of execution. The last
console.log()
runs immediately after you set up the read stream.upcData
will not contain anything at this point.But it will contain data at point #10, and #5 etc will fill it.
That means: Whatever you want to do with
upcData
, do it inside the'end'
event handler.Since csv reader will give you one row per
data
event, you can also do things directly in thedata
event handler and not build anupcData
array at all.If you want to be able to
await
the whole thing, you must turn it into a promise first. In this case again the relevant step (promise resolution) happens in theend
callback: