I have an array of 262 links within a site. I am using Phantom in a NodeJS server to crawl each of these pages to create HTML snapshots. I am finding after the 222nd page, phantom crashes with the following error:
phantom stderr: 2015-06-10 11:44:20.159 phantomjs[4474:58792] Critical failure: the LastResort font is unavailable.
This has happened twice now, and is a real pain as it takes quite a while for it to crawl this many pages. For the record, when I had less pages, I never came across this issue, and for the first 222 pages there are no errors, so I don't think the issue is with my current code. However I will happily post some if anyone thinks it will be useful to help me solve this problem.
My initial assumptions are it is either a bug in phantom, something to do with a maximum number of pages allowed to be crawled, or a setting I can/should add to my instances of phantom.
I found someone with the same problem which didn't get solved here: https://groups.google.com/forum/#!topic/casperjs/KpDisQL7wxs
The code looks like this:
var crawlPage = function(arr, idx){
// start/end crawl
// --------------------------------
if (!idx) idx = 0;
if (idx >= arr.length) return;
else if (idx === 0) console.log("Starting page crawl");
// visit page
// --------------------------------
phantom.create(function(ph){
ph.createPage(function(page){
page.open(arr[idx], function(status){
//error accessing page
if (status !== "success") return console.log("Unable to access page");
//wait 4s for page to render
setTimeout(function(){
//evaluate page html
page.evaluate(function(){
return document.all[0].outerHTML;
}, function(snapshotHTML){
//strip script tags from html
snapshotHTML = snapshotHTML.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, "");
//save snapshot to file
saveSnapshot(arr[idx], snapshotHTML); //save snapshot file
//crawl next page
crawlPage(arr, idx+1);
//exit page
ph.exit();
});
}, 5000);
});
});
});
}
var siteLinks = [/*array of 262 links*/];
crawlPage(siteLinks);