I am trying to resolve an issue for my firebase scheduled cloud functions and Firebase support advised me to check

if the function in question maintains a persistent connection

I refer to the docs here but the examples are HTTPS functions, ie having req, res params. The function I am dealing with is a scheduled function, ie:

functions.pubsub.schedule('every 5 minutes').onRun((context) => {//some work})

Without req and res params, how can I check if the firebase schedule cloud function maintains a persistent connection?

EDIT:

I have received the reply from Firebase support and they advised me to check for persistent connections for HTTP requests that goes out from my scheduled functions.

My scheduled function performs web scraping tasks by scraping pdf URLs, and then extracting plain text from these pdf documents. The issue I am facing is that these tasks always crash with "Bad end offset" error which I can never catch. Firebase support suspects that the connection isn't persistent when pdfjs reaches out to the pdf's URL.

Error: Bad end offset: 1376305
    at ChunkedStream.onReceiveData (/workspace/node_modules/pdfjs-dist/es5/build/pdf.worker.js:12255:15)
    at ChunkedStreamManager.onReceiveData (/workspace/node_modules/pdfjs-dist/es5/build/pdf.worker.js:12818:21)
    at /workspace/node_modules/pdfjs-dist/es5/build/pdf.worker.js:12602:15
    at process._tickCallback (internal/process/next_tick.js:68:7)

enter image description here

Code:

const pdf = require('../pdf/pdf')

function getTextByUrl() {
    let url = "https://www.nea.gov.sg/docs/default-source/our-services/amendments-to-copeh-for-the-removal-of-roof-gutters-for-a-amp-a-or-reconstruction-works-involving-roof-structures.pdf"

    let options = pdf.setupPdfOptions(url)
    return extract(options)
}

function extract(options) {
    return pdf.getPlainBody(options)
    .then(pb => {
        console.log("Text extraction done:", pb)
    })
    .catch(err => {
        console.log("Err:", err)
    })
}

At the pdf.js script:

const pdfjslib = require('pdfjs-dist/es5/build/pdf.js');

function getPlainBody(options) {
    console.log("1")
    return getDocument(options)
    .then(doc => {
        console.log("4")
        return extractTexts(doc, doc.numPages)
    })
    .catch(err => console.log('Get plainBody err:', err))
}

function getDocument(options) { 
    console.log("2")
    var loadingTask = pdfjslib.getDocument(options)
    return loadingTask.promise
    .then((doc) => {
        console.log("3") //<== Not logged and crashed
        return doc
    })
    .catch(err => console.log("getDocument err:", err)) //<== Didn't catch the "Bad end offset" error
}

function extractTexts(doc, maxNumPages) {
    var promises = []
    for (pageNum = 1; pageNum <= maxNumPages; pageNum++) {
        const promise = getText(pageNum, doc)
        promises.push(promise)
    }

    return Promise.all(promises)
    .then((results) => {
        var pdfString = []
        for (const result of results) {
           pdfString.push(result);
        }

        return pdfString.join('').replace(/\s{2,}/g,' ').trim();
    })
    .catch((err) => {
        console.log('Error extracting text', err)
    })
}


function getText(pageNum, doc) { 
    console.log("5")
    return doc.getPage(pageNum)
    .then((page) => {
        return page.getTextContent()
    })
    .then((content) => {        
        var strings = content.items.map((item) => {
            return item.str
        });
        
        const concatStrings = strings.join('');
        return concatStrings;
    })
    .catch((err) => {
        console.log('Error getting text', err)
        return
    })
}

function setupPdfOptions(url) {
    return {
        url: url,
        httpHeaders: {
            "User-Agent": //My useragent,
        },
    };
}

I attempted to implement agent = new http.Agent({keepAlive: true}); at setupPdfOptions but it still crashes.

What can I do to maintain a persistent connection between pdfjs and the pdf's URL?

I am running Node engine 10, and "pdfjs-dist": "^2.4.456".

0

There are 0 best solutions below