JS PDF Parsing: How do I extract the contents of PDF and automatically display it on the text fields?

50 Views Asked by At

I am working on our thesis which has a functionality similar to autofill when you select your resume when applying for a job through websites. The problem is, when I select a different PDF file from the one I used for testing, the information isn't showing up on the text fields although I can see in the console that it was extracted. Is there any other way I can capture and display the information? I already tried different regex patterns.

Here is my full JS code:

<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.11.338/pdf.min.js"></script>
<script>
const pdfjsLib = window['pdfjs-dist/build/pdf'];

// Set the location of the worker script
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.11.338/pdf.worker.min.js';

document.getElementById('fileInput').addEventListener('change', function (event) {
    var file = event.target.files[0];
    var fileReader = new FileReader();

    fileReader.onload = function () {
        var typedarray = new Uint8Array(this.result);

        pdfjsLib.getDocument(typedarray).promise.then(function (pdf) {
            var textPromises = [];

            // Loop through pages 1 to 5 and create promises for text extraction
            for (let pageNumber = 1; pageNumber <= 5; pageNumber++) {
                textPromises.push(
                    pdf.getPage(pageNumber).then(page => {
                        return page.getTextContent().then(textContent => {
                            return textContent.items.map(item => item.str).join('\n');
                        });
                    })
                );
            }

            // Wait for all promises to resolve
            Promise.all(textPromises).then(pagesText => {
                // Concatenate text from all pages
                var text = pagesText.join('\n');

                console.log("Extracted text:", text);

                var titleRegex = /ARCHITECTURE\n([\s\S]+?)\nA/;
                var authorRegex = /(By:|By|by:|by)\s*([\s\S]+?)\s*(January|February|March|April|May|June|July|August|September|October|November|December) \d{4}/;
                var yearRegex = /(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})/;
                var abstractRegex = /ABSTRACT\n([\s\S]+?)\nPage/;
                var adviserRegex = /(\w+)\s+Adviser/; // Capture word before "Adviser"

                var titleMatch = text.match(titleRegex);
                var authorMatch = text.match(authorRegex);
                var yearMatch = text.match(yearRegex);
                var abstractMatch = text.match(abstractRegex);
                var adviserMatch = text.match(adviserRegex);

                console.log("Title match:", titleMatch);
                console.log("Author match:", authorMatch);
                console.log("Year match:", yearMatch);
                console.log("Abstract match:", abstractMatch);
                console.log("Adviser match:", adviserMatch);

                if (titleMatch) {
                    document.getElementById('title').value = titleMatch[1].trim();
                }
                if (authorMatch) {
                    document.getElementById('author').value = authorMatch[2].trim();
                }
                if (yearMatch) {
                    document.getElementById('year').value = yearMatch[2];
                }
                if (abstractMatch) {
                    document.getElementById('abstract').value = abstractMatch[1].trim();
                }
                if (adviserMatch) {
                    document.getElementById('adviser').value = adviserMatch[1].trim();
                }
            });
        });

    };

    fileReader.readAsArrayBuffer(file);
});
</script>

0

There are 0 best solutions below