import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { WebPDFLoader } from 'langchain/document_loaders/web/pdf';
import * as pdfjs from 'pdfjs-dist';
///...code here handling file reading
const handleLoadPDF = async () => {
if (uploadedFile) {
const blob = new Blob([uploadedFile], { type: 'application/pdf' });
const loader = new WebPDFLoader(blob, {
pdfjs: () => import("pdfjs-dist/legacy/build/pdf.mjs"),
workerSrc: `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.js`,
});
try {
// Load the PDF document
const pdfDocument = await loader.load();
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000, // Adjust the chunk size as needed
chunkOverlap: 200, // Adjust the chunk overlap as needed
});
// Split the document into text chunks
const splitDocs = await textSplitter.splitDocuments(pdfDocument);
console.log({ splitDocs });
} catch (error) {
console.error('Error:', error);
// Handle errors as needed
}
}
};
pdf-W4I3FICS.js?v=815f6208:48416 Uncaught (in promise) Error: No PDFJS.workerSrc specified
at getWorkerSrc (pdf-W4I3FICS.js?v=815f6208:48416:23)
at PDFWorker2.PDFWorker_initialize [as _initialize] (pdf-W4I3FICS.js?v=815f6208:48476:38)
at new PDFWorker2 (pdf-W4I3FICS.js?v=815f6208:48454:22)
at getDocument (pdf-W4I3FICS.js?v=815f6208:47858:72)
at WebPDFLoader.load (langchain_document_loaders_web_pdf.js?v=815f6208:63:29)
at async loadPDF (Input.jsx:88:18)
Keep getting this error despite linking a workerSrc for PDFjs, tried this with sample code on Langchain's documentation and still get the same error. Does anyone know what I need to do? Thank you!
I've tried using sample code from a similar issue on GitHub, but this gave me the same error
https://github.com/langchain-ai/langchainjs/issues/3787