Dealing with PDFs containing both tables and non-tabular data using Camelot PDF parser

53 Views Asked by At

I am using the Camelot PDF parsing library to extract data from PDF files, but I am facing an issue when the PDFs contain both tables and non-tabular data. Camelot seems to only extract table data and ignores the non-tabular content. Here is the code snippet I am using

from langchain.document_loaders.csv_loader import CSVLoader

import camelot
import uuid
from camelot.core import TableList

def export_tables_as_csv(filepath):
    tables = camelot.read_pdf(filepath, backend="ghostscript")
    for i, table in enumerate(tables):
        tables.export(f'table_{i+1}.csv', f='csv')

def generate_random_filename():
    return str(uuid.uuid4())


from collections import namedtuple

Document = namedtuple('Document', ['page_content', 'metadata'])

def formChunksForTable(filepath=None, file_type=None, url=None):
    try:
        if not filepath:
            print("Error: Filepath is missing.")
            return []

        all_docs = []
        tables = camelot.read_pdf(filepath, backend="ghostscript", flavor='stream')

        if isinstance(tables, TableList):
            for i, table in enumerate(tables):
                if table.df is not None and not table.df.empty:
                    for row_idx, row in enumerate(table.df.values):
                        page_content = ' '.join(row)
                        metadata = {'source': f'table-page-{i+1}-row-{row_idx+1}'}
                        doc = Document(page_content, metadata)
                        all_docs.append(doc)
                else:
                    print(f"Warning: Table {i+1} is empty.")

            if all_docs:
                print("Documents:", all_docs)
            else:
                print("No valid tables found in the PDF.")
        else:
            print("No tables found in the PDF.")
        
        return all_docs
    except Exception as e:
        print(f"Error: {e}")
        return []

0

There are 0 best solutions below