Problem of pages being overwritten while using pytesseract and PyPDF2

75 Views Asked by At

enter image description hereI have the following code which aims to turn scanned pdf files into searchable (ctrl+F) files. But for some reason that I cannot understand, the program sometimes (not always) mixes the pages in a way that, for example, it outputs the following: page1, page2, page3, page2, page5... so it writes page2 again on top of page4.

I tried to make the code reproducable.

import pytesseract , PyPDF2 , io , os , tempfile
from pdf2image import convert_from_path
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

# set required paths
pdf_folder_path = 'U:/pdf_folder/'
poppler_path = r'C:\poppler-24.02.0\Library\bin'
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

# create example pdf file
c = canvas.Canvas("output.pdf", pagesize=letter)
width, height = letter
c.setFont("Helvetica", 100)

for i in range(50):
    c.saveState()
    c.translate(width/2, height/2)
    c.drawCentredString(0, 0, str(i + 1))
    c.restoreState()
    c.showPage()
c.save()

# list all pdfs in the folder (to be made searchable)
directory = os.fsencode(pdf_folder_path)
file_paths = []

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith('.pdf'): 
        file_paths.append(filename)
        continue
    else:
        continue

print('Files found:')

# convert scanned files into searchable format
for file in file_paths:

    original_pdf = 'U:/Make pdfs searchable/' + file
    searchable_pdf = original_pdf[:-4] + ' (searchable).pdf'

    print('Currently working on: ' + original_pdf)

    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(original_pdf, poppler_path=poppler_path, output_folder=path)
        pdf_writer = PyPDF2.PdfWriter()
        for image in images:
            page = pytesseract.image_to_pdf_or_hocr(image, extension='pdf', lang='eng')
            pdf = PyPDF2.PdfReader(io.BytesIO(page))
            pdf_writer.add_page(pdf.pages[0])
        with open(searchable_pdf, "wb") as f:
            pdf_writer.write(f)
    print(original_pdf + ' done!')

print('Process complete.')

Any ideas to why this is happening ? Thanks :)

Update 1/3: I have not figured out the cause of the problem, but I created a workaround using PdfMerger() which is however a bit slower.

for file in os.listdir(input_folder):

    if file.endswith(".pdf"):

        original_pdf = input_folder + file
        modified_pdf = output_folder + file[:-4] + ' (searchable).pdf'

        print('Current pdf: ' + file)

        with tempfile.TemporaryDirectory() as path:
            images = convert_from_path(original_pdf, poppler_path=poppler_path, output_folder=path)
            merger = PyPDF2.PdfMerger()
            for count, image in enumerate(images, 1):
                print('Currently on page: ' + str(count))
                page = pytesseract.image_to_pdf_or_hocr(image, extension='pdf', lang='eng')
                pdf = PyPDF2.PdfReader(io.BytesIO(page))
                merger.append(io.BytesIO(page))
            with open(modified_pdf, 'wb') as combined_pdf_file:
                merger.write(combined_pdf_file)

        print(file + ' done!')

print('Process complete.')
1

There are 1 best solutions below

3
Michael On

It often happens when you take pages directly from the PdfReader object. Let's adjust your code to keep track of the order of the pages before we work with them:

Change this:

for file in file_paths:

    original_pdf = 'U:/Make pdfs searchable/' + file
    searchable_pdf = original_pdf[:-4] + ' (searchable).pdf'

    print('Currently working on: ' + original_pdf)

    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(original_pdf, poppler_path=poppler_path, output_folder=path)
        pdf_writer = PyPDF2.PdfWriter()
        for image in images:
            page = pytesseract.image_to_pdf_or_hocr(image, extension='pdf', lang='eng')
            pdf = PyPDF2.PdfReader(io.BytesIO(page))
            pdf_writer.add_page(pdf.pages[0])
        with open(searchable_pdf, "wb") as f:
            pdf_writer.write(f)
    print(original_pdf + ' done!')

print('Process complete.')

to this:

# convert scanned files into searchable format
for file in file_paths:
    original_pdf = 'U:/Make pdfs searchable/' + file
    searchable_pdf = original_pdf[:-4] + ' (searchable).pdf'

    print('Currently working on: ' + original_pdf)

    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(original_pdf, poppler_path=poppler_path, output_folder=path)

        # keeping track of original page numbers
        original_page_numbers = list(range(1, len(images) + 1))

        # initialize dict to store page content
        page_contents = {}

        for i, image in enumerate(images, 1):
            page = pytesseract.image_to_pdf_or_hocr(image, extension='pdf', lang='eng')
            page_contents[i] = page

        # sort pages based on original page numbers
        sorted_pages = [page_contents[i] for i in original_page_numbers]

        # write sorted pages to searchable PDF
        pdf_writer = PyPDF2.PdfWriter()
        for page in sorted_pages:
            pdf = PyPDF2.PdfReader(io.BytesIO(page))
            pdf_writer.add_page(pdf.pages[0])

        with open(searchable_pdf, "wb") as f:
            pdf_writer.write(f)

    print(original_pdf + ' done!')

print('Process complete.')

This should keep your order of pages.