I have the following code which aims to turn scanned pdf files into searchable (ctrl+F) files. But for some reason that I cannot understand, the program sometimes (not always) mixes the pages in a way that, for example, it outputs the following: page1, page2, page3, page2, page5... so it writes page2 again on top of page4.
I tried to make the code reproducable.
import pytesseract , PyPDF2 , io , os , tempfile
from pdf2image import convert_from_path
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
# set required paths
pdf_folder_path = 'U:/pdf_folder/'
poppler_path = r'C:\poppler-24.02.0\Library\bin'
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
# create example pdf file
c = canvas.Canvas("output.pdf", pagesize=letter)
width, height = letter
c.setFont("Helvetica", 100)
for i in range(50):
c.saveState()
c.translate(width/2, height/2)
c.drawCentredString(0, 0, str(i + 1))
c.restoreState()
c.showPage()
c.save()
# list all pdfs in the folder (to be made searchable)
directory = os.fsencode(pdf_folder_path)
file_paths = []
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith('.pdf'):
file_paths.append(filename)
continue
else:
continue
print('Files found:')
# convert scanned files into searchable format
for file in file_paths:
original_pdf = 'U:/Make pdfs searchable/' + file
searchable_pdf = original_pdf[:-4] + ' (searchable).pdf'
print('Currently working on: ' + original_pdf)
with tempfile.TemporaryDirectory() as path:
images = convert_from_path(original_pdf, poppler_path=poppler_path, output_folder=path)
pdf_writer = PyPDF2.PdfWriter()
for image in images:
page = pytesseract.image_to_pdf_or_hocr(image, extension='pdf', lang='eng')
pdf = PyPDF2.PdfReader(io.BytesIO(page))
pdf_writer.add_page(pdf.pages[0])
with open(searchable_pdf, "wb") as f:
pdf_writer.write(f)
print(original_pdf + ' done!')
print('Process complete.')
Any ideas to why this is happening ? Thanks :)
Update 1/3: I have not figured out the cause of the problem, but I created a workaround using PdfMerger() which is however a bit slower.
for file in os.listdir(input_folder):
if file.endswith(".pdf"):
original_pdf = input_folder + file
modified_pdf = output_folder + file[:-4] + ' (searchable).pdf'
print('Current pdf: ' + file)
with tempfile.TemporaryDirectory() as path:
images = convert_from_path(original_pdf, poppler_path=poppler_path, output_folder=path)
merger = PyPDF2.PdfMerger()
for count, image in enumerate(images, 1):
print('Currently on page: ' + str(count))
page = pytesseract.image_to_pdf_or_hocr(image, extension='pdf', lang='eng')
pdf = PyPDF2.PdfReader(io.BytesIO(page))
merger.append(io.BytesIO(page))
with open(modified_pdf, 'wb') as combined_pdf_file:
merger.write(combined_pdf_file)
print(file + ' done!')
print('Process complete.')
It often happens when you take pages directly from the PdfReader object. Let's adjust your code to keep track of the order of the pages before we work with them:
Change this:
to this:
This should keep your order of pages.