I created an app that generates keyword searchable PDF files with tesseact, but when the PDF file is created, the pages are randomly reordered by one page every few dozen pages. How can I get the PDF file to be created in the order in which the image files were read?
I used pool.map to match order but its not useless.
from multiprocessing import Pool, cpu_count
import pytesseract
import PyPDF2
import io
from datetime import datetime
import os
from PySide6.QtWidgets import QApplication, QFileDialog
import sys
# Tesseract path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def process_image(args):
index, file_path = args
try:
# image to OCR PDF create
page = pytesseract.image_to_pdf_or_hocr(file_path, extension='pdf', lang='eng+kor')
return (index, file_path, page)
except Exception as e:
return (index, file_path, None)
def create_searchable_pdf_and_doc(files):
pdf_writer = PyPDF2.PdfWriter()
total_files = len(files)
# multi processing
with Pool(processes=cpu_count()) as pool:
# file index and path
tasks = [(i, file) for i, file in enumerate(files)]
# pool.map match order
results = pool.map(process_image, tasks)
# arrange
results.sort(key=lambda x: x[0])
for i, (_, _, page) in enumerate(results):
# processing status
print(f"\rProcessing: {i+1}/{total_files}", end="")
sys.stdout.flush()
if page:
pdf = PyPDF2.PdfReader(io.BytesIO(page))
for pageNum in range(len(pdf.pages)):
pdf_writer.add_page(pdf.pages[pageNum])
print("\nAll files have been processed. Compiling into PDF...")
# file name creation
today_date = datetime.now().strftime("%Y%m%d")
directory_name = os.path.basename(os.path.dirname(files[0]))
final_pdf_name = f"{directory_name}_{today_date}.pdf"
# PDF file save
with open(final_pdf_name, "wb") as f_out:
pdf_writer.write(f_out)
print(f"PDF file created: {final_pdf_name}")
def select_files():
app = QApplication(sys.argv)
dialog = QFileDialog()
dialog.setFileMode(QFileDialog.ExistingFiles)
dialog.setNameFilter("Images (*.png *.xpm *.jpg *.jpeg *.bmp *.gif)")
if dialog.exec():
return dialog.selectedFiles()
return []
if __name__ == "__main__":
selected_files = select_files()
if selected_files:
create_searchable_pdf_and_doc(selected_files)
else:
print("Fie does not selected.")
How can I get the PDF file to be created in the order in which the image files were read?