Update Python code - the PyPDF2 library has deprecated objects used in Python code

2k Views Asked by At

I have been able to use the Python code in a Linux based OS, but when I tried to run it the same code on a Windows based OS, I got deprecation messages.

My question is: How can the code be updated to overcome the deprecation issues?

  1. The Python code used was:
import PyPDF2
import openpyxl

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PyPDF2.PdfFileReader(file)
        for page_num in range(pdf_reader.getNumPages()):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
            return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "PDF_File_name.pdf"
    excel_file = "output.xlsx"

pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)

Output: "PyPDF2.errors.DeprecationError: PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead."

  1. So I updated to this Python code:
import PyPDF2
import openpyxl

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(pdf_reader.getNumPages()):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
            return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "PDF_File_name.pdf"
    excel_file = "output.xlsx"

pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)

Output: "PyPDF2.errors.DeprecationError: reader.getNumPages is deprecated and was removed in PyPDF2 3.0.0. Use len(reader.pages) instead."

  1. I next updated Python code based on recommendation from https://pypdf2.readthedocs.io/en/latest/user/migration-1-to-2.html which states to update:

reader.getNumPages() / reader.numPages ➔ len(reader.pages)

import PyPDF2
import openpyxl

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(pdf_reader.len(reader.pages)):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
            return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "PDF_File_name.pdf"
    excel_file = "output.xlsx"

pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)

Output: "AttributeError: 'PdfReader' object has no attribute 'len'"

  1. I updated the code based on comments by 'Abdul Aziz Barkat': Typo: pdf_reader.len(reader.pages) compare that to len(reader.pages) as stated in the deprecation message... You have to write len(pdf_reader.pages), len is a builtin function in Python.
import PyPDF2
import openpyxl

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
            return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "computers.pdf"
    excel_file = "output.xlsx"

pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)

Output: "PyPDF2.errors.DeprecationError: reader.getPage(pageNumber) is deprecated and was removed in PyPDF2 3.0.0. Use reader.pages[page_number] instead."

2

There are 2 best solutions below

2
Musabbir Arrafi On

The way you're trying to use the methods to read pdf has been deprecated in the new version. Follow the PdfFileReader class documentation to know more. Here's your corrected code:

import openpyxl
from PyPDF2 import PdfFileReader

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PdfFileReader(file)
        print(pdf_reader.numPages)
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
        return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "test.pdf"
    excel_file = "output.xlsx"
    pdf_text = pdf_to_text(pdf_file)
    print(pdf_text)
    save_text_to_excel(pdf_text, excel_file)
0
VicRam0001 On

Thanks (Abdul and Musabbir) for the feedback, I have updated the code as suggested, also using the Migration Guide to update the deprecated elements: https://pypdf2.readthedocs.io/en/latest/user/migration-1-to-2.html

This code now runs on Python 3x using a Windows OS:

import openpyxl
import PyPDF2

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "PDF-file-name.pdf"
    excel_file = "output.xlsx"
    pdf_text = pdf_to_text(pdf_file)
    save_text_to_excel(pdf_text, excel_file)