I have been able to use the Python code in a Linux based OS, but when I tried to run it the same code on a Windows based OS, I got deprecation messages.
My question is: How can the code be updated to overcome the deprecation issues?
- The Python code used was:
import PyPDF2
import openpyxl
def pdf_to_text(pdf_file):
text = ""
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfFileReader(file)
for page_num in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
def save_text_to_excel(text, excel_file):
workbook = openpyxl.Workbook()
sheet = workbook.active
lines = text.split("\n")
for row_num, line in enumerate(lines, 1):
sheet.cell(row=row_num, column=1, value=line)
workbook.save(excel_file)
if __name__ == "__main__":
pdf_file = "PDF_File_name.pdf"
excel_file = "output.xlsx"
pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)
Output: "PyPDF2.errors.DeprecationError: PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead."
- So I updated to this Python code:
import PyPDF2
import openpyxl
def pdf_to_text(pdf_file):
text = ""
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
def save_text_to_excel(text, excel_file):
workbook = openpyxl.Workbook()
sheet = workbook.active
lines = text.split("\n")
for row_num, line in enumerate(lines, 1):
sheet.cell(row=row_num, column=1, value=line)
workbook.save(excel_file)
if __name__ == "__main__":
pdf_file = "PDF_File_name.pdf"
excel_file = "output.xlsx"
pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)
Output: "PyPDF2.errors.DeprecationError: reader.getNumPages is deprecated and was removed in PyPDF2 3.0.0. Use len(reader.pages) instead."
- I next updated Python code based on recommendation from https://pypdf2.readthedocs.io/en/latest/user/migration-1-to-2.html which states to update:
reader.getNumPages() / reader.numPages ➔ len(reader.pages)
import PyPDF2
import openpyxl
def pdf_to_text(pdf_file):
text = ""
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(pdf_reader.len(reader.pages)):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
def save_text_to_excel(text, excel_file):
workbook = openpyxl.Workbook()
sheet = workbook.active
lines = text.split("\n")
for row_num, line in enumerate(lines, 1):
sheet.cell(row=row_num, column=1, value=line)
workbook.save(excel_file)
if __name__ == "__main__":
pdf_file = "PDF_File_name.pdf"
excel_file = "output.xlsx"
pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)
Output: "AttributeError: 'PdfReader' object has no attribute 'len'"
- I updated the code based on comments by 'Abdul Aziz Barkat': Typo: pdf_reader.len(reader.pages) compare that to len(reader.pages) as stated in the deprecation message... You have to write len(pdf_reader.pages), len is a builtin function in Python.
import PyPDF2
import openpyxl
def pdf_to_text(pdf_file):
text = ""
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
def save_text_to_excel(text, excel_file):
workbook = openpyxl.Workbook()
sheet = workbook.active
lines = text.split("\n")
for row_num, line in enumerate(lines, 1):
sheet.cell(row=row_num, column=1, value=line)
workbook.save(excel_file)
if __name__ == "__main__":
pdf_file = "computers.pdf"
excel_file = "output.xlsx"
pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)
Output: "PyPDF2.errors.DeprecationError: reader.getPage(pageNumber) is deprecated and was removed in PyPDF2 3.0.0. Use reader.pages[page_number] instead."
The way you're trying to use the methods to read
pdfhas been deprecated in the new version. Follow the PdfFileReader class documentation to know more. Here's your corrected code: