Is there any code snippet that will work? I have tried this for converting pdf to html
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import os
import contextlib
import tempfile
rsrcmgr = PDFResourceManager()
laparams = LAParams()
converter = HTMLConverter if format == 'html' else TextConverter
out_file = "A:\folder"
in_file = "A:\folder\pyhtml.html"
pdf_filename = 'insurance.pdf'
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
PDFPage.get_pages(rsrcmgr, device, in_file, pagenos=[1], maxpages=1)
with contextlib.closing(tempfile.NamedTemporaryFile(mode='r', suffix='.xml')) as xmlin:
cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (
pdf_filename, xmlin.name.rpartition('.')[0])
os.system(cmd + " >/dev/null 2>&1")
result = xmlin.read().decode('utf-8')
when i run the above code it gives me following erroe
Traceback (most recent call last):
File "a:\folder\new - Copy.py", line 14, in <module>
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
AttributeError: 'str' object has no attribute 'write'
If there is attempt of
.write
that means you should provide write-able file-handle rather thanstr
, you might usewith open
... which will take care of closing file for you as follow, replaceusing
If you want to know more about
open
read Built-in Functions docs