Mapping Headings with their corresponding Paragraphs in a pdf file using python.
I got a pdf with headings and paragraphs and i want to map all the headings with their respective paragraph.I want this because i have a list of keywords that i need to search in that pdf file and return all the headings under which that keyword falls. Code bleow is my current approach in python, but it fails when there are 2 side in a single page.
Is there a better approach to do this, or some python libary which gives us this mapping or something similar?
import re
import pdfplumber
import pprint
from autocorrect import Speller
pdf = pdfplumber.open(path_to_pdf)
heading_2_para = {}
heading = ''
heading_2_page = {}
spell = Speller()
for page_no, page in enumerate(pdf.pages):
extracted_lines = page.extract_text_lines(layout=False, strip=True, return_chars=True)
for line in extracted_lines:
para = heading_2_para.get(line['text'], '')
is_para = False
for characters in line['chars']:
if(characters['text'].isdigit() or characters['text'] == '.' or 'Bold' in characters['fontname'] or characters['text']==':'):
continue
else:
is_para = True
break
if(not is_para):
heading = line['text']
if(heading_2_para.get(line['text'], '') == ''):
heading_2_para[line['text']] = ''
heading_2_page[line['text']] = page_no + 1
else:
if(heading ):
heading_2_para[heading] = heading_2_para[heading] + line['text']
keywords = ['a', 'b', 'c']
keyword_2_title = {}
for i in heading_2_para:
print(i)
for keyword in keywords:
results = []
keyword = spell(keyword)
for heading in heading_2_para:
para = heading_2_para[heading]
if re.search(keyword, para, re.IGNORECASE):
results.append([heading, heading_2_page[heading]])
keyword_2_title[keyword] = results
for keyword in keyword_2_title:
print("keyword:", keyword)
pprint.pprint(keyword_2_title[keyword])
print('-----------------------------------------------------------')