Mapping Headings with their corresponding Paragraphs in a pdf file using python

47 Views Asked by At

Mapping Headings with their corresponding Paragraphs in a pdf file using python.

I got a pdf with headings and paragraphs and i want to map all the headings with their respective paragraph.I want this because i have a list of keywords that i need to search in that pdf file and return all the headings under which that keyword falls. Code bleow is my current approach in python, but it fails when there are 2 side in a single page.

Is there a better approach to do this, or some python libary which gives us this mapping or something similar?

import re
import pdfplumber
import pprint
from autocorrect import Speller

pdf = pdfplumber.open(path_to_pdf)

heading_2_para = {}
heading = ''
heading_2_page = {}

spell = Speller()

for page_no, page in enumerate(pdf.pages):
    extracted_lines = page.extract_text_lines(layout=False, strip=True, return_chars=True)
    for line in extracted_lines:
        para = heading_2_para.get(line['text'], '')
        is_para = False
        for characters in line['chars']:

            if(characters['text'].isdigit() or characters['text'] == '.' or 'Bold' in characters['fontname'] or characters['text']==':'):
                continue
            else:
                is_para = True
                break
        if(not is_para):
            heading = line['text']
            if(heading_2_para.get(line['text'], '') == ''):
                heading_2_para[line['text']] = ''
                heading_2_page[line['text']] = page_no + 1
        else:
            if(heading ):
                heading_2_para[heading] = heading_2_para[heading] + line['text']

keywords = ['a', 'b', 'c']

keyword_2_title = {}

for i in heading_2_para:
    print(i)

for keyword in keywords:
    results = []
    keyword = spell(keyword)
    for heading in heading_2_para:
        para = heading_2_para[heading]
        if re.search(keyword, para, re.IGNORECASE):
            results.append([heading, heading_2_page[heading]])
    keyword_2_title[keyword] = results

for keyword in keyword_2_title:
    print("keyword:", keyword)
    pprint.pprint(keyword_2_title[keyword])
    print('-----------------------------------------------------------')    
0

There are 0 best solutions below