Unable to read document

58 Views Asked by At

I can't seem to figure out how to get this script to read the actual documents within the links it pulls. I can't seem to get it to bring back in text from the actual document within the links. I also tried to use iframe and src but was unsuccessful.

I have never tried anything like this before so a little stumped on what else I can do.

from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import urlparse, parse_qs
import io
from PyPDF2 import PdfReader

# specify the web driver path
driver = webdriver.Chrome("path/to/chromedriver")

# navigate to the website
url = "https://probaterecords.shelbyal.com/shelby/search.do?indexName=opr&templateName=Main&searchQuery=richard+wygle&lq=&searchType=1&regex=%5B%5Ea-zA-Z0-9_%5D&regexwithspaces=%5B%5Ea-zA-Z0-9%5Cs_%5D&regexwithasterisks=%5B%5E*a-zA-Z0-9%5Cs_%5D&sortBy=InstrumentFilter&desc=N&searchable=DisplayName%2CLastName%2CFirstName%2CInstrument%2CRecDate%2CPartyRole%2CDocTypeDesc%2CDocType%2CBook%2CPage%2CLot%2CBlock%2CTownship%2COther%2CFreeform%2COtherName&isPhoneticSearch=&q=richard+wygle&basicSortOrder=InstrumentFilter%7CN&Instrument=&Instrument_select=AND&RecDate=&RecDate=&RecDate_select=AND&LastName=&LastName_select=AND&searchkindLast=StartsLast&FirstName=&FirstName_select=OR&FirstName2=&FirstName2_select=AND&DocTypeDesc=&DocTypeDesc_select=AND&Book=&Book_select=AND&Page=&Page_select=AND&MAPBOOK=&MAPBOOK_select=AND&MAPPAGE=&MAPPAGE_select=AND&Lot%23=&Lot%23_select=AND&Lot=&Lot_select=AND&Block=&Block_select=AND&Section=&Section_select=AND&Township=&Township_select=AND&Range=&Range_select=AND&QT=&QT_select=AND&BQT=&BQT_select=AND&LegacyNum=&LegacyNum_select=AND&advancedSortOrder=InstrumentFilter%7CN"
driver.get(url)

# get the page source
html = driver.page_source

# parse the HTML
soup = BeautifulSoup(html, 'html.parser')

# find all the anchor tags with class "nocolor pphoto"
links = soup.select('a[class="nocolor pphoto"]')

# Create an empty dictionary to store the links
unique_links = {}

for link in links:
    href = link['href']
    if href.startswith('/shelby/search.do?indexName=shelbyimages&lq='):
        # construct the full link
        full_link = 'https://probaterecords.shelbyal.com' + href
        # parse the query parameters from the link
        parsed_url = urlparse(full_link)
        query_params = parse_qs(parsed_url.query)
        # extract the instrument number from the query parameters
        instrument_number = query_params['lq'][0]
        # Extract the document type
        options = soup.select('select[name="DocTypeDesc"] option')
        for option in options:
            # check if the option value contains "deed"
            if "deeds" in option.get_text().lower():
                doc_type = option.get_text()
                # add the link to the dictionary
                unique_links[instrument_number] = (full_link, doc_type)

# Iterate over the unique links
for instrument_number, link_info in unique_links.items():
    full_link, doc_type = link_info
    # Open the PDF file from the url
    response = requests.get(full_link)
    pdf_file = io.BytesIO(response.content)
    pdf_reader = PdfReader(pdf_file)

    # Get the number of pages
    pages = len(pdf_reader.pages)

    # Initialize a variable to store the text
    text = ""

    # Iterate over the pages
    for page in pdf_reader.pages:
        # Extract the text from the page
        text += page.extract_text()

    # Print the document type, instrument number and the text
    print("Document Type: ", doc_type)
    print("Instrument Number: ", instrument_number)
    print("Text: ", text)
0

There are 0 best solutions below