I can't seem to figure out how to get this script to read the actual documents within the links it pulls. I can't seem to get it to bring back in text from the actual document within the links. I also tried to use iframe and src but was unsuccessful.
I have never tried anything like this before so a little stumped on what else I can do.
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import urlparse, parse_qs
import io
from PyPDF2 import PdfReader
# specify the web driver path
driver = webdriver.Chrome("path/to/chromedriver")
# navigate to the website
url = "https://probaterecords.shelbyal.com/shelby/search.do?indexName=opr&templateName=Main&searchQuery=richard+wygle&lq=&searchType=1®ex=%5B%5Ea-zA-Z0-9_%5D®exwithspaces=%5B%5Ea-zA-Z0-9%5Cs_%5D®exwithasterisks=%5B%5E*a-zA-Z0-9%5Cs_%5D&sortBy=InstrumentFilter&desc=N&searchable=DisplayName%2CLastName%2CFirstName%2CInstrument%2CRecDate%2CPartyRole%2CDocTypeDesc%2CDocType%2CBook%2CPage%2CLot%2CBlock%2CTownship%2COther%2CFreeform%2COtherName&isPhoneticSearch=&q=richard+wygle&basicSortOrder=InstrumentFilter%7CN&Instrument=&Instrument_select=AND&RecDate=&RecDate=&RecDate_select=AND&LastName=&LastName_select=AND&searchkindLast=StartsLast&FirstName=&FirstName_select=OR&FirstName2=&FirstName2_select=AND&DocTypeDesc=&DocTypeDesc_select=AND&Book=&Book_select=AND&Page=&Page_select=AND&MAPBOOK=&MAPBOOK_select=AND&MAPPAGE=&MAPPAGE_select=AND&Lot%23=&Lot%23_select=AND&Lot=&Lot_select=AND&Block=&Block_select=AND&Section=&Section_select=AND&Township=&Township_select=AND&Range=&Range_select=AND&QT=&QT_select=AND&BQT=&BQT_select=AND&LegacyNum=&LegacyNum_select=AND&advancedSortOrder=InstrumentFilter%7CN"
driver.get(url)
# get the page source
html = driver.page_source
# parse the HTML
soup = BeautifulSoup(html, 'html.parser')
# find all the anchor tags with class "nocolor pphoto"
links = soup.select('a[class="nocolor pphoto"]')
# Create an empty dictionary to store the links
unique_links = {}
for link in links:
href = link['href']
if href.startswith('/shelby/search.do?indexName=shelbyimages&lq='):
# construct the full link
full_link = 'https://probaterecords.shelbyal.com' + href
# parse the query parameters from the link
parsed_url = urlparse(full_link)
query_params = parse_qs(parsed_url.query)
# extract the instrument number from the query parameters
instrument_number = query_params['lq'][0]
# Extract the document type
options = soup.select('select[name="DocTypeDesc"] option')
for option in options:
# check if the option value contains "deed"
if "deeds" in option.get_text().lower():
doc_type = option.get_text()
# add the link to the dictionary
unique_links[instrument_number] = (full_link, doc_type)
# Iterate over the unique links
for instrument_number, link_info in unique_links.items():
full_link, doc_type = link_info
# Open the PDF file from the url
response = requests.get(full_link)
pdf_file = io.BytesIO(response.content)
pdf_reader = PdfReader(pdf_file)
# Get the number of pages
pages = len(pdf_reader.pages)
# Initialize a variable to store the text
text = ""
# Iterate over the pages
for page in pdf_reader.pages:
# Extract the text from the page
text += page.extract_text()
# Print the document type, instrument number and the text
print("Document Type: ", doc_type)
print("Instrument Number: ", instrument_number)
print("Text: ", text)