I am trying to extract Security Ownership of Certain Beneficial Owners and Management
from the proxy statements.
I can extract the content using playwright, but I cannot extract insights from it. Here's what I've done so far:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.chains import create_extraction_chain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from playwright.sync_api import sync_playwright, Playwright
from bs4 import BeautifulSoup
schema = {
"properties": {
"name_of_beneficial_owner": {"type": "string"},
"number_of_shares": {"type": "integer"},
"percentage_owned": {"type": "integer"},
},
"required": ["name_of_beneficial_owner", "number_of_shares", "percentage_owned"],
}
def run(playwright: Playwright):
browser = playwright.firefox.launch()
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/96.0.4664.93 Safari/537.36",
locale="en-US;q=0.5",
)
page = context.new_page()
response = page.goto('https://www.sec.gov/edgar/search/', referer='https://www.sec.gov/edgar/search-and-access')
response = page.goto('https://www.sec.gov/Archives/edgar/data/320193/000130817923000019/laap2023_def14a.htm')
html = page.content()
soup = BeautifulSoup(html, 'html.parser')
text_content = soup.get_text()
llm = Ollama(model="mistral", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
chain = create_extraction_chain(schema, llm)
chain.run(text_content)
with sync_playwright() as playwright:
run(playwright)