Is it possible to gain insights from any document using langchain and mistral?

174 Views Asked by At

I am trying to extract Security Ownership of Certain Beneficial Owners and Management from the proxy statements.

I can extract the content using playwright, but I cannot extract insights from it. Here's what I've done so far:

from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.chains import create_extraction_chain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from playwright.sync_api import sync_playwright, Playwright

from bs4 import BeautifulSoup

schema = {
    "properties": {
        "name_of_beneficial_owner": {"type": "string"},
        "number_of_shares": {"type": "integer"},
        "percentage_owned": {"type": "integer"},
    },
    "required": ["name_of_beneficial_owner", "number_of_shares", "percentage_owned"],
}


def run(playwright: Playwright):
    browser = playwright.firefox.launch()
    context = browser.new_context(
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                   " Chrome/96.0.4664.93 Safari/537.36",
        locale="en-US;q=0.5",
    )
    page = context.new_page()
    response = page.goto('https://www.sec.gov/edgar/search/', referer='https://www.sec.gov/edgar/search-and-access')
    response = page.goto('https://www.sec.gov/Archives/edgar/data/320193/000130817923000019/laap2023_def14a.htm')
    html = page.content()
    soup = BeautifulSoup(html, 'html.parser')
    text_content = soup.get_text()
    llm = Ollama(model="mistral", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    chain = create_extraction_chain(schema, llm)
    chain.run(text_content)


with sync_playwright() as playwright:
    run(playwright)

0

There are 0 best solutions below