Here's my python file which is a streamlit app that you can upload documents and then question and answer, basically a standard ask document chatbot.
how should i modify it so that even if i close the browser tab and open it again, the vector store is saved and the user doesn't have to re-upload the files? in fact, the user is able to continously upload more files.
import os
from apikey import apikey
import streamlit as st
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
# Set the OpenAI API key in the environment variables for authentication.
os.environ["OPENAI_API_KEY"] = apikey
# Define a function to clear the conversation history stored in Streamlit's session state.
def clear_history():
if 'history' in st.session_state:
del st.session_state['history']
# Set the title of the web page displayed to the user.
st.title('Chat with Document')
# Create a file uploader widget allowing users to upload documents in PDF, DOCX, or TXT format.
uploaded_file = st.file_uploader('Upload file:',type=['pdf','docx', 'txt'])
# Create a button that, when clicked, triggers the clear_history function to reset the session.
add_file = st.button('Add File', on_click=clear_history)
# Check if a file has been uploaded and the 'Add File' button has been pressed.
if uploaded_file and add_file:
with st.spinner('Reading, chunking and embedding file...'):
# Read the uploaded file's content as bytes.
bytes_data = uploaded_file.read()
# Construct a file path to save the uploaded file temporarily.
file_name = os.path. join('./', uploaded_file.name)
# Save the uploaded file to the constructed path.
with open (file_name, 'wb') as f:
f.write(bytes_data)
# Determine the file's extension to decide on the appropriate loader.
name, extension = os.path.splitext(file_name)
# Select the loader based on the file extension.
if extension == '.pdf':
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader(file_name)
elif extension == '.docx':
from langchain.document_loaders import Docx2txtLoader
loader = Docx2txtLoader(file_name)
elif extension == '.txt':
from langchain.document_loaders import TextLoader
loader = TextLoader(file_name)
else:
st.write('Document format is not supported!')
# Load the document using the selected loader.
documents = loader.load()
# Initialize the text splitter and split the loaded document into manageable chunks.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
# Initialize embeddings and vector store for the document chunks.
embeddings = OpenAIEmbeddings()
vector_store = Chroma.from_documents(chunks, embeddings)
# Initialize a ChatOpenAI instance with GPT-3.5 turbo model and temperature set to 0 for objective responses.
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)
# Create a retriever from the vector store for document retrieval.
retriever=vector_store.as_retriever()
# Initialize a conversational retrieval chain with the language model and the retriever.
crc = ConversationalRetrievalChain.from_llm(llm, retriever)
# Store the initialized conversational retrieval chain in Streamlit's session state.
st.session_state.crc = crc
# Display a success message once the file has been processed.
st.success('File uploaded, chunked and embedded successfully')
# Create an input widget for users to type in their questions.
question = st.text_input('Input your question')
# Process the question if it has been asked.
if question:
if 'crc' in st.session_state:
crc = st.session_state.crc
# Initialize the chat history in session state if it doesn't exist.
if 'history' not in st.session_state:
st.session_state['history'] = []
# Run the conversational retrieval chain with the current question and chat history.
response = crc.run({
'question':question,
'chat_history': st.session_state['history']
})
# Append the current question and response to the chat history.
st.session_state['history'].append((question,response))
# Display the response.
st.write(response)
# Iterate through the chat history to display previous questions and answers.
for prompts in st.session_state['history']:
st.write("Question: " + prompts[0])
st.write("Answer: " + prompts[1])