This is my code which I am using to load my faiss vector store and print it as a dataframe using pandas from my folder faiss index which contain two files in it. This is my directory.
faiss index index.faiss index.pkl
Now this is the output I am getting after printing out my faiss vector_store.docstore._dict. What I want is to store my it in some database like SQLite3.
chunk_id document page content
0 6ead4019-13c7-41e6-8c5e-bd7458087e71 Little Book of Plagiarism.pdf 1 This booklet is based upon “The Little Book of... 1 68f52aa8-eea3-4c52-825f-cc4fa81735ac Little Book of Plagiarism.pdf 2 CONTENTS \n \nWhat is Plagiarism? .............. 2 7ecb569f-ecdc-408a-9014-fbf129e9a966 Little Book of Plagiarism.pdf 2 UK Academic Traditions ......................... 3 00b27c8e-4d77-47e3-98a1-d2c261d2cb44 Little Book of Plagiarism.pdf 2 Collusion ................................ ..... 4 f186b715-bb47-4dd5-9fca-469742b72d95 Little Book of Plagiarism.pdf 2 Making Notes ................................... .. ... ... ... ... 81 1df7ba99-0386-4354-9362-1b5aad11ed6f Policy on drugs and tobacco control at HEIs.pdf 10 10 | P a g e \n of compliance with this provi... 82 ea0b9d6f-d1ce-46e6-ae74-66123fc602b6 Policy on drugs and tobacco control at HEIs.pdf 11 11 | P a g e \n \nAnnex - A \nUNDERTAKING ... 83 de5f6e45-3fc2-41af-947e-a7ff73cd00d8 Policy on drugs and tobacco control at HEIs.pdf 11 any time and to take any measure to ensure ... 84 5a9168e9-348a-4624-a489-8e18fde3437f Policy on drugs and tobacco control at HEIs.pdf 12 12 | P a g e \n ANNEX -B \n \nUNDERTAKING FOR... 85 5ae17d45-f523-4c02-97cd-57d4a6d96247 Policy on drugs and tobacco control at HEIs.pdf 12 policie s. Further, I have read and am aware o...
[86 rows x 4 columns]
Python app.py code
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import pandas as pd
# import sqlite3
from IPython.display import display
from langchain.chains import ConversationalRetrievalChain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
# loader = PyPDFLoader("Little Book of Plagiarism.pdf")
# pages = loader.load_and_split()
# print(pages)
# text_splitter = RecursiveCharacterTextSplitter(
# # Set a really small chunk size, just to show.
# chunk_size=800,
# chunk_overlap=100,
# length_function=len,
# is_separator_regex=False,
# )
# chunked_docs = text_splitter.split_documents(pages)
# print(chunked_docs[30])
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
# Create embeddings and store them in a FAISS vector store
embedder = HuggingFaceEmbeddings()
vector_store = FAISS.load_local("faiss index", embedder, allow_dangerous_deserialization=True)
#vector_store = FAISS.from_documents(chunked_docs, embedder)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
#print(vector_store.docstore._dict)
print("length", len(vector_store.docstore._dict))
# Assuming vector_store is your example vector_store
page_contents = []
for doc_id, document in vector_store.docstore._dict.items():
page_content = document.page_content
page_contents.append(page_content)
#text_embeddings = embedder.embed_documents(page_contents)
#print(vector_store)
repo_id="google/flan-t5-large"
qa = ConversationalRetrievalChain.from_llm(
HuggingFaceHub(
repo_id=repo_id, model_kwargs={"temperature": 0,"max_length":512}
), vector_store.as_retriever(), memory=memory, chain_type="stuff")
chat_history = []
def show_vstore(store):
vector_df = store_to_df(store)
display (vector_df)
#convert vector store into df to convenient access
def store_to_df(store):
v_dict = store.docstore._dict
data_rows = [ ]
for k in v_dict.keys():
doc_name = v_dict[k].metadata['source'].split('/')[-1]
page_number = v_dict[k].metadata['page']+1
content = v_dict[k].page_content
data_rows.append({"chunk_id":k, "document":doc_name, "page":page_number, "content": content})
# for i in range(len(data_rows)):
# data_rows[i]["embeddings"] = text_embeddings[i]
vector_df = pd.DataFrame(data_rows)
return vector_df
#df = store_to_df(vector_store)
#df.to_csv('output.csv', index=False)
print(store_to_df(vector_store))
I have seen both langchain and faiss documentation but could not found a way to store my faiss vector store in a database like SQlite. I want to store my faiss vector in a database like SQlite3.