import logging, os, pickle, torch, time
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.embeddings.openai import OpenAIEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, AutoConfig, Pipeline
from dotenv import load_dotenv
# skip for streamlit process
path = "instructor_xl"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(path)
token_texts = tokenizer(chunks, return_tensors="pt", padding=True, truncation=True)
model = model.to(device)
embeddings = model(**token_texts)
duration = time.time() - start
logging.info(f"time to embed the books on {device}: {duration}")
VectorStore = FAISS.from_texts(chunks, embeddings)
So the path is local path, with using Autotokenizer and AutoModel, I can run it in batches. However, the FAISS.from_texts cannot take the argument embeddings, because of the error no attribute embed_document.
Should I build the FAISS from scratch or any other libraries can help me?
Since you are using Faiss through the Langchain integration, it expects a wrapper Embeddings model class and not the model directly. What you can do is create a CustomEmbeddings model class and put your model in it.
If you are trying to use a hosted by hugginfaces model, like the models on setence_transformers for example you can use the class HuggingFaceEmbeddings.