from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS import gradio as gr # Load and split PDF document def load_doc(list_file_path): # Processing for one document only # loader = PyPDFLoader(file_path) # pages = loader.load() loaders = [PyPDFLoader(x) for x in list_file_path] pages = [] for loader in loaders: pages.extend(loader.load()) text_splitter = RecursiveCharacterTextSplitter( chunk_size = 1024, chunk_overlap = 64 ) doc_splits = text_splitter.split_documents(pages) return doc_splits def create_db(splits): model_kwargs = {'device': 'cpu'} embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs =model_kwargs) vectordb = FAISS.from_documents(splits, embeddings) return vectordb def initialize_database(list_file_obj, progress=gr.Progress()): # Create a list of documents (when valid) list_file_path = [x.name for x in list_file_obj if x is not None] # Load document and create splits doc_splits = load_doc(list_file_path) # Create or load vector database vector_db = create_db(doc_splits) return vector_db #, "Database created!"