Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
import gradio as gr | |
# Load and split PDF document | |
def load_doc(list_file_path): | |
# Processing for one document only | |
# loader = PyPDFLoader(file_path) | |
# pages = loader.load() | |
loaders = [PyPDFLoader(x) for x in list_file_path] | |
pages = [] | |
for loader in loaders: | |
pages.extend(loader.load()) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size = 1024, | |
chunk_overlap = 64 | |
) | |
doc_splits = text_splitter.split_documents(pages) | |
return doc_splits | |
def create_db(splits): | |
model_kwargs = {'device': 'cpu'} | |
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs =model_kwargs) | |
vectordb = FAISS.from_documents(splits, embeddings) | |
return vectordb | |
def initialize_database(list_file_obj, progress=gr.Progress()): | |
# Create a list of documents (when valid) | |
list_file_path = [x.name for x in list_file_obj if x is not None] | |
# Load document and create splits | |
doc_splits = load_doc(list_file_path) | |
# Create or load vector database | |
vector_db = create_db(doc_splits) | |
return vector_db #, "Database created!" | |