Spaces:
Sleeping
Sleeping
File size: 1,419 Bytes
5db687f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import gradio as gr
# Load and split PDF document
def load_doc(list_file_path):
# Processing for one document only
# loader = PyPDFLoader(file_path)
# pages = loader.load()
loaders = [PyPDFLoader(x) for x in list_file_path]
pages = []
for loader in loaders:
pages.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1024,
chunk_overlap = 64
)
doc_splits = text_splitter.split_documents(pages)
return doc_splits
def create_db(splits):
model_kwargs = {'device': 'cpu'}
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs =model_kwargs)
vectordb = FAISS.from_documents(splits, embeddings)
return vectordb
def initialize_database(list_file_obj, progress=gr.Progress()):
# Create a list of documents (when valid)
list_file_path = [x.name for x in list_file_obj if x is not None]
# Load document and create splits
doc_splits = load_doc(list_file_path)
# Create or load vector database
vector_db = create_db(doc_splits)
return vector_db #, "Database created!"
|