File size: 1,222 Bytes
c002818 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# vectorstore.py
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
def load_or_build_vectorstore(local_file: str, index_folder: str, embeddings):
"""
Loads a local FAISS index if it exists; otherwise,
builds a new index from the specified PDF file.
"""
if os.path.exists(index_folder):
print("Loading existing FAISS index from disk...")
vectorstore = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True)
else:
print("Building a new FAISS index...")
loader = PyPDFLoader(local_file)
documents = loader.load()
text_splitter = SemanticChunker(
embeddings=embeddings,
breakpoint_threshold_type='percentile',
breakpoint_threshold_amount=90
)
chunked_docs = text_splitter.split_documents(documents)
print(f"Document split into {len(chunked_docs)} chunks.")
vectorstore = FAISS.from_documents(chunked_docs, embeddings)
vectorstore.save_local(index_folder)
return vectorstore
|