Spaces:
Sleeping
Sleeping
# Import required libraries | |
import os | |
import tempfile | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import PDFPlumberLoader | |
from langchain_ollama import OllamaEmbeddings | |
from langchain_chroma import Chroma | |
# Function to load, split, and embed data from PDF documents into Chroma vector store | |
def process_documents(pdfs): | |
""" | |
Process PDF documents through loading, splitting, and embedding. | |
Returns vector store instance. | |
""" | |
# Create temporary directory for PDF storage | |
with tempfile.TemporaryDirectory() as temp_dir: | |
# Save uploaded PDFs to temp directory | |
pdf_paths = [] | |
for pdf in pdfs: | |
path = os.path.join(temp_dir, pdf.name) | |
with open(path, "wb") as f: | |
f.write(pdf.getbuffer()) | |
pdf_paths.append(path) | |
# Load the documents | |
documents = [] | |
for path in pdf_paths: | |
loader = PDFPlumberLoader(path) | |
documents.extend(loader.load()) | |
# Split documents into chunks using RecursiveCharacterTextSplitter | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1200, | |
chunk_overlap=150 | |
) | |
splits = text_splitter.split_documents(documents) | |
# Instantiate the embeddings model | |
embeddings = OllamaEmbeddings(model="nomic-embed-text") | |
# Create embeddings and vector store | |
vector_store = Chroma.from_documents( | |
documents=splits, | |
embedding=embeddings, | |
persist_directory="./chroma_db" | |
) | |
return vector_store | |
# Initialize and returns a retriever for the vector store, which will be used to fetch relevant chunks from the stored embeddings based on user queries. | |
def get_retriever(): | |
"""Initialize and return the vector store retriever""" | |
# Initialize the embedding model | |
embeddings = OllamaEmbeddings(model="nomic-embed-text") | |
try: | |
# Initialize the vector store | |
vector_store = Chroma( | |
embedding_function=embeddings, | |
persist_directory="./chroma_db" | |
) | |
# Return the retriever with MMR (Maximum Marginal Relevance) search and k=3 | |
return vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3}) | |
except Exception as e: | |
print(f"Error initializing vector store: {e}") | |
return None | |