MercadonaAgent / utils.py
Miguelpef's picture
Upload 4 files
fb40ebb verified
# Import required libraries
import os
import tempfile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
# Function to load, split, and embed data from PDF documents into Chroma vector store
def process_documents(pdfs):
"""
Process PDF documents through loading, splitting, and embedding.
Returns vector store instance.
"""
# Create temporary directory for PDF storage
with tempfile.TemporaryDirectory() as temp_dir:
# Save uploaded PDFs to temp directory
pdf_paths = []
for pdf in pdfs:
path = os.path.join(temp_dir, pdf.name)
with open(path, "wb") as f:
f.write(pdf.getbuffer())
pdf_paths.append(path)
# Load the documents
documents = []
for path in pdf_paths:
loader = PDFPlumberLoader(path)
documents.extend(loader.load())
# Split documents into chunks using RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1200,
chunk_overlap=150
)
splits = text_splitter.split_documents(documents)
# Instantiate the embeddings model
embeddings = OllamaEmbeddings(model="nomic-embed-text")
# Create embeddings and vector store
vector_store = Chroma.from_documents(
documents=splits,
embedding=embeddings,
persist_directory="./chroma_db"
)
return vector_store
# Initialize and returns a retriever for the vector store, which will be used to fetch relevant chunks from the stored embeddings based on user queries.
def get_retriever():
"""Initialize and return the vector store retriever"""
# Initialize the embedding model
embeddings = OllamaEmbeddings(model="nomic-embed-text")
try:
# Initialize the vector store
vector_store = Chroma(
embedding_function=embeddings,
persist_directory="./chroma_db"
)
# Return the retriever with MMR (Maximum Marginal Relevance) search and k=3
return vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3})
except Exception as e:
print(f"Error initializing vector store: {e}")
return None