Spaces:
Sleeping
Sleeping
File size: 2,474 Bytes
fb40ebb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# Import required libraries
import os
import tempfile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
# Function to load, split, and embed data from PDF documents into Chroma vector store
def process_documents(pdfs):
"""
Process PDF documents through loading, splitting, and embedding.
Returns vector store instance.
"""
# Create temporary directory for PDF storage
with tempfile.TemporaryDirectory() as temp_dir:
# Save uploaded PDFs to temp directory
pdf_paths = []
for pdf in pdfs:
path = os.path.join(temp_dir, pdf.name)
with open(path, "wb") as f:
f.write(pdf.getbuffer())
pdf_paths.append(path)
# Load the documents
documents = []
for path in pdf_paths:
loader = PDFPlumberLoader(path)
documents.extend(loader.load())
# Split documents into chunks using RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1200,
chunk_overlap=150
)
splits = text_splitter.split_documents(documents)
# Instantiate the embeddings model
embeddings = OllamaEmbeddings(model="nomic-embed-text")
# Create embeddings and vector store
vector_store = Chroma.from_documents(
documents=splits,
embedding=embeddings,
persist_directory="./chroma_db"
)
return vector_store
# Initialize and returns a retriever for the vector store, which will be used to fetch relevant chunks from the stored embeddings based on user queries.
def get_retriever():
"""Initialize and return the vector store retriever"""
# Initialize the embedding model
embeddings = OllamaEmbeddings(model="nomic-embed-text")
try:
# Initialize the vector store
vector_store = Chroma(
embedding_function=embeddings,
persist_directory="./chroma_db"
)
# Return the retriever with MMR (Maximum Marginal Relevance) search and k=3
return vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3})
except Exception as e:
print(f"Error initializing vector store: {e}")
return None
|