Test_App_8.1 / vector_embeddings.py
AnkitPatil's picture
Upload 4 files
bd510ae verified
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import os
#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from dotenv import load_dotenv
from collections import OrderedDict
# Load environment variables from .env file
load_dotenv()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
# Load the PDF
loader = PyPDFLoader("DOC From Adv.pdf") # Provide your PDF path here
documents = loader.load()
# Split the text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
texts = text_splitter.split_documents(documents)
# Initialize the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Convert texts to embeddings
try:
embeddings = embedding_model.embed_documents([doc.page_content for doc in texts])
print("Vector Embeddings created successfully")
except Exception as e:
print(f"Error creating vector embeddings: {e}")
# Initialize Chroma vector store
vector_store = Chroma(embedding_function=embedding_model, persist_directory="data")
# Add documents to the vector store
vector_store.add_documents(documents=texts)
# Validate the setup
try:
# Test query to validate data retrieval
test_query = "What are some popular items for winter?"
results = vector_store.search(query=test_query, search_type='similarity')
# Deduplicate results
unique_results = OrderedDict()
for doc in results:
if doc.page_content not in unique_results:
unique_results[doc.page_content] = doc
# Convert unique results to a list and limit to top 3
final_results = list(unique_results.values())[:3]
print(f"Unique query results: {final_results}")
except Exception as e:
print(f"Error during test query: {e}")