Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_chroma import Chroma | |
import os | |
#from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.embeddings import HuggingFaceInstructEmbeddings | |
from dotenv import load_dotenv | |
from collections import OrderedDict | |
# Load environment variables from .env file | |
load_dotenv() | |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
# Load the PDF | |
loader = PyPDFLoader("DOC From Adv.pdf") # Provide your PDF path here | |
documents = loader.load() | |
# Split the text | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300) | |
texts = text_splitter.split_documents(documents) | |
# Initialize the embedding model | |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
# Convert texts to embeddings | |
try: | |
embeddings = embedding_model.embed_documents([doc.page_content for doc in texts]) | |
print("Vector Embeddings created successfully") | |
except Exception as e: | |
print(f"Error creating vector embeddings: {e}") | |
# Initialize Chroma vector store | |
vector_store = Chroma(embedding_function=embedding_model, persist_directory="data") | |
# Add documents to the vector store | |
vector_store.add_documents(documents=texts) | |
# Validate the setup | |
try: | |
# Test query to validate data retrieval | |
test_query = "What are some popular items for winter?" | |
results = vector_store.search(query=test_query, search_type='similarity') | |
# Deduplicate results | |
unique_results = OrderedDict() | |
for doc in results: | |
if doc.page_content not in unique_results: | |
unique_results[doc.page_content] = doc | |
# Convert unique results to a list and limit to top 3 | |
final_results = list(unique_results.values())[:3] | |
print(f"Unique query results: {final_results}") | |
except Exception as e: | |
print(f"Error during test query: {e}") | |