Pravincoder's picture
Update rag.py
74e5b15 verified
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import uuid
import numpy as np
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV") # e.g., "us-west-2"
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "shl-test-index")
# === STEP 1: Preprocessing CSV & Chunking ===
def pre_processing_csv(csv_path):
df = pd.read_csv(csv_path)
df.fillna("", inplace=True)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=50
)
documents = []
metadatas = []
for idx, row in df.iterrows():
combined_text = (
f"Test Name: {row.get('Test Name', '')}\n"
f"Description: {row.get('Description', '')}\n"
f"Remote Testing: {row.get('Remote Testing', '')}\n"
f"Adaptive/IRT: {row.get('Adaptive/IRT', '')}\n"
f"Test Type: {row.get('Test Type', '')}\n"
)
chunks = text_splitter.split_text(combined_text)
for chunk in chunks:
documents.append(chunk)
metadatas.append({
"Test Name": row.get('Test Name', ''),
"Test Link": row.get('Test Link', ''),
"Remote Testing": row.get('Remote Testing', ''),
"Adaptive/IRT": row.get('Adaptive/IRT', ''),
"Test Type": row.get('Test Type', ''),
"row_id": idx
})
return documents, metadatas
# === STEP 2: Embed and Store in Pinecone ===
def build_pinecone_store(documents, metadatas, model, index_name, pinecone_api_key, pinecone_env):
print("πŸ” Embedding documents...")
embeddings = model.encode(documents, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")
print("πŸ”‘ Initializing Pinecone client...")
# Import new classes from the pinecone package
from pinecone import Pinecone, ServerlessSpec
# Create a Pinecone client instance
pc = Pinecone(api_key=pinecone_api_key)
# Check if the index exists; if not, create a new one.
existing_indexes = pc.list_indexes().names()
if index_name not in existing_indexes:
print("πŸ“₯ Creating new Pinecone index...")
pc.create_index(
name=index_name,
dimension=embeddings.shape[1],
metric="cosine",
spec=ServerlessSpec(cloud="aws", region=pinecone_env)
)
# Optionally, you might need to wait a few moments for the new index to be ready.
# Connect to the index
index = pc.Index(index_name)
print("πŸ“₯ Upserting embeddings to Pinecone index...")
to_upsert = []
for i, (vec, meta) in enumerate(zip(embeddings, metadatas)):
# Create a unique document id
doc_id = str(uuid.uuid4())
# Save the document text in metadata to return during queries
meta_copy = meta.copy()
meta_copy["document"] = documents[i]
# Prepare tuple (id, vector, metadata)
to_upsert.append((doc_id, vec.tolist(), meta_copy))
# Upsert documents as a single batch (for large datasets, consider batching the upserts)
index.upsert(vectors=to_upsert)
return index, model, embeddings, documents, metadatas
# === STEP 3: Query the RAG Model using Pinecone ===
def ask_query(query, model, index, k=10):
print(f"\nπŸ’¬ Query: {query}")
# Generate query embedding
query_embedding = model.encode([query]).tolist()[0]
# Query Pinecone (retrieve extra candidates to filter duplicates)
query_response = index.query(vector=query_embedding, top_k=k * 2, include_metadata=True)
seen_tests = set()
final_results = []
# Loop through matches and filter for unique "Test Name"
for match in query_response['matches']:
meta = match.get('metadata', {})
test_name = meta.get("Test Name", "")
if test_name in seen_tests:
continue
seen_tests.add(test_name)
# Retrieve the stored document text from metadata
doc = meta.get("document", "")
final_results.append((doc, meta))
if len(final_results) >= k:
break
return final_results
# === Example Usage ===
if __name__ == "__main__":
# Path to your CSV file
csv_path = "shl_products.csv"
# Step 1: Preprocess CSV and create document chunks
documents, metadatas = pre_processing_csv(csv_path)
# Load the SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")
# Step 2: Build the Pinecone vector store
index, model, embeddings, documents, metadatas = build_pinecone_store(
documents, metadatas, model, PINECONE_INDEX_NAME, PINECONE_API_KEY, PINECONE_ENV
)
# Step 3: Query the RAG model
sample_query = "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes."
results = ask_query(sample_query, model, index, k=10)
# Display the results
print(f"\nResults for query: {sample_query}\n{'='*80}")
for i, (doc, meta) in enumerate(results, 1):
print(f"Result {i}:")
print(f"Test Name: {meta.get('Test Name', '')}")
print(f"Test Link: https://www.shl.com{meta.get('Test Link', '')}")
print(f"Chunk: {doc}")
print("-" * 80)