File size: 5,540 Bytes
ed8425a 542c92b ed8425a 542c92b ed8425a 542c92b ed8425a 542c92b ed8425a 542c92b ed8425a 542c92b ed8425a 542c92b ed8425a 542c92b ed8425a 542c92b ed8425a 542c92b ed8425a 542c92b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import uuid
import numpy as np
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV") # e.g., "us-west-2"
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "shl-test-index")
# === STEP 1: Preprocessing CSV & Chunking ===
def pre_processing_csv(csv_path):
df = pd.read_csv(csv_path)
df.fillna("", inplace=True)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=50
)
documents = []
metadatas = []
for idx, row in df.iterrows():
combined_text = (
f"Test Name: {row.get('Test Name', '')}\n"
f"Description: {row.get('Description', '')}\n"
f"Remote Testing: {row.get('Remote Testing', '')}\n"
f"Adaptive/IRT: {row.get('Adaptive/IRT', '')}\n"
f"Test Type: {row.get('Test Type', '')}\n"
)
chunks = text_splitter.split_text(combined_text)
for chunk in chunks:
documents.append(chunk)
metadatas.append({
"Test Name": row.get('Test Name', ''),
"Test Link": row.get('Test Link', ''),
"Remote Testing": row.get('Remote Testing', ''),
"Adaptive/IRT": row.get('Adaptive/IRT', ''),
"Test Type": row.get('Test Type', ''),
"row_id": idx
})
return documents, metadatas
# === STEP 2: Embed and Store in Pinecone ===
def build_pinecone_store(documents, metadatas, model, index_name, pinecone_api_key, pinecone_env):
print("π Embedding documents...")
embeddings = model.encode(documents, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")
print("π Initializing Pinecone client...")
# Import new classes from the pinecone package
from pinecone import Pinecone, ServerlessSpec
# Create a Pinecone client instance
pc = Pinecone(api_key=pinecone_api_key)
# Check if the index exists; if not, create a new one.
existing_indexes = pc.list_indexes().names()
if index_name not in existing_indexes:
print("π₯ Creating new Pinecone index...")
pc.create_index(
name=index_name,
dimension=embeddings.shape[1],
metric="cosine",
spec=ServerlessSpec(cloud="aws", region=pinecone_env)
)
# Optionally, you might need to wait a few moments for the new index to be ready.
# Connect to the index
index = pc.Index(index_name)
print("π₯ Upserting embeddings to Pinecone index...")
to_upsert = []
for i, (vec, meta) in enumerate(zip(embeddings, metadatas)):
# Create a unique document id
doc_id = str(uuid.uuid4())
# Save the document text in metadata to return during queries
meta_copy = meta.copy()
meta_copy["document"] = documents[i]
# Prepare tuple (id, vector, metadata)
to_upsert.append((doc_id, vec.tolist(), meta_copy))
# Upsert documents as a single batch (for large datasets, consider batching the upserts)
index.upsert(vectors=to_upsert)
return index, model, embeddings, documents, metadatas
# === STEP 3: Query the RAG Model using Pinecone ===
def ask_query(query, model, index, k=10):
print(f"\n㪠Query: {query}")
# Generate query embedding
query_embedding = model.encode([query]).tolist()[0]
# Query Pinecone (retrieve extra candidates to filter duplicates)
query_response = index.query(vector=query_embedding, top_k=k * 2, include_metadata=True)
seen_tests = set()
final_results = []
# Loop through matches and filter for unique "Test Name"
for match in query_response['matches']:
meta = match.get('metadata', {})
test_name = meta.get("Test Name", "")
if test_name in seen_tests:
continue
seen_tests.add(test_name)
# Retrieve the stored document text from metadata
doc = meta.get("document", "")
final_results.append((doc, meta))
if len(final_results) >= k:
break
return final_results
# === Example Usage ===
if __name__ == "__main__":
# Path to your CSV file
csv_path = "shl_products.csv"
# Step 1: Preprocess CSV and create document chunks
documents, metadatas = pre_processing_csv(csv_path)
# Load the SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")
# Step 2: Build the Pinecone vector store
index, model, embeddings, documents, metadatas = build_pinecone_store(
documents, metadatas, model, PINECONE_INDEX_NAME, PINECONE_API_KEY, PINECONE_ENV
)
# Step 3: Query the RAG model
sample_query = "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes."
results = ask_query(sample_query, model, index, k=10)
# Display the results
print(f"\nResults for query: {sample_query}\n{'='*80}")
for i, (doc, meta) in enumerate(results, 1):
print(f"Result {i}:")
print(f"Test Name: {meta.get('Test Name', '')}")
print(f"Test Link: https://www.shl.com{meta.get('Test Link', '')}")
print(f"Chunk: {doc}")
print("-" * 80)
|