Spaces:

Pravincoder
/

rag-recomendation-system-shl

Sleeping

App Files Files Community

Pravincoder commited on Apr 8

Commit

542c92b

verified ·

1 Parent(s): 48a0fce

Update rag.py

Browse files

Files changed (1) hide show

rag.py +97 -44

rag.py CHANGED Viewed

@@ -1,9 +1,16 @@
 import pandas as pd
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from sentence_transformers import SentenceTransformer
-import chromadb
 import uuid
 import numpy as np
 # === STEP 1: Preprocessing CSV & Chunking ===
 def pre_processing_csv(csv_path):
@@ -19,14 +26,13 @@ def pre_processing_csv(csv_path):
     metadatas = []
     for idx, row in df.iterrows():
-        # Combine multiple fields for better context
-        combined_text = f"""
-        Test Name: {row.get('Test Name', '')}
-        Description: {row.get('Description', '')}
-        Remote Testing: {row.get('Remote Testing', '')}
-        Adaptive/IRT: {row.get('Adaptive/IRT', '')}
-        Test Type: {row.get('Test Type', '')}
-        """
         chunks = text_splitter.split_text(combined_text)
@@ -43,54 +49,101 @@ def pre_processing_csv(csv_path):
     return documents, metadatas
-# === STEP 2: Embed and Store in ChromaDB ===
-def build_chroma_store(documents, metadatas,client=None):
-    if client is None:
-        client = chromadb.Client()
-    collection = client.create_collection(name="shl_test_catalog")
     print("🔍 Embedding documents...")
-    model = SentenceTransformer("all-MiniLM-L6-v2")
     embeddings = model.encode(documents, show_progress_bar=True)
-    print("📥 Adding to ChromaDB...")
-    collection.add(
-        documents=documents,
-        embeddings=[e.tolist() for e in embeddings],
-        ids=[str(uuid.uuid4()) for _ in range(len(documents))],
-        metadatas=metadatas
-    )
-    return collection, model
-# === STEP 3: Query the RAG Model ===
-def ask_query(query, model, collection, k=10):
-    print(f"\n💬 Query: {query}")
-    query_embedding = model.encode(query)
-    # Get more results than needed for diversity
-    results = collection.query(
-        query_embeddings=[query_embedding.tolist()],
-        n_results=k*2  # Get more results for diversity
-    )
-    # Process results to ensure diversity
     seen_tests = set()
     final_results = []
-    for i in range(len(results['documents'][0])):
-        doc = results['documents'][0][i]
-        meta = results['metadatas'][0][i]
-        test_name = meta['Test Name']
-        # Skip if we've already seen this test
         if test_name in seen_tests:
             continue
         seen_tests.add(test_name)
         final_results.append((doc, meta))
-        # Stop if we have enough diverse results
         if len(final_results) >= k:
             break
-    return final_results

 import pandas as pd
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from sentence_transformers import SentenceTransformer
 import uuid
 import numpy as np
+from dotenv import load_dotenv
+import os
+# Load environment variables from .env file
+load_dotenv()
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+PINECONE_ENV = os.getenv("PINECONE_ENV")  # e.g., "us-west-2"
+PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "shl-test-index")
 # === STEP 1: Preprocessing CSV & Chunking ===
 def pre_processing_csv(csv_path):
     metadatas = []
     for idx, row in df.iterrows():
+        combined_text = (
+            f"Test Name: {row.get('Test Name', '')}\n"
+            f"Description: {row.get('Description', '')}\n"
+            f"Remote Testing: {row.get('Remote Testing', '')}\n"
+            f"Adaptive/IRT: {row.get('Adaptive/IRT', '')}\n"
+            f"Test Type: {row.get('Test Type', '')}\n"
+        )
         chunks = text_splitter.split_text(combined_text)
     return documents, metadatas
+# === STEP 2: Embed and Store in Pinecone ===
+def build_pinecone_store(documents, metadatas, model, index_name, pinecone_api_key, pinecone_env):
     print("🔍 Embedding documents...")
     embeddings = model.encode(documents, show_progress_bar=True)
+    embeddings = np.array(embeddings).astype("float32")
+    print("🔑 Initializing Pinecone client...")
+    # Import new classes from the pinecone package
+    from pinecone import Pinecone, ServerlessSpec
+    # Create a Pinecone client instance
+    pc = Pinecone(api_key=pinecone_api_key)
+    # Check if the index exists; if not, create a new one.
+    existing_indexes = pc.list_indexes().names()
+    if index_name not in existing_indexes:
+        print("📥 Creating new Pinecone index...")
+        pc.create_index(
+            name=index_name,
+            dimension=embeddings.shape[1],
+            metric="cosine",
+            spec=ServerlessSpec(cloud="aws", region=pinecone_env)
+        )
+        # Optionally, you might need to wait a few moments for the new index to be ready.
+    # Connect to the index
+    index = pc.Index(index_name)
+    print("📥 Upserting embeddings to Pinecone index...")
+    to_upsert = []
+    for i, (vec, meta) in enumerate(zip(embeddings, metadatas)):
+        # Create a unique document id
+        doc_id = str(uuid.uuid4())
+        # Save the document text in metadata to return during queries
+        meta_copy = meta.copy()
+        meta_copy["document"] = documents[i]
+        # Prepare tuple (id, vector, metadata)
+        to_upsert.append((doc_id, vec.tolist(), meta_copy))
+    # Upsert documents as a single batch (for large datasets, consider batching the upserts)
+    index.upsert(vectors=to_upsert)
+    return index, model, embeddings, documents, metadatas
+# === STEP 3: Query the RAG Model using Pinecone ===
+def ask_query(query, model, index, k=10):
+    print(f"\n💬 Query: {query}")
+    # Generate query embedding
+    query_embedding = model.encode([query]).tolist()[0]
+    # Query Pinecone (retrieve extra candidates to filter duplicates)
+    query_response = index.query(vector=query_embedding, top_k=k * 2, include_metadata=True)
     seen_tests = set()
     final_results = []
+    # Loop through matches and filter for unique "Test Name"
+    for match in query_response['matches']:
+        meta = match.get('metadata', {})
+        test_name = meta.get("Test Name", "")
         if test_name in seen_tests:
             continue
         seen_tests.add(test_name)
+        # Retrieve the stored document text from metadata
+        doc = meta.get("document", "")
         final_results.append((doc, meta))
         if len(final_results) >= k:
             break
+    return final_results
+# === Example Usage ===
+if __name__ == "__main__":
+    # Path to your CSV file
+    csv_path = "shl_products.csv"
+    # Step 1: Preprocess CSV and create document chunks
+    documents, metadatas = pre_processing_csv(csv_path)
+    # Load the SentenceTransformer model
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    # Step 2: Build the Pinecone vector store
+    index, model, embeddings, documents, metadatas = build_pinecone_store(
+        documents, metadatas, model, PINECONE_INDEX_NAME, PINECONE_API_KEY, PINECONE_ENV
+    )
+    # Step 3: Query the RAG model
+    sample_query = "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes."
+    results = ask_query(sample_query, model, index, k=10)
+    # Display the results
+    print(f"\nResults for query: {sample_query}\n{'='*80}")
+    for i, (doc, meta) in enumerate(results, 1):
+        print(f"Result {i}:")
+        print(f"Test Name: {meta.get('Test Name', '')}")
+        print(f"Test Link: https://www.shl.com{meta.get('Test Link', '')}")
+        print(f"Chunk: {doc}")
+        print("-" * 80)