Pravincoder commited on
Commit
542c92b
Β·
verified Β·
1 Parent(s): 48a0fce

Update rag.py

Browse files
Files changed (1) hide show
  1. rag.py +97 -44
rag.py CHANGED
@@ -1,9 +1,16 @@
1
  import pandas as pd
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from sentence_transformers import SentenceTransformer
4
- import chromadb
5
  import uuid
6
  import numpy as np
 
 
 
 
 
 
 
 
7
 
8
  # === STEP 1: Preprocessing CSV & Chunking ===
9
  def pre_processing_csv(csv_path):
@@ -19,14 +26,13 @@ def pre_processing_csv(csv_path):
19
  metadatas = []
20
 
21
  for idx, row in df.iterrows():
22
- # Combine multiple fields for better context
23
- combined_text = f"""
24
- Test Name: {row.get('Test Name', '')}
25
- Description: {row.get('Description', '')}
26
- Remote Testing: {row.get('Remote Testing', '')}
27
- Adaptive/IRT: {row.get('Adaptive/IRT', '')}
28
- Test Type: {row.get('Test Type', '')}
29
- """
30
 
31
  chunks = text_splitter.split_text(combined_text)
32
 
@@ -43,54 +49,101 @@ def pre_processing_csv(csv_path):
43
 
44
  return documents, metadatas
45
 
46
- # === STEP 2: Embed and Store in ChromaDB ===
47
- def build_chroma_store(documents, metadatas,client=None):
48
- if client is None:
49
- client = chromadb.Client()
50
- collection = client.create_collection(name="shl_test_catalog")
51
  print("πŸ” Embedding documents...")
52
- model = SentenceTransformer("all-MiniLM-L6-v2")
53
  embeddings = model.encode(documents, show_progress_bar=True)
 
54
 
55
- print("πŸ“₯ Adding to ChromaDB...")
56
- collection.add(
57
- documents=documents,
58
- embeddings=[e.tolist() for e in embeddings],
59
- ids=[str(uuid.uuid4()) for _ in range(len(documents))],
60
- metadatas=metadatas
61
- )
62
 
63
- return collection, model
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # === STEP 3: Query the RAG Model ===
66
- def ask_query(query, model, collection, k=10):
67
- print(f"\nπŸ’¬ Query: {query}")
68
- query_embedding = model.encode(query)
 
 
 
 
 
 
 
 
 
69
 
70
- # Get more results than needed for diversity
71
- results = collection.query(
72
- query_embeddings=[query_embedding.tolist()],
73
- n_results=k*2 # Get more results for diversity
74
- )
 
 
 
 
 
 
 
75
 
76
- # Process results to ensure diversity
77
  seen_tests = set()
78
  final_results = []
79
-
80
- for i in range(len(results['documents'][0])):
81
- doc = results['documents'][0][i]
82
- meta = results['metadatas'][0][i]
83
- test_name = meta['Test Name']
84
-
85
- # Skip if we've already seen this test
86
  if test_name in seen_tests:
87
  continue
88
-
89
  seen_tests.add(test_name)
 
 
90
  final_results.append((doc, meta))
91
-
92
- # Stop if we have enough diverse results
93
  if len(final_results) >= k:
94
  break
95
 
96
- return final_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from sentence_transformers import SentenceTransformer
 
4
  import uuid
5
  import numpy as np
6
+ from dotenv import load_dotenv
7
+ import os
8
+
9
+ # Load environment variables from .env file
10
+ load_dotenv()
11
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
12
+ PINECONE_ENV = os.getenv("PINECONE_ENV") # e.g., "us-west-2"
13
+ PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "shl-test-index")
14
 
15
  # === STEP 1: Preprocessing CSV & Chunking ===
16
  def pre_processing_csv(csv_path):
 
26
  metadatas = []
27
 
28
  for idx, row in df.iterrows():
29
+ combined_text = (
30
+ f"Test Name: {row.get('Test Name', '')}\n"
31
+ f"Description: {row.get('Description', '')}\n"
32
+ f"Remote Testing: {row.get('Remote Testing', '')}\n"
33
+ f"Adaptive/IRT: {row.get('Adaptive/IRT', '')}\n"
34
+ f"Test Type: {row.get('Test Type', '')}\n"
35
+ )
 
36
 
37
  chunks = text_splitter.split_text(combined_text)
38
 
 
49
 
50
  return documents, metadatas
51
 
52
+ # === STEP 2: Embed and Store in Pinecone ===
53
+ def build_pinecone_store(documents, metadatas, model, index_name, pinecone_api_key, pinecone_env):
 
 
 
54
  print("πŸ” Embedding documents...")
 
55
  embeddings = model.encode(documents, show_progress_bar=True)
56
+ embeddings = np.array(embeddings).astype("float32")
57
 
58
+ print("πŸ”‘ Initializing Pinecone client...")
59
+ # Import new classes from the pinecone package
60
+ from pinecone import Pinecone, ServerlessSpec
 
 
 
 
61
 
62
+ # Create a Pinecone client instance
63
+ pc = Pinecone(api_key=pinecone_api_key)
64
+
65
+ # Check if the index exists; if not, create a new one.
66
+ existing_indexes = pc.list_indexes().names()
67
+ if index_name not in existing_indexes:
68
+ print("πŸ“₯ Creating new Pinecone index...")
69
+ pc.create_index(
70
+ name=index_name,
71
+ dimension=embeddings.shape[1],
72
+ metric="cosine",
73
+ spec=ServerlessSpec(cloud="aws", region=pinecone_env)
74
+ )
75
+ # Optionally, you might need to wait a few moments for the new index to be ready.
76
 
77
+ # Connect to the index
78
+ index = pc.Index(index_name)
79
+
80
+ print("πŸ“₯ Upserting embeddings to Pinecone index...")
81
+ to_upsert = []
82
+ for i, (vec, meta) in enumerate(zip(embeddings, metadatas)):
83
+ # Create a unique document id
84
+ doc_id = str(uuid.uuid4())
85
+ # Save the document text in metadata to return during queries
86
+ meta_copy = meta.copy()
87
+ meta_copy["document"] = documents[i]
88
+ # Prepare tuple (id, vector, metadata)
89
+ to_upsert.append((doc_id, vec.tolist(), meta_copy))
90
 
91
+ # Upsert documents as a single batch (for large datasets, consider batching the upserts)
92
+ index.upsert(vectors=to_upsert)
93
+
94
+ return index, model, embeddings, documents, metadatas
95
+
96
+ # === STEP 3: Query the RAG Model using Pinecone ===
97
+ def ask_query(query, model, index, k=10):
98
+ print(f"\nπŸ’¬ Query: {query}")
99
+ # Generate query embedding
100
+ query_embedding = model.encode([query]).tolist()[0]
101
+ # Query Pinecone (retrieve extra candidates to filter duplicates)
102
+ query_response = index.query(vector=query_embedding, top_k=k * 2, include_metadata=True)
103
 
 
104
  seen_tests = set()
105
  final_results = []
106
+
107
+ # Loop through matches and filter for unique "Test Name"
108
+ for match in query_response['matches']:
109
+ meta = match.get('metadata', {})
110
+ test_name = meta.get("Test Name", "")
 
 
111
  if test_name in seen_tests:
112
  continue
 
113
  seen_tests.add(test_name)
114
+ # Retrieve the stored document text from metadata
115
+ doc = meta.get("document", "")
116
  final_results.append((doc, meta))
 
 
117
  if len(final_results) >= k:
118
  break
119
 
120
+ return final_results
121
+
122
+ # === Example Usage ===
123
+ if __name__ == "__main__":
124
+ # Path to your CSV file
125
+ csv_path = "shl_products.csv"
126
+
127
+ # Step 1: Preprocess CSV and create document chunks
128
+ documents, metadatas = pre_processing_csv(csv_path)
129
+
130
+ # Load the SentenceTransformer model
131
+ model = SentenceTransformer("all-MiniLM-L6-v2")
132
+
133
+ # Step 2: Build the Pinecone vector store
134
+ index, model, embeddings, documents, metadatas = build_pinecone_store(
135
+ documents, metadatas, model, PINECONE_INDEX_NAME, PINECONE_API_KEY, PINECONE_ENV
136
+ )
137
+
138
+ # Step 3: Query the RAG model
139
+ sample_query = "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes."
140
+ results = ask_query(sample_query, model, index, k=10)
141
+
142
+ # Display the results
143
+ print(f"\nResults for query: {sample_query}\n{'='*80}")
144
+ for i, (doc, meta) in enumerate(results, 1):
145
+ print(f"Result {i}:")
146
+ print(f"Test Name: {meta.get('Test Name', '')}")
147
+ print(f"Test Link: https://www.shl.com{meta.get('Test Link', '')}")
148
+ print(f"Chunk: {doc}")
149
+ print("-" * 80)