File size: 5,540 Bytes
ed8425a
 
 
 
 
542c92b
 
 
 
 
 
 
 
ed8425a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542c92b
 
 
 
 
 
 
ed8425a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542c92b
 
ed8425a
 
542c92b
ed8425a
542c92b
 
 
ed8425a
542c92b
 
 
 
 
 
 
 
 
 
 
 
 
 
ed8425a
542c92b
 
 
 
 
 
 
 
 
 
 
 
 
ed8425a
542c92b
 
 
 
 
 
 
 
 
 
 
 
ed8425a
 
 
542c92b
 
 
 
 
ed8425a
 
 
542c92b
 
ed8425a
 
 
 
542c92b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import uuid
import numpy as np
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")  # e.g., "us-west-2"
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "shl-test-index")

# === STEP 1: Preprocessing CSV & Chunking ===
def pre_processing_csv(csv_path):
    df = pd.read_csv(csv_path)
    df.fillna("", inplace=True)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=50
    )

    documents = []
    metadatas = []

    for idx, row in df.iterrows():
        combined_text = (
            f"Test Name: {row.get('Test Name', '')}\n"
            f"Description: {row.get('Description', '')}\n"
            f"Remote Testing: {row.get('Remote Testing', '')}\n"
            f"Adaptive/IRT: {row.get('Adaptive/IRT', '')}\n"
            f"Test Type: {row.get('Test Type', '')}\n"
        )
        
        chunks = text_splitter.split_text(combined_text)

        for chunk in chunks:
            documents.append(chunk)
            metadatas.append({
                "Test Name": row.get('Test Name', ''),
                "Test Link": row.get('Test Link', ''),
                "Remote Testing": row.get('Remote Testing', ''),
                "Adaptive/IRT": row.get('Adaptive/IRT', ''),
                "Test Type": row.get('Test Type', ''),
                "row_id": idx
            })
    
    return documents, metadatas

# === STEP 2: Embed and Store in Pinecone ===
def build_pinecone_store(documents, metadatas, model, index_name, pinecone_api_key, pinecone_env):
    print("πŸ” Embedding documents...")
    embeddings = model.encode(documents, show_progress_bar=True)
    embeddings = np.array(embeddings).astype("float32")

    print("πŸ”‘ Initializing Pinecone client...")
    # Import new classes from the pinecone package
    from pinecone import Pinecone, ServerlessSpec

    # Create a Pinecone client instance
    pc = Pinecone(api_key=pinecone_api_key)
    
    # Check if the index exists; if not, create a new one.
    existing_indexes = pc.list_indexes().names()
    if index_name not in existing_indexes:
        print("πŸ“₯ Creating new Pinecone index...")
        pc.create_index(
            name=index_name,
            dimension=embeddings.shape[1],
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region=pinecone_env)
        )
        # Optionally, you might need to wait a few moments for the new index to be ready.

    # Connect to the index
    index = pc.Index(index_name)

    print("πŸ“₯ Upserting embeddings to Pinecone index...")
    to_upsert = []
    for i, (vec, meta) in enumerate(zip(embeddings, metadatas)):
        # Create a unique document id
        doc_id = str(uuid.uuid4())
        # Save the document text in metadata to return during queries
        meta_copy = meta.copy()
        meta_copy["document"] = documents[i]
        # Prepare tuple (id, vector, metadata)
        to_upsert.append((doc_id, vec.tolist(), meta_copy))
    
    # Upsert documents as a single batch (for large datasets, consider batching the upserts)
    index.upsert(vectors=to_upsert)

    return index, model, embeddings, documents, metadatas

# === STEP 3: Query the RAG Model using Pinecone ===
def ask_query(query, model, index, k=10):
    print(f"\nπŸ’¬ Query: {query}")
    # Generate query embedding
    query_embedding = model.encode([query]).tolist()[0]
    # Query Pinecone (retrieve extra candidates to filter duplicates)
    query_response = index.query(vector=query_embedding, top_k=k * 2, include_metadata=True)

    seen_tests = set()
    final_results = []

    # Loop through matches and filter for unique "Test Name"
    for match in query_response['matches']:
        meta = match.get('metadata', {})
        test_name = meta.get("Test Name", "")
        if test_name in seen_tests:
            continue
        seen_tests.add(test_name)
        # Retrieve the stored document text from metadata
        doc = meta.get("document", "")
        final_results.append((doc, meta))
        if len(final_results) >= k:
            break

    return final_results

# === Example Usage ===
if __name__ == "__main__":
    # Path to your CSV file
    csv_path = "shl_products.csv"
    
    # Step 1: Preprocess CSV and create document chunks
    documents, metadatas = pre_processing_csv(csv_path)
    
    # Load the SentenceTransformer model
    model = SentenceTransformer("all-MiniLM-L6-v2")
    
    # Step 2: Build the Pinecone vector store
    index, model, embeddings, documents, metadatas = build_pinecone_store(
        documents, metadatas, model, PINECONE_INDEX_NAME, PINECONE_API_KEY, PINECONE_ENV
    )
    
    # Step 3: Query the RAG model
    sample_query = "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes."
    results = ask_query(sample_query, model, index, k=10)
    
    # Display the results
    print(f"\nResults for query: {sample_query}\n{'='*80}")
    for i, (doc, meta) in enumerate(results, 1):
        print(f"Result {i}:")
        print(f"Test Name: {meta.get('Test Name', '')}")
        print(f"Test Link: https://www.shl.com{meta.get('Test Link', '')}")
        print(f"Chunk: {doc}")
        print("-" * 80)