File size: 3,024 Bytes
ed8425a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
import uuid
import numpy as np

# === STEP 1: Preprocessing CSV & Chunking ===
def pre_processing_csv(csv_path):
    df = pd.read_csv(csv_path)
    df.fillna("", inplace=True)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=50
    )

    documents = []
    metadatas = []

    for idx, row in df.iterrows():
        # Combine multiple fields for better context
        combined_text = f"""
        Test Name: {row.get('Test Name', '')}
        Description: {row.get('Description', '')}
        Remote Testing: {row.get('Remote Testing', '')}
        Adaptive/IRT: {row.get('Adaptive/IRT', '')}
        Test Type: {row.get('Test Type', '')}
        """
        
        chunks = text_splitter.split_text(combined_text)

        for chunk in chunks:
            documents.append(chunk)
            metadatas.append({
                "Test Name": row.get('Test Name', ''),
                "Test Link": row.get('Test Link', ''),
                "Remote Testing": row.get('Remote Testing', ''),
                "Adaptive/IRT": row.get('Adaptive/IRT', ''),
                "Test Type": row.get('Test Type', ''),
                "row_id": idx
            })
    
    return documents, metadatas

# === STEP 2: Embed and Store in ChromaDB ===
def build_chroma_store(documents, metadatas,client=None):
    if client is None:
        client = chromadb.Client()
    collection = client.create_collection(name="shl_test_catalog")
    print("🔍 Embedding documents...")
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(documents, show_progress_bar=True)

    print("📥 Adding to ChromaDB...")
    collection.add(
        documents=documents,
        embeddings=[e.tolist() for e in embeddings],
        ids=[str(uuid.uuid4()) for _ in range(len(documents))],
        metadatas=metadatas
    )

    return collection, model

# === STEP 3: Query the RAG Model ===
def ask_query(query, model, collection, k=10):
    print(f"\n💬 Query: {query}")
    query_embedding = model.encode(query)
    
    # Get more results than needed for diversity
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=k*2  # Get more results for diversity
    )

    # Process results to ensure diversity
    seen_tests = set()
    final_results = []
    
    for i in range(len(results['documents'][0])):
        doc = results['documents'][0][i]
        meta = results['metadatas'][0][i]
        test_name = meta['Test Name']
        
        # Skip if we've already seen this test
        if test_name in seen_tests:
            continue
            
        seen_tests.add(test_name)
        final_results.append((doc, meta))
        
        # Stop if we have enough diverse results
        if len(final_results) >= k:
            break

    return final_results