File size: 1,918 Bytes
0df7243
 
 
 
 
e169b07
0df7243
8703987
e169b07
 
 
c20fa4f
ca74349
e169b07
 
3e71806
39162f9
ca74349
39162f9
0df7243
 
e169b07
0df7243
 
 
 
 
 
e169b07
0df7243
 
 
 
 
 
 
 
 
 
 
e169b07
0df7243
 
 
e169b07
 
 
 
 
 
 
 
8d72876
 
e169b07
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# vectordb_utils.py

from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import uuid
import os

# Setup cache dir
cache_dir = os.environ.get("MODEL_CACHE_DIR", "/app/cache")  # Fallback
os.makedirs(cache_dir, exist_ok=True)
api_key=os.environ.get("QDRANT_API_KEY")
# Encoder and Qdrant config
encoder = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_dir)
qdrant = QdrantClient(
    url="https://b4e91bde-3e30-43ef-968e-c10a43f2e161.eu-west-2-0.aws.cloud.qdrant.io:6333", 
    api_key=api_key,
)
collection_name = "customer_support_docsv1"

# Initialize collection
def init_qdrant_collection():
    qdrant.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
    )

# Add a query/response to DB
def add_to_vectordb(query, response):
    vector = encoder.encode(query).tolist()
    qdrant.upload_points(
        collection_name=collection_name,
        points=[PointStruct(
            id=str(uuid.uuid4()),
            vector=vector,
            payload={"query": query, "response": response}
        )]
    )

# Search DB
def search_vectordb(query, limit=3):
    vector = encoder.encode(query).tolist()
    return qdrant.search(collection_name=collection_name, query_vector=vector, limit=limit)

# 🆕 Load and populate from Hugging Face dataset
def populate_vectordb_from_hf():
    print("Loading dataset from Hugging Face...")
    dataset = load_dataset("Talhat/Customer_IT_Support", split="train")

    print("Populating vector DB...")
    for item in dataset:
        query = item.get("body", "").strip()
        response = item.get("answer", "").strip()
        if query and response:
            add_to_vectordb(query, response)

    print("Vector DB population complete.")