Spaces:
Sleeping
Sleeping
File size: 1,918 Bytes
0df7243 e169b07 0df7243 8703987 e169b07 c20fa4f ca74349 e169b07 3e71806 39162f9 ca74349 39162f9 0df7243 e169b07 0df7243 e169b07 0df7243 e169b07 0df7243 e169b07 8d72876 e169b07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# vectordb_utils.py
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import uuid
import os
# Setup cache dir
cache_dir = os.environ.get("MODEL_CACHE_DIR", "/app/cache") # Fallback
os.makedirs(cache_dir, exist_ok=True)
api_key=os.environ.get("QDRANT_API_KEY")
# Encoder and Qdrant config
encoder = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_dir)
qdrant = QdrantClient(
url="https://b4e91bde-3e30-43ef-968e-c10a43f2e161.eu-west-2-0.aws.cloud.qdrant.io:6333",
api_key=api_key,
)
collection_name = "customer_support_docsv1"
# Initialize collection
def init_qdrant_collection():
qdrant.recreate_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)
# Add a query/response to DB
def add_to_vectordb(query, response):
vector = encoder.encode(query).tolist()
qdrant.upload_points(
collection_name=collection_name,
points=[PointStruct(
id=str(uuid.uuid4()),
vector=vector,
payload={"query": query, "response": response}
)]
)
# Search DB
def search_vectordb(query, limit=3):
vector = encoder.encode(query).tolist()
return qdrant.search(collection_name=collection_name, query_vector=vector, limit=limit)
# 🆕 Load and populate from Hugging Face dataset
def populate_vectordb_from_hf():
print("Loading dataset from Hugging Face...")
dataset = load_dataset("Talhat/Customer_IT_Support", split="train")
print("Populating vector DB...")
for item in dataset:
query = item.get("body", "").strip()
response = item.get("answer", "").strip()
if query and response:
add_to_vectordb(query, response)
print("Vector DB population complete.")
|