Spaces:
Sleeping
Sleeping
# vectordb_utils.py | |
from qdrant_client import QdrantClient | |
from qdrant_client.models import VectorParams, Distance, PointStruct | |
from sentence_transformers import SentenceTransformer | |
from datasets import load_dataset | |
import uuid | |
import os | |
# Setup cache dir | |
cache_dir = os.environ.get("MODEL_CACHE_DIR", "/app/cache") # Fallback | |
os.makedirs(cache_dir, exist_ok=True) | |
api_key=os.environ.get("QDRANT_API_KEY") | |
# Encoder and Qdrant config | |
encoder = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_dir) | |
qdrant = QdrantClient( | |
url="https://b4e91bde-3e30-43ef-968e-c10a43f2e161.eu-west-2-0.aws.cloud.qdrant.io:6333", | |
api_key=api_key, | |
) | |
collection_name = "customer_support_docsv1" | |
# Initialize collection | |
def init_qdrant_collection(): | |
qdrant.recreate_collection( | |
collection_name=collection_name, | |
vectors_config=VectorParams(size=384, distance=Distance.COSINE) | |
) | |
# Add a query/response to DB | |
def add_to_vectordb(query, response): | |
vector = encoder.encode(query).tolist() | |
qdrant.upload_points( | |
collection_name=collection_name, | |
points=[PointStruct( | |
id=str(uuid.uuid4()), | |
vector=vector, | |
payload={"query": query, "response": response} | |
)] | |
) | |
# Search DB | |
def search_vectordb(query, limit=3): | |
vector = encoder.encode(query).tolist() | |
return qdrant.search(collection_name=collection_name, query_vector=vector, limit=limit) | |
# π Load and populate from Hugging Face dataset | |
def populate_vectordb_from_hf(): | |
print("Loading dataset from Hugging Face...") | |
dataset = load_dataset("Talhat/Customer_IT_Support", split="train") | |
print("Populating vector DB...") | |
for item in dataset: | |
query = item.get("body", "").strip() | |
response = item.get("answer", "").strip() | |
if query and response: | |
add_to_vectordb(query, response) | |
print("Vector DB population complete.") | |