import json import torch import numpy as np from sklearn.metrics.pairwise import cosine_similarity # Load once at import time WEIGHTS = torch.load("tensor.pt").detach().numpy() # shape: (V, D) with open("vocab.json", "r") as f: TOKEN2IDX = json.load(f) # Build reverse map: idx (as int) → token (str) IDX2TOKEN = {int(i): w for w, i in TOKEN2IDX.items()} def chat(question: str) -> str: """ Embedding Q&A stub: - Tokenize by whitespace - Lookup embeddings - Average them - Find nearest token in vocab """ # Simple whitespace tokenizer; you can improve this later tokens = question.lower().split() # Map to indices, drop unknowns idxs = [TOKEN2IDX[t] for t in tokens if t in TOKEN2IDX] if not idxs: return "🤔 I don't recognize any of those words." # Average embedding vector q_embed = np.mean(WEIGHTS[idxs], axis=0, keepdims=True) # Cosine‐similarity against all vocab embeddings sims = cosine_similarity(q_embed, WEIGHTS)[0] best = int(np.argmax(sims)) best_word = IDX2TOKEN.get(best, "") return f"🗣️ Nearest concept: **{best_word}**"