File size: 1,148 Bytes
fb2b4e2
 
 
 
 
 
 
 
 
 
 
4adef30
 
 
fb2b4e2
 
 
 
 
4adef30
fb2b4e2
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import json
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load once at import time
WEIGHTS = torch.load("tensor.pt").detach().numpy()   # shape: (V, D)
with open("vocab.json", "r") as f:
    TOKEN2IDX = json.load(f)
# Build reverse map: idx (as int) → token (str)
IDX2TOKEN = {int(i): w for w, i in TOKEN2IDX.items()}

def chat(question: str) -> str:
    """
    Embedding Q&A stub:
    - Tokenize by whitespace
    - Lookup embeddings
    - Average them
    - Find nearest token in vocab
    """
    # Simple whitespace tokenizer; you can improve this later
    tokens = question.lower().split()
    # Map to indices, drop unknowns
    idxs = [TOKEN2IDX[t] for t in tokens if t in TOKEN2IDX]
    if not idxs:
        return "🤔 I don't recognize any of those words."
    # Average embedding vector
    q_embed = np.mean(WEIGHTS[idxs], axis=0, keepdims=True)
    # Cosine‐similarity against all vocab embeddings
    sims = cosine_similarity(q_embed, WEIGHTS)[0]
    best = int(np.argmax(sims))
    best_word = IDX2TOKEN.get(best, "<unknown>")
    return f"🗣️ Nearest concept: **{best_word}**"