Spaces:
Sleeping
Sleeping
Rohit Rajpoot
commited on
Commit
·
fb2b4e2
1
Parent(s):
aa99e83
Detach tensor before .numpy()
Browse files- assist/chat.py +29 -5
assist/chat.py
CHANGED
@@ -1,9 +1,33 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
def chat(question: str) -> str:
|
4 |
"""
|
5 |
-
|
|
|
|
|
|
|
|
|
6 |
"""
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
|
6 |
+
# Load once at import time
|
7 |
+
WEIGHTS = torch.load("tensor.pt").detach().numpy() # shape: (V, D)
|
8 |
+
with open("vocab.json", "r") as f:
|
9 |
+
TOKEN2IDX = json.load(f)
|
10 |
+
# Build reverse map: idx (as int) → token (str)
|
11 |
+
IDX2TOKEN = {int(i): w for w, i in TOKEN2IDX.items()}
|
12 |
|
13 |
def chat(question: str) -> str:
|
14 |
"""
|
15 |
+
Embedding Q&A stub:
|
16 |
+
- Tokenize by whitespace
|
17 |
+
- Lookup embeddings
|
18 |
+
- Average them
|
19 |
+
- Find nearest token in vocab
|
20 |
"""
|
21 |
+
# Simple whitespace tokenizer; you can improve this later
|
22 |
+
tokens = question.lower().split()
|
23 |
+
# Map to indices, drop unknowns
|
24 |
+
idxs = [TOKEN2IDX[t] for t in tokens if t in TOKEN2IDX]
|
25 |
+
if not idxs:
|
26 |
+
return "🤔 I don't recognize any of those words."
|
27 |
+
# Average embedding vector
|
28 |
+
q_embed = np.mean(WEIGHTS[idxs], axis=0, keepdims=True)
|
29 |
+
# Cosine‐similarity against all vocab embeddings
|
30 |
+
sims = cosine_similarity(q_embed, WEIGHTS)[0]
|
31 |
+
best = int(np.argmax(sims))
|
32 |
+
best_word = IDX2TOKEN.get(best, "<unknown>")
|
33 |
+
return f"🗣️ Nearest concept: **{best_word}**"
|