Spaces:
Running
Running
File size: 1,088 Bytes
6b3fcc5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
import json
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Load the Bayesian embeddings & vocab at import time
WEIGHTS = torch.load("tensor_bayes.pt").detach().numpy() # shape: (V, V)
with open("vocab_bayes.json", "r") as f:
TOKEN2IDX = json.load(f)
IDX2TOKEN = {int(idx): tok for tok, idx in TOKEN2IDX.items()}
def bayes_chat(question: str) -> str:
"""
Given a user question, tokenize → average Bayesian embeddings →
find the nearest token in the vocab → return that as the "answer."
"""
tokens = question.lower().split()
idxs = [TOKEN2IDX[t] for t in tokens if t in TOKEN2IDX]
if not idxs:
return "🤔 I don’t recognize any of those words."
# average the rows corresponding to each token
qv = np.mean(WEIGHTS[idxs], axis=0, keepdims=True)
# compute similarities against every token’s vector
sims = cosine_similarity(qv, WEIGHTS)[0]
best_idx = int(np.argmax(sims))
best_tok = IDX2TOKEN.get(best_idx, "<unknown>")
return f"🔬 Bayesian neighbor: **{best_tok}**"
|