import json import torch import numpy as np from sklearn.metrics.pairwise import cosine_similarity # Load the Bayesian embeddings & vocab at import time WEIGHTS = torch.load("tensor_bayes.pt").detach().numpy() # shape: (V, V) with open("vocab_bayes.json", "r") as f: TOKEN2IDX = json.load(f) IDX2TOKEN = {int(idx): tok for tok, idx in TOKEN2IDX.items()} def bayes_chat(question: str) -> str: """ Given a user question, tokenize → average Bayesian embeddings → find the nearest token in the vocab → return that as the "answer." """ tokens = question.lower().split() idxs = [TOKEN2IDX[t] for t in tokens if t in TOKEN2IDX] if not idxs: return "🤔 I don’t recognize any of those words." # average the rows corresponding to each token qv = np.mean(WEIGHTS[idxs], axis=0, keepdims=True) # compute similarities against every token’s vector sims = cosine_similarity(qv, WEIGHTS)[0] best_idx = int(np.argmax(sims)) best_tok = IDX2TOKEN.get(best_idx, "") return f"🔬 Bayesian neighbor: **{best_tok}**"