deenasun commited on
Commit
fb9dd9f
·
1 Parent(s): c4f7a31

add closest word search if query isn't in the KeyedVector's vocabulary

Browse files
Files changed (1) hide show
  1. vectorizer.py +20 -2
vectorizer.py CHANGED
@@ -52,10 +52,24 @@ class Vectorizer:
52
 
53
  def encode(self, word):
54
  print(f"encoding {word}")
55
- if self.kv is not None and word in self.kv.key_to_index:
 
 
 
56
  return self.kv[word]
57
  else:
58
  print(f"Error: {word} is not in the KeyedVector's vocabulary")
 
 
 
 
 
 
 
 
 
 
 
59
  return None
60
 
61
  def encode_and_format(self, word):
@@ -70,10 +84,11 @@ class Vectorizer:
70
  try:
71
  await self.ensure_supabase_initialized()
72
  query_embedding = self.encode(query)
 
73
  if query_embedding is None:
74
  return {
75
  "match": False,
76
- "error": f"'{query}' not in vocabulary"
77
  }
78
 
79
  query_embedding = query_embedding.tolist()
@@ -139,8 +154,11 @@ def load_filtered_kv(model_name='word2vec-google-news-300', vocab=None):
139
  async def main():
140
  vectorizer = Vectorizer()
141
 
 
142
  vector = vectorizer.encode("test")
143
  print(vector)
 
 
144
  result = await vectorizer.vector_query_from_supabase("dog")
145
  print(result)
146
  result = await vectorizer.vector_query_from_supabase("cat")
 
52
 
53
  def encode(self, word):
54
  print(f"encoding {word}")
55
+ if self.kv is None:
56
+ print("KeyedVectors not loaded")
57
+ return None
58
+ if word in self.kv.key_to_index:
59
  return self.kv[word]
60
  else:
61
  print(f"Error: {word} is not in the KeyedVector's vocabulary")
62
+ # Try to find closest match
63
+ try:
64
+ closest_matches = self.kv.most_similar(word, topn=3)
65
+ if closest_matches:
66
+ closest_word = closest_matches[0][0]
67
+ print(f"Using closest match '{closest_word}' for '{word}'")
68
+ return self.kv[closest_word]
69
+ else:
70
+ print(f"No similar words found for '{word}'")
71
+ except Exception as e:
72
+ print(f"Error finding similar words: {e}")
73
  return None
74
 
75
  def encode_and_format(self, word):
 
84
  try:
85
  await self.ensure_supabase_initialized()
86
  query_embedding = self.encode(query)
87
+
88
  if query_embedding is None:
89
  return {
90
  "match": False,
91
+ "error": f"'{query}' not in vocabulary and no similar words found"
92
  }
93
 
94
  query_embedding = query_embedding.tolist()
 
154
  async def main():
155
  vectorizer = Vectorizer()
156
 
157
+ # Test exact word match
158
  vector = vectorizer.encode("test")
159
  print(vector)
160
+
161
+ # Test words not in vocabulary with closest match fallback
162
  result = await vectorizer.vector_query_from_supabase("dog")
163
  print(result)
164
  result = await vectorizer.vector_query_from_supabase("cat")