Arko Banik commited on
Commit
5c9e161
Β·
1 Parent(s): d02c4b6

produces pie chart but breaks when trying to change category

Browse files
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Part4
3
- emoji: πŸ“ˆ
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: streamlit
7
  sdk_version: 1.30.0
8
  app_file: app.py
 
1
  ---
2
+ title: MiniProject1 P4
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: red
6
  sdk: streamlit
7
  sdk_version: 1.30.0
8
  app_file: app.py
app.py CHANGED
@@ -4,7 +4,6 @@ import numpy.linalg as la
4
  import pickle
5
  import os
6
  import gdown
7
- #import sentence_transformers
8
  from sentence_transformers import SentenceTransformer
9
  import matplotlib.pyplot as plt
10
  import math
@@ -21,11 +20,8 @@ def cosine_similarity(x, y):
21
  """
22
  ##################################
23
  ### TODO: Add code here ##########
24
- cos_sim = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
25
- exp_cos = np.exp(cos_sim) ######## find formula for exponentiated cosine similarity
26
-
27
  ##################################
28
- return exp_cos
29
 
30
 
31
  # Function to Load Glove Embeddings
@@ -67,7 +63,7 @@ def download_glove_embeddings_gdrive(model_type):
67
  gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
68
 
69
 
70
- @st.cache_data()
71
  def load_glove_embeddings_gdrive(model_type):
72
  word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
73
  embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
@@ -128,22 +124,18 @@ def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, mode
128
  embedding = np.zeros(int(model_type.split("d")[0]))
129
  ##################################
130
  ##### TODO: Add code here ########
131
-
132
- #glove_word_set= load_glove_embeddings_gdrive(model_type)
133
-
134
- for word in sentence:
135
- #print(sentence)
136
- words = [word.strip('.,?!').lower() for word in sentence.split()]
137
- total = 0
138
- for w in words:
139
- if w in embeddings:
140
- embed += embeddings[w]
141
- total +=1
142
- if total != 0:
143
- embed = embed/total
144
-
145
- return embed
146
  ##################################
 
 
 
 
 
 
 
 
 
 
 
147
 
148
 
149
  def get_category_embeddings(embeddings_metadata):
@@ -182,7 +174,7 @@ def get_sorted_cosine_similarity(embeddings_metadata):
182
  (50 pts)
183
  """
184
  categories = st.session_state.categories.split(" ")
185
-
186
  if embeddings_metadata["embedding_model"] == "glove":
187
  word_index_dict = embeddings_metadata["word_index_dict"]
188
  embeddings = embeddings_metadata["embeddings"]
@@ -194,11 +186,10 @@ def get_sorted_cosine_similarity(embeddings_metadata):
194
 
195
  ##########################################
196
  ## TODO: Get embeddings for categories ###
197
- cat_embed = []
198
- for cat in categories:
199
- cat_embed.append(get_glove_embeddings(cat))
200
-
201
  ##########################################
 
 
 
202
 
203
  else:
204
  model_name = embeddings_metadata["model_name"]
@@ -208,38 +199,36 @@ def get_sorted_cosine_similarity(embeddings_metadata):
208
  category_embeddings = st.session_state["cat_embed_" + model_name]
209
 
210
  print("text_search = ", st.session_state.text_search)
 
211
  if model_name:
212
  input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
213
  else:
214
  input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search)
215
- #for index in range(len(categories)):
216
- #pass
217
- ##########################################
218
- # TODO: Compute cosine similarity between input sentence and categories
219
-
220
- cat_scores = []
221
- cat_idx = 0
222
- for cat_embed in category_embeddings:
223
- # Calc cosine sim
224
- cat_scores.append((cat_idx, np.dot(input,cat_embed)))
225
- # Store doc_id and score as a tuple
226
- cat_idx +=1
227
-
228
-
229
- sorted_list = sorted(cat_scores, key=lambda x: x[1])
230
 
231
- sorted_cats = [element[0] for element in sorted_list]
232
-
233
- #flip sorting order
234
- sorted_cats = sorted_cats[::-1]
235
- # Add list to Map
236
- result = sorted_cats[0]
237
- selected_cat = categories[result]
238
- print(selected_cat)
239
- # TODO: Update category embeddings if category not found
240
- ##########################################
241
 
242
- return selected_cat
243
 
244
 
245
  def plot_piechart(sorted_cosine_scores_items):
@@ -397,7 +386,8 @@ if st.session_state.text_search:
397
  }
398
  with st.spinner("Obtaining Cosine similarity for Glove..."):
399
  sorted_cosine_sim_glove = get_sorted_cosine_similarity(
400
- st.session_state.text_search, embeddings_metadata
 
401
  )
402
 
403
  # Sentence transformer embeddings
@@ -405,7 +395,8 @@ if st.session_state.text_search:
405
  embeddings_metadata = {"embedding_model": "transformers", "model_name": ""}
406
  with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
407
  sorted_cosine_sim_transformer = get_sorted_cosine_similarity(
408
- st.session_state.text_search, embeddings_metadata
 
409
  )
410
 
411
  # Results and Plot Pie Chart for Glove
 
4
  import pickle
5
  import os
6
  import gdown
 
7
  from sentence_transformers import SentenceTransformer
8
  import matplotlib.pyplot as plt
9
  import math
 
20
  """
21
  ##################################
22
  ### TODO: Add code here ##########
 
 
 
23
  ##################################
24
+ return np.exp(np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y)))
25
 
26
 
27
  # Function to Load Glove Embeddings
 
63
  gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
64
 
65
 
66
+ # @st.cache_data()
67
  def load_glove_embeddings_gdrive(model_type):
68
  word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
69
  embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
 
124
  embedding = np.zeros(int(model_type.split("d")[0]))
125
  ##################################
126
  ##### TODO: Add code here ########
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  ##################################
128
+
129
+ words = [word.strip('.,?!').lower() for word in sentence.split()]
130
+ total = 0
131
+ for w in words:
132
+ if w in word_index_dict:
133
+ embedding += embeddings[word_index_dict[w]]
134
+ total +=1
135
+ if total != 0:
136
+ embedding = embedding/total
137
+
138
+ return embedding
139
 
140
 
141
  def get_category_embeddings(embeddings_metadata):
 
174
  (50 pts)
175
  """
176
  categories = st.session_state.categories.split(" ")
177
+ cosine_sim = {}
178
  if embeddings_metadata["embedding_model"] == "glove":
179
  word_index_dict = embeddings_metadata["word_index_dict"]
180
  embeddings = embeddings_metadata["embeddings"]
 
186
 
187
  ##########################################
188
  ## TODO: Get embeddings for categories ###
 
 
 
 
189
  ##########################################
190
+ category_embeddings = {}
191
+ for cat in categories:
192
+ category_embeddings[cat] = get_glove_embeddings(cat, word_index_dict, embeddings, model_type)
193
 
194
  else:
195
  model_name = embeddings_metadata["model_name"]
 
199
  category_embeddings = st.session_state["cat_embed_" + model_name]
200
 
201
  print("text_search = ", st.session_state.text_search)
202
+ print(category_embeddings)
203
  if model_name:
204
  input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
205
  else:
206
  input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search)
207
+
208
+ cat_scores = []
209
+ for index in range(len(categories)):
210
+ ##########################################
211
+ # TODO: Compute cosine similarity between input sentence and categories
212
+ # TODO: Update category embeddings if category not found
213
+ ##########################################
214
+ cat = categories[index]
215
+ cat_embed = category_embeddings[cat]
216
+ # Calc cosine sim
217
+ cat_scores.append((index, np.dot(input_embedding,cat_embed)))
218
+ # Store doc_id and score as a tuple
 
 
 
219
 
220
+
221
+ sorted_list = sorted(cat_scores, key=lambda x: x[1])
222
+ sorted_list = sorted_list[::-1]
223
+
224
+ sorted_cats = [element[0] for element in sorted_list]
225
+
226
+ #flip sorting order
227
+ # Add list to Map
228
+ # for cat_pair in sorted_cats:
229
+ # cosine_sim[cat_pair[0]] = cat_pair[1]
230
 
231
+ return sorted_list
232
 
233
 
234
  def plot_piechart(sorted_cosine_scores_items):
 
386
  }
387
  with st.spinner("Obtaining Cosine similarity for Glove..."):
388
  sorted_cosine_sim_glove = get_sorted_cosine_similarity(
389
+ # st.session_state.text_search,
390
+ embeddings_metadata
391
  )
392
 
393
  # Sentence transformer embeddings
 
395
  embeddings_metadata = {"embedding_model": "transformers", "model_name": ""}
396
  with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
397
  sorted_cosine_sim_transformer = get_sorted_cosine_similarity(
398
+ # st.session_state.text_search,
399
+ embeddings_metadata
400
  )
401
 
402
  # Results and Plot Pie Chart for Glove
embeddings_25d_temp.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eec0acf13b5c7d7c3bd178c1c84332347b9c0d55a474e37f4313e5289aacde3
3
+ size 238702880
embeddings_50d_temp.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e74f88cde3ff2e36c815d13955c67983cf6f81829d2582cb6789c10786e5ef66
3
+ size 477405680
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
word_index_dict_25d_temp.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:674af352f703098ef122f6a8db7c5e08c5081829d49daea32e5aeac1fe582900
3
+ size 60284151
word_index_dict_50d_temp.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:674af352f703098ef122f6a8db7c5e08c5081829d49daea32e5aeac1fe582900
3
+ size 60284151