Spaces:
Sleeping
Sleeping
Arko Banik
commited on
Commit
Β·
5c9e161
1
Parent(s):
d02c4b6
produces pie chart but breaks when trying to change category
Browse files- README.md +4 -4
- app.py +45 -54
- embeddings_25d_temp.npy +3 -0
- embeddings_50d_temp.npy +3 -0
- requirements.txt +0 -0
- word_index_dict_25d_temp.pkl +3 -0
- word_index_dict_50d_temp.pkl +3 -0
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.30.0
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
+
title: MiniProject1 P4
|
3 |
+
emoji: π
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: red
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.30.0
|
8 |
app_file: app.py
|
app.py
CHANGED
@@ -4,7 +4,6 @@ import numpy.linalg as la
|
|
4 |
import pickle
|
5 |
import os
|
6 |
import gdown
|
7 |
-
#import sentence_transformers
|
8 |
from sentence_transformers import SentenceTransformer
|
9 |
import matplotlib.pyplot as plt
|
10 |
import math
|
@@ -21,11 +20,8 @@ def cosine_similarity(x, y):
|
|
21 |
"""
|
22 |
##################################
|
23 |
### TODO: Add code here ##########
|
24 |
-
cos_sim = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
|
25 |
-
exp_cos = np.exp(cos_sim) ######## find formula for exponentiated cosine similarity
|
26 |
-
|
27 |
##################################
|
28 |
-
return
|
29 |
|
30 |
|
31 |
# Function to Load Glove Embeddings
|
@@ -67,7 +63,7 @@ def download_glove_embeddings_gdrive(model_type):
|
|
67 |
gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
|
68 |
|
69 |
|
70 |
-
@st.cache_data()
|
71 |
def load_glove_embeddings_gdrive(model_type):
|
72 |
word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
|
73 |
embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
|
@@ -128,22 +124,18 @@ def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, mode
|
|
128 |
embedding = np.zeros(int(model_type.split("d")[0]))
|
129 |
##################################
|
130 |
##### TODO: Add code here ########
|
131 |
-
|
132 |
-
#glove_word_set= load_glove_embeddings_gdrive(model_type)
|
133 |
-
|
134 |
-
for word in sentence:
|
135 |
-
#print(sentence)
|
136 |
-
words = [word.strip('.,?!').lower() for word in sentence.split()]
|
137 |
-
total = 0
|
138 |
-
for w in words:
|
139 |
-
if w in embeddings:
|
140 |
-
embed += embeddings[w]
|
141 |
-
total +=1
|
142 |
-
if total != 0:
|
143 |
-
embed = embed/total
|
144 |
-
|
145 |
-
return embed
|
146 |
##################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
|
149 |
def get_category_embeddings(embeddings_metadata):
|
@@ -182,7 +174,7 @@ def get_sorted_cosine_similarity(embeddings_metadata):
|
|
182 |
(50 pts)
|
183 |
"""
|
184 |
categories = st.session_state.categories.split(" ")
|
185 |
-
|
186 |
if embeddings_metadata["embedding_model"] == "glove":
|
187 |
word_index_dict = embeddings_metadata["word_index_dict"]
|
188 |
embeddings = embeddings_metadata["embeddings"]
|
@@ -194,11 +186,10 @@ def get_sorted_cosine_similarity(embeddings_metadata):
|
|
194 |
|
195 |
##########################################
|
196 |
## TODO: Get embeddings for categories ###
|
197 |
-
cat_embed = []
|
198 |
-
for cat in categories:
|
199 |
-
cat_embed.append(get_glove_embeddings(cat))
|
200 |
-
|
201 |
##########################################
|
|
|
|
|
|
|
202 |
|
203 |
else:
|
204 |
model_name = embeddings_metadata["model_name"]
|
@@ -208,38 +199,36 @@ def get_sorted_cosine_similarity(embeddings_metadata):
|
|
208 |
category_embeddings = st.session_state["cat_embed_" + model_name]
|
209 |
|
210 |
print("text_search = ", st.session_state.text_search)
|
|
|
211 |
if model_name:
|
212 |
input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
|
213 |
else:
|
214 |
input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search)
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
sorted_list = sorted(cat_scores, key=lambda x: x[1])
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
|
242 |
-
return
|
243 |
|
244 |
|
245 |
def plot_piechart(sorted_cosine_scores_items):
|
@@ -397,7 +386,8 @@ if st.session_state.text_search:
|
|
397 |
}
|
398 |
with st.spinner("Obtaining Cosine similarity for Glove..."):
|
399 |
sorted_cosine_sim_glove = get_sorted_cosine_similarity(
|
400 |
-
st.session_state.text_search,
|
|
|
401 |
)
|
402 |
|
403 |
# Sentence transformer embeddings
|
@@ -405,7 +395,8 @@ if st.session_state.text_search:
|
|
405 |
embeddings_metadata = {"embedding_model": "transformers", "model_name": ""}
|
406 |
with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
|
407 |
sorted_cosine_sim_transformer = get_sorted_cosine_similarity(
|
408 |
-
st.session_state.text_search,
|
|
|
409 |
)
|
410 |
|
411 |
# Results and Plot Pie Chart for Glove
|
|
|
4 |
import pickle
|
5 |
import os
|
6 |
import gdown
|
|
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
import matplotlib.pyplot as plt
|
9 |
import math
|
|
|
20 |
"""
|
21 |
##################################
|
22 |
### TODO: Add code here ##########
|
|
|
|
|
|
|
23 |
##################################
|
24 |
+
return np.exp(np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y)))
|
25 |
|
26 |
|
27 |
# Function to Load Glove Embeddings
|
|
|
63 |
gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
|
64 |
|
65 |
|
66 |
+
# @st.cache_data()
|
67 |
def load_glove_embeddings_gdrive(model_type):
|
68 |
word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
|
69 |
embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
|
|
|
124 |
embedding = np.zeros(int(model_type.split("d")[0]))
|
125 |
##################################
|
126 |
##### TODO: Add code here ########
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
##################################
|
128 |
+
|
129 |
+
words = [word.strip('.,?!').lower() for word in sentence.split()]
|
130 |
+
total = 0
|
131 |
+
for w in words:
|
132 |
+
if w in word_index_dict:
|
133 |
+
embedding += embeddings[word_index_dict[w]]
|
134 |
+
total +=1
|
135 |
+
if total != 0:
|
136 |
+
embedding = embedding/total
|
137 |
+
|
138 |
+
return embedding
|
139 |
|
140 |
|
141 |
def get_category_embeddings(embeddings_metadata):
|
|
|
174 |
(50 pts)
|
175 |
"""
|
176 |
categories = st.session_state.categories.split(" ")
|
177 |
+
cosine_sim = {}
|
178 |
if embeddings_metadata["embedding_model"] == "glove":
|
179 |
word_index_dict = embeddings_metadata["word_index_dict"]
|
180 |
embeddings = embeddings_metadata["embeddings"]
|
|
|
186 |
|
187 |
##########################################
|
188 |
## TODO: Get embeddings for categories ###
|
|
|
|
|
|
|
|
|
189 |
##########################################
|
190 |
+
category_embeddings = {}
|
191 |
+
for cat in categories:
|
192 |
+
category_embeddings[cat] = get_glove_embeddings(cat, word_index_dict, embeddings, model_type)
|
193 |
|
194 |
else:
|
195 |
model_name = embeddings_metadata["model_name"]
|
|
|
199 |
category_embeddings = st.session_state["cat_embed_" + model_name]
|
200 |
|
201 |
print("text_search = ", st.session_state.text_search)
|
202 |
+
print(category_embeddings)
|
203 |
if model_name:
|
204 |
input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
|
205 |
else:
|
206 |
input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search)
|
207 |
+
|
208 |
+
cat_scores = []
|
209 |
+
for index in range(len(categories)):
|
210 |
+
##########################################
|
211 |
+
# TODO: Compute cosine similarity between input sentence and categories
|
212 |
+
# TODO: Update category embeddings if category not found
|
213 |
+
##########################################
|
214 |
+
cat = categories[index]
|
215 |
+
cat_embed = category_embeddings[cat]
|
216 |
+
# Calc cosine sim
|
217 |
+
cat_scores.append((index, np.dot(input_embedding,cat_embed)))
|
218 |
+
# Store doc_id and score as a tuple
|
|
|
|
|
|
|
219 |
|
220 |
+
|
221 |
+
sorted_list = sorted(cat_scores, key=lambda x: x[1])
|
222 |
+
sorted_list = sorted_list[::-1]
|
223 |
+
|
224 |
+
sorted_cats = [element[0] for element in sorted_list]
|
225 |
+
|
226 |
+
#flip sorting order
|
227 |
+
# Add list to Map
|
228 |
+
# for cat_pair in sorted_cats:
|
229 |
+
# cosine_sim[cat_pair[0]] = cat_pair[1]
|
230 |
|
231 |
+
return sorted_list
|
232 |
|
233 |
|
234 |
def plot_piechart(sorted_cosine_scores_items):
|
|
|
386 |
}
|
387 |
with st.spinner("Obtaining Cosine similarity for Glove..."):
|
388 |
sorted_cosine_sim_glove = get_sorted_cosine_similarity(
|
389 |
+
# st.session_state.text_search,
|
390 |
+
embeddings_metadata
|
391 |
)
|
392 |
|
393 |
# Sentence transformer embeddings
|
|
|
395 |
embeddings_metadata = {"embedding_model": "transformers", "model_name": ""}
|
396 |
with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
|
397 |
sorted_cosine_sim_transformer = get_sorted_cosine_similarity(
|
398 |
+
# st.session_state.text_search,
|
399 |
+
embeddings_metadata
|
400 |
)
|
401 |
|
402 |
# Results and Plot Pie Chart for Glove
|
embeddings_25d_temp.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5eec0acf13b5c7d7c3bd178c1c84332347b9c0d55a474e37f4313e5289aacde3
|
3 |
+
size 238702880
|
embeddings_50d_temp.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e74f88cde3ff2e36c815d13955c67983cf6f81829d2582cb6789c10786e5ef66
|
3 |
+
size 477405680
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
word_index_dict_25d_temp.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:674af352f703098ef122f6a8db7c5e08c5081829d49daea32e5aeac1fe582900
|
3 |
+
size 60284151
|
word_index_dict_50d_temp.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:674af352f703098ef122f6a8db7c5e08c5081829d49daea32e5aeac1fe582900
|
3 |
+
size 60284151
|