ericlkc commited on
Commit
2913b41
·
verified ·
1 Parent(s): 9c23e38

in-class 01 demo

Browse files
app.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ In this code block, you can develop a class for Embeddings -
3
+ That can fetch embeddings of different kinds for the purpose of "Semantic Search"
4
+ """
5
+
6
+ from sentence_transformers import SentenceTransformer
7
+ import numpy as np
8
+ import pickle
9
+
10
+ import numpy.linalg as la
11
+
12
+
13
+ class Embeddings:
14
+
15
+ def __init__(self):
16
+ """
17
+ Initialize the class
18
+ """
19
+ self.glove_embedding_dimension = 50
20
+
21
+ def download_glove_embeddings(self):
22
+ """
23
+ Download glove embeddings from web or from your gdrive if in optimized format
24
+ """
25
+ # use data from gdrive
26
+ embeddings_temp = "/content/drive/MyDrive/LLM596/embeddings_50d_temp.npy"
27
+
28
+ word_index_temp = "/content/drive/MyDrive/LLM596/word_index_dict_50d_temp.pkl"
29
+
30
+ def load_glove_embeddings(self, embedding_dimension):
31
+ # load data
32
+ word_index_temp = "word_index_dict_50d_temp.pkl"
33
+ embeddings_temp = "embeddings_50d_temp.npy"
34
+
35
+ # Load word index dictionary
36
+ word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin")
37
+
38
+ # Load embeddings numpy
39
+ embeddings = np.load(embeddings_temp)
40
+
41
+ return word_index_dict, embeddings
42
+
43
+ def get_glove_embedding(self, word, word_index_dict, embeddings):
44
+ """
45
+ Retrieve GloVe embedding of a specific dimension
46
+ """
47
+ word = word.lower()
48
+ if word in word_index_dict:
49
+ return embeddings[word_index_dict[word]]
50
+ else:
51
+ return np.zeros(self.glove_embedding_dimension)
52
+
53
+ def embeddings_before_answer(self, word_index_dict, positive_words, negative_words, embeddings):
54
+ new_embedding = np.zeros(self.glove_embedding_dimension)
55
+
56
+ # for negative words
57
+ for word in negative_words:
58
+ new_embedding -= self.get_glove_embedding(word, word_index_dict, embeddings)
59
+
60
+ # for positive words
61
+ for word in positive_words:
62
+ new_embedding += self.get_glove_embedding(word, word_index_dict, embeddings)
63
+
64
+ return new_embedding
65
+
66
+ def get_sentence_transformer_embedding(self, sentence, transformer_name="all-MiniLM-L6-v2"):
67
+ """
68
+ Encode a sentence using sentence transformer and return embedding
69
+ """
70
+
71
+ sentenceTransformer = SentenceTransformer(transformer_name)
72
+
73
+ return sentenceTransformer.encode(sentence)
74
+
75
+ def get_averaged_glove_embeddings(self, sentence, embeddings_dict):
76
+ words = sentence.split(" ")
77
+ # Initialize an array of zeros for the embedding
78
+ glove_embedding = np.zeros(embeddings_dict['embeddings'].shape[1])
79
+
80
+ count_words = 0
81
+ for word in words:
82
+ word = word.lower() # Convert to lowercase to match the embeddings dictionary
83
+ if word in embeddings_dict['word_index']:
84
+ # Sum up embeddings for each word
85
+ glove_embedding += embeddings_dict['embeddings'][embeddings_dict['word_index'][word]]
86
+ count_words += 1
87
+
88
+ if count_words > 0:
89
+ # Average the embeddings
90
+ glove_embedding /= count_words
91
+
92
+ return glove_embedding
93
+
94
+
95
+ class Search:
96
+
97
+ def __init__(self, embeddings_model):
98
+ self.embeddings_model = embeddings_model
99
+
100
+ def cosine_similarity(self, x, y):
101
+
102
+ return np.dot(x, y) / max(la.norm(x) * la.norm(y), 1e-3)
103
+
104
+ def normalize_func(self, vector):
105
+ norm = np.linalg.norm(vector)
106
+ if norm == 0:
107
+ return vector
108
+ return vector / norm
109
+
110
+ def find_closest_words(self, current_embedding, answer_list, word_index_dict, embeddings):
111
+ """
112
+ Find the closest word to the target embedding from a list of answer_list
113
+ """
114
+ highest_similarity = -50
115
+ closest_answer = None
116
+
117
+ for choice in answer_list:
118
+ choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
119
+ similarity = self.cosine_similarity(current_embedding, choice_embedding)
120
+ if similarity > highest_similarity:
121
+ highest_similarity = similarity
122
+ closest_answer = choice
123
+
124
+ return closest_answer
125
+
126
+ def find_word_as(self, current_relation, target_word, answer_list, word_index_dict, embeddings):
127
+
128
+ base_vector_a = self.embeddings_model.get_glove_embedding(current_relation[0], word_index_dict, embeddings)
129
+ base_vector_b = self.embeddings_model.get_glove_embedding(current_relation[1], word_index_dict, embeddings)
130
+ target_vector = self.embeddings_model.get_glove_embedding(target_word, word_index_dict, embeddings)
131
+
132
+ ref_difference = self.normalize_func(base_vector_b - base_vector_a)
133
+
134
+ answer = None
135
+ highest_similarity = -50
136
+
137
+ for choice in answer_list:
138
+ choice_vector = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
139
+ choice_difference = self.normalize_func(choice_vector - target_vector)
140
+ similarity = self.cosine_similarity(ref_difference, choice_difference)
141
+ if similarity > highest_similarity:
142
+ highest_similarity = similarity
143
+ answer = choice
144
+
145
+ return answer
146
+
147
+ def find_similarity_scores(self, current_embedding, choices, word_index_dict, embeddings):
148
+
149
+ similarity_scores = {}
150
+
151
+ for choice in choices:
152
+ choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
153
+ similarity = self.cosine_similarity(current_embedding, choice_embedding)
154
+ similarity_scores[choice] = similarity
155
+
156
+ return similarity_scores
157
+
158
+ def get_topK_similar_categories(self, sentence, categories, top_k=10):
159
+ """
160
+ Return the most similar categories to a given sentence -
161
+ This is a baseline implementation of a semantic search engine
162
+ """
163
+
164
+ # Implement your code here
165
+ sentence_embedding = self.embeddings_model.get_sentence_transformer_embedding(sentence)
166
+
167
+ similarities = {}
168
+ for category, category_embedding in categories.items():
169
+ similarity = self.cosine_similarity(sentence_embedding, category_embedding)
170
+ similarities[category] = similarity
171
+ # print(similarity)
172
+
173
+ # sorted_categories ={}
174
+ # sorted_categories = sorted(similarities, key=lambda x: x[1], reverse=True)
175
+
176
+ sorted_cosine_sim = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))
177
+
178
+ # Return top K categories
179
+ return sorted_cosine_sim
180
+
181
+
182
+ def plot_alatirchart(sorted_cosine_scores_models):
183
+ models = list(sorted_cosine_scores_models.keys())
184
+ tabs = st.tabs(models)
185
+ figs = {}
186
+ for model in models:
187
+ # modified
188
+ figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model])
189
+
190
+ for index in range(len(tabs)):
191
+ with tabs[index]:
192
+ st.pyplot(figs[models[index]])
193
+
194
+
195
+ import matplotlib.pyplot as plt
196
+
197
+
198
+ def plot_pie_chart(category_simiarity_scores):
199
+ categories = list(category_simiarity_scores.keys())
200
+ cur_similarities = list(category_simiarity_scores.values())
201
+
202
+ similarities = [similar / sum(cur_similarities) for similar in cur_similarities]
203
+
204
+ fig, ax = plt.subplots()
205
+ ax.pie(similarities, labels=categories,
206
+ autopct="%1.1f%%",
207
+ startangle=90)
208
+ ax.axis('equal')
209
+ plt.show()
210
+
211
+
212
+ def plot_piechart_helper(sorted_cosine_scores_items):
213
+ sorted_cosine_scores = np.array(list(sorted_cosine_scores_items.values()))
214
+ categories_sorted = list(sorted_cosine_scores_items.keys())
215
+
216
+ fig, ax = plt.subplots(figsize=(3, 3))
217
+ my_explode = np.zeros(len(categories_sorted))
218
+ my_explode[0] = 0.2
219
+ if len(categories_sorted) == 3:
220
+ my_explode[1] = 0.1
221
+ elif len(categories_sorted) > 3:
222
+ my_explode[2] = 0.05
223
+
224
+ ax.pie(
225
+ sorted_cosine_scores,
226
+ labels=categories_sorted,
227
+ autopct="%1.1f%%",
228
+ explode=my_explode,
229
+ )
230
+
231
+ return fig
232
+
233
+
234
+ import streamlit as st
235
+
236
+ ### Text Search ###
237
+ st.sidebar.title("GloVe Twitter")
238
+ st.sidebar.markdown(
239
+ """
240
+ GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on
241
+ 2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip).
242
+
243
+ Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*.
244
+ """
245
+ )
246
+
247
+ if 'categories' not in st.session_state:
248
+ st.session_state['categories'] = "Flowers Colors Cars Weather Food"
249
+ if 'text_search' not in st.session_state:
250
+ st.session_state['text_search'] = "Roses are red, trucks are blue, and Seattle is grey right now"
251
+
252
+ embeddings_model = Embeddings()
253
+
254
+ model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d"), index=1)
255
+
256
+ st.title("Demo in in-class coding")
257
+ st.subheader(
258
+ "Pass in space separated categories you want this search demo to be about."
259
+ )
260
+
261
+ # categories of user input
262
+ user_categories = st.text_input(
263
+ label="Categories", value=st.session_state.categories
264
+ )
265
+
266
+ st.session_state.categories = user_categories.split(" ")
267
+
268
+ print(st.session_state.get("categories"))
269
+
270
+ print(type(st.session_state.get("categories")))
271
+
272
+ st.subheader("Pass in an input word or even a sentence")
273
+ user_text_search = st.text_input(
274
+ label="Input your sentence",
275
+ value=st.session_state.text_search,
276
+ )
277
+
278
+ st.session_state.text_search = user_text_search
279
+
280
+ # Load glove embeddings
281
+ word_index_dict, embeddings = embeddings_model.load_glove_embeddings(model_type)
282
+
283
+ category_embeddings = {category: embeddings_model.get_sentence_transformer_embedding(category) for category in
284
+ st.session_state.categories}
285
+
286
+ search_using_cos = Search(embeddings_model)
287
+
288
+ # Find closest word to an input word
289
+ if st.session_state.text_search:
290
+ # sentence transformer embeddings
291
+ print("sentence transformer Embedding")
292
+ embeddings_metadata = {
293
+ "word_index_dict": word_index_dict,
294
+ "embeddings": embeddings,
295
+ "model_type": model_type,
296
+ "text_search": st.session_state.text_search
297
+ }
298
+ with st.spinner("Obtaining Cosine similarity for Glove..."):
299
+ sorted_cosine_sim_transformer = search_using_cos.get_topK_similar_categories(
300
+ st.session_state.text_search, category_embeddings
301
+ )
302
+
303
+ # Results and Plot Pie Chart for Glove
304
+ print("Categories are: ", st.session_state.categories)
305
+ st.subheader(
306
+ "Closest word I have between: "
307
+ + " ".join(st.session_state.categories)
308
+ + " as per different Embeddings"
309
+ )
310
+
311
+ # print(sorted_cosine_sim_glove)
312
+ print(sorted_cosine_sim_transformer)
313
+ print(list(sorted_cosine_sim_transformer.keys())[0])
314
+
315
+ st.write(
316
+ f"Closest category using sentence transformer embeddings : {list(sorted_cosine_sim_transformer.keys())[0]}")
317
+
318
+ plot_alatirchart(
319
+ {
320
+ "sentence_transformer_384": sorted_cosine_sim_transformer,
321
+ }
322
+ )
323
+
324
+ st.write("")
325
+ st.write(
326
+ "Demo developed by Kechen Liu"
327
+ )
embeddings_50d_temp.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e74f88cde3ff2e36c815d13955c67983cf6f81829d2582cb6789c10786e5ef66
3
+ size 477405680
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ numpy
3
+ pickleshare
4
+ gdown
5
+ sentence-transformers
6
+ matplotlib
word_index_dict_50d_temp.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:674af352f703098ef122f6a8db7c5e08c5081829d49daea32e5aeac1fe582900
3
+ size 60284151