""" In this code block, you can develop a class for Embeddings - That can fetch embeddings of different kinds for the purpose of "Semantic Search" """ from sentence_transformers import SentenceTransformer import numpy as np import pickle import numpy.linalg as la class Embeddings: def __init__(self): """ Initialize the class """ self.glove_embedding_dimension = 50 def download_glove_embeddings(self): """ Download glove embeddings from web or from your gdrive if in optimized format """ # use data from gdrive embeddings_temp = "/content/drive/MyDrive/LLM596/embeddings_50d_temp.npy" word_index_temp = "/content/drive/MyDrive/LLM596/word_index_dict_50d_temp.pkl" def load_glove_embeddings(self, embedding_dimension): # load data word_index_temp = "word_index_dict_50d_temp.pkl" embeddings_temp = "embeddings_50d_temp.npy" # Load word index dictionary word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin") # Load embeddings numpy embeddings = np.load(embeddings_temp) return word_index_dict, embeddings def get_glove_embedding(self, word, word_index_dict, embeddings): """ Retrieve GloVe embedding of a specific dimension """ word = word.lower() if word in word_index_dict: return embeddings[word_index_dict[word]] else: return np.zeros(self.glove_embedding_dimension) def embeddings_before_answer(self, word_index_dict, positive_words, negative_words, embeddings): new_embedding = np.zeros(self.glove_embedding_dimension) # for negative words for word in negative_words: new_embedding -= self.get_glove_embedding(word, word_index_dict, embeddings) # for positive words for word in positive_words: new_embedding += self.get_glove_embedding(word, word_index_dict, embeddings) return new_embedding def get_sentence_transformer_embedding(self, sentence, transformer_name="all-MiniLM-L6-v2"): """ Encode a sentence using sentence transformer and return embedding """ sentenceTransformer = SentenceTransformer(transformer_name) return sentenceTransformer.encode(sentence) def get_averaged_glove_embeddings(self, sentence, embeddings_dict): words = sentence.split(" ") # Initialize an array of zeros for the embedding glove_embedding = np.zeros(embeddings_dict['embeddings'].shape[1]) count_words = 0 for word in words: word = word.lower() # Convert to lowercase to match the embeddings dictionary if word in embeddings_dict['word_index']: # Sum up embeddings for each word glove_embedding += embeddings_dict['embeddings'][embeddings_dict['word_index'][word]] count_words += 1 if count_words > 0: # Average the embeddings glove_embedding /= count_words return glove_embedding class Search: def __init__(self, embeddings_model): self.embeddings_model = embeddings_model def cosine_similarity(self, x, y): return np.dot(x, y) / max(la.norm(x) * la.norm(y), 1e-3) def normalize_func(self, vector): norm = np.linalg.norm(vector) if norm == 0: return vector return vector / norm def find_closest_words(self, current_embedding, answer_list, word_index_dict, embeddings): """ Find the closest word to the target embedding from a list of answer_list """ highest_similarity = -50 closest_answer = None for choice in answer_list: choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings) similarity = self.cosine_similarity(current_embedding, choice_embedding) if similarity > highest_similarity: highest_similarity = similarity closest_answer = choice return closest_answer def find_word_as(self, current_relation, target_word, answer_list, word_index_dict, embeddings): base_vector_a = self.embeddings_model.get_glove_embedding(current_relation[0], word_index_dict, embeddings) base_vector_b = self.embeddings_model.get_glove_embedding(current_relation[1], word_index_dict, embeddings) target_vector = self.embeddings_model.get_glove_embedding(target_word, word_index_dict, embeddings) ref_difference = self.normalize_func(base_vector_b - base_vector_a) answer = None highest_similarity = -50 for choice in answer_list: choice_vector = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings) choice_difference = self.normalize_func(choice_vector - target_vector) similarity = self.cosine_similarity(ref_difference, choice_difference) if similarity > highest_similarity: highest_similarity = similarity answer = choice return answer def find_similarity_scores(self, current_embedding, choices, word_index_dict, embeddings): similarity_scores = {} for choice in choices: choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings) similarity = self.cosine_similarity(current_embedding, choice_embedding) similarity_scores[choice] = similarity return similarity_scores def get_topK_similar_categories(self, sentence, categories, top_k=10): """ Return the most similar categories to a given sentence - This is a baseline implementation of a semantic search engine """ # Implement your code here sentence_embedding = self.embeddings_model.get_sentence_transformer_embedding(sentence) similarities = {} for category, category_embedding in categories.items(): similarity = self.cosine_similarity(sentence_embedding, category_embedding) similarities[category] = similarity # print(similarity) # sorted_categories ={} # sorted_categories = sorted(similarities, key=lambda x: x[1], reverse=True) sorted_cosine_sim = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True)) # Return top K categories return sorted_cosine_sim def plot_alatirchart(sorted_cosine_scores_models): models = list(sorted_cosine_scores_models.keys()) tabs = st.tabs(models) figs = {} for model in models: # modified figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model]) for index in range(len(tabs)): with tabs[index]: st.pyplot(figs[models[index]]) import matplotlib.pyplot as plt def plot_pie_chart(category_simiarity_scores): categories = list(category_simiarity_scores.keys()) cur_similarities = list(category_simiarity_scores.values()) similarities = [similar / sum(cur_similarities) for similar in cur_similarities] fig, ax = plt.subplots() ax.pie(similarities, labels=categories, autopct="%1.1f%%", startangle=90) ax.axis('equal') plt.show() def plot_piechart_helper(sorted_cosine_scores_items): sorted_cosine_scores = np.array(list(sorted_cosine_scores_items.values())) categories_sorted = list(sorted_cosine_scores_items.keys()) fig, ax = plt.subplots(figsize=(3, 3)) my_explode = np.zeros(len(categories_sorted)) my_explode[0] = 0.2 if len(categories_sorted) == 3: my_explode[1] = 0.1 elif len(categories_sorted) > 3: my_explode[2] = 0.05 ax.pie( sorted_cosine_scores, labels=categories_sorted, autopct="%1.1f%%", explode=my_explode, ) return fig import streamlit as st ### Text Search ### st.sidebar.title("GloVe Twitter") st.sidebar.markdown( """ GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on 2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip). Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*. """ ) if 'categories' not in st.session_state: st.session_state['categories'] = "Flowers Colors Cars Weather Food" if 'text_search' not in st.session_state: st.session_state['text_search'] = "Roses are red, trucks are blue, and Seattle is grey right now" embeddings_model = Embeddings() model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d"), index=1) st.title("Demo in in-class coding") st.subheader( "Pass in space separated categories you want this search demo to be about." ) # categories of user input user_categories = st.text_input( label="Categories", value=st.session_state.categories ) st.session_state.categories = user_categories.split(" ") print(st.session_state.get("categories")) print(type(st.session_state.get("categories"))) st.subheader("Pass in an input word or even a sentence") user_text_search = st.text_input( label="Input your sentence", value=st.session_state.text_search, ) st.session_state.text_search = user_text_search # Load glove embeddings word_index_dict, embeddings = embeddings_model.load_glove_embeddings(model_type) category_embeddings = {category: embeddings_model.get_sentence_transformer_embedding(category) for category in st.session_state.categories} search_using_cos = Search(embeddings_model) # Find closest word to an input word if st.session_state.text_search: # sentence transformer embeddings print("sentence transformer Embedding") embeddings_metadata = { "word_index_dict": word_index_dict, "embeddings": embeddings, "model_type": model_type, "text_search": st.session_state.text_search } with st.spinner("Obtaining Cosine similarity for Glove..."): sorted_cosine_sim_transformer = search_using_cos.get_topK_similar_categories( st.session_state.text_search, category_embeddings ) # Results and Plot Pie Chart for Glove print("Categories are: ", st.session_state.categories) st.subheader( "Closest word I have between: " + " ".join(st.session_state.categories) + " as per different Embeddings" ) # print(sorted_cosine_sim_glove) print(sorted_cosine_sim_transformer) print(list(sorted_cosine_sim_transformer.keys())[0]) st.write( f"Closest category using sentence transformer embeddings : {list(sorted_cosine_sim_transformer.keys())[0]}") plot_alatirchart( { "sentence_transformer_384": sorted_cosine_sim_transformer, } ) st.write("") st.write( "Demo developed by Kechen Liu" )