Spaces:

ericlkc
/

in-class-01

Sleeping

App Files Files Community

ericlkc commited on Jan 31, 2024

Commit

2913b41

verified ·

1 Parent(s): 9c23e38

in-class 01 demo

Browse files

Files changed (4) hide show

app.py +327 -0
embeddings_50d_temp.npy +3 -0
requirements.txt +6 -0
word_index_dict_50d_temp.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+In this code block, you can develop a class for Embeddings -
+That can fetch embeddings of different kinds for the purpose of "Semantic Search"
+"""
+from sentence_transformers import SentenceTransformer
+import numpy as np
+import pickle
+import numpy.linalg as la
+class Embeddings:
+    def __init__(self):
+        """
+        Initialize the class
+        """
+        self.glove_embedding_dimension = 50
+    def download_glove_embeddings(self):
+        """
+        Download glove embeddings from web or from your gdrive if in optimized format
+        """
+        # use data from gdrive
+        embeddings_temp = "/content/drive/MyDrive/LLM596/embeddings_50d_temp.npy"
+        word_index_temp = "/content/drive/MyDrive/LLM596/word_index_dict_50d_temp.pkl"
+    def load_glove_embeddings(self, embedding_dimension):
+        # load data
+        word_index_temp = "word_index_dict_50d_temp.pkl"
+        embeddings_temp = "embeddings_50d_temp.npy"
+        # Load word index dictionary
+        word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin")
+        # Load embeddings numpy
+        embeddings = np.load(embeddings_temp)
+        return word_index_dict, embeddings
+    def get_glove_embedding(self, word, word_index_dict, embeddings):
+        """
+        Retrieve GloVe embedding of a specific dimension
+        """
+        word = word.lower()
+        if word in word_index_dict:
+            return embeddings[word_index_dict[word]]
+        else:
+            return np.zeros(self.glove_embedding_dimension)
+    def embeddings_before_answer(self, word_index_dict, positive_words, negative_words, embeddings):
+        new_embedding = np.zeros(self.glove_embedding_dimension)
+        #  for negative words
+        for word in negative_words:
+            new_embedding -= self.get_glove_embedding(word, word_index_dict, embeddings)
+        # for positive words
+        for word in positive_words:
+            new_embedding += self.get_glove_embedding(word, word_index_dict, embeddings)
+        return new_embedding
+    def get_sentence_transformer_embedding(self, sentence, transformer_name="all-MiniLM-L6-v2"):
+        """
+        Encode a sentence using sentence transformer and return embedding
+        """
+        sentenceTransformer = SentenceTransformer(transformer_name)
+        return sentenceTransformer.encode(sentence)
+    def get_averaged_glove_embeddings(self, sentence, embeddings_dict):
+        words = sentence.split(" ")
+        # Initialize an array of zeros for the embedding
+        glove_embedding = np.zeros(embeddings_dict['embeddings'].shape[1])
+        count_words = 0
+        for word in words:
+            word = word.lower()  # Convert to lowercase to match the embeddings dictionary
+            if word in embeddings_dict['word_index']:
+                # Sum up embeddings for each word
+                glove_embedding += embeddings_dict['embeddings'][embeddings_dict['word_index'][word]]
+                count_words += 1
+        if count_words > 0:
+            # Average the embeddings
+            glove_embedding /= count_words
+        return glove_embedding
+class Search:
+    def __init__(self, embeddings_model):
+        self.embeddings_model = embeddings_model
+    def cosine_similarity(self, x, y):
+        return np.dot(x, y) / max(la.norm(x) * la.norm(y), 1e-3)
+    def normalize_func(self, vector):
+        norm = np.linalg.norm(vector)
+        if norm == 0:
+            return vector
+        return vector / norm
+    def find_closest_words(self, current_embedding, answer_list, word_index_dict, embeddings):
+        """
+        Find the closest word to the target embedding from a list of answer_list
+        """
+        highest_similarity = -50
+        closest_answer = None
+        for choice in answer_list:
+            choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
+            similarity = self.cosine_similarity(current_embedding, choice_embedding)
+            if similarity > highest_similarity:
+                highest_similarity = similarity
+                closest_answer = choice
+        return closest_answer
+    def find_word_as(self, current_relation, target_word, answer_list, word_index_dict, embeddings):
+        base_vector_a = self.embeddings_model.get_glove_embedding(current_relation[0], word_index_dict, embeddings)
+        base_vector_b = self.embeddings_model.get_glove_embedding(current_relation[1], word_index_dict, embeddings)
+        target_vector = self.embeddings_model.get_glove_embedding(target_word, word_index_dict, embeddings)
+        ref_difference = self.normalize_func(base_vector_b - base_vector_a)
+        answer = None
+        highest_similarity = -50
+        for choice in answer_list:
+            choice_vector = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
+            choice_difference = self.normalize_func(choice_vector - target_vector)
+            similarity = self.cosine_similarity(ref_difference, choice_difference)
+            if similarity > highest_similarity:
+                highest_similarity = similarity
+                answer = choice
+        return answer
+    def find_similarity_scores(self, current_embedding, choices, word_index_dict, embeddings):
+        similarity_scores = {}
+        for choice in choices:
+            choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
+            similarity = self.cosine_similarity(current_embedding, choice_embedding)
+            similarity_scores[choice] = similarity
+        return similarity_scores
+    def get_topK_similar_categories(self, sentence, categories, top_k=10):
+        """
+        Return the most similar categories to a given sentence -
+        This is a baseline implementation of a semantic search engine
+        """
+        # Implement your code here
+        sentence_embedding = self.embeddings_model.get_sentence_transformer_embedding(sentence)
+        similarities = {}
+        for category, category_embedding in categories.items():
+            similarity = self.cosine_similarity(sentence_embedding, category_embedding)
+            similarities[category] = similarity
+            # print(similarity)
+        # sorted_categories ={}
+        # sorted_categories = sorted(similarities, key=lambda x: x[1], reverse=True)
+        sorted_cosine_sim = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))
+        # Return top K categories
+        return sorted_cosine_sim
+def plot_alatirchart(sorted_cosine_scores_models):
+    models = list(sorted_cosine_scores_models.keys())
+    tabs = st.tabs(models)
+    figs = {}
+    for model in models:
+        # modified
+        figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model])
+    for index in range(len(tabs)):
+        with tabs[index]:
+            st.pyplot(figs[models[index]])
+import matplotlib.pyplot as plt
+def plot_pie_chart(category_simiarity_scores):
+    categories = list(category_simiarity_scores.keys())
+    cur_similarities = list(category_simiarity_scores.values())
+    similarities = [similar / sum(cur_similarities) for similar in cur_similarities]
+    fig, ax = plt.subplots()
+    ax.pie(similarities, labels=categories,
+           autopct="%1.1f%%",
+           startangle=90)
+    ax.axis('equal')
+    plt.show()
+def plot_piechart_helper(sorted_cosine_scores_items):
+    sorted_cosine_scores = np.array(list(sorted_cosine_scores_items.values()))
+    categories_sorted = list(sorted_cosine_scores_items.keys())
+    fig, ax = plt.subplots(figsize=(3, 3))
+    my_explode = np.zeros(len(categories_sorted))
+    my_explode[0] = 0.2
+    if len(categories_sorted) == 3:
+        my_explode[1] = 0.1
+    elif len(categories_sorted) > 3:
+        my_explode[2] = 0.05
+    ax.pie(
+        sorted_cosine_scores,
+        labels=categories_sorted,
+        autopct="%1.1f%%",
+        explode=my_explode,
+    )
+    return fig
+import streamlit as st
+### Text Search ###
+st.sidebar.title("GloVe Twitter")
+st.sidebar.markdown(
+    """
+GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on
+2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip).
+Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*.
+"""
+)
+if 'categories' not in st.session_state:
+    st.session_state['categories'] = "Flowers Colors Cars Weather Food"
+if 'text_search' not in st.session_state:
+    st.session_state['text_search'] = "Roses are red, trucks are blue, and Seattle is grey right now"
+embeddings_model = Embeddings()
+model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d"), index=1)
+st.title("Demo in in-class coding")
+st.subheader(
+    "Pass in space separated categories you want this search demo to be about."
+)
+# categories of user input
+user_categories = st.text_input(
+    label="Categories", value=st.session_state.categories
+)
+st.session_state.categories = user_categories.split(" ")
+print(st.session_state.get("categories"))
+print(type(st.session_state.get("categories")))
+st.subheader("Pass in an input word or even a sentence")
+user_text_search = st.text_input(
+    label="Input your sentence",
+    value=st.session_state.text_search,
+)
+st.session_state.text_search = user_text_search
+# Load glove embeddings
+word_index_dict, embeddings = embeddings_model.load_glove_embeddings(model_type)
+category_embeddings = {category: embeddings_model.get_sentence_transformer_embedding(category) for category in
+                       st.session_state.categories}
+search_using_cos = Search(embeddings_model)
+# Find closest word to an input word
+if st.session_state.text_search:
+    # sentence transformer embeddings
+    print("sentence transformer  Embedding")
+    embeddings_metadata = {
+        "word_index_dict": word_index_dict,
+        "embeddings": embeddings,
+        "model_type": model_type,
+        "text_search": st.session_state.text_search
+    }
+    with st.spinner("Obtaining Cosine similarity for Glove..."):
+        sorted_cosine_sim_transformer = search_using_cos.get_topK_similar_categories(
+            st.session_state.text_search, category_embeddings
+        )
+    # Results and Plot Pie Chart for Glove
+    print("Categories are: ", st.session_state.categories)
+    st.subheader(
+        "Closest word I have between: "
+        + " ".join(st.session_state.categories)
+        + " as per different Embeddings"
+    )
+    # print(sorted_cosine_sim_glove)
+    print(sorted_cosine_sim_transformer)
+    print(list(sorted_cosine_sim_transformer.keys())[0])
+    st.write(
+        f"Closest category using sentence transformer embeddings : {list(sorted_cosine_sim_transformer.keys())[0]}")
+    plot_alatirchart(
+        {
+            "sentence_transformer_384": sorted_cosine_sim_transformer,
+        }
+    )
+    st.write("")
+    st.write(
+        "Demo developed by Kechen Liu"
+    )

embeddings_50d_temp.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e74f88cde3ff2e36c815d13955c67983cf6f81829d2582cb6789c10786e5ef66
+size 477405680

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit
+numpy
+pickleshare
+gdown
+sentence-transformers
+matplotlib

word_index_dict_50d_temp.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:674af352f703098ef122f6a8db7c5e08c5081829d49daea32e5aeac1fe582900
+size 60284151