in-class-01 / app.py
ericlkc's picture
in-class 01 demo
2913b41 verified
"""
In this code block, you can develop a class for Embeddings -
That can fetch embeddings of different kinds for the purpose of "Semantic Search"
"""
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle
import numpy.linalg as la
class Embeddings:
def __init__(self):
"""
Initialize the class
"""
self.glove_embedding_dimension = 50
def download_glove_embeddings(self):
"""
Download glove embeddings from web or from your gdrive if in optimized format
"""
# use data from gdrive
embeddings_temp = "/content/drive/MyDrive/LLM596/embeddings_50d_temp.npy"
word_index_temp = "/content/drive/MyDrive/LLM596/word_index_dict_50d_temp.pkl"
def load_glove_embeddings(self, embedding_dimension):
# load data
word_index_temp = "word_index_dict_50d_temp.pkl"
embeddings_temp = "embeddings_50d_temp.npy"
# Load word index dictionary
word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin")
# Load embeddings numpy
embeddings = np.load(embeddings_temp)
return word_index_dict, embeddings
def get_glove_embedding(self, word, word_index_dict, embeddings):
"""
Retrieve GloVe embedding of a specific dimension
"""
word = word.lower()
if word in word_index_dict:
return embeddings[word_index_dict[word]]
else:
return np.zeros(self.glove_embedding_dimension)
def embeddings_before_answer(self, word_index_dict, positive_words, negative_words, embeddings):
new_embedding = np.zeros(self.glove_embedding_dimension)
# for negative words
for word in negative_words:
new_embedding -= self.get_glove_embedding(word, word_index_dict, embeddings)
# for positive words
for word in positive_words:
new_embedding += self.get_glove_embedding(word, word_index_dict, embeddings)
return new_embedding
def get_sentence_transformer_embedding(self, sentence, transformer_name="all-MiniLM-L6-v2"):
"""
Encode a sentence using sentence transformer and return embedding
"""
sentenceTransformer = SentenceTransformer(transformer_name)
return sentenceTransformer.encode(sentence)
def get_averaged_glove_embeddings(self, sentence, embeddings_dict):
words = sentence.split(" ")
# Initialize an array of zeros for the embedding
glove_embedding = np.zeros(embeddings_dict['embeddings'].shape[1])
count_words = 0
for word in words:
word = word.lower() # Convert to lowercase to match the embeddings dictionary
if word in embeddings_dict['word_index']:
# Sum up embeddings for each word
glove_embedding += embeddings_dict['embeddings'][embeddings_dict['word_index'][word]]
count_words += 1
if count_words > 0:
# Average the embeddings
glove_embedding /= count_words
return glove_embedding
class Search:
def __init__(self, embeddings_model):
self.embeddings_model = embeddings_model
def cosine_similarity(self, x, y):
return np.dot(x, y) / max(la.norm(x) * la.norm(y), 1e-3)
def normalize_func(self, vector):
norm = np.linalg.norm(vector)
if norm == 0:
return vector
return vector / norm
def find_closest_words(self, current_embedding, answer_list, word_index_dict, embeddings):
"""
Find the closest word to the target embedding from a list of answer_list
"""
highest_similarity = -50
closest_answer = None
for choice in answer_list:
choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
similarity = self.cosine_similarity(current_embedding, choice_embedding)
if similarity > highest_similarity:
highest_similarity = similarity
closest_answer = choice
return closest_answer
def find_word_as(self, current_relation, target_word, answer_list, word_index_dict, embeddings):
base_vector_a = self.embeddings_model.get_glove_embedding(current_relation[0], word_index_dict, embeddings)
base_vector_b = self.embeddings_model.get_glove_embedding(current_relation[1], word_index_dict, embeddings)
target_vector = self.embeddings_model.get_glove_embedding(target_word, word_index_dict, embeddings)
ref_difference = self.normalize_func(base_vector_b - base_vector_a)
answer = None
highest_similarity = -50
for choice in answer_list:
choice_vector = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
choice_difference = self.normalize_func(choice_vector - target_vector)
similarity = self.cosine_similarity(ref_difference, choice_difference)
if similarity > highest_similarity:
highest_similarity = similarity
answer = choice
return answer
def find_similarity_scores(self, current_embedding, choices, word_index_dict, embeddings):
similarity_scores = {}
for choice in choices:
choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
similarity = self.cosine_similarity(current_embedding, choice_embedding)
similarity_scores[choice] = similarity
return similarity_scores
def get_topK_similar_categories(self, sentence, categories, top_k=10):
"""
Return the most similar categories to a given sentence -
This is a baseline implementation of a semantic search engine
"""
# Implement your code here
sentence_embedding = self.embeddings_model.get_sentence_transformer_embedding(sentence)
similarities = {}
for category, category_embedding in categories.items():
similarity = self.cosine_similarity(sentence_embedding, category_embedding)
similarities[category] = similarity
# print(similarity)
# sorted_categories ={}
# sorted_categories = sorted(similarities, key=lambda x: x[1], reverse=True)
sorted_cosine_sim = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))
# Return top K categories
return sorted_cosine_sim
def plot_alatirchart(sorted_cosine_scores_models):
models = list(sorted_cosine_scores_models.keys())
tabs = st.tabs(models)
figs = {}
for model in models:
# modified
figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model])
for index in range(len(tabs)):
with tabs[index]:
st.pyplot(figs[models[index]])
import matplotlib.pyplot as plt
def plot_pie_chart(category_simiarity_scores):
categories = list(category_simiarity_scores.keys())
cur_similarities = list(category_simiarity_scores.values())
similarities = [similar / sum(cur_similarities) for similar in cur_similarities]
fig, ax = plt.subplots()
ax.pie(similarities, labels=categories,
autopct="%1.1f%%",
startangle=90)
ax.axis('equal')
plt.show()
def plot_piechart_helper(sorted_cosine_scores_items):
sorted_cosine_scores = np.array(list(sorted_cosine_scores_items.values()))
categories_sorted = list(sorted_cosine_scores_items.keys())
fig, ax = plt.subplots(figsize=(3, 3))
my_explode = np.zeros(len(categories_sorted))
my_explode[0] = 0.2
if len(categories_sorted) == 3:
my_explode[1] = 0.1
elif len(categories_sorted) > 3:
my_explode[2] = 0.05
ax.pie(
sorted_cosine_scores,
labels=categories_sorted,
autopct="%1.1f%%",
explode=my_explode,
)
return fig
import streamlit as st
### Text Search ###
st.sidebar.title("GloVe Twitter")
st.sidebar.markdown(
"""
GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on
2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip).
Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*.
"""
)
if 'categories' not in st.session_state:
st.session_state['categories'] = "Flowers Colors Cars Weather Food"
if 'text_search' not in st.session_state:
st.session_state['text_search'] = "Roses are red, trucks are blue, and Seattle is grey right now"
embeddings_model = Embeddings()
model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d"), index=1)
st.title("Demo in in-class coding")
st.subheader(
"Pass in space separated categories you want this search demo to be about."
)
# categories of user input
user_categories = st.text_input(
label="Categories", value=st.session_state.categories
)
st.session_state.categories = user_categories.split(" ")
print(st.session_state.get("categories"))
print(type(st.session_state.get("categories")))
st.subheader("Pass in an input word or even a sentence")
user_text_search = st.text_input(
label="Input your sentence",
value=st.session_state.text_search,
)
st.session_state.text_search = user_text_search
# Load glove embeddings
word_index_dict, embeddings = embeddings_model.load_glove_embeddings(model_type)
category_embeddings = {category: embeddings_model.get_sentence_transformer_embedding(category) for category in
st.session_state.categories}
search_using_cos = Search(embeddings_model)
# Find closest word to an input word
if st.session_state.text_search:
# sentence transformer embeddings
print("sentence transformer Embedding")
embeddings_metadata = {
"word_index_dict": word_index_dict,
"embeddings": embeddings,
"model_type": model_type,
"text_search": st.session_state.text_search
}
with st.spinner("Obtaining Cosine similarity for Glove..."):
sorted_cosine_sim_transformer = search_using_cos.get_topK_similar_categories(
st.session_state.text_search, category_embeddings
)
# Results and Plot Pie Chart for Glove
print("Categories are: ", st.session_state.categories)
st.subheader(
"Closest word I have between: "
+ " ".join(st.session_state.categories)
+ " as per different Embeddings"
)
# print(sorted_cosine_sim_glove)
print(sorted_cosine_sim_transformer)
print(list(sorted_cosine_sim_transformer.keys())[0])
st.write(
f"Closest category using sentence transformer embeddings : {list(sorted_cosine_sim_transformer.keys())[0]}")
plot_alatirchart(
{
"sentence_transformer_384": sorted_cosine_sim_transformer,
}
)
st.write("")
st.write(
"Demo developed by Kechen Liu"
)