import gradio as gr import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.manifold import TSNE model = np.load('gpt2-red-1k-words.npy',allow_pickle='TRUE').item() data = np.asarray([x for x in model.values()]) keys = np.asarray([x for x in model.keys()]) cache = "/home/user/app/d.jpg" def find_most_similar_vectors(vector, lookup_table): """ Finds the indices of the three most similar vectors in the lookup table to the given vector. :param vector: A 1xN numpy array (the vector to compare against others) :param lookup_table: An MxN numpy array (a matrix of vectors) :return: A list of indices of the three most similar vectors from the lookup table """ # Calculate the Euclidean distances from the given vector to all vectors in the lookup table distances = np.linalg.norm(lookup_table - vector, axis=1) # Get the indices of the three smallest distances indices_of_smallest = np.argsort(distances)[:3] return indices_of_smallest.tolist() # Function to reduce dimensions def reduce_dimensions(data, method='PCA'): if method == 'PCA': model = PCA(n_components=2) elif method == 'TSNE': model = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3) return model.fit_transform(data) # Plotting function def plot_reduced_data(reduced_data, labels, title): plt.figure(figsize=(10, 8)) plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.6) for i, label in enumerate(labels): plt.annotate(" " + label, (reduced_data[i, 0], reduced_data[i, 1]), fontsize=18) plt.title(title) # Data for the arrow 1 start_point = (reduced_data[0, 0], reduced_data[0, 1]) # Starting point of the arrow end_point = (reduced_data[1, 0], reduced_data[1, 1]) # Ending point of the arrow # Adding an arrow 1 plt.annotate('', xy=end_point, xytext=start_point, arrowprops=dict(arrowstyle="->", color='green', lw=3)) # Data for the arrow 2 end_point = (reduced_data[-1, 0] , reduced_data[-1, 1]) # Starting point of the arrow start_point = (reduced_data[2, 0], reduced_data[2, 1]) # Ending point of the arrow # Adding an arrow 2 plt.annotate('', xy=end_point, xytext=start_point, arrowprops=dict(arrowstyle="->", color='green', lw=3)) plt.xlabel('Component 1') plt.ylabel('Component 2') plt.grid(True) plt.savefig(cache) description = """ ### Word Embedding Demo App Universidade Federal de São Paulo - Escola Paulista de Medicina The output is Word3 + (Word2 - Word1) Credits: * Gensim * Glove """ Word1 = gr.Textbox() Word2 = gr.Textbox() Word3 = gr.Textbox() label = gr.Label(show_label=True, label="Word4") sp = gr.Image() def inference(word1, word2, word3): transform = model[word3] + model[word2] - model[word1] output = keys[find_most_similar_vectors(transform[np.newaxis, ...], data)] print(output) word_list = [word1, word2, word3] word_list.extend(output) words = {key: model[key] for key in word_list} words[word3 + " + (" + word2 + " - " + word1 + ")"] = transform data2 = np.concatenate([x[np.newaxis, :] for x in words.values()], axis=0) #print(data.shape) labels = words.keys() #reduced_data_pca = reduce_dimensions(data, method='PCA') #print(reduced_data_pca.shape) plot_reduced_data(data2, labels, 'PCA Results') return cache examples = [ ["woman", "man", "aunt"], ["woman", "man", "girl"], ["woman", "man", "granddaughter"], ] iface = gr.Interface( fn=inference, inputs=[Word1, Word2, Word3], outputs=sp, description=description, examples=examples ) iface.launch()