Spaces:

felipekitamura
/

word_embeddings

Sleeping

File size: 3,045 Bytes

db01b5c
 
 
 
 
 
 
38636da
db01b5c
229a964
db01b5c
 
 
 
 
 
3eaacbe
db01b5c
 
 
 
 
 
 
88cf807
db01b5c
72de3b8
c32bd8b
 
 
72de3b8
c32bd8b
39f357f
72de3b8
 
37cb828
39f357f
72de3b8
 
 
39f357f
72de3b8
db01b5c
 
 
34222dc
db01b5c
 
 
 
 
 
 
 
 
6aa5b29
db01b5c
 
 
 
 
 
12c38a8
db01b5c
 
 
f381ee7
83fbef1
db01b5c
 
9f8fcc2
db01b5c
88cf807
db01b5c
 
 
524d13b
db01b5c
524d13b
db01b5c
 
 
1ef8e00
db01b5c
1ef8e00
db01b5c

import gensim.downloader 
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
model = gensim.downloader.load("word2vec-google-news-300") #glove-wiki-gigaword-50

cache = "/home/user/app/d.png"

# Function to reduce dimensions
def reduce_dimensions(data, method='PCA'):
    if method == 'PCA':
        model = PCA(n_components=2)
    elif method == 'TSNE':
        model = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=4)
    return model.fit_transform(data)

# Plotting function
def plot_reduced_data(reduced_data, labels, title):
    plt.figure(figsize=(10, 8))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.6)
    for i, label in enumerate(labels):
        plt.annotate("  " + label, (reduced_data[i, 0], reduced_data[i, 1]), fontsize=18)
    plt.title(title)
    # Data for the arrow 1
    start_point = (reduced_data[0, 0], reduced_data[0, 1])  # Starting point of the arrow
    end_point = (reduced_data[1, 0], reduced_data[1, 1])  # Ending point of the arrow
    
    # Adding an arrow 1
    plt.annotate('', xy=end_point, xytext=start_point,
                 arrowprops=dict(arrowstyle="->", color='green', lw=3))
    
    # Data for the arrow 2
    end_point = (reduced_data[-1, 0] , reduced_data[-1, 1])  # Starting point of the arrow
    start_point = (reduced_data[2, 0], reduced_data[2, 1])  # Ending point of the arrow
    
    # Adding an arrow 2
    plt.annotate('', xy=end_point, xytext=start_point,
                 arrowprops=dict(arrowstyle="->", color='green', lw=3))  
    
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.grid(True)
    plt.savefig(cache) #, dpi=300)

description = """
### Word Embedding Demo App
Universidade Federal de São Paulo - Escola Paulista de Medicina

The output is Word3 + (Word2 - Word1)

Credits:  
* Gensim
* Word2Vec
"""

Word1 = gr.Textbox()
Word2 = gr.Textbox()
Word3 = gr.Textbox()
label = gr.Label(show_label=True, label="Word4")
sp = gr.Image()


def inference(word1, word2, word3):
    transform = model[word3] + model[word2] - model[word1]
    output = model.similar_by_vector(transform)
    print(output)
    word_list = [word1, word2, word3]
    word_list.extend([x for x,y in [item for item in output[:6]]])
    words = {key: model[key] for key in word_list}
    words[word3 + " + (" + word2 + " - " + word1 + ")"] = transform
    data = np.concatenate([x[np.newaxis, :] for x in words.values()], axis=0)
    print(data.shape)
    labels = words.keys()
    reduced_data_pca = reduce_dimensions(data, method='PCA')
    print(reduced_data_pca.shape)
    plot_reduced_data(reduced_data_pca, labels, 'PCA Results')
    return cache

examples = [
    ["woman", "man", "girl"],  
    ["woman", "man", "granddaughter"],
    ["woman", "man", "aunt"],
]

iface = gr.Interface(
    fn=inference,
    inputs=[Word1, Word2, Word3],
    outputs=sp,
    description=description,
    examples=examples
    )

iface.launch()