word2vec / app.py
kkosmi's picture
Update app.py
aca3b1d verified
raw
history blame
6 kB
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import gradio as gr
from sklearn.decomposition import PCA
import plotly.graph_objects as go
import nltk
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from docs import NOVEL_TEXT
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Function to process each sentence
def process_text(text):
# Tokenization
tokens = word_tokenize(text.lower())
# Remove stop words and apply lemmatization
processed_tokens = [
lemmatizer.lemmatize(token)
for token in tokens if token.isalnum() and token not in stop_words
]
return processed_tokens
# Split text into sentences
sentences = nltk.sent_tokenize(NOVEL_TEXT)
# Process each sentence in the corpus
processed_corpus = [process_text(sentence) for sentence in sentences]
import random
emotion_words = ['emotion', 'joy', 'fear', 'anger', 'sadness', 'disgust', 'anxiety', 'team', 'console', 'headquarters', 'feelings']
hockey_words = ['hockey', 'game', 'team', 'skates', 'stick', 'rink', 'practice', 'championship', 'score', 'goal', 'penalty']
memory_words = ['memory', 'sphere', 'shelves', 'life', 'experience', 'recall', 'remember', 'color', 'happy', 'sad', 'joyful']
friend_words = ['friend', 'riley', 'grace', 'bree', 'team', 'support', 'help', 'together', 'loyal', 'fun', 'friendship']
school_words = ['school', 'class', 'teacher', 'student', 'homework', 'study', 'exam', 'lesson', 'classmates', 'learn']
train_data = []
for _ in range(40):
train_data.append(random.sample(emotion_words, k=random.randint(4, 6)))
train_data.append(random.sample(hockey_words, k=random.randint(4, 6)))
train_data.append(random.sample(memory_words, k=random.randint(4, 6)))
train_data.append(random.sample(friend_words, k=random.randint(4, 6)))
train_data.append(random.sample(school_words, k=random.randint(4, 6)))
random.shuffle(train_data)
def train_word2vec(sentences):
model = Word2Vec(sentences, vector_size=100, window=3, min_count=2, workers=4, sg=0, epochs=100)
return model
def apply_pca(word_vectors):
pca = PCA(n_components=3)
return pca.fit_transform(word_vectors)
def get_unique(model):
vocablist1=list(model.wv.index_to_key)
vocablist =[]
for i in vocablist1:
vocablist.append(i)
return vocablist
def process_text(target_word):
# ์ „์ฒ˜๋ฆฌ
sentences=X
# Word2Vec ๋ชจ๋ธ ํ•™์Šต
model = train_word2vec(sentences)
unique_words = get_unique(model)
# ๊ฐ ๋‹จ์–ด์˜ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ถ”์ถœ
word_vectors = np.array([model.wv[word] for word in unique_words])
# PCA๋กœ ์ฐจ์› ์ถ•์†Œ
word_vectors_3d = apply_pca(word_vectors)
# ์ƒ‰์ƒ ์„ค์ • (ํˆฌ๋ช…๋„ ์ถ”๊ฐ€)
colors = ['rgba(128, 128, 128, 0.15)' if word != target_word else 'rgba(255, 0, 0, 1)' for word in unique_words]
# ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
if target_word in model.wv:
similar_words = model.wv.most_similar(target_word, topn=10)
similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
for idx in similar_word_indices:
colors[idx] = 'rgba(0, 255, 0, 1)' # ๊ฐ€๊นŒ์šด ๋‹จ์–ด๋“ค์„ ์ดˆ๋ก์ƒ‰์œผ๋กœ ํ‘œ์‹œ
# ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
if target_word in model.wv:
all_words = model.wv.index_to_key # ๋ชจ๋ธ์— ํฌํ•จ๋œ ๋ชจ๋“  ๋‹จ์–ด ๋ฆฌ์ŠคํŠธ
dissimilar_words = sorted([(word, model.wv.similarity(target_word, word))
for word in all_words if word != target_word],
key=lambda x: x[1])[:10] # ์œ ์‚ฌ๋„๊ฐ€ ๊ฐ€์žฅ ๋‚ฎ์€ 10๊ฐœ ๋‹จ์–ด ์„ ํƒ
dissimilar_word_indices = [unique_words.index(word) for word, _ in dissimilar_words]
for idx in dissimilar_word_indices:
colors[idx] = 'rgba(128, 0, 128, 1)' # ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋“ค์„ ๋ณด๋ผ์ƒ‰์œผ๋กœ ํ‘œ์‹œ
# Plotly๋ฅผ ์‚ฌ์šฉํ•œ 3D ์‚ฐ์ ๋„ ์ƒ์„ฑ
fig = go.Figure(data=[go.Scatter3d(
x=word_vectors_3d[:, 0],
y=word_vectors_3d[:, 1],
z=word_vectors_3d[:, 2],
mode='markers+text',
text=unique_words,
textposition="top center",
marker=dict(
size=6,
color=colors,
)
)])
fig.update_layout(
title="Word Embeddings 3D Visualization",
scene=dict(
xaxis_title="PCA 1",
yaxis_title="PCA 2",
zaxis_title="PCA 3"
),
width=1000,
height=1000
)
# ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ๋ชฉ๋ก ์ƒ์„ฑ
similar_words_text = ""
if target_word in model.wv:
similar_words_text = "๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ:\n" + "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])
return fig, similar_words_text
# Gradio ์ธํ„ฐํŽ˜์ด์Šค
with gr.Blocks() as iface:
gr.Markdown("# Word Embedding 3D ์‹œ๊ฐํ™”")
gr.Markdown("๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”. Word2Vec๊ณผ PCA๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด ์ž„๋ฒ ๋”ฉ์„ 3D๋กœ ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค. ์ž…๋ ฅํ•œ ๋‹จ์–ด๋Š” ๋นจ๊ฐ„์ƒ‰์œผ๋กœ, ๊ฐ€์žฅ ์œ ์‚ฌํ•œ 10๊ฐœ ๋‹จ์–ด๋Š” ์ดˆ๋ก์ƒ‰, ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋Š” ๋ณด๋ผ์ƒ‰์œผ๋กœ ๊ฐ•์กฐ๋ฉ๋‹ˆ๋‹ค. ์œ ์‚ฌํ•œ ๋‹จ์–ด ๋ชฉ๋ก์€ ๊ทธ๋ž˜ํ”„ ์•„๋ž˜์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
with gr.Row():
# file_input = gr.File(label="ํ…์ŠคํŠธ ํŒŒ์ผ ์—…๋กœ๋“œ (.txt)", file_types=[".txt"])
word_input = gr.Textbox(label="๊ฐ•์กฐํ•  ๋‹จ์–ด ์ž…๋ ฅ")
submit_btn = gr.Button("์ œ์ถœ")
plot_output = gr.Plot(label="Word Embedding 3D ์‹œ๊ฐํ™”")
similar_words_output = gr.Textbox(label="์œ ์‚ฌํ•œ ๋‹จ์–ด")
submit_btn.click(
fn=process_text,
inputs=[word_input],
outputs=[plot_output, similar_words_output]
)
if __name__ == "__main__":
iface.launch()