word2vec / app.py
kkosmi's picture
Update app.py
ff648fb verified
raw
history blame
7.39 kB
import numpy as np
import pandas as pd
import random
from gensim.models import Word2Vec
import gradio as gr
from sklearn.decomposition import PCA
import plotly.graph_objects as go
# Word2Vec ๋ชจ๋ธ ํ•™์Šต ํ•จ์ˆ˜
def train_word2vec(sentences):
# model = Word2Vec(sentences, vector_size=100, window=4, min_count=6, workers=4, sg=0, epochs=100)
model = Word2Vec(sentences, vector_size=50, window=4, min_count=1, sg=0, epochs=100)
return model
def apply_pca(word_vectors):
pca = PCA(n_components=3)
return pca.fit_transform(word_vectors)
def get_unique(model):
vocablist1=list(model.wv.index_to_key)
vocablist =[]
for i in vocablist1:
vocablist.append(i)
return vocablist
def train_model(sentence):
# ์ „์ฒ˜๋ฆฌ
sentences=sentence
# Word2Vec ๋ชจ๋ธ ํ•™์Šต
model = train_word2vec(sentences)
unique_words = get_unique(model)
return model, unique_words
def process_text(target_word):
target_word =target_word.lower() #################
# Word2Vec ๋ชจ๋ธ ๋กœ๋“œ
model = Word2Vec.load("word2vec.model")
unique_words = get_unique(model)
# ๊ฐ ๋‹จ์–ด์˜ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ถ”์ถœ
word_vectors = np.array([model.wv[word] for word in unique_words])
# PCA๋กœ ์ฐจ์› ์ถ•์†Œ
word_vectors_3d = apply_pca(word_vectors)
# ์ƒ‰์ƒ ์„ค์ • (ํˆฌ๋ช…๋„ ์ถ”๊ฐ€)
colors = ['rgba(255, 255, 255, 0.15)' if word != target_word else 'rgba(255, 20, 147, 0.9)' for word in unique_words]
# ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
if target_word in model.wv:
similar_words = model.wv.most_similar(target_word, topn=10)
similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
for idx in similar_word_indices:
colors[idx] = 'rgba(255, 165, 0, 1)' # ๊ฐ€๊นŒ์šด ๋‹จ์–ด๋“ค์„ ์ฃผํ™ฉ์ƒ‰์œผ๋กœ ํ‘œ์‹œ
# ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
if target_word in model.wv:
all_words = model.wv.index_to_key # ๋ชจ๋ธ์— ํฌํ•จ๋œ ๋ชจ๋“  ๋‹จ์–ด ๋ฆฌ์ŠคํŠธ
dissimilar_words = sorted(
[(word, model.wv.similarity(target_word, word)) for word in all_words if word != target_word],
key=lambda x: x[1]
)[:10] # ์œ ์‚ฌ๋„๊ฐ€ ๊ฐ€์žฅ ๋‚ฎ์€ 10๊ฐœ ๋‹จ์–ด ์„ ํƒ
dissimilar_word_indices = [unique_words.index(word) for word, _ in dissimilar_words]
for idx in dissimilar_word_indices:
colors[idx] = 'rgba(138, 43, 226, 0.8)' # ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋“ค์„ ๋ณด๋ผ์ƒ‰์œผ๋กœ ํ‘œ์‹œ
# Plotly๋ฅผ ์‚ฌ์šฉํ•œ 3D ์‚ฐ์ ๋„ ์ƒ์„ฑ
fig = go.Figure(data=[go.Scatter3d(
x=word_vectors_3d[:, 0],
y=word_vectors_3d[:, 1],
z=word_vectors_3d[:, 2],
mode='markers+text',
text=unique_words,
textposition="top center",
marker=dict(
size=4,
color=colors,
)
)])
fig.update_layout(
title="Word Embeddings 3D Visualization",
scene=dict(
xaxis_title="X",
yaxis_title="Y",
zaxis_title="Z"
),
width=1100,
height=900
)
# ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ๋ชฉ๋ก ์ƒ์„ฑ
similar_words_text = ""
if target_word in model.wv:
similar_words_text = "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])
dissimlar_words_Text=""
if target_word in model.wv:
dissimilar_words_text = "\n".join([f"{word}: {score:.4f}" for word, score in dissimilar_words])
return fig, similar_words_text, dissimilar_words_text
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ˆ˜์ •
with gr.Blocks(css="""
#input-box {
background-color: #ffeef3; /* ์—ฐํ•œ ํŒŒ์Šคํ…” ํ•‘ํฌ */
border: 2px solid #ffccd5; /* ์—ฐํ•œ ํ•‘ํฌ ํ…Œ๋‘๋ฆฌ */
color: #000; /* ํ…์ŠคํŠธ ์ƒ‰์ƒ */
border-radius: 8px; /* ๋‘ฅ๊ทผ ํ…Œ๋‘๋ฆฌ */
}
#submit-btn {
background-color: #ebfbea; /* ์—ฐํ•œ ํŒŒ์Šคํ…” ์—ฐ๋‘์ƒ‰ */
border: 2px solid #d6f5d6; /* ์—ฐํ•œ ์—ฐ๋‘์ƒ‰ ํ…Œ๋‘๋ฆฌ */
color: #000; /* ํ…์ŠคํŠธ ์ƒ‰์ƒ */
border-radius: 8px; /* ๋‘ฅ๊ทผ ํ…Œ๋‘๋ฆฌ */
}
#bulletin {
background-color: #eaf9ff; /* ์—ฐํ•œ ํŒŒ์Šคํ…” ํ•˜๋Š˜์ƒ‰ */
border: 2px solid #d3f0f7; /* ์—ฐํ•œ ํ•˜๋Š˜์ƒ‰ ํ…Œ๋‘๋ฆฌ */
color: #000; /* ํ…์ŠคํŠธ ์ƒ‰์ƒ */
border-radius: 8px; /* ๋‘ฅ๊ทผ ํ…Œ๋‘๋ฆฌ */
}
#similar-words {
background-color: #fff0e6; /* ์—ฐํ•œ ํŒŒ์Šคํ…” ์ฃผํ™ฉ์ƒ‰ */
border: 2px solid #ffe3cc; /* ์—ฐํ•œ ์ฃผํ™ฉ ํ…Œ๋‘๋ฆฌ */
color: #000; /* ํ…์ŠคํŠธ ์ƒ‰์ƒ */
border-radius: 8px; /* ๋‘ฅ๊ทผ ํ…Œ๋‘๋ฆฌ */
}
#dissimilar-words {
background-color: #f2e6ff; /* ์—ฐํ•œ ํŒŒ์Šคํ…” ๋ณด๋ผ์ƒ‰ */
border: 2px solid #e0ccff; /* ์—ฐํ•œ ๋ณด๋ผ ํ…Œ๋‘๋ฆฌ */
color: #000; /* ํ…์ŠคํŠธ ์ƒ‰์ƒ */
border-radius: 8px; /* ๋‘ฅ๊ทผ ํ…Œ๋‘๋ฆฌ */
}
label {
font-weight: bold; /* ์ œ๋ชฉ ๋ณผ๋“œ์ฒด */
}
""") as iface:
gr.Markdown("# <Inside Out 2> ๋‹จ์–ด ์˜๋ฏธ ์ง€๋„ 3D ์‹œ๊ฐํ™”")
# gr.Markdown("<Inside Out 2> ๋‹จ์–ด ์˜๋ฏธ ์ง€๋„(์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ) 3D ์‹œ๊ฐํ™” ๋„๊ตฌ")
with gr.Row():
# ์‚ฌ์šฉ์ž ์ž…๋ ฅ ๋ฐ•์Šค๋ฅผ ๊ฐ•์กฐํ•˜๊ธฐ ์œ„ํ•ด ์Šคํƒ€์ผ์„ ๋ณ€๊ฒฝ
with gr.Column():
word_input = gr.Textbox(
label="**๋‹จ์–ด ์ž…๋ ฅ**",
elem_id="input-box",
placeholder="ex. emotion, puberty, hockey, friend, anxiety, memory, ...",
lines=1
)
submit_btn = gr.Button("์ œ์ถœ", elem_id="submit-btn")
bulletin = gr.Textbox(
label="์‚ฌ์šฉ๋ฒ• ์•ˆ๋‚ด",
interactive=False,
lines=4,
value=(
"1. ์†Œ์„ค์— ๋‚˜์˜จ ๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜๊ณ  [์ œ์ถœ]์ด๋‚˜ [Enter]๋ฅผ ๋ˆ„๋ฅด์„ธ์š”\n"
"2. ์ž…๋ ฅ ๋‹จ์–ด๋Š” ๋นจ๊ฐ„์ƒ‰, ๊ฐ€๊นŒ์šด ๋‹จ์–ด๋“ค์€ ์ฃผํ™ฉ์ƒ‰, ๋จผ ๋‹จ์–ด๋“ค์€ ๋ณด๋ผ์ƒ‰์œผ๋กœ ๊ฐ•์กฐ๋ฉ๋‹ˆ๋‹ค.\n"
"3. <Error>๊ฐ€ ๋‚˜ํƒ€๋‚˜๋Š” ๊ฒฝ์šฐ, ๋‹ค๋ฅธ ๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•ด๋ณด์„ธ์š”.\n"
"4. ๋งˆ์šฐ์Šค ๋“œ๋ž˜๊ทธ ๋ฐ ์Šคํฌ๋กค์„ ํ™œ์šฉํ•˜์—ฌ 3D ํ™”๋ฉด์„ ์‚ดํŽด๋ณด์„ธ์š”.\n"
"5. ๋‹จ์–ด ์ž…๋ ฅ์ฐฝ์— ๋‹ค๋ฅธ ๋‹จ์–ด๋“ค๋„ ์ž…๋ ฅํ•ด๋ณด์„ธ์š”."
),
elem_id="bulletin"
)
with gr.Row():
# ์‹œ๊ฐํ™” ํ™”๋ฉด์˜ ํฌ๊ธฐ๋ฅผ CSS๋กœ ์ฆ๊ฐ€
plot_output = gr.Plot(label="Word Embedding 3D ์‹œ๊ฐํ™”", elem_id="plot-box")
with gr.Column(scale=0.3): # ์ปฌ๋Ÿผ์˜ ๋„ˆ๋น„๋ฅผ ์ค„์ด๊ธฐ ์œ„ํ•ด scale ๊ฐ’์„ ๋‚ฎ์ถค
similar_words_output = gr.Textbox(
label="๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ",
interactive=False,
lines=5,
elem_id="similar-words"
)
dissimilar_words_output = gr.Textbox(
label="๊ฐ€์žฅ ๋จผ ๋‹จ์–ด 10๊ฐœ",
interactive=False,
lines=5,
elem_id="dissimilar-words"
)
submit_btn.click(
fn=process_text,
inputs=[word_input],
outputs=[plot_output, similar_words_output, dissimilar_words_output]
)
word_input.submit(
fn=process_text,
inputs=[word_input],
outputs=[plot_output, similar_words_output, dissimilar_words_output],
preprocess=lambda word: word.lower() if word else "" # None ์ฒดํฌ ํ›„ ์†Œ๋ฌธ์ž ๋ณ€ํ™˜
)
if __name__ == "__main__":
iface.launch()