word2vec / app.py
Unggi's picture
Upload app.py
770e786 verified
raw
history blame
3.97 kB
import numpy as np
from gensim.models import Word2Vec
import gradio as gr
from sklearn.decomposition import PCA
import plotly.graph_objects as go
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Word2Vec ๋ชจ๋ธ ํ•™์Šต ํ•จ์ˆ˜
def train_word2vec(sentences):
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
return model
def preprocess_text(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
# ํ† ํฐํ™” ๋ฐ ํ’ˆ์‚ฌ ํƒœ๊น…
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
# ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ (NN, NNS, NNP, NNPS)
nouns = [word.lower() for word, pos in tagged if pos.startswith('NN')]
# ์ค‘๋ณต ์ œ๊ฑฐ ๋ฐ ์ •๋ ฌ
unique_nouns = sorted(set(nouns))
# ๊ฐ„๋‹จํ•œ ๋ฌธ์žฅ ์ƒ์„ฑ (๊ฐ ๋ช…์‚ฌ๋ฅผ ๊ฐœ๋ณ„ ๋ฌธ์žฅ์œผ๋กœ ์ทจ๊ธ‰)
sentences = [[noun] for noun in unique_nouns]
return sentences, unique_nouns
def apply_pca(word_vectors):
pca = PCA(n_components=3)
return pca.fit_transform(word_vectors)
def process_text(file_path, target_word):
# ์ „์ฒ˜๋ฆฌ
sentences, unique_words = preprocess_text(file_path)
# Word2Vec ๋ชจ๋ธ ํ•™์Šต
model = train_word2vec(sentences)
# ๊ฐ ๋‹จ์–ด์˜ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ถ”์ถœ
word_vectors = np.array([model.wv[word] for word in unique_words])
# PCA๋กœ ์ฐจ์› ์ถ•์†Œ
word_vectors_3d = apply_pca(word_vectors)
# ์ƒ‰์ƒ ์„ค์ • (ํˆฌ๋ช…๋„ ์ถ”๊ฐ€)
colors = ['rgba(128, 128, 128, 0.3)' if word != target_word else 'rgba(255, 0, 0, 1)' for word in unique_words]
# ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
if target_word in model.wv:
similar_words = model.wv.most_similar(target_word, topn=10)
similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
for idx in similar_word_indices:
colors[idx] = 'rgba(0, 255, 0, 1)' # ๊ฐ€๊นŒ์šด ๋‹จ์–ด๋“ค์„ ์ดˆ๋ก์ƒ‰์œผ๋กœ ํ‘œ์‹œ
# Plotly๋ฅผ ์‚ฌ์šฉํ•œ 3D ์‚ฐ์ ๋„ ์ƒ์„ฑ
fig = go.Figure(data=[go.Scatter3d(
x=word_vectors_3d[:, 0],
y=word_vectors_3d[:, 1],
z=word_vectors_3d[:, 2],
mode='markers+text',
text=unique_words,
textposition="top center",
marker=dict(
size=8,
color=colors,
)
)])
fig.update_layout(
title="Word Embeddings 3D Visualization",
scene=dict(
xaxis_title="PCA 1",
yaxis_title="PCA 2",
zaxis_title="PCA 3"
),
width=800,
height=800
)
# ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ๋ชฉ๋ก ์ƒ์„ฑ
similar_words_text = ""
if target_word in model.wv:
similar_words_text = "๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ:\n" + "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])
return fig, similar_words_text
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ˆ˜์ •
with gr.Blocks() as iface:
gr.Markdown("# Word Embedding 3D ์‹œ๊ฐํ™”")
gr.Markdown("ํ…์ŠคํŠธ ํŒŒ์ผ(.txt)์„ ์—…๋กœ๋“œํ•˜๊ณ  ๊ฐ•์กฐํ•  ๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”. Word2Vec๊ณผ PCA๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด ์ž„๋ฒ ๋”ฉ์„ 3D๋กœ ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค. ์ž…๋ ฅํ•œ ๋‹จ์–ด๋Š” ๋นจ๊ฐ„์ƒ‰์œผ๋กœ, ๊ฐ€์žฅ ์œ ์‚ฌํ•œ 10๊ฐœ ๋‹จ์–ด๋Š” ์ดˆ๋ก์ƒ‰์œผ๋กœ ๊ฐ•์กฐ๋ฉ๋‹ˆ๋‹ค. ์œ ์‚ฌํ•œ ๋‹จ์–ด ๋ชฉ๋ก์€ ๊ทธ๋ž˜ํ”„ ์•„๋ž˜์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
with gr.Row():
file_input = gr.File(label="ํ…์ŠคํŠธ ํŒŒ์ผ ์—…๋กœ๋“œ (.txt)", file_types=[".txt"])
word_input = gr.Textbox(label="๊ฐ•์กฐํ•  ๋‹จ์–ด ์ž…๋ ฅ")
submit_btn = gr.Button("์ œ์ถœ")
plot_output = gr.Plot(label="Word Embedding 3D ์‹œ๊ฐํ™”")
similar_words_output = gr.Textbox(label="์œ ์‚ฌํ•œ ๋‹จ์–ด")
submit_btn.click(
fn=process_text,
inputs=[file_input, word_input],
outputs=[plot_output, similar_words_output]
)
if __name__ == "__main__":
iface.launch()