word2vec / app.py
Ridealist's picture
Update app.py
0131a54 verified
raw
history blame
7.55 kB
import numpy as np
import pandas as pd
import random
from gensim.models import Word2Vec
import gradio as gr
from sklearn.decomposition import PCA
import plotly.graph_objects as go
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from docs import NOVEL_TEXT
def download_nltk_library():
try:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
return True
except:
return False
# Function to process each sentence
def process_text(text):
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Tokenization
tokens = word_tokenize(text.lower())
# Remove stop words and apply lemmatization
processed_tokens = [
lemmatizer.lemmatize(token)
for token in tokens if token.isalnum() and token not in stop_words
]
return processed_tokens
# Word2Vec ๋ชจ๋ธ ํ•™์Šต ํ•จ์ˆ˜
def train_word2vec(sentences):
model = Word2Vec(sentences, vector_size=100, window=3, min_count=2, workers=4, sg=0, epochs=100)
return model
# def preprocess_text(file_path):
# with open(file_path, 'r', encoding='utf-8') as file:
# text = file.read()
# # ํ† ํฐํ™” ๋ฐ ํ’ˆ์‚ฌ ํƒœ๊น…
# tokens = word_tokenize(text)
# tagged = pos_tag(tokens)
# # ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ (NN, NNS, NNP, NNPS)
# nouns = [word.lower() for word, pos in tagged if pos.startswith('NN')]
# # ์ค‘๋ณต ์ œ๊ฑฐ ๋ฐ ์ •๋ ฌ
# unique_nouns = sorted(set(nouns))
# # ๊ฐ„๋‹จํ•œ ๋ฌธ์žฅ ์ƒ์„ฑ (๊ฐ ๋ช…์‚ฌ๋ฅผ ๊ฐœ๋ณ„ ๋ฌธ์žฅ์œผ๋กœ ์ทจ๊ธ‰)
# sentences = [[noun] for noun in unique_nouns]
# return sentences, unique_nouns
def apply_pca(word_vectors):
pca = PCA(n_components=3)
return pca.fit_transform(word_vectors)
# def process_text(file_path, target_word):
def get_unique(model):
vocablist1=list(model.wv.index_to_key)
vocablist =[]
for i in vocablist1:
vocablist.append(i)
return vocablist
def train_model(sentence):
# ์ „์ฒ˜๋ฆฌ
sentences=sentence
# Word2Vec ๋ชจ๋ธ ํ•™์Šต
model = train_word2vec(sentences)
unique_words = get_unique(model)
return model, unique_words
def process_model(target_word):
# Word2Vec ๋ชจ๋ธ ๋กœ๋“œ
model = Word2Vec.load("word2vec.model")
unique_words = get_unique(model)
# ๊ฐ ๋‹จ์–ด์˜ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ถ”์ถœ
word_vectors = np.array([model.wv[word] for word in unique_words])
# PCA๋กœ ์ฐจ์› ์ถ•์†Œ
word_vectors_3d = apply_pca(word_vectors)
# ์ƒ‰์ƒ ์„ค์ • (ํˆฌ๋ช…๋„ ์ถ”๊ฐ€)
colors = ['rgba(255, 255, 255, 0.15)' if word != target_word else 'rgba(255, 20, 147, 0.9)' for word in unique_words]
# ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
if target_word in model.wv:
similar_words = model.wv.most_similar(target_word, topn=10)
similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
for idx in similar_word_indices:
colors[idx] = 'rgba(255, 165, 0, 1)' # ๊ฐ€๊นŒ์šด ๋‹จ์–ด๋“ค์„ ์ฃผํ™ฉ์ƒ‰์œผ๋กœ ํ‘œ์‹œ
# ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
if target_word in model.wv:
all_words = model.wv.index_to_key # ๋ชจ๋ธ์— ํฌํ•จ๋œ ๋ชจ๋“  ๋‹จ์–ด ๋ฆฌ์ŠคํŠธ
dissimilar_words = sorted(
[(word, model.wv.similarity(target_word, word)) for word in all_words if word != target_word],
key=lambda x: x[1]
)[:10] # ์œ ์‚ฌ๋„๊ฐ€ ๊ฐ€์žฅ ๋‚ฎ์€ 10๊ฐœ ๋‹จ์–ด ์„ ํƒ
dissimilar_word_indices = [unique_words.index(word) for word, _ in dissimilar_words]
for idx in dissimilar_word_indices:
colors[idx] = 'rgba(138, 43, 226, 0.8)' # ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋“ค์„ ๋ณด๋ผ์ƒ‰์œผ๋กœ ํ‘œ์‹œ
# Plotly๋ฅผ ์‚ฌ์šฉํ•œ 3D ์‚ฐ์ ๋„ ์ƒ์„ฑ
fig = go.Figure(data=[go.Scatter3d(
x=word_vectors_3d[:, 0],
y=word_vectors_3d[:, 1],
z=word_vectors_3d[:, 2],
mode='markers+text',
text=unique_words,
textposition="top center",
marker=dict(
size=4,
color=colors,
)
)])
fig.update_layout(
title="Word Embeddings 3D Visualization",
scene=dict(
xaxis_title="X",
yaxis_title="Y",
zaxis_title="Z"
),
width=800,
height=800
)
# ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ๋ชฉ๋ก ์ƒ์„ฑ
similar_words_text = ""
if target_word in model.wv:
similar_words_text = "๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ:\n" + "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])
dissimilar_words_text = ""
if target_word in model.wv:
dissimilar_words_text = "๊ฐ€์žฅ ๋จผ ๋‹จ์–ด 10๊ฐœ:\n" + "\n".join([f"{word}: {score:.4f}" for word, score in dissimilar_words])
return fig, similar_words_text, dissimilar_words_text
def change_button_state_true():
# If the first button is clicked, enable or disable the second button based on its state
return gr.update(interactive=True)
def change_button_state_false():
# If the first button is clicked, enable or disable the second button based on its state
return gr.update(interactive=False)
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ˆ˜์ •
with gr.Blocks(css=".plot-box {width: 70%; height: 500px;}") as iface:
gr.Markdown("# Word Embedding 3D ์‹œ๊ฐํ™”")
gr.Markdown("๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”. Word2Vec๊ณผ PCA๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด ์ž„๋ฒ ๋”ฉ์„ 3D๋กœ ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค. ์ž…๋ ฅํ•œ ๋‹จ์–ด๋Š” ๋นจ๊ฐ„์ƒ‰์œผ๋กœ, ๊ฐ€์žฅ ์œ ์‚ฌํ•œ 10๊ฐœ ๋‹จ์–ด๋Š” ์ดˆ๋ก์ƒ‰, ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋Š” ๋ณด๋ผ์ƒ‰์œผ๋กœ ๊ฐ•์กฐ๋ฉ๋‹ˆ๋‹ค. ์œ ์‚ฌํ•œ ๋‹จ์–ด ๋ชฉ๋ก์€ ๊ทธ๋ž˜ํ”„ ์•„๋ž˜์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
download_nltk_library()
with gr.Row():
word_input = gr.Textbox(label="**๊ฐ•์กฐํ•  ๋‹จ์–ด ์ž…๋ ฅ**", elem_id="input-box", placeholder="๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”", lines=1, interactive=False)
with gr.Column(scale=1):
# ์‚ฌ์šฉ์ž ์ž…๋ ฅ ๋ฐ•์Šค๋ฅผ ๊ฐ•์กฐํ•˜๊ธฐ ์œ„ํ•ด ์Šคํƒ€์ผ์„ ๋ณ€๊ฒฝ
# word_input = gr.Textbox(label="**๊ฐ•์กฐํ•  ๋‹จ์–ด ์ž…๋ ฅ**", elem_id="input-box", placeholder="๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”", lines=1)
load_btn = gr.Button("๋ชจ๋ธ ๋กœ๋ง", elem_id="submit-btn")
submit_btn = gr.Button("๋‹จ์–ด ์ž…๋ ฅ", elem_id="submit-btn", interactive=False)
with gr.Row():
# ์‹œ๊ฐํ™” ํ™”๋ฉด์˜ ํฌ๊ธฐ๋ฅผ CSS๋กœ ์ฆ๊ฐ€
plot_output = gr.Plot(label="Word Embedding 3D ์‹œ๊ฐํ™”", elem_id="plot-box")
with gr.Column(scale=0.3): # ์ปฌ๋Ÿผ์˜ ๋„ˆ๋น„๋ฅผ ์ค„์ด๊ธฐ ์œ„ํ•ด scale ๊ฐ’์„ ๋‚ฎ์ถค
similar_words_output = gr.Textbox(label="์œ ์‚ฌํ•œ ๋‹จ์–ด", interactive=False, lines=5)
dissimilar_words_output = gr.Textbox(label="์œ ์‚ฌํ•˜์ง€ ์•Š์€ ๋‹จ์–ด", interactive=False, lines=5)
load_btn.click(
fn=process_model,
inputs=[word_input],
outputs=[plot_output, similar_words_output, dissimilar_words_output]
)
load_btn.click(
fn=change_button_state_true,
outputs=submit_btn
)
load_btn.click(
fn=change_button_state_true,
outputs=word_input
)
submit_btn.click(
fn=process_model,
inputs=[word_input],
outputs=[plot_output, similar_words_output, dissimilar_words_output]
)
submit_btn.click(
fn=change_button_state_false,
outputs=load_btn
)
if __name__ == "__main__":
iface.launch()