Update app.py
Browse files
app.py
CHANGED
@@ -1,68 +1,119 @@
|
|
1 |
import numpy as np
|
|
|
2 |
from gensim.models import Word2Vec
|
3 |
import gradio as gr
|
4 |
from sklearn.decomposition import PCA
|
5 |
import plotly.graph_objects as go
|
6 |
import nltk
|
7 |
from nltk.tokenize import word_tokenize
|
|
|
|
|
|
|
8 |
from nltk.tag import pos_tag
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
nltk.download('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
# Word2Vec ๋ชจ๋ธ ํ์ต ํจ์
|
16 |
def train_word2vec(sentences):
|
17 |
-
model = Word2Vec(sentences, vector_size=100, window=
|
18 |
return model
|
19 |
|
20 |
-
def preprocess_text(file_path):
|
21 |
-
with open(file_path, 'r', encoding='utf-8') as file:
|
22 |
-
text = file.read()
|
23 |
-
|
24 |
-
# ํ ํฐํ ๋ฐ ํ์ฌ ํ๊น
|
25 |
-
tokens = word_tokenize(text)
|
26 |
-
tagged = pos_tag(tokens)
|
27 |
-
|
28 |
-
# ๋ช
์ฌ๋ง ์ถ์ถ (NN, NNS, NNP, NNPS)
|
29 |
-
nouns = [word.lower() for word, pos in tagged if pos.startswith('NN')]
|
30 |
-
|
31 |
-
# ์ค๋ณต ์ ๊ฑฐ ๋ฐ ์ ๋ ฌ
|
32 |
-
unique_nouns = sorted(set(nouns))
|
33 |
-
|
34 |
-
# ๊ฐ๋จํ ๋ฌธ์ฅ ์์ฑ (๊ฐ ๋ช
์ฌ๋ฅผ ๊ฐ๋ณ ๋ฌธ์ฅ์ผ๋ก ์ทจ๊ธ)
|
35 |
-
sentences = [[noun] for noun in unique_nouns]
|
36 |
-
|
37 |
-
return sentences, unique_nouns
|
38 |
-
|
39 |
def apply_pca(word_vectors):
|
40 |
pca = PCA(n_components=3)
|
41 |
return pca.fit_transform(word_vectors)
|
42 |
|
43 |
-
def process_text(file_path, target_word):
|
44 |
-
# ์ ์ฒ๋ฆฌ
|
45 |
-
sentences, unique_words = preprocess_text(file_path)
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
# Word2Vec ๋ชจ๋ธ ํ์ต
|
48 |
model = train_word2vec(sentences)
|
49 |
-
|
|
|
50 |
# ๊ฐ ๋จ์ด์ ์๋ฒ ๋ฉ ๋ฒกํฐ ์ถ์ถ
|
51 |
word_vectors = np.array([model.wv[word] for word in unique_words])
|
52 |
-
|
53 |
# PCA๋ก ์ฐจ์ ์ถ์
|
54 |
word_vectors_3d = apply_pca(word_vectors)
|
55 |
-
|
56 |
# ์์ ์ค์ (ํฌ๋ช
๋ ์ถ๊ฐ)
|
57 |
-
colors = ['rgba(128, 128, 128, 0.
|
58 |
-
|
59 |
# ๊ฐ์ฅ ๊ฐ๊น์ด ๋จ์ด 10๊ฐ ์ฐพ๊ธฐ
|
60 |
if target_word in model.wv:
|
61 |
similar_words = model.wv.most_similar(target_word, topn=10)
|
62 |
similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
|
63 |
for idx in similar_word_indices:
|
64 |
colors[idx] = 'rgba(0, 255, 0, 1)' # ๊ฐ๊น์ด ๋จ์ด๋ค์ ์ด๋ก์์ผ๋ก ํ์
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# Plotly๋ฅผ ์ฌ์ฉํ 3D ์ฐ์ ๋ ์์ฑ
|
67 |
fig = go.Figure(data=[go.Scatter3d(
|
68 |
x=word_vectors_3d[:, 0],
|
@@ -72,11 +123,11 @@ def process_text(file_path, target_word):
|
|
72 |
text=unique_words,
|
73 |
textposition="top center",
|
74 |
marker=dict(
|
75 |
-
size=
|
76 |
color=colors,
|
77 |
)
|
78 |
)])
|
79 |
-
|
80 |
fig.update_layout(
|
81 |
title="Word Embeddings 3D Visualization",
|
82 |
scene=dict(
|
@@ -84,34 +135,35 @@ def process_text(file_path, target_word):
|
|
84 |
yaxis_title="PCA 2",
|
85 |
zaxis_title="PCA 3"
|
86 |
),
|
87 |
-
width=
|
88 |
-
height=
|
89 |
)
|
90 |
-
|
91 |
# ๊ฐ์ฅ ๊ฐ๊น์ด ๋จ์ด 10๊ฐ ๋ชฉ๋ก ์์ฑ
|
92 |
similar_words_text = ""
|
93 |
if target_word in model.wv:
|
94 |
similar_words_text = "๊ฐ์ฅ ๊ฐ๊น์ด ๋จ์ด 10๊ฐ:\n" + "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])
|
95 |
-
|
96 |
return fig, similar_words_text
|
97 |
|
98 |
-
|
|
|
99 |
with gr.Blocks() as iface:
|
100 |
gr.Markdown("# Word Embedding 3D ์๊ฐํ")
|
101 |
-
gr.Markdown("
|
102 |
-
|
103 |
with gr.Row():
|
104 |
-
file_input = gr.File(label="ํ
์คํธ ํ์ผ ์
๋ก๋ (.txt)", file_types=[".txt"])
|
105 |
word_input = gr.Textbox(label="๊ฐ์กฐํ ๋จ์ด ์
๋ ฅ")
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
plot_output = gr.Plot(label="Word Embedding 3D ์๊ฐํ")
|
110 |
similar_words_output = gr.Textbox(label="์ ์ฌํ ๋จ์ด")
|
111 |
-
|
112 |
submit_btn.click(
|
113 |
fn=process_text,
|
114 |
-
inputs=[
|
115 |
outputs=[plot_output, similar_words_output]
|
116 |
)
|
117 |
|
|
|
1 |
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
from gensim.models import Word2Vec
|
4 |
import gradio as gr
|
5 |
from sklearn.decomposition import PCA
|
6 |
import plotly.graph_objects as go
|
7 |
import nltk
|
8 |
from nltk.tokenize import word_tokenize
|
9 |
+
import nltk
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
12 |
from nltk.tag import pos_tag
|
13 |
|
14 |
+
nltk.download('punkt')
|
15 |
+
nltk.download('stopwords')
|
16 |
+
nltk.download('wordnet')
|
17 |
+
nltk.download('averaged_perceptron_tagger')
|
18 |
+
|
19 |
+
from docs import NOVEL_TEXT
|
20 |
+
|
21 |
+
# Initialize lemmatizer and stop words
|
22 |
+
lemmatizer = WordNetLemmatizer()
|
23 |
+
stop_words = set(stopwords.words('english'))
|
24 |
+
|
25 |
+
# Function to process each sentence
|
26 |
+
def process_text(text):
|
27 |
+
# Tokenization
|
28 |
+
tokens = word_tokenize(text.lower())
|
29 |
+
|
30 |
+
# Remove stop words and apply lemmatization
|
31 |
+
processed_tokens = [
|
32 |
+
lemmatizer.lemmatize(token)
|
33 |
+
for token in tokens if token.isalnum() and token not in stop_words
|
34 |
+
]
|
35 |
+
|
36 |
+
return processed_tokens
|
37 |
+
|
38 |
+
# Split text into sentences
|
39 |
+
sentences = nltk.sent_tokenize(NOVEL_TEXT)
|
40 |
+
|
41 |
+
# Process each sentence in the corpus
|
42 |
+
processed_corpus = [process_text(sentence) for sentence in sentences]
|
43 |
+
|
44 |
+
import random
|
45 |
+
|
46 |
+
emotion_words = ['emotion', 'joy', 'fear', 'anger', 'sadness', 'disgust', 'anxiety', 'team', 'console', 'headquarters', 'feelings']
|
47 |
+
hockey_words = ['hockey', 'game', 'team', 'skates', 'stick', 'rink', 'practice', 'championship', 'score', 'goal', 'penalty']
|
48 |
+
memory_words = ['memory', 'sphere', 'shelves', 'life', 'experience', 'recall', 'remember', 'color', 'happy', 'sad', 'joyful']
|
49 |
+
friend_words = ['friend', 'riley', 'grace', 'bree', 'team', 'support', 'help', 'together', 'loyal', 'fun', 'friendship']
|
50 |
+
school_words = ['school', 'class', 'teacher', 'student', 'homework', 'study', 'exam', 'lesson', 'classmates', 'learn']
|
51 |
+
|
52 |
+
train_data = []
|
53 |
+
|
54 |
+
for _ in range(40):
|
55 |
+
train_data.append(random.sample(emotion_words, k=random.randint(4, 6)))
|
56 |
+
train_data.append(random.sample(hockey_words, k=random.randint(4, 6)))
|
57 |
+
train_data.append(random.sample(memory_words, k=random.randint(4, 6)))
|
58 |
+
train_data.append(random.sample(friend_words, k=random.randint(4, 6)))
|
59 |
+
train_data.append(random.sample(school_words, k=random.randint(4, 6)))
|
60 |
+
|
61 |
+
|
62 |
+
random.shuffle(train_data)
|
63 |
+
|
64 |
+
|
65 |
|
|
|
66 |
def train_word2vec(sentences):
|
67 |
+
model = Word2Vec(sentences, vector_size=100, window=3, min_count=2, workers=4, sg=0, epochs=100)
|
68 |
return model
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
def apply_pca(word_vectors):
|
71 |
pca = PCA(n_components=3)
|
72 |
return pca.fit_transform(word_vectors)
|
73 |
|
|
|
|
|
|
|
74 |
|
75 |
+
def get_unique(model):
|
76 |
+
vocablist1=list(model.wv.index_to_key)
|
77 |
+
vocablist =[]
|
78 |
+
for i in vocablist1:
|
79 |
+
vocablist.append(i)
|
80 |
+
return vocablist
|
81 |
+
|
82 |
+
def process_text(target_word):
|
83 |
+
# ์ ์ฒ๋ฆฌ
|
84 |
+
sentences=X
|
85 |
+
|
86 |
# Word2Vec ๋ชจ๋ธ ํ์ต
|
87 |
model = train_word2vec(sentences)
|
88 |
+
unique_words = get_unique(model)
|
89 |
+
|
90 |
# ๊ฐ ๋จ์ด์ ์๋ฒ ๋ฉ ๋ฒกํฐ ์ถ์ถ
|
91 |
word_vectors = np.array([model.wv[word] for word in unique_words])
|
92 |
+
|
93 |
# PCA๋ก ์ฐจ์ ์ถ์
|
94 |
word_vectors_3d = apply_pca(word_vectors)
|
95 |
+
|
96 |
# ์์ ์ค์ (ํฌ๋ช
๋ ์ถ๊ฐ)
|
97 |
+
colors = ['rgba(128, 128, 128, 0.15)' if word != target_word else 'rgba(255, 0, 0, 1)' for word in unique_words]
|
98 |
+
|
99 |
# ๊ฐ์ฅ ๊ฐ๊น์ด ๋จ์ด 10๊ฐ ์ฐพ๊ธฐ
|
100 |
if target_word in model.wv:
|
101 |
similar_words = model.wv.most_similar(target_word, topn=10)
|
102 |
similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
|
103 |
for idx in similar_word_indices:
|
104 |
colors[idx] = 'rgba(0, 255, 0, 1)' # ๊ฐ๊น์ด ๋จ์ด๋ค์ ์ด๋ก์์ผ๋ก ํ์
|
105 |
+
|
106 |
+
# ๊ฐ์ฅ ๋จผ ๋จ์ด 10๊ฐ ์ฐพ๊ธฐ
|
107 |
+
if target_word in model.wv:
|
108 |
+
all_words = model.wv.index_to_key # ๋ชจ๋ธ์ ํฌํจ๋ ๋ชจ๋ ๋จ์ด ๋ฆฌ์คํธ
|
109 |
+
dissimilar_words = sorted([(word, model.wv.similarity(target_word, word))
|
110 |
+
for word in all_words if word != target_word],
|
111 |
+
key=lambda x: x[1])[:10] # ์ ์ฌ๋๊ฐ ๊ฐ์ฅ ๋ฎ์ 10๊ฐ ๋จ์ด ์ ํ
|
112 |
+
|
113 |
+
dissimilar_word_indices = [unique_words.index(word) for word, _ in dissimilar_words]
|
114 |
+
for idx in dissimilar_word_indices:
|
115 |
+
colors[idx] = 'rgba(128, 0, 128, 1)' # ๊ฐ์ฅ ๋จผ ๋จ์ด๋ค์ ๋ณด๋ผ์์ผ๋ก ํ์
|
116 |
+
|
117 |
# Plotly๋ฅผ ์ฌ์ฉํ 3D ์ฐ์ ๋ ์์ฑ
|
118 |
fig = go.Figure(data=[go.Scatter3d(
|
119 |
x=word_vectors_3d[:, 0],
|
|
|
123 |
text=unique_words,
|
124 |
textposition="top center",
|
125 |
marker=dict(
|
126 |
+
size=6,
|
127 |
color=colors,
|
128 |
)
|
129 |
)])
|
130 |
+
|
131 |
fig.update_layout(
|
132 |
title="Word Embeddings 3D Visualization",
|
133 |
scene=dict(
|
|
|
135 |
yaxis_title="PCA 2",
|
136 |
zaxis_title="PCA 3"
|
137 |
),
|
138 |
+
width=1000,
|
139 |
+
height=1000
|
140 |
)
|
141 |
+
|
142 |
# ๊ฐ์ฅ ๊ฐ๊น์ด ๋จ์ด 10๊ฐ ๋ชฉ๋ก ์์ฑ
|
143 |
similar_words_text = ""
|
144 |
if target_word in model.wv:
|
145 |
similar_words_text = "๊ฐ์ฅ ๊ฐ๊น์ด ๋จ์ด 10๊ฐ:\n" + "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])
|
146 |
+
|
147 |
return fig, similar_words_text
|
148 |
|
149 |
+
|
150 |
+
# Gradio ์ธํฐํ์ด์ค
|
151 |
with gr.Blocks() as iface:
|
152 |
gr.Markdown("# Word Embedding 3D ์๊ฐํ")
|
153 |
+
gr.Markdown("๋จ์ด๋ฅผ ์
๋ ฅํ์ธ์. Word2Vec๊ณผ PCA๋ฅผ ์ฌ์ฉํ์ฌ ๋จ์ด ์๋ฒ ๋ฉ์ 3D๋ก ์๊ฐํํฉ๋๋ค. ์
๋ ฅํ ๋จ์ด๋ ๋นจ๊ฐ์์ผ๋ก, ๊ฐ์ฅ ์ ์ฌํ 10๊ฐ ๋จ์ด๋ ์ด๋ก์, ๊ฐ์ฅ ๋จผ ๋จ์ด๋ ๋ณด๋ผ์์ผ๋ก ๊ฐ์กฐ๋ฉ๋๋ค. ์ ์ฌํ ๋จ์ด ๋ชฉ๋ก์ ๊ทธ๋ํ ์๋์ ํ์๋ฉ๋๋ค.")
|
154 |
+
|
155 |
with gr.Row():
|
156 |
+
# file_input = gr.File(label="ํ
์คํธ ํ์ผ ์
๋ก๋ (.txt)", file_types=[".txt"])
|
157 |
word_input = gr.Textbox(label="๊ฐ์กฐํ ๋จ์ด ์
๋ ฅ")
|
158 |
+
submit_btn = gr.Button("์ ์ถ")
|
159 |
+
|
160 |
+
|
161 |
plot_output = gr.Plot(label="Word Embedding 3D ์๊ฐํ")
|
162 |
similar_words_output = gr.Textbox(label="์ ์ฌํ ๋จ์ด")
|
163 |
+
|
164 |
submit_btn.click(
|
165 |
fn=process_text,
|
166 |
+
inputs=[word_input],
|
167 |
outputs=[plot_output, similar_words_output]
|
168 |
)
|
169 |
|