Unggi commited on
Commit
770e786
ยท
verified ยท
1 Parent(s): 10805d5

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -0
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from gensim.models import Word2Vec
3
+ import gradio as gr
4
+ from sklearn.decomposition import PCA
5
+ import plotly.graph_objects as go
6
+ import nltk
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk.tag import pos_tag
9
+
10
+ nltk.download('punkt')
11
+ nltk.download('averaged_perceptron_tagger')
12
+
13
+ # Word2Vec ๋ชจ๋ธ ํ•™์Šต ํ•จ์ˆ˜
14
+ def train_word2vec(sentences):
15
+ model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
16
+ return model
17
+
18
+ def preprocess_text(file_path):
19
+ with open(file_path, 'r', encoding='utf-8') as file:
20
+ text = file.read()
21
+
22
+ # ํ† ํฐํ™” ๋ฐ ํ’ˆ์‚ฌ ํƒœ๊น…
23
+ tokens = word_tokenize(text)
24
+ tagged = pos_tag(tokens)
25
+
26
+ # ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ (NN, NNS, NNP, NNPS)
27
+ nouns = [word.lower() for word, pos in tagged if pos.startswith('NN')]
28
+
29
+ # ์ค‘๋ณต ์ œ๊ฑฐ ๋ฐ ์ •๋ ฌ
30
+ unique_nouns = sorted(set(nouns))
31
+
32
+ # ๊ฐ„๋‹จํ•œ ๋ฌธ์žฅ ์ƒ์„ฑ (๊ฐ ๋ช…์‚ฌ๋ฅผ ๊ฐœ๋ณ„ ๋ฌธ์žฅ์œผ๋กœ ์ทจ๊ธ‰)
33
+ sentences = [[noun] for noun in unique_nouns]
34
+
35
+ return sentences, unique_nouns
36
+
37
+ def apply_pca(word_vectors):
38
+ pca = PCA(n_components=3)
39
+ return pca.fit_transform(word_vectors)
40
+
41
+ def process_text(file_path, target_word):
42
+ # ์ „์ฒ˜๋ฆฌ
43
+ sentences, unique_words = preprocess_text(file_path)
44
+
45
+ # Word2Vec ๋ชจ๋ธ ํ•™์Šต
46
+ model = train_word2vec(sentences)
47
+
48
+ # ๊ฐ ๋‹จ์–ด์˜ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ถ”์ถœ
49
+ word_vectors = np.array([model.wv[word] for word in unique_words])
50
+
51
+ # PCA๋กœ ์ฐจ์› ์ถ•์†Œ
52
+ word_vectors_3d = apply_pca(word_vectors)
53
+
54
+ # ์ƒ‰์ƒ ์„ค์ • (ํˆฌ๋ช…๋„ ์ถ”๊ฐ€)
55
+ colors = ['rgba(128, 128, 128, 0.3)' if word != target_word else 'rgba(255, 0, 0, 1)' for word in unique_words]
56
+
57
+ # ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
58
+ if target_word in model.wv:
59
+ similar_words = model.wv.most_similar(target_word, topn=10)
60
+ similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
61
+ for idx in similar_word_indices:
62
+ colors[idx] = 'rgba(0, 255, 0, 1)' # ๊ฐ€๊นŒ์šด ๋‹จ์–ด๋“ค์„ ์ดˆ๋ก์ƒ‰์œผ๋กœ ํ‘œ์‹œ
63
+
64
+ # Plotly๋ฅผ ์‚ฌ์šฉํ•œ 3D ์‚ฐ์ ๋„ ์ƒ์„ฑ
65
+ fig = go.Figure(data=[go.Scatter3d(
66
+ x=word_vectors_3d[:, 0],
67
+ y=word_vectors_3d[:, 1],
68
+ z=word_vectors_3d[:, 2],
69
+ mode='markers+text',
70
+ text=unique_words,
71
+ textposition="top center",
72
+ marker=dict(
73
+ size=8,
74
+ color=colors,
75
+ )
76
+ )])
77
+
78
+ fig.update_layout(
79
+ title="Word Embeddings 3D Visualization",
80
+ scene=dict(
81
+ xaxis_title="PCA 1",
82
+ yaxis_title="PCA 2",
83
+ zaxis_title="PCA 3"
84
+ ),
85
+ width=800,
86
+ height=800
87
+ )
88
+
89
+ # ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ๋ชฉ๋ก ์ƒ์„ฑ
90
+ similar_words_text = ""
91
+ if target_word in model.wv:
92
+ similar_words_text = "๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ:\n" + "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])
93
+
94
+ return fig, similar_words_text
95
+
96
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ˆ˜์ •
97
+ with gr.Blocks() as iface:
98
+ gr.Markdown("# Word Embedding 3D ์‹œ๊ฐํ™”")
99
+ gr.Markdown("ํ…์ŠคํŠธ ํŒŒ์ผ(.txt)์„ ์—…๋กœ๋“œํ•˜๊ณ  ๊ฐ•์กฐํ•  ๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”. Word2Vec๊ณผ PCA๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด ์ž„๋ฒ ๋”ฉ์„ 3D๋กœ ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค. ์ž…๋ ฅํ•œ ๋‹จ์–ด๋Š” ๋นจ๊ฐ„์ƒ‰์œผ๋กœ, ๊ฐ€์žฅ ์œ ์‚ฌํ•œ 10๊ฐœ ๋‹จ์–ด๋Š” ์ดˆ๋ก์ƒ‰์œผ๋กœ ๊ฐ•์กฐ๋ฉ๋‹ˆ๋‹ค. ์œ ์‚ฌํ•œ ๋‹จ์–ด ๋ชฉ๋ก์€ ๊ทธ๋ž˜ํ”„ ์•„๋ž˜์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
100
+
101
+ with gr.Row():
102
+ file_input = gr.File(label="ํ…์ŠคํŠธ ํŒŒ์ผ ์—…๋กœ๋“œ (.txt)", file_types=[".txt"])
103
+ word_input = gr.Textbox(label="๊ฐ•์กฐํ•  ๋‹จ์–ด ์ž…๋ ฅ")
104
+
105
+ submit_btn = gr.Button("์ œ์ถœ")
106
+
107
+ plot_output = gr.Plot(label="Word Embedding 3D ์‹œ๊ฐํ™”")
108
+ similar_words_output = gr.Textbox(label="์œ ์‚ฌํ•œ ๋‹จ์–ด")
109
+
110
+ submit_btn.click(
111
+ fn=process_text,
112
+ inputs=[file_input, word_input],
113
+ outputs=[plot_output, similar_words_output]
114
+ )
115
+
116
+ if __name__ == "__main__":
117
+ iface.launch()