File size: 4,019 Bytes
770e786
 
 
 
 
 
 
 
 
ded3781
 
 
 
770e786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import numpy as np
from gensim.models import Word2Vec
import gradio as gr
from sklearn.decomposition import PCA
import plotly.graph_objects as go
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt_tab')
nltk.download('all')

# Word2Vec ๋ชจ๋ธ ํ•™์Šต ํ•จ์ˆ˜
def train_word2vec(sentences):
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
    return model

def preprocess_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # ํ† ํฐํ™” ๋ฐ ํ’ˆ์‚ฌ ํƒœ๊น…
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    
    # ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ (NN, NNS, NNP, NNPS)
    nouns = [word.lower() for word, pos in tagged if pos.startswith('NN')]
    
    # ์ค‘๋ณต ์ œ๊ฑฐ ๋ฐ ์ •๋ ฌ
    unique_nouns = sorted(set(nouns))
    
    # ๊ฐ„๋‹จํ•œ ๋ฌธ์žฅ ์ƒ์„ฑ (๊ฐ ๋ช…์‚ฌ๋ฅผ ๊ฐœ๋ณ„ ๋ฌธ์žฅ์œผ๋กœ ์ทจ๊ธ‰)
    sentences = [[noun] for noun in unique_nouns]
    
    return sentences, unique_nouns

def apply_pca(word_vectors):
    pca = PCA(n_components=3)
    return pca.fit_transform(word_vectors)

def process_text(file_path, target_word):
    # ์ „์ฒ˜๋ฆฌ
    sentences, unique_words = preprocess_text(file_path)
    
    # Word2Vec ๋ชจ๋ธ ํ•™์Šต
    model = train_word2vec(sentences)
    
    # ๊ฐ ๋‹จ์–ด์˜ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ถ”์ถœ
    word_vectors = np.array([model.wv[word] for word in unique_words])
    
    # PCA๋กœ ์ฐจ์› ์ถ•์†Œ
    word_vectors_3d = apply_pca(word_vectors)
    
    # ์ƒ‰์ƒ ์„ค์ • (ํˆฌ๋ช…๋„ ์ถ”๊ฐ€)
    colors = ['rgba(128, 128, 128, 0.3)' if word != target_word else 'rgba(255, 0, 0, 1)' for word in unique_words]
    
    # ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
    if target_word in model.wv:
        similar_words = model.wv.most_similar(target_word, topn=10)
        similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
        for idx in similar_word_indices:
            colors[idx] = 'rgba(0, 255, 0, 1)'  # ๊ฐ€๊นŒ์šด ๋‹จ์–ด๋“ค์„ ์ดˆ๋ก์ƒ‰์œผ๋กœ ํ‘œ์‹œ
    
    # Plotly๋ฅผ ์‚ฌ์šฉํ•œ 3D ์‚ฐ์ ๋„ ์ƒ์„ฑ
    fig = go.Figure(data=[go.Scatter3d(
        x=word_vectors_3d[:, 0],
        y=word_vectors_3d[:, 1],
        z=word_vectors_3d[:, 2],
        mode='markers+text',
        text=unique_words,
        textposition="top center",
        marker=dict(
            size=8,
            color=colors,
        )
    )])
    
    fig.update_layout(
        title="Word Embeddings 3D Visualization",
        scene=dict(
            xaxis_title="PCA 1",
            yaxis_title="PCA 2",
            zaxis_title="PCA 3"
        ),
        width=800,
        height=800
    )
    
    # ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ๋ชฉ๋ก ์ƒ์„ฑ
    similar_words_text = ""
    if target_word in model.wv:
        similar_words_text = "๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ:\n" + "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])
    
    return fig, similar_words_text

# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ˆ˜์ •
with gr.Blocks() as iface:
    gr.Markdown("# Word Embedding 3D ์‹œ๊ฐํ™”")
    gr.Markdown("ํ…์ŠคํŠธ ํŒŒ์ผ(.txt)์„ ์—…๋กœ๋“œํ•˜๊ณ  ๊ฐ•์กฐํ•  ๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”. Word2Vec๊ณผ PCA๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด ์ž„๋ฒ ๋”ฉ์„ 3D๋กœ ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค. ์ž…๋ ฅํ•œ ๋‹จ์–ด๋Š” ๋นจ๊ฐ„์ƒ‰์œผ๋กœ, ๊ฐ€์žฅ ์œ ์‚ฌํ•œ 10๊ฐœ ๋‹จ์–ด๋Š” ์ดˆ๋ก์ƒ‰์œผ๋กœ ๊ฐ•์กฐ๋ฉ๋‹ˆ๋‹ค. ์œ ์‚ฌํ•œ ๋‹จ์–ด ๋ชฉ๋ก์€ ๊ทธ๋ž˜ํ”„ ์•„๋ž˜์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
    
    with gr.Row():
        file_input = gr.File(label="ํ…์ŠคํŠธ ํŒŒ์ผ ์—…๋กœ๋“œ (.txt)", file_types=[".txt"])
        word_input = gr.Textbox(label="๊ฐ•์กฐํ•  ๋‹จ์–ด ์ž…๋ ฅ")
    
    submit_btn = gr.Button("์ œ์ถœ")
    
    plot_output = gr.Plot(label="Word Embedding 3D ์‹œ๊ฐํ™”")
    similar_words_output = gr.Textbox(label="์œ ์‚ฌํ•œ ๋‹จ์–ด")
    
    submit_btn.click(
        fn=process_text,
        inputs=[file_input, word_input],
        outputs=[plot_output, similar_words_output]
    )

if __name__ == "__main__":
    iface.launch()