kkosmi commited on
Commit
aca3b1d
ยท
verified ยท
1 Parent(s): ded3781

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -49
app.py CHANGED
@@ -1,68 +1,119 @@
1
  import numpy as np
 
2
  from gensim.models import Word2Vec
3
  import gradio as gr
4
  from sklearn.decomposition import PCA
5
  import plotly.graph_objects as go
6
  import nltk
7
  from nltk.tokenize import word_tokenize
 
 
 
8
  from nltk.tag import pos_tag
9
 
10
- # nltk.download('punkt')
11
- # nltk.download('averaged_perceptron_tagger')
12
- # nltk.download('punkt_tab')
13
- nltk.download('all')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # Word2Vec ๋ชจ๋ธ ํ•™์Šต ํ•จ์ˆ˜
16
  def train_word2vec(sentences):
17
- model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
18
  return model
19
 
20
- def preprocess_text(file_path):
21
- with open(file_path, 'r', encoding='utf-8') as file:
22
- text = file.read()
23
-
24
- # ํ† ํฐํ™” ๋ฐ ํ’ˆ์‚ฌ ํƒœ๊น…
25
- tokens = word_tokenize(text)
26
- tagged = pos_tag(tokens)
27
-
28
- # ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ (NN, NNS, NNP, NNPS)
29
- nouns = [word.lower() for word, pos in tagged if pos.startswith('NN')]
30
-
31
- # ์ค‘๋ณต ์ œ๊ฑฐ ๋ฐ ์ •๋ ฌ
32
- unique_nouns = sorted(set(nouns))
33
-
34
- # ๊ฐ„๋‹จํ•œ ๋ฌธ์žฅ ์ƒ์„ฑ (๊ฐ ๋ช…์‚ฌ๋ฅผ ๊ฐœ๋ณ„ ๋ฌธ์žฅ์œผ๋กœ ์ทจ๊ธ‰)
35
- sentences = [[noun] for noun in unique_nouns]
36
-
37
- return sentences, unique_nouns
38
-
39
  def apply_pca(word_vectors):
40
  pca = PCA(n_components=3)
41
  return pca.fit_transform(word_vectors)
42
 
43
- def process_text(file_path, target_word):
44
- # ์ „์ฒ˜๋ฆฌ
45
- sentences, unique_words = preprocess_text(file_path)
46
 
 
 
 
 
 
 
 
 
 
 
 
47
  # Word2Vec ๋ชจ๋ธ ํ•™์Šต
48
  model = train_word2vec(sentences)
49
-
 
50
  # ๊ฐ ๋‹จ์–ด์˜ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ถ”์ถœ
51
  word_vectors = np.array([model.wv[word] for word in unique_words])
52
-
53
  # PCA๋กœ ์ฐจ์› ์ถ•์†Œ
54
  word_vectors_3d = apply_pca(word_vectors)
55
-
56
  # ์ƒ‰์ƒ ์„ค์ • (ํˆฌ๋ช…๋„ ์ถ”๊ฐ€)
57
- colors = ['rgba(128, 128, 128, 0.3)' if word != target_word else 'rgba(255, 0, 0, 1)' for word in unique_words]
58
-
59
  # ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
60
  if target_word in model.wv:
61
  similar_words = model.wv.most_similar(target_word, topn=10)
62
  similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
63
  for idx in similar_word_indices:
64
  colors[idx] = 'rgba(0, 255, 0, 1)' # ๊ฐ€๊นŒ์šด ๋‹จ์–ด๋“ค์„ ์ดˆ๋ก์ƒ‰์œผ๋กœ ํ‘œ์‹œ
65
-
 
 
 
 
 
 
 
 
 
 
 
66
  # Plotly๋ฅผ ์‚ฌ์šฉํ•œ 3D ์‚ฐ์ ๋„ ์ƒ์„ฑ
67
  fig = go.Figure(data=[go.Scatter3d(
68
  x=word_vectors_3d[:, 0],
@@ -72,11 +123,11 @@ def process_text(file_path, target_word):
72
  text=unique_words,
73
  textposition="top center",
74
  marker=dict(
75
- size=8,
76
  color=colors,
77
  )
78
  )])
79
-
80
  fig.update_layout(
81
  title="Word Embeddings 3D Visualization",
82
  scene=dict(
@@ -84,34 +135,35 @@ def process_text(file_path, target_word):
84
  yaxis_title="PCA 2",
85
  zaxis_title="PCA 3"
86
  ),
87
- width=800,
88
- height=800
89
  )
90
-
91
  # ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ๋ชฉ๋ก ์ƒ์„ฑ
92
  similar_words_text = ""
93
  if target_word in model.wv:
94
  similar_words_text = "๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ:\n" + "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])
95
-
96
  return fig, similar_words_text
97
 
98
- # Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ˆ˜์ •
 
99
  with gr.Blocks() as iface:
100
  gr.Markdown("# Word Embedding 3D ์‹œ๊ฐํ™”")
101
- gr.Markdown("ํ…์ŠคํŠธ ํŒŒ์ผ(.txt)์„ ์—…๋กœ๋“œํ•˜๊ณ  ๊ฐ•์กฐํ•  ๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”. Word2Vec๊ณผ PCA๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด ์ž„๋ฒ ๋”ฉ์„ 3D๋กœ ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค. ์ž…๋ ฅํ•œ ๋‹จ์–ด๋Š” ๋นจ๊ฐ„์ƒ‰์œผ๋กœ, ๊ฐ€์žฅ ์œ ์‚ฌํ•œ 10๊ฐœ ๋‹จ์–ด๋Š” ์ดˆ๋ก์ƒ‰์œผ๋กœ ๊ฐ•์กฐ๋ฉ๋‹ˆ๋‹ค. ์œ ์‚ฌํ•œ ๋‹จ์–ด ๋ชฉ๋ก์€ ๊ทธ๋ž˜ํ”„ ์•„๋ž˜์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
102
-
103
  with gr.Row():
104
- file_input = gr.File(label="ํ…์ŠคํŠธ ํŒŒ์ผ ์—…๋กœ๋“œ (.txt)", file_types=[".txt"])
105
  word_input = gr.Textbox(label="๊ฐ•์กฐํ•  ๋‹จ์–ด ์ž…๋ ฅ")
106
-
107
- submit_btn = gr.Button("์ œ์ถœ")
108
-
109
  plot_output = gr.Plot(label="Word Embedding 3D ์‹œ๊ฐํ™”")
110
  similar_words_output = gr.Textbox(label="์œ ์‚ฌํ•œ ๋‹จ์–ด")
111
-
112
  submit_btn.click(
113
  fn=process_text,
114
- inputs=[file_input, word_input],
115
  outputs=[plot_output, similar_words_output]
116
  )
117
 
 
1
  import numpy as np
2
+ import pandas as pd
3
  from gensim.models import Word2Vec
4
  import gradio as gr
5
  from sklearn.decomposition import PCA
6
  import plotly.graph_objects as go
7
  import nltk
8
  from nltk.tokenize import word_tokenize
9
+ import nltk
10
+ from nltk.corpus import stopwords
11
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
12
  from nltk.tag import pos_tag
13
 
14
+ nltk.download('punkt')
15
+ nltk.download('stopwords')
16
+ nltk.download('wordnet')
17
+ nltk.download('averaged_perceptron_tagger')
18
+
19
+ from docs import NOVEL_TEXT
20
+
21
+ # Initialize lemmatizer and stop words
22
+ lemmatizer = WordNetLemmatizer()
23
+ stop_words = set(stopwords.words('english'))
24
+
25
+ # Function to process each sentence
26
+ def process_text(text):
27
+ # Tokenization
28
+ tokens = word_tokenize(text.lower())
29
+
30
+ # Remove stop words and apply lemmatization
31
+ processed_tokens = [
32
+ lemmatizer.lemmatize(token)
33
+ for token in tokens if token.isalnum() and token not in stop_words
34
+ ]
35
+
36
+ return processed_tokens
37
+
38
+ # Split text into sentences
39
+ sentences = nltk.sent_tokenize(NOVEL_TEXT)
40
+
41
+ # Process each sentence in the corpus
42
+ processed_corpus = [process_text(sentence) for sentence in sentences]
43
+
44
+ import random
45
+
46
+ emotion_words = ['emotion', 'joy', 'fear', 'anger', 'sadness', 'disgust', 'anxiety', 'team', 'console', 'headquarters', 'feelings']
47
+ hockey_words = ['hockey', 'game', 'team', 'skates', 'stick', 'rink', 'practice', 'championship', 'score', 'goal', 'penalty']
48
+ memory_words = ['memory', 'sphere', 'shelves', 'life', 'experience', 'recall', 'remember', 'color', 'happy', 'sad', 'joyful']
49
+ friend_words = ['friend', 'riley', 'grace', 'bree', 'team', 'support', 'help', 'together', 'loyal', 'fun', 'friendship']
50
+ school_words = ['school', 'class', 'teacher', 'student', 'homework', 'study', 'exam', 'lesson', 'classmates', 'learn']
51
+
52
+ train_data = []
53
+
54
+ for _ in range(40):
55
+ train_data.append(random.sample(emotion_words, k=random.randint(4, 6)))
56
+ train_data.append(random.sample(hockey_words, k=random.randint(4, 6)))
57
+ train_data.append(random.sample(memory_words, k=random.randint(4, 6)))
58
+ train_data.append(random.sample(friend_words, k=random.randint(4, 6)))
59
+ train_data.append(random.sample(school_words, k=random.randint(4, 6)))
60
+
61
+
62
+ random.shuffle(train_data)
63
+
64
+
65
 
 
66
  def train_word2vec(sentences):
67
+ model = Word2Vec(sentences, vector_size=100, window=3, min_count=2, workers=4, sg=0, epochs=100)
68
  return model
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def apply_pca(word_vectors):
71
  pca = PCA(n_components=3)
72
  return pca.fit_transform(word_vectors)
73
 
 
 
 
74
 
75
+ def get_unique(model):
76
+ vocablist1=list(model.wv.index_to_key)
77
+ vocablist =[]
78
+ for i in vocablist1:
79
+ vocablist.append(i)
80
+ return vocablist
81
+
82
+ def process_text(target_word):
83
+ # ์ „์ฒ˜๋ฆฌ
84
+ sentences=X
85
+
86
  # Word2Vec ๋ชจ๋ธ ํ•™์Šต
87
  model = train_word2vec(sentences)
88
+ unique_words = get_unique(model)
89
+
90
  # ๊ฐ ๋‹จ์–ด์˜ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ถ”์ถœ
91
  word_vectors = np.array([model.wv[word] for word in unique_words])
92
+
93
  # PCA๋กœ ์ฐจ์› ์ถ•์†Œ
94
  word_vectors_3d = apply_pca(word_vectors)
95
+
96
  # ์ƒ‰์ƒ ์„ค์ • (ํˆฌ๋ช…๋„ ์ถ”๊ฐ€)
97
+ colors = ['rgba(128, 128, 128, 0.15)' if word != target_word else 'rgba(255, 0, 0, 1)' for word in unique_words]
98
+
99
  # ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
100
  if target_word in model.wv:
101
  similar_words = model.wv.most_similar(target_word, topn=10)
102
  similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
103
  for idx in similar_word_indices:
104
  colors[idx] = 'rgba(0, 255, 0, 1)' # ๊ฐ€๊นŒ์šด ๋‹จ์–ด๋“ค์„ ์ดˆ๋ก์ƒ‰์œผ๋กœ ํ‘œ์‹œ
105
+
106
+ # ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
107
+ if target_word in model.wv:
108
+ all_words = model.wv.index_to_key # ๋ชจ๋ธ์— ํฌํ•จ๋œ ๋ชจ๋“  ๋‹จ์–ด ๋ฆฌ์ŠคํŠธ
109
+ dissimilar_words = sorted([(word, model.wv.similarity(target_word, word))
110
+ for word in all_words if word != target_word],
111
+ key=lambda x: x[1])[:10] # ์œ ์‚ฌ๋„๊ฐ€ ๊ฐ€์žฅ ๋‚ฎ์€ 10๊ฐœ ๋‹จ์–ด ์„ ํƒ
112
+
113
+ dissimilar_word_indices = [unique_words.index(word) for word, _ in dissimilar_words]
114
+ for idx in dissimilar_word_indices:
115
+ colors[idx] = 'rgba(128, 0, 128, 1)' # ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋“ค์„ ๋ณด๋ผ์ƒ‰์œผ๋กœ ํ‘œ์‹œ
116
+
117
  # Plotly๋ฅผ ์‚ฌ์šฉํ•œ 3D ์‚ฐ์ ๋„ ์ƒ์„ฑ
118
  fig = go.Figure(data=[go.Scatter3d(
119
  x=word_vectors_3d[:, 0],
 
123
  text=unique_words,
124
  textposition="top center",
125
  marker=dict(
126
+ size=6,
127
  color=colors,
128
  )
129
  )])
130
+
131
  fig.update_layout(
132
  title="Word Embeddings 3D Visualization",
133
  scene=dict(
 
135
  yaxis_title="PCA 2",
136
  zaxis_title="PCA 3"
137
  ),
138
+ width=1000,
139
+ height=1000
140
  )
141
+
142
  # ๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ ๋ชฉ๋ก ์ƒ์„ฑ
143
  similar_words_text = ""
144
  if target_word in model.wv:
145
  similar_words_text = "๊ฐ€์žฅ ๊ฐ€๊นŒ์šด ๋‹จ์–ด 10๊ฐœ:\n" + "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])
146
+
147
  return fig, similar_words_text
148
 
149
+
150
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค
151
  with gr.Blocks() as iface:
152
  gr.Markdown("# Word Embedding 3D ์‹œ๊ฐํ™”")
153
+ gr.Markdown("๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”. Word2Vec๊ณผ PCA๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด ์ž„๋ฒ ๋”ฉ์„ 3D๋กœ ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค. ์ž…๋ ฅํ•œ ๋‹จ์–ด๋Š” ๋นจ๊ฐ„์ƒ‰์œผ๋กœ, ๊ฐ€์žฅ ์œ ์‚ฌํ•œ 10๊ฐœ ๋‹จ์–ด๋Š” ์ดˆ๋ก์ƒ‰, ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋Š” ๋ณด๋ผ์ƒ‰์œผ๋กœ ๊ฐ•์กฐ๋ฉ๋‹ˆ๋‹ค. ์œ ์‚ฌํ•œ ๋‹จ์–ด ๋ชฉ๋ก์€ ๊ทธ๋ž˜ํ”„ ์•„๋ž˜์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
154
+
155
  with gr.Row():
156
+ # file_input = gr.File(label="ํ…์ŠคํŠธ ํŒŒ์ผ ์—…๋กœ๋“œ (.txt)", file_types=[".txt"])
157
  word_input = gr.Textbox(label="๊ฐ•์กฐํ•  ๋‹จ์–ด ์ž…๋ ฅ")
158
+ submit_btn = gr.Button("์ œ์ถœ")
159
+
160
+
161
  plot_output = gr.Plot(label="Word Embedding 3D ์‹œ๊ฐํ™”")
162
  similar_words_output = gr.Textbox(label="์œ ์‚ฌํ•œ ๋‹จ์–ด")
163
+
164
  submit_btn.click(
165
  fn=process_text,
166
+ inputs=[word_input],
167
  outputs=[plot_output, similar_words_output]
168
  )
169