Ridealist commited on
Commit
9fcb29b
ยท
verified ยท
1 Parent(s): 0fbe44b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -48
app.py CHANGED
@@ -1,29 +1,35 @@
1
  import numpy as np
2
  import pandas as pd
 
3
  from gensim.models import Word2Vec
4
  import gradio as gr
5
  from sklearn.decomposition import PCA
6
  import plotly.graph_objects as go
7
  import nltk
8
  from nltk.tokenize import word_tokenize
9
- import nltk
10
  from nltk.corpus import stopwords
11
  from nltk.stem import PorterStemmer, WordNetLemmatizer
12
  from nltk.tag import pos_tag
13
 
14
- nltk.download('punkt')
15
- nltk.download('stopwords')
16
- nltk.download('wordnet')
17
- nltk.download('averaged_perceptron_tagger')
18
-
19
  from docs import NOVEL_TEXT
20
 
21
- # Initialize lemmatizer and stop words
22
- lemmatizer = WordNetLemmatizer()
23
- stop_words = set(stopwords.words('english'))
 
 
 
 
 
 
 
24
 
25
  # Function to process each sentence
26
  def process_text(text):
 
 
 
 
27
  # Tokenization
28
  tokens = word_tokenize(text.lower())
29
 
@@ -35,43 +41,36 @@ def process_text(text):
35
 
36
  return processed_tokens
37
 
38
- # Split text into sentences
39
- sentences = nltk.sent_tokenize(NOVEL_TEXT)
40
-
41
- # Process each sentence in the corpus
42
- processed_corpus = [process_text(sentence) for sentence in sentences]
43
-
44
- import random
45
-
46
- emotion_words = ['emotion', 'joy', 'fear', 'anger', 'sadness', 'disgust', 'anxiety', 'team', 'console', 'headquarters', 'feelings']
47
- hockey_words = ['hockey', 'game', 'team', 'skates', 'stick', 'rink', 'practice', 'championship', 'score', 'goal', 'penalty']
48
- memory_words = ['memory', 'sphere', 'shelves', 'life', 'experience', 'recall', 'remember', 'color', 'happy', 'sad', 'joyful']
49
- friend_words = ['friend', 'riley', 'grace', 'bree', 'team', 'support', 'help', 'together', 'loyal', 'fun', 'friendship']
50
- school_words = ['school', 'class', 'teacher', 'student', 'homework', 'study', 'exam', 'lesson', 'classmates', 'learn']
51
-
52
- train_data = []
53
 
54
- for _ in range(40):
55
- train_data.append(random.sample(emotion_words, k=random.randint(4, 6)))
56
- train_data.append(random.sample(hockey_words, k=random.randint(4, 6)))
57
- train_data.append(random.sample(memory_words, k=random.randint(4, 6)))
58
- train_data.append(random.sample(friend_words, k=random.randint(4, 6)))
59
- train_data.append(random.sample(school_words, k=random.randint(4, 6)))
60
 
 
 
 
61
 
62
- random.shuffle(train_data)
 
63
 
 
 
64
 
 
 
65
 
66
- def train_word2vec(sentences):
67
- model = Word2Vec(sentences, vector_size=100, window=3, min_count=2, workers=4, sg=0, epochs=100)
68
- return model
69
 
70
  def apply_pca(word_vectors):
71
  pca = PCA(n_components=3)
72
  return pca.fit_transform(word_vectors)
73
 
74
-
 
75
  def get_unique(model):
76
  vocablist1=list(model.wv.index_to_key)
77
  vocablist =[]
@@ -79,14 +78,22 @@ def get_unique(model):
79
  vocablist.append(i)
80
  return vocablist
81
 
82
- def process_text(target_word):
83
  # ์ „์ฒ˜๋ฆฌ
84
- sentences=X
85
 
86
  # Word2Vec ๋ชจ๋ธ ํ•™์Šต
87
  model = train_word2vec(sentences)
88
  unique_words = get_unique(model)
 
 
 
 
89
 
 
 
 
 
90
  # ๊ฐ ๋‹จ์–ด์˜ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ถ”์ถœ
91
  word_vectors = np.array([model.wv[word] for word in unique_words])
92
 
@@ -106,14 +113,15 @@ def process_text(target_word):
106
  # ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
107
  if target_word in model.wv:
108
  all_words = model.wv.index_to_key # ๋ชจ๋ธ์— ํฌํ•จ๋œ ๋ชจ๋“  ๋‹จ์–ด ๋ฆฌ์ŠคํŠธ
109
- dissimilar_words = sorted([(word, model.wv.similarity(target_word, word))
110
  for word in all_words if word != target_word],
111
  key=lambda x: x[1])[:10] # ์œ ์‚ฌ๋„๊ฐ€ ๊ฐ€์žฅ ๋‚ฎ์€ 10๊ฐœ ๋‹จ์–ด ์„ ํƒ
112
-
113
  dissimilar_word_indices = [unique_words.index(word) for word, _ in dissimilar_words]
114
  for idx in dissimilar_word_indices:
115
  colors[idx] = 'rgba(128, 0, 128, 1)' # ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋“ค์„ ๋ณด๋ผ์ƒ‰์œผ๋กœ ํ‘œ์‹œ
116
 
 
117
  # Plotly๋ฅผ ์‚ฌ์šฉํ•œ 3D ์‚ฐ์ ๋„ ์ƒ์„ฑ
118
  fig = go.Figure(data=[go.Scatter3d(
119
  x=word_vectors_3d[:, 0],
@@ -147,25 +155,29 @@ def process_text(target_word):
147
  return fig, similar_words_text
148
 
149
 
150
- # Gradio ์ธํ„ฐํŽ˜์ด์Šค
151
- with gr.Blocks() as iface:
152
  gr.Markdown("# Word Embedding 3D ์‹œ๊ฐํ™”")
153
  gr.Markdown("๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”. Word2Vec๊ณผ PCA๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด ์ž„๋ฒ ๋”ฉ์„ 3D๋กœ ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค. ์ž…๋ ฅํ•œ ๋‹จ์–ด๋Š” ๋นจ๊ฐ„์ƒ‰์œผ๋กœ, ๊ฐ€์žฅ ์œ ์‚ฌํ•œ 10๊ฐœ ๋‹จ์–ด๋Š” ์ดˆ๋ก์ƒ‰, ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋Š” ๋ณด๋ผ์ƒ‰์œผ๋กœ ๊ฐ•์กฐ๋ฉ๋‹ˆ๋‹ค. ์œ ์‚ฌํ•œ ๋‹จ์–ด ๋ชฉ๋ก์€ ๊ทธ๋ž˜ํ”„ ์•„๋ž˜์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
154
 
155
  with gr.Row():
156
- # file_input = gr.File(label="ํ…์ŠคํŠธ ํŒŒ์ผ ์—…๋กœ๋“œ (.txt)", file_types=[".txt"])
157
- word_input = gr.Textbox(label="๊ฐ•์กฐํ•  ๋‹จ์–ด ์ž…๋ ฅ")
158
- submit_btn = gr.Button("์ œ์ถœ")
159
 
 
 
 
160
 
161
- plot_output = gr.Plot(label="Word Embedding 3D ์‹œ๊ฐํ™”")
162
- similar_words_output = gr.Textbox(label="์œ ์‚ฌํ•œ ๋‹จ์–ด")
 
163
 
164
  submit_btn.click(
165
  fn=process_text,
166
  inputs=[word_input],
167
- outputs=[plot_output, similar_words_output]
168
  )
169
 
170
  if __name__ == "__main__":
171
- iface.launch()
 
1
  import numpy as np
2
  import pandas as pd
3
+ import random
4
  from gensim.models import Word2Vec
5
  import gradio as gr
6
  from sklearn.decomposition import PCA
7
  import plotly.graph_objects as go
8
  import nltk
9
  from nltk.tokenize import word_tokenize
 
10
  from nltk.corpus import stopwords
11
  from nltk.stem import PorterStemmer, WordNetLemmatizer
12
  from nltk.tag import pos_tag
13
 
 
 
 
 
 
14
  from docs import NOVEL_TEXT
15
 
16
+ def download_nltk_library():
17
+ try:
18
+ nltk.download('punkt')
19
+ nltk.download('stopwords')
20
+ nltk.download('wordnet')
21
+ nltk.download('averaged_perceptron_tagger')
22
+ nltk.download('punkt_tab')
23
+ return True
24
+ except:
25
+ return False
26
 
27
  # Function to process each sentence
28
  def process_text(text):
29
+
30
+ lemmatizer = WordNetLemmatizer()
31
+ stop_words = set(stopwords.words('english'))
32
+
33
  # Tokenization
34
  tokens = word_tokenize(text.lower())
35
 
 
41
 
42
  return processed_tokens
43
 
44
+ # Word2Vec ๋ชจ๋ธ ํ•™์Šต ํ•จ์ˆ˜
45
+ def train_word2vec(sentences):
46
+ model = Word2Vec(sentences, vector_size=100, window=3, min_count=2, workers=4, sg=0, epochs=100)
47
+ return model
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ # def preprocess_text(file_path):
50
+ # with open(file_path, 'r', encoding='utf-8') as file:
51
+ # text = file.read()
 
 
 
52
 
53
+ # # ํ† ํฐํ™” ๋ฐ ํ’ˆ์‚ฌ ํƒœ๊น…
54
+ # tokens = word_tokenize(text)
55
+ # tagged = pos_tag(tokens)
56
 
57
+ # # ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ (NN, NNS, NNP, NNPS)
58
+ # nouns = [word.lower() for word, pos in tagged if pos.startswith('NN')]
59
 
60
+ # # ์ค‘๋ณต ์ œ๊ฑฐ ๋ฐ ์ •๋ ฌ
61
+ # unique_nouns = sorted(set(nouns))
62
 
63
+ # # ๊ฐ„๋‹จํ•œ ๋ฌธ์žฅ ์ƒ์„ฑ (๊ฐ ๋ช…์‚ฌ๋ฅผ ๊ฐœ๋ณ„ ๋ฌธ์žฅ์œผ๋กœ ์ทจ๊ธ‰)
64
+ # sentences = [[noun] for noun in unique_nouns]
65
 
66
+ # return sentences, unique_nouns
 
 
67
 
68
  def apply_pca(word_vectors):
69
  pca = PCA(n_components=3)
70
  return pca.fit_transform(word_vectors)
71
 
72
+ # def process_text(file_path, target_word):
73
+
74
  def get_unique(model):
75
  vocablist1=list(model.wv.index_to_key)
76
  vocablist =[]
 
78
  vocablist.append(i)
79
  return vocablist
80
 
81
+ def train_model(sentence):
82
  # ์ „์ฒ˜๋ฆฌ
83
+ sentences=sentence
84
 
85
  # Word2Vec ๋ชจ๋ธ ํ•™์Šต
86
  model = train_word2vec(sentences)
87
  unique_words = get_unique(model)
88
+
89
+ return model, unique_words
90
+
91
+ def process_model(target_word):
92
 
93
+ # Word2Vec ๋ชจ๋ธ ๋กœ๋“œ
94
+ model = Word2Vec.load("word2vec.model")
95
+ unique_words = get_unique(model)
96
+
97
  # ๊ฐ ๋‹จ์–ด์˜ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ถ”์ถœ
98
  word_vectors = np.array([model.wv[word] for word in unique_words])
99
 
 
113
  # ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด 10๊ฐœ ์ฐพ๊ธฐ
114
  if target_word in model.wv:
115
  all_words = model.wv.index_to_key # ๋ชจ๋ธ์— ํฌํ•จ๋œ ๋ชจ๋“  ๋‹จ์–ด ๋ฆฌ์ŠคํŠธ
116
+ dissimilar_words = sorted([(word, model.wv.similarity(target_word, word))
117
  for word in all_words if word != target_word],
118
  key=lambda x: x[1])[:10] # ์œ ์‚ฌ๋„๊ฐ€ ๊ฐ€์žฅ ๋‚ฎ์€ 10๊ฐœ ๋‹จ์–ด ์„ ํƒ
119
+
120
  dissimilar_word_indices = [unique_words.index(word) for word, _ in dissimilar_words]
121
  for idx in dissimilar_word_indices:
122
  colors[idx] = 'rgba(128, 0, 128, 1)' # ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋“ค์„ ๋ณด๋ผ์ƒ‰์œผ๋กœ ํ‘œ์‹œ
123
 
124
+
125
  # Plotly๋ฅผ ์‚ฌ์šฉํ•œ 3D ์‚ฐ์ ๋„ ์ƒ์„ฑ
126
  fig = go.Figure(data=[go.Scatter3d(
127
  x=word_vectors_3d[:, 0],
 
155
  return fig, similar_words_text
156
 
157
 
158
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ˆ˜์ •
159
+ with gr.Blocks(css=".plot-box {width: 70%; height: 500px;}") as iface:
160
  gr.Markdown("# Word Embedding 3D ์‹œ๊ฐํ™”")
161
  gr.Markdown("๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”. Word2Vec๊ณผ PCA๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด ์ž„๋ฒ ๋”ฉ์„ 3D๋กœ ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค. ์ž…๋ ฅํ•œ ๋‹จ์–ด๋Š” ๋นจ๊ฐ„์ƒ‰์œผ๋กœ, ๊ฐ€์žฅ ์œ ์‚ฌํ•œ 10๊ฐœ ๋‹จ์–ด๋Š” ์ดˆ๋ก์ƒ‰, ๊ฐ€์žฅ ๋จผ ๋‹จ์–ด๋Š” ๋ณด๋ผ์ƒ‰์œผ๋กœ ๊ฐ•์กฐ๋ฉ๋‹ˆ๋‹ค. ์œ ์‚ฌํ•œ ๋‹จ์–ด ๋ชฉ๋ก์€ ๊ทธ๋ž˜ํ”„ ์•„๋ž˜์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
162
 
163
  with gr.Row():
164
+ # ์‚ฌ์šฉ์ž ์ž…๋ ฅ ๋ฐ•์Šค๋ฅผ ๊ฐ•์กฐํ•˜๊ธฐ ์œ„ํ•ด ์Šคํƒ€์ผ์„ ๋ณ€๊ฒฝ
165
+ word_input = gr.Textbox(label="**๊ฐ•์กฐํ•  ๋‹จ์–ด ์ž…๋ ฅ**", elem_id="input-box", placeholder="๋‹จ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”", lines=1)
166
+ submit_btn = gr.Button("์ œ์ถœ", elem_id="submit-btn")
167
 
168
+ with gr.Row():
169
+ # ์‹œ๊ฐํ™” ํ™”๋ฉด์˜ ํฌ๊ธฐ๋ฅผ CSS๋กœ ์ฆ๊ฐ€
170
+ plot_output = gr.Plot(label="Word Embedding 3D ์‹œ๊ฐํ™”", elem_id="plot-box")
171
 
172
+ with gr.Column(scale=0.3): # ์ปฌ๋Ÿผ์˜ ๋„ˆ๋น„๋ฅผ ์ค„์ด๊ธฐ ์œ„ํ•ด scale ๊ฐ’์„ ๋‚ฎ์ถค
173
+ similar_words_output = gr.Textbox(label="์œ ์‚ฌํ•œ ๋‹จ์–ด", interactive=False, lines=5)
174
+ dissimilar_words_output = gr.Textbox(label="์œ ์‚ฌํ•˜์ง€ ์•Š์€ ๋‹จ์–ด", interactive=False, lines=5)
175
 
176
  submit_btn.click(
177
  fn=process_text,
178
  inputs=[word_input],
179
+ outputs=[plot_output, similar_words_output, dissimilar_words_output]
180
  )
181
 
182
  if __name__ == "__main__":
183
+ iface.launch()