DanielSc4 commited on
Commit
2180e70
1 Parent(s): b351586

Fixed some code smell

Browse files
Files changed (1) hide show
  1. app.py +51 -46
app.py CHANGED
@@ -8,12 +8,11 @@ import nltk, spacy, gensim
8
  from sklearn.decomposition import LatentDirichletAllocation
9
  from sklearn.feature_extraction.text import CountVectorizer
10
  from pprint import pprint
 
 
11
 
12
- def concat_comments(sup_comment: list[str], comment: list[str]) -> list[str]:
13
- format_s = "{s}\n{c}"
14
- return [
15
- format_s.format(s=s, c=c) for s, c in zip(sup_comment, comment)
16
- ]
17
 
18
  def sent_to_words(sentences):
19
  for sentence in sentences:
@@ -28,38 +27,34 @@ def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'], nlp=Non
28
  ]))
29
  return texts_out
30
 
31
-
32
- def main(choose_context):
33
  df = pd.read_csv('./data/results.csv', index_col=0)
34
-
35
- print(choose_context)
36
-
37
- if choose_context == 'comment':
38
- data = df.comment
39
- elif choose_context == 'sup comment':
40
- data = df.sup_comment
41
- elif choose_context == 'sup comment + comment':
42
- data = concat_comments(df.sup_comment, df.comment)
43
-
44
  data_words = list(sent_to_words(data))
45
 
46
- print('downloading en_core_web_sm')
47
- os.system("python -m spacy download en_core_web_sm")
48
- print('en_core_web_sm downloaded')
49
-
 
 
 
50
  nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
51
  data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "ADJ"], nlp=nlp) #select noun and verb
52
 
 
53
  vectorizer = CountVectorizer(
54
- analyzer='word',
55
- min_df=10,
56
- stop_words='english',
57
- lowercase=True,
58
- token_pattern='[a-zA-Z0-9]{3,}'
59
- )
60
- data_vectorized = vectorizer.fit_transform(data_lemmatized)
61
 
 
 
62
 
 
63
  lda_model = LatentDirichletAllocation(
64
  n_components=5,
65
  max_iter=10,
@@ -68,19 +63,28 @@ def main(choose_context):
68
  batch_size=128,
69
  evaluate_every = -1,
70
  n_jobs = -1,
 
71
  )
 
 
72
  lda_output = lda_model.fit_transform(data_vectorized)
73
  print(lda_model) # Model attributes
74
 
 
 
75
  # Log Likelyhood: Higher the better
76
- print("Log Likelihood: ", lda_model.score(data_vectorized))
77
  # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
78
- print("Perplexity: ", lda_model.perplexity(data_vectorized))
 
 
79
  # See model parameters
80
  pprint(lda_model.get_params())
81
 
 
82
  best_lda_model = lda_model
83
 
 
84
  lda_output = best_lda_model.transform(data_vectorized)
85
 
86
  topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
@@ -96,8 +100,6 @@ def main(choose_context):
96
  # Assign Column and Index
97
  df_topic_keywords.columns = vectorizer.get_feature_names_out()
98
  df_topic_keywords.index = topicnames
99
- # View
100
- df_topic_keywords
101
 
102
  # Show top n keywords for each topic
103
  def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
@@ -190,25 +192,28 @@ def main(choose_context):
190
  return fig
191
 
192
 
 
 
 
 
 
193
  with gr.Blocks() as demo:
194
- gr.Markdown("## Dashboard per l'analisi di LDA")
195
- gr.Markdown("#### Questo 猫 un sottotitolo")
196
- button = gr.Radio(
197
- label="Plot type",
198
- choices=['scatter_plot', 'heatmap', 'us_map', 'interactive_barplot', "radial", "multiline"], value='scatter_plot'
199
- )
200
  # gradio.Dataframe(路路路)
201
- choose_context = gr.Radio(
202
- label="Context LDA",
203
- choices=['comment', 'sup comment', 'sup comment + comment'], value='sup comment'
204
- )
205
- plot = gr.Plot(label="Plot")
206
- choose_context.change(main, inputs=[choose_context], outputs=[plot])
207
 
208
  btn = gr.Button(value="Submit")
209
- btn.click(main, inputs=[choose_context], outputs=[plot])
 
 
 
 
 
 
210
 
211
- demo.load(main, inputs=[choose_context], outputs=[plot])
212
 
213
 
214
  # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
8
  from sklearn.decomposition import LatentDirichletAllocation
9
  from sklearn.feature_extraction.text import CountVectorizer
10
  from pprint import pprint
11
+ import matplotlib
12
+ matplotlib.use('agg')
13
 
14
+ def concat_comments(*kwargs):
15
+ return ['\n'.join(ele) for ele in zip(*kwargs)]
 
 
 
16
 
17
  def sent_to_words(sentences):
18
  for sentence in sentences:
 
27
  ]))
28
  return texts_out
29
 
30
+ def get_lda(n_components):
 
31
  df = pd.read_csv('./data/results.csv', index_col=0)
32
+ data = concat_comments(df.subreddit, df.sup_comment, df.comment)
 
 
 
 
 
 
 
 
 
33
  data_words = list(sent_to_words(data))
34
 
35
+
36
+ if not spacy.util.is_package("en_core_web_sm"):
37
+ print('[x] en_core_web_sm not found, downloading...')
38
+ os.system("python -m spacy download en_core_web_sm")
39
+ print('[x] en_core_web_sm downloaded')
40
+
41
+ print('[x] Lemmatization begins')
42
  nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
43
  data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "ADJ"], nlp=nlp) #select noun and verb
44
 
45
+ print('[x] Vectorizing')
46
  vectorizer = CountVectorizer(
47
+ analyzer='word',
48
+ min_df=10,
49
+ stop_words='english',
50
+ lowercase=True,
51
+ token_pattern='[a-zA-Z0-9]{3,}'
52
+ )
 
53
 
54
+ print('[x] Fitting vectorized data on lemmatization')
55
+ data_vectorized = vectorizer.fit_transform(data_lemmatized)
56
 
57
+ print('[x] Init LDA model')
58
  lda_model = LatentDirichletAllocation(
59
  n_components=5,
60
  max_iter=10,
 
63
  batch_size=128,
64
  evaluate_every = -1,
65
  n_jobs = -1,
66
+ verbose=1,
67
  )
68
+
69
+ print('[x] Fitting LDA model')
70
  lda_output = lda_model.fit_transform(data_vectorized)
71
  print(lda_model) # Model attributes
72
 
73
+ print('[x] Getting performances')
74
+ performances = lda_model.score(data_vectorized), lda_model.perplexity(data_vectorized)
75
  # Log Likelyhood: Higher the better
76
+ print("Log Likelihood: ", performances[0])
77
  # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
78
+ print("Perplexity: ", performances[1])
79
+
80
+ print('[x] Check parameters if they look correct')
81
  # See model parameters
82
  pprint(lda_model.get_params())
83
 
84
+ # switching to the best model
85
  best_lda_model = lda_model
86
 
87
+ print('[x] Getting LDA output')
88
  lda_output = best_lda_model.transform(data_vectorized)
89
 
90
  topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
 
100
  # Assign Column and Index
101
  df_topic_keywords.columns = vectorizer.get_feature_names_out()
102
  df_topic_keywords.index = topicnames
 
 
103
 
104
  # Show top n keywords for each topic
105
  def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
 
192
  return fig
193
 
194
 
195
+ # def main():
196
+
197
+
198
+
199
+
200
  with gr.Blocks() as demo:
201
+ gr.Markdown("# Dashboard per l'analisi con LDA")
202
+ gr.Markdown("### Questo 猫 un sottotitolo")
 
 
 
 
203
  # gradio.Dataframe(路路路)
204
+
205
+ n_comp = gr.Slider(2, 25, value=5, step = 1, label="N components", info="Scegli il numero di componenti per LDA"),
 
 
 
 
206
 
207
  btn = gr.Button(value="Submit")
208
+
209
+ plot = gr.Plot(label="Plot")
210
+
211
+ btn.click(get_lda, inputs=[n_comp[0]], outputs=[plot])
212
+
213
+
214
+
215
 
216
+ # demo.load(main, inputs=[], outputs=[plot])
217
 
218
 
219
  # iface = gr.Interface(fn=greet, inputs="text", outputs="text")