DanielSc4 commited on
Commit
465ab59
1 Parent(s): 6534dfb

updated app

Browse files
Files changed (2) hide show
  1. app.py +28 -22
  2. test.ipynb +3 -3
app.py CHANGED
@@ -27,12 +27,17 @@ def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'], nlp=Non
27
  ]))
28
  return texts_out
29
 
30
- def get_lda(n_components):
31
  df = pd.read_csv('./data/results.csv', index_col=0)
32
  data = concat_comments(df.subreddit, df.sup_comment, df.comment)
33
  data_words = list(sent_to_words(data))
34
 
35
-
 
 
 
 
 
36
  if not spacy.util.is_package("en_core_web_sm"):
37
  print('[x] en_core_web_sm not found, downloading...')
38
  os.system("python -m spacy download en_core_web_sm")
@@ -162,7 +167,7 @@ def get_lda(n_components):
162
  print('Percentuale di commenti ironici per ogni topic')
163
  perc_topic_irony = {}
164
  for t in topics:
165
- total_0label = sum((df.label == 1) & (df.Topic_key_word == t))
166
  if total_0label != 0:
167
  total_X_topic = df.Topic_key_word.value_counts()[t]
168
  else:
@@ -182,10 +187,6 @@ def get_lda(n_components):
182
  plt.xticks(rotation=70)
183
  plt.legend()
184
  plt.axhline(0.5, color = 'red', ls=":")
185
-
186
- # Should this be a parameter?
187
- # Max number of biggest subreddits to analyse
188
- n_top_subreddit_to_analyse = 20
189
 
190
  # probably not necessary (?) To drop eventually if log are to much cluttered!
191
  print('Percentage of each topic for each subreddit')
@@ -205,17 +206,11 @@ def get_lda(n_components):
205
  print('[x] Generating plot [2]')
206
  # plot
207
  subreddits = list(df.subreddit.value_counts().index)[:n_top_subreddit_to_analyse]
208
-
209
- # weight_counts = {
210
- # t: [
211
- # df[df.Topic_key_word == t].subreddit.value_counts()[subreddit] / df.subreddit.value_counts()[subreddit] for subreddit in subreddits
212
- # ] for t in topics
213
- # }
214
 
215
  irony_percs = {
216
  t: [
217
  len(
218
- df[df.subreddit == subreddit][(df[df.subreddit == subreddit].Topic_key_word == t) & (df[df.subreddit == subreddit].label == 1)]
219
  ) /
220
  len(
221
  df[df.subreddit == subreddit]
@@ -234,7 +229,7 @@ def get_lda(n_components):
234
  ax.bar(subreddits, irony_percs[k], width - 0.01, bottom=bottom, color = 'black', edgecolor = 'white', alpha = .2, hatch = '\\')
235
  bottom += v
236
 
237
- ax.set_title("Perc of topics for each subreddit")
238
  ax.legend(loc="upper right")
239
  plt.xticks(rotation=50)
240
 
@@ -250,21 +245,32 @@ def get_lda(n_components):
250
 
251
  with gr.Blocks() as demo:
252
  gr.Markdown("# Dashboard per l'analisi con LDA")
253
- gr.Markdown("### Questo un sottotitolo")
254
  # gradio.Dataframe(路路路)
255
-
 
 
 
 
 
 
 
 
 
 
256
 
257
  btn = gr.Button(value="Submit")
 
 
 
258
 
259
  btn.click(
260
  get_lda,
261
- inputs=[
262
- gr.Slider(2, 25, value=5, step = 1, label="N components", info="Scegli il numero di componenti per LDA"),
263
- ],
264
  outputs=[
265
  gr.DataFrame(),
266
- gr.Plot(label="Plot 1"),
267
- gr.Plot(label="Plot 2"),
268
  ]
269
  )
270
 
 
27
  ]))
28
  return texts_out
29
 
30
+ def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
31
  df = pd.read_csv('./data/results.csv', index_col=0)
32
  data = concat_comments(df.subreddit, df.sup_comment, df.comment)
33
  data_words = list(sent_to_words(data))
34
 
35
+ if what_label_to_use == 'Use True label':
36
+ label = 'label'
37
+ else:
38
+ label = 'prediction'
39
+
40
+
41
  if not spacy.util.is_package("en_core_web_sm"):
42
  print('[x] en_core_web_sm not found, downloading...')
43
  os.system("python -m spacy download en_core_web_sm")
 
167
  print('Percentuale di commenti ironici per ogni topic')
168
  perc_topic_irony = {}
169
  for t in topics:
170
+ total_0label = sum((df[label] == 1) & (df.Topic_key_word == t))
171
  if total_0label != 0:
172
  total_X_topic = df.Topic_key_word.value_counts()[t]
173
  else:
 
187
  plt.xticks(rotation=70)
188
  plt.legend()
189
  plt.axhline(0.5, color = 'red', ls=":")
 
 
 
 
190
 
191
  # probably not necessary (?) To drop eventually if log are to much cluttered!
192
  print('Percentage of each topic for each subreddit')
 
206
  print('[x] Generating plot [2]')
207
  # plot
208
  subreddits = list(df.subreddit.value_counts().index)[:n_top_subreddit_to_analyse]
 
 
 
 
 
 
209
 
210
  irony_percs = {
211
  t: [
212
  len(
213
+ df[df.subreddit == subreddit][(df[df.subreddit == subreddit].Topic_key_word == t) & (df[df.subreddit == subreddit][label] == 1)]
214
  ) /
215
  len(
216
  df[df.subreddit == subreddit]
 
229
  ax.bar(subreddits, irony_percs[k], width - 0.01, bottom=bottom, color = 'black', edgecolor = 'white', alpha = .2, hatch = '\\')
230
  bottom += v
231
 
232
+ ax.set_title("% of topics for each subreddit")
233
  ax.legend(loc="upper right")
234
  plt.xticks(rotation=50)
235
 
 
245
 
246
  with gr.Blocks() as demo:
247
  gr.Markdown("# Dashboard per l'analisi con LDA")
248
+ gr.Markdown("### La dashboard permette l'addestramento di un modello LDA per controllare se e quali topic sono pi霉 propensi a commenti di tipo sarcastico")
249
  # gradio.Dataframe(路路路)
250
+
251
+ inputs = []
252
+ with gr.Row():
253
+ inputs.append(gr.Slider(2, 25, value=5, step = 1, label="LDA N components", info="Scegli il numero di componenti per LDA"))
254
+ inputs.append(gr.Slider(2, 20, value=5, step = 1, label="Subreddit dal dataset", info="Numero di subreddit da analizzare"))
255
+ inputs.append(gr.Radio(
256
+ choices = ['Use True label', 'Use BERT prediction'],
257
+ value = 'Use True label',
258
+ label = "Scegliere quali label sull'ironia utilizzare:",
259
+ )
260
+ )
261
 
262
  btn = gr.Button(value="Submit")
263
+
264
+ gr.Markdown("## Risulati ottenuti")
265
+ gr.Markdown("#### Top 15 parole che pi霉 contribuiscono al topic di riferimento (utlima colonna):")
266
 
267
  btn.click(
268
  get_lda,
269
+ inputs=inputs,
 
 
270
  outputs=[
271
  gr.DataFrame(),
272
+ gr.Plot(label="Quanto i topic trovati portano ironia?"),
273
+ gr.Plot(label="Come i topic sono correlati ai diversi subreddit del dataset?"),
274
  ]
275
  )
276
 
test.ipynb CHANGED
@@ -255,9 +255,9 @@
255
  "metadata": {},
256
  "source": [
257
  "TODO:\n",
258
- "- Show LDA top words for each topic\n",
259
- "- I topic con una bassa percentuale di ironia sono i topic considerati pi霉 \"seri\" (?)\n",
260
- "- Per ora sto utilizzando le label assegnate dal dataset, se non avessi le label e dovessi prevedere l'ironia LDA 猫 cmq affidabile?"
261
  ]
262
  }
263
  ],
 
255
  "metadata": {},
256
  "source": [
257
  "TODO:\n",
258
+ "- [x] Show LDA top words for each topic\n",
259
+ "- [ ] I topic con una bassa percentuale di ironia sono i topic considerati pi霉 \"seri\" (?)\n",
260
+ "- [x] Per ora sto utilizzando le label assegnate dal dataset, se non avessi le label e dovessi prevedere l'ironia LDA 猫 cmq affidabile?"
261
  ]
262
  }
263
  ],