Spaces:
Runtime error
Runtime error
updated app
Browse files- app.py +28 -22
- test.ipynb +3 -3
app.py
CHANGED
@@ -27,12 +27,17 @@ def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'], nlp=Non
|
|
27 |
]))
|
28 |
return texts_out
|
29 |
|
30 |
-
def get_lda(n_components):
|
31 |
df = pd.read_csv('./data/results.csv', index_col=0)
|
32 |
data = concat_comments(df.subreddit, df.sup_comment, df.comment)
|
33 |
data_words = list(sent_to_words(data))
|
34 |
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
36 |
if not spacy.util.is_package("en_core_web_sm"):
|
37 |
print('[x] en_core_web_sm not found, downloading...')
|
38 |
os.system("python -m spacy download en_core_web_sm")
|
@@ -162,7 +167,7 @@ def get_lda(n_components):
|
|
162 |
print('Percentuale di commenti ironici per ogni topic')
|
163 |
perc_topic_irony = {}
|
164 |
for t in topics:
|
165 |
-
total_0label = sum((df
|
166 |
if total_0label != 0:
|
167 |
total_X_topic = df.Topic_key_word.value_counts()[t]
|
168 |
else:
|
@@ -182,10 +187,6 @@ def get_lda(n_components):
|
|
182 |
plt.xticks(rotation=70)
|
183 |
plt.legend()
|
184 |
plt.axhline(0.5, color = 'red', ls=":")
|
185 |
-
|
186 |
-
# Should this be a parameter?
|
187 |
-
# Max number of biggest subreddits to analyse
|
188 |
-
n_top_subreddit_to_analyse = 20
|
189 |
|
190 |
# probably not necessary (?) To drop eventually if log are to much cluttered!
|
191 |
print('Percentage of each topic for each subreddit')
|
@@ -205,17 +206,11 @@ def get_lda(n_components):
|
|
205 |
print('[x] Generating plot [2]')
|
206 |
# plot
|
207 |
subreddits = list(df.subreddit.value_counts().index)[:n_top_subreddit_to_analyse]
|
208 |
-
|
209 |
-
# weight_counts = {
|
210 |
-
# t: [
|
211 |
-
# df[df.Topic_key_word == t].subreddit.value_counts()[subreddit] / df.subreddit.value_counts()[subreddit] for subreddit in subreddits
|
212 |
-
# ] for t in topics
|
213 |
-
# }
|
214 |
|
215 |
irony_percs = {
|
216 |
t: [
|
217 |
len(
|
218 |
-
df[df.subreddit == subreddit][(df[df.subreddit == subreddit].Topic_key_word == t) & (df[df.subreddit == subreddit]
|
219 |
) /
|
220 |
len(
|
221 |
df[df.subreddit == subreddit]
|
@@ -234,7 +229,7 @@ def get_lda(n_components):
|
|
234 |
ax.bar(subreddits, irony_percs[k], width - 0.01, bottom=bottom, color = 'black', edgecolor = 'white', alpha = .2, hatch = '\\')
|
235 |
bottom += v
|
236 |
|
237 |
-
ax.set_title("
|
238 |
ax.legend(loc="upper right")
|
239 |
plt.xticks(rotation=50)
|
240 |
|
@@ -250,21 +245,32 @@ def get_lda(n_components):
|
|
250 |
|
251 |
with gr.Blocks() as demo:
|
252 |
gr.Markdown("# Dashboard per l'analisi con LDA")
|
253 |
-
gr.Markdown("###
|
254 |
# gradio.Dataframe(路路路)
|
255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
btn = gr.Button(value="Submit")
|
|
|
|
|
|
|
258 |
|
259 |
btn.click(
|
260 |
get_lda,
|
261 |
-
inputs=
|
262 |
-
gr.Slider(2, 25, value=5, step = 1, label="N components", info="Scegli il numero di componenti per LDA"),
|
263 |
-
],
|
264 |
outputs=[
|
265 |
gr.DataFrame(),
|
266 |
-
gr.Plot(label="
|
267 |
-
gr.Plot(label="
|
268 |
]
|
269 |
)
|
270 |
|
|
|
27 |
]))
|
28 |
return texts_out
|
29 |
|
30 |
+
def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
|
31 |
df = pd.read_csv('./data/results.csv', index_col=0)
|
32 |
data = concat_comments(df.subreddit, df.sup_comment, df.comment)
|
33 |
data_words = list(sent_to_words(data))
|
34 |
|
35 |
+
if what_label_to_use == 'Use True label':
|
36 |
+
label = 'label'
|
37 |
+
else:
|
38 |
+
label = 'prediction'
|
39 |
+
|
40 |
+
|
41 |
if not spacy.util.is_package("en_core_web_sm"):
|
42 |
print('[x] en_core_web_sm not found, downloading...')
|
43 |
os.system("python -m spacy download en_core_web_sm")
|
|
|
167 |
print('Percentuale di commenti ironici per ogni topic')
|
168 |
perc_topic_irony = {}
|
169 |
for t in topics:
|
170 |
+
total_0label = sum((df[label] == 1) & (df.Topic_key_word == t))
|
171 |
if total_0label != 0:
|
172 |
total_X_topic = df.Topic_key_word.value_counts()[t]
|
173 |
else:
|
|
|
187 |
plt.xticks(rotation=70)
|
188 |
plt.legend()
|
189 |
plt.axhline(0.5, color = 'red', ls=":")
|
|
|
|
|
|
|
|
|
190 |
|
191 |
# probably not necessary (?) To drop eventually if log are to much cluttered!
|
192 |
print('Percentage of each topic for each subreddit')
|
|
|
206 |
print('[x] Generating plot [2]')
|
207 |
# plot
|
208 |
subreddits = list(df.subreddit.value_counts().index)[:n_top_subreddit_to_analyse]
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
irony_percs = {
|
211 |
t: [
|
212 |
len(
|
213 |
+
df[df.subreddit == subreddit][(df[df.subreddit == subreddit].Topic_key_word == t) & (df[df.subreddit == subreddit][label] == 1)]
|
214 |
) /
|
215 |
len(
|
216 |
df[df.subreddit == subreddit]
|
|
|
229 |
ax.bar(subreddits, irony_percs[k], width - 0.01, bottom=bottom, color = 'black', edgecolor = 'white', alpha = .2, hatch = '\\')
|
230 |
bottom += v
|
231 |
|
232 |
+
ax.set_title("% of topics for each subreddit")
|
233 |
ax.legend(loc="upper right")
|
234 |
plt.xticks(rotation=50)
|
235 |
|
|
|
245 |
|
246 |
with gr.Blocks() as demo:
|
247 |
gr.Markdown("# Dashboard per l'analisi con LDA")
|
248 |
+
gr.Markdown("### La dashboard permette l'addestramento di un modello LDA per controllare se e quali topic sono pi霉 propensi a commenti di tipo sarcastico")
|
249 |
# gradio.Dataframe(路路路)
|
250 |
+
|
251 |
+
inputs = []
|
252 |
+
with gr.Row():
|
253 |
+
inputs.append(gr.Slider(2, 25, value=5, step = 1, label="LDA N components", info="Scegli il numero di componenti per LDA"))
|
254 |
+
inputs.append(gr.Slider(2, 20, value=5, step = 1, label="Subreddit dal dataset", info="Numero di subreddit da analizzare"))
|
255 |
+
inputs.append(gr.Radio(
|
256 |
+
choices = ['Use True label', 'Use BERT prediction'],
|
257 |
+
value = 'Use True label',
|
258 |
+
label = "Scegliere quali label sull'ironia utilizzare:",
|
259 |
+
)
|
260 |
+
)
|
261 |
|
262 |
btn = gr.Button(value="Submit")
|
263 |
+
|
264 |
+
gr.Markdown("## Risulati ottenuti")
|
265 |
+
gr.Markdown("#### Top 15 parole che pi霉 contribuiscono al topic di riferimento (utlima colonna):")
|
266 |
|
267 |
btn.click(
|
268 |
get_lda,
|
269 |
+
inputs=inputs,
|
|
|
|
|
270 |
outputs=[
|
271 |
gr.DataFrame(),
|
272 |
+
gr.Plot(label="Quanto i topic trovati portano ironia?"),
|
273 |
+
gr.Plot(label="Come i topic sono correlati ai diversi subreddit del dataset?"),
|
274 |
]
|
275 |
)
|
276 |
|
test.ipynb
CHANGED
@@ -255,9 +255,9 @@
|
|
255 |
"metadata": {},
|
256 |
"source": [
|
257 |
"TODO:\n",
|
258 |
-
"- Show LDA top words for each topic\n",
|
259 |
-
"- I topic con una bassa percentuale di ironia sono i topic considerati pi霉 \"seri\" (?)\n",
|
260 |
-
"- Per ora sto utilizzando le label assegnate dal dataset, se non avessi le label e dovessi prevedere l'ironia LDA 猫 cmq affidabile?"
|
261 |
]
|
262 |
}
|
263 |
],
|
|
|
255 |
"metadata": {},
|
256 |
"source": [
|
257 |
"TODO:\n",
|
258 |
+
"- [x] Show LDA top words for each topic\n",
|
259 |
+
"- [ ] I topic con una bassa percentuale di ironia sono i topic considerati pi霉 \"seri\" (?)\n",
|
260 |
+
"- [x] Per ora sto utilizzando le label assegnate dal dataset, se non avessi le label e dovessi prevedere l'ironia LDA 猫 cmq affidabile?"
|
261 |
]
|
262 |
}
|
263 |
],
|