Spaces:

abidlabs
/

Sentiments_topic_modeling_ITALIAN

Build error

App Files Files Community

abidlabs HF Staff commited on Aug 7, 2022

Commit

44e164c

1 Parent(s): 7a5cdac

Upload app.py

Browse files

Files changed (1) hide show

app.py +421 -0

app.py ADDED Viewed

	@@ -0,0 +1,421 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import plotly.express as px
+from stop_words import get_stop_words
+from wordcloud import WordCloud
+from datasets import load_dataset
+import re
+## import data
+dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
+data = pd.DataFrame.from_dict(dataset["train"])
+# load stop words
+it_stop_words = load_dataset("Santarabantoosoo/italian-stopwords")
+it_stop = pd.DataFrame.from_dict(it_stop_words["train"])
+it_stop = it_stop.text.to_list()
+## Optimize stop words according to Luca's repo
+def format_input(user_key, stopwords):
+  '''
+  format user input request to lookup in the database of frequencies
+  input:
+    user_key is a string
+    stopwords is a list of strings
+  output:
+    key is a string
+  '''
+  key = user_key.lower()
+  key = re.sub(r'[^\w\s]', ' ', key)
+  key = ' '.join([el for el in key.split() if not (el in stopwords)])
+  return key
+### Loading TFIDF
+TFIDF_21_Jul_Oct = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Jul_Oct")
+TFIDF_22_Feb_Apr = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_Feb_Apr")
+TFIDF_22_May_Jul = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_May_Jul")
+TFIDF_21_Nov_22_Jan = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Nov_22_Jan")
+## Loading whole_text
+whole_text_21_Jul_Oct = load_dataset("Santarabantoosoo/whole_text_TF_21_Jul_Oct")
+whole_text_22_Feb_Apr = load_dataset("Santarabantoosoo/whole_text_TF_22_Feb_Apr")
+whole_text_22_May_Jul = load_dataset("Santarabantoosoo/whole_text_TF_22_May_Jul")
+whole_text_21_Nov_22_Jan = load_dataset("Santarabantoosoo/whole_text_TF_21_Nov_22_Jan")
+TFIDF_21_Jul_Oct = pd.DataFrame.from_dict(TFIDF_21_Jul_Oct["train"])
+TFIDF_22_Feb_Apr = pd.DataFrame.from_dict(TFIDF_22_Feb_Apr["train"])
+TFIDF_22_May_Jul = pd.DataFrame.from_dict(TFIDF_22_May_Jul["train"])
+TFIDF_21_Nov_22_Jan = pd.DataFrame.from_dict(TFIDF_21_Nov_22_Jan["train"])
+whole_text_21_Jul_Oct = pd.DataFrame.from_dict(whole_text_21_Jul_Oct["train"])
+whole_text_22_Feb_Apr = pd.DataFrame.from_dict(whole_text_22_Feb_Apr["train"])
+whole_text_22_May_Jul = pd.DataFrame.from_dict(whole_text_22_May_Jul["train"])
+whole_text_21_Nov_22_Jan = pd.DataFrame.from_dict(whole_text_21_Nov_22_Jan["train"])
+ser_TFIDF = []
+ser_TFIDF.append(TFIDF_21_Jul_Oct.transpose()[0])
+ser_TFIDF.append(TFIDF_22_Feb_Apr.transpose()[0])
+ser_TFIDF.append(TFIDF_22_May_Jul.transpose()[0])
+ser_TFIDF.append(TFIDF_21_Nov_22_Jan.transpose()[0])
+ser_whole_text = []
+ser_whole_text.append(whole_text_21_Jul_Oct.transpose()[0])
+ser_whole_text.append(whole_text_22_Feb_Apr.transpose()[0])
+ser_whole_text.append(whole_text_22_May_Jul.transpose()[0])
+ser_whole_text.append(whole_text_21_Nov_22_Jan.transpose()[0])
+def plot_time_series(choice, keyword, user_keys):
+    x = np.arange(2,10,2)
+    y = [[] for j in range(len(keyword))]
+    for j in range(len(keyword)):
+      i=0
+      while i < len(choice):
+        try:
+          y[j].append(choice[i][keyword[j]])
+          i += 1
+        except:
+          y[j].append(0.0)
+          i += 1
+      y[j] = np.array(y[j])
+    x_ticks_labels = ['Q1','Q2','Q3','Q4']
+    fig, ax = plt.subplots(1,1)
+    for j in range(len(keyword)):
+      ax.plot(x,y[j], label = user_keys[j].lower())
+    # Set number of ticks for x-axis
+    ax.set_xticks(x)
+    ax.set_xticklabels(x_ticks_labels, fontsize=12)
+    leg = plt.legend(loc='best')
+    plt.xlabel('Time')
+    plt.title("keywords quartely analysis (July 2021 - July 2022)")
+    plt.ylabel(f'Freq. from {user_keys}')
+    return fig
+# Wordcloud with anger tweets
+angry_tweets = data['tweet'][data["emotion"] == 'anger']
+angry_tweets = angry_tweets.apply(format_input, args = [it_stop])
+stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
+anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))
+# Wordcloud with sad tweets
+sad_tweets = data['tweet'][data["emotion"] == 'sadness']
+sad_tweets = sad_tweets.apply(format_input, args = [it_stop])
+stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
+sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))
+ # Wordcloud with joy tweets
+joy_tweets = data['tweet'][data["emotion"] == 'joy']
+joy_tweets = joy_tweets.apply(format_input, args = [it_stop])
+stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
+joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))
+ # Wordcloud with fear tweets
+fear_tweets = data['tweet'][data["emotion"] == 'fear']
+fear_tweets = fear_tweets.apply(format_input, args = [it_stop])
+stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
+fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))
+## COmbine all plots in a single plot
+wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
+# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
+wc_fig.tight_layout()
+ax1.imshow(sad_wordcloud, interpolation="bilinear")
+ax1.axis("off")
+ax1.set_title('Sadness', {'fontsize': 30})
+ax2.imshow(joy_wordcloud, interpolation="bilinear")
+ax2.axis("off")
+ax2.set_title('Joy', {'fontsize': 30})
+ax3.imshow(fear_wordcloud, interpolation="bilinear")
+ax3.axis("off")
+ax3.set_title('Fear', {'fontsize': 30})
+ax4.imshow(anger_wordcloud, interpolation="bilinear")
+ax4.axis("off")
+ax4.set_title('Anger', {'fontsize': 30})
+# plot a pie plot for emotions' distribution
+number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()
+number_tweets_per_day["tweet_date"] = pd.to_datetime(number_tweets_per_day["date"])
+time_fig = px.line(number_tweets_per_day, x = 'tweet_date', y = 'id', labels = {'id': 'count'}, color = 'emotion',
+                  color_discrete_sequence=px.colors.qualitative.G10)
+# create a lineplot for emotions
+sentiment_counts = data.groupby('emotion').agg({'id' : 'size'}).reset_index()
+sentiment_counts.rename(columns = {'id':'count'}, inplace = True)
+sent_fig = px.pie(sentiment_counts, values='count', names='emotion', title='Tweets within each emotion', labels = {'id': 'count'},
+                 color_discrete_sequence=px.colors.qualitative.G10)
+sent_fig
+def display_plot(image_choice):
+    if image_choice == 'Sentiment distribution':
+        return sent_fig
+    elif image_choice == 'Time series':
+        return time_fig
+    elif image_choice == 'Word clouds':
+        return wc_fig
+def display_freq_plot(choice, *args):
+    user_keys = [arg for arg in args]
+    # clean input strings to match keywords in the database
+    keyword = []
+    for key in user_keys:
+        keyword.append(format_input(key, it_stop))
+    if choice == "TFIDF":
+        return plot_time_series(ser_TFIDF, keyword, user_keys)
+    elif choice == "Whole_text":
+        return plot_time_series(ser_whole_text, keyword, user_keys)
+def display_output(tweet_index):
+    topics = "<ol>\
+    <li>Discussion about scientific studies</li>\
+    <li>Anxiety about pandemic and the information about it OR Specific people in the context of LC</li>\
+    <li>Discussion about LC impact in terms of time periods</li>\
+    <li>Discussion about LC impact on patient life (impact on life so far or scope for lifelong impact)</li>\
+    <li>Treatment scenario</li>\
+    <li>Impact/Consequences of LC on children</li>\
+    </ol>"
+    item = topic_dist_list[tweet_index]
+    distribution = f'<html><body><h3>Topics Distribution</h3>({item[0][0]+1}, {item[0][1]}), ({item[1][0]+1}, {item[1][1]}), ({item[2][0]+1}, {item[2][1]}), ({item[3][0]+1}, {item[3][1]}), ({item[4][0]+1}, {item[4][1]}), ({item[5][0]+1}, {item[5][1]})\
+    </body></html>'
+    return gr.HTML.update(distribution, visible=True)
+def display_output_Q2_Q4(tweet_index):
+    item = topic_dist_list_Q2_Q4[tweet_index]
+    distribution = f'<html><body><h3>Topics Distribution</h3>({item[0][0]+1}, {item[0][1]}), ({item[1][0]+1}, {item[1][1]}), ({item[2][0]+1}, {item[2][1]}), ({item[3][0]+1}, {item[3][1]}), ({item[4][0]+1}, {item[4][1]}), ({item[5][0]+1}, {item[5][1]})\
+    </body></html>'
+    return gr.HTML.update(distribution, visible=True)
+# with gr.Blocks() as demo:
+#     gr.Markdown("## Choose your adventure")
+#     with gr.Tabs():
+#         with gr.TabItem("Topic modeling"):
+#             gr.Markdown("Nothing here yet")
+#         with gr.TabItem("Word frequency"):
+#             inputs =  [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'),
+#                       gr.Textbox(label = 'word 1'),
+#                       gr.Textbox(label = 'word 2'),
+#                       gr.Textbox(label = 'word 3'),
+#                       gr.Textbox(label = 'word 4')]
+#             plot_output = gr.Plot(elem_id = 1)
+#             freq_button = gr.Button("Submit")
+#         with gr.TabItem("Sentiment analysis"):
+#             text_input =  gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')
+#             sent_plot = gr.Plot()
+#             sent_button = gr.Button("Submit")
+#     sent_button.click(display_plot, inputs=text_input, outputs= sent_plot)
+#     freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output)
+with gr.Blocks() as demo:
+    gr.Markdown("## Choose your adventure")
+    with gr.Tabs():
+        with gr.TabItem("Topic modeling"):
+             gr.Markdown(
+                """
+            ## <div style="text-align: center;">Topic modeling analysis on Twitter</div>
+            """
+            )
+             with gr.Tabs():
+                 with gr.TabItem("July-Semptember 2021"):
+                     with gr.Row():
+                         gr.Image("./wordclouds_Q1 data.png", label="July-September 2021")
+                     tweets_list = ['C\'è uno studio a riguardo condotto proprio sui più giovani che identifica il long covid alla stregua di ogni strascico di malattie infettive polmonari. Il long covid è dannoso come una polmonite in quanto a effetti a lungo termine.  Se lo ritrovo te lo passo, ora sono fuori...',
+                    'Mio cugino Ã¨ guarito dal covid dopo 4 mesi di ospedale,  di cui  piÃ¹ di 2 intubato, grazie alla testardaggine dei medici che hanno fatto di tutto per salvargli la vita a 57 anni. Ora Ã¨ nella fase long covid per recuperare i danni fisici riportati',
+                    'Ãˆ importante parlare di #LongCovid e sensibilizzare tutti, giovani compresi, che non Ã¨ un gioco ma una malattia debilitante/invalidante che puÃ² stravolgere la vita. Io 39 anni e #LongCovid da 18 mesi (con 4 figli piccoli). #countlongcovid',
+                    'Il Long Covid Ã¨ una diretta conseguenza di quelli che nei primi tempi sono stati abbandonati a se stessi giorni e giorni e curati solo quando molto aggravati, in ospedale. Se ti curi tempestivamente non hai nessuna conseguenza.',
+                    'Non sai di cosa parli sono stato un mese attaccato ad un respiratore e sono salvo per miracolo. Ma questo Ã¨ niente in confronto con il #LongCovid che mi porto dietro da mesi e mesi. Siete dei criminali a pensare ch\'Ã¨ meglio curare che prevenire. Dei pazzi da rinchiudere',
+                    'A chi dice ""Il COVID è innocuo per i bambini"".   Oltre ad alcuni decessi 500+ bambini sono morti di COVID negli USA 2020)  c\'è #LongCOVID.  Se ne parla in questo studio:   ""Studio inglese rileva che il COVID a lungo colpisce fino a 1 bambino su 7 mesi dopo l\'infezione']
+                     q1_data_topic_list=['0. Discussion about scientific studies','1. Anxiety about pandemic and the information about it OR Specific people in the context of LC',
+                    '2. Discussion about LC impact in terms of time periods','3. Discussion about LC impact on patient life (impact on life so far or scope for lifelong impact)' ,
+                    '4. Treatment scenario', '5. Impact/Consequences of LC on children']
+                     topic_dist_list=[[(0, 0.2181524), (1, 0.13380228), (2, 0.021277282), (3, 0.48123622), (4, 0.01883339), (5, 0.12669843)],
+                    [(0, 0.0145399235), (1, 0.01287178), (2, 0.43158862), (3, 0.24750596), (4, 0.264914), (5, 0.028579665)],
+                    [(0, 0.016303344), (1, 0.014450405), (2, 0.36162496), (3, 0.48426068), (4, 0.023487965), (5, 0.09987263)],
+                    [(0, 0.018612841), (1, 0.016472807), (2, 0.44922927), (3, 0.033633586), (4, 0.026889767), (5, 0.45516175)],
+                    [(0, 0.016305258), (1, 0.014453228), (2, 0.7628153), (3, 0.029092493), (4, 0.14613572), (5, 0.031198042)],
+                    [(0, 0.016303508), (1, 0.014449066), (2, 0.15605325), (3, 0.029179793), (4, 0.023376595), (5, 0.7606378)]]
+                     topics = '<html><body>\
+                        <h3><b>Topics July to Sept, 2021</b></h3>\
+                        <ol type="1">\
+                        <li>1. Discussion about scientific studies</li>\
+                        <li>2. Anxiety about pandemic and the information about it OR Specific people in the context of LC</li>\
+                        <li>3. Discussion about LC impact in terms of time periods</li>\
+                        <li>4. Discussion about LC impact on patient life (impact on life so far or scope for lifelong impact)</li>\
+                        <li>5. Treatment scenario</li>\
+                        <li>6. Impact/Consequences of LC on children</li>\
+                        </ol>\
+                        </body></html>'
+                     Q1_topics = gr.HTML(topics, visible=True)
+                     gr.Markdown(
+                        """
+                    ### Test our topic modeling model : select a tweet and check the topics distribution !
+                    """
+                    )
+                     tweet = gr.Dropdown(tweets_list, label="Example tweets", interactive=True, type="index")
+                     model_output = gr.HTML("", visible=False)
+                     tweet.change(display_output, tweet, model_output)
+                 with gr.TabItem("October 2021-July 2022"):
+                     topic_dist_list_Q2_Q4=[[(0, 0.4377157), (1, 0.05924045), (2, 0.1525337), (3, 0.1941842), (4, 0.075339705), (5, 0.08098622)],
+                    [(0, 0.16064012), (1, 0.063850455), (2, 0.08664099), (3, 0.2870743), (4, 0.081202514), (5, 0.32059166)],
+                    [(0, 0.14904374), (1, 0.059243646), (2, 0.08039133), (3, 0.26638654), (4, 0.07534457), (5, 0.36959016)],
+                    [(0, 0.14897935), (1, 0.059245925), (2, 0.08039324), (3, 0.41068354), (4, 0.14752874), (5, 0.15316921)],
+                    [(0, 0.089826144), (1, 0.069229595), (2, 0.09393969), (3, 0.5643193), (4, 0.08804329), (5, 0.09464199)],
+                    [(0, 0.08284077), (1, 0.29718927), (2, 0.08663448), (3, 0.36485678), (4, 0.08119658), (5, 0.08728213)]]
+                     with gr.Row():
+                            gr.Image("./wordclouds_Q2-Q2 data.png", label="October 2021-July 2022")
+                     Q2_Q4_topics = '<html><body>\
+                            <h3><b>Topics October 2021 to July 2022</b></h3>\
+                            <ol type="1">\
+                            <li>1. Variants</li>\
+                            <li>2. Vaccine side-effects (and general anti-vax/ anti-LC narrative)</li>\
+                            <li>3. Aftermath of LC or vaccine</li>\
+                            <li>4. Impact of LC in terms of time OR Risks/Symptoms of LC</li>\
+                            <li>5.  Anger or anxiety about LC information</li>\
+                            <li>6. Discussion or Information about the science/knowledge surrounding LC</li>\
+                            </ol>\
+                            </body></html>'
+                     Q2_Q4_topics_html = gr.HTML(Q2_Q4_topics, visible=True)
+                     tweet_list_Q2_Q4=["Omicron e Long Covid: palpitazioni e perdita d'udito tra i sintomi - #Omicron #Covid: #palpitazioni ",
+                    'Long Covid e trombosi. La correlazione è spiegata da Giovanni Esposito, Presidente GISE, in un articolo sul sito  https://t.co/8TdI9nhDHY e avvalorata da uno studio svedese pubblicato sul British Medical Journal.  https://t.co/UebaXUtfbz',
+                    'Peccato che il ""long COVID"" che è proprio ciò di cui parla l\'esimio dottore citato determini una alterazione o soppressione del sistema immunitario di cui si sa ancora poco ma che può portare a conseguenze fatali per il paziente.',
+                    'Il Long covid rappresentava un problema solo fino ad aprile 2021, i vaccini hanno molto ridotto l\'impatto e la gravitÃ delle patologie a lungo termine, in pratica si puÃ² dire che il long covid non esiste piÃ¹',
+                    'Sicuro, 100-150 morti al giorno, 6 ondate l anno, rischio long covid, rischio evoluzionario, e via dicendo — finitissimo',
+                    'le cure le fai giorno dopo giorno... ci sono casi di long-covid dopo 6 mesi dall\'infezione. [Vaccino > >Cure] è un dato di fatto',
+                    'A parte il rischio di sviluppare il #longcovid, il pericolo grave di lasciar circolare il virus e di farlo diventare endemico come preconizza il governo e lo sciagurato #speranza non Ã¨ nel decorso del singolo caso ma nell\'aumento proporzionale dell\'insorgere di nuove varianti']
+                     gr.Markdown(
+                        """
+                    ### Test our topic modeling model : select a tweet and check the topics distribution !
+                    """
+                    )
+                     tweet_Q2_Q4 = gr.Dropdown(tweet_list_Q2_Q4, label="Example tweets", interactive=True, type="index")
+                     model_output_Q2_Q4 = gr.HTML("", visible=False)
+                     tweet_Q2_Q4.change(display_output_Q2_Q4, tweet_Q2_Q4, model_output_Q2_Q4)
+        with gr.TabItem("Word frequency"):
+            inputs =  [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'),
+                      gr.Textbox(label = 'word 1'),
+                      gr.Textbox(label = 'word 2'),
+                      gr.Textbox(label = 'word 3')]
+            plot_output = gr.Plot()
+            freq_button = gr.Button("Submit")
+            freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output)
+            gr.Examples(
+              examples= [['TFIDF', 'Stanchezza', "l'età", '#LongCovidKids'], ['Whole_text', 'nebbia mentale', 'mal di testa', 'Ansia']],
+              inputs= inputs)
+        with gr.TabItem("Sentiment analysis"):
+            text_input =  gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')
+            sent_plot = gr.Plot()
+            sent_button = gr.Button("Submit")
+            sent_button.click(display_plot, inputs=text_input, outputs= sent_plot)
+demo.launch(debug=True, show_error = True);