Spaces:

madoss
/

gdiy

Runtime error

App Files Files Community

madoss commited on Jul 9, 2023

Commit

c8ef7fe

1 Parent(s): 697528c

update app

Browse files

Files changed (1) hide show

app.py +12 -52

app.py CHANGED Viewed

@@ -8,12 +8,11 @@ from wordcloud import WordCloud
 import nltk
 import plotly.express as px
 import plotly.graph_objects as go
-import pickle
 import warnings
 nltk.download('stopwords')
-from nltk.corpus import stopwords
 nlp = spacy.load("fr_core_news_sm")
-stopword = stopwords.words('french')
 warnings.filterwarnings('ignore')
 from nltk import FreqDist
@@ -40,45 +39,24 @@ def clean_data(df):
 df_clean = clean_data(df)
-def clean_up1(row: str, stopword, pos=None):
-    """ Args : text
-    Returns : List
-    - Remove `\xa0` and `\u200a`
-    - Remove word with length lower than 2"""
-    texts = row.replace(f'\xa0', '')
-    texts = texts.replace(f'\u200a', '')
-    text_ = " ".join([token for token in texts.split() if token.isalpha() and len(token) > 2])
-    texts = nlp(text_)
-    if pos is not None:
-        list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword \
-                       and token.pos_ not in pos]
-    else:
-        list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword]
-    return list_tokens
 # Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
 pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
 # Some frequently used in the podcast
-context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
            'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
-           'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
-stopword = stopword + context # add some frequent words in stopword
-clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
-docs = clean_text.apply(lambda x: " ".join(x)).tolist()
 #load the model
-topic_model = BERTopic.load("bertopic.pkl")
-with open('topics', 'rb') as f:
-    topics = pickle.load(f)
-timestamps = df_clean.index
-topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
                                                 global_tuning=True,
                                                 evolution_tuning=True,
                                                 nr_bins=20)
@@ -116,34 +94,16 @@ podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inpla
 fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
 fig.update_traces(textposition="bottom right")
 fig.update_layout(autosize=False, width=800)
-#barchart of number of podcast per year
-#fig1 = px.pie(data_frame=podcast_per_year, values = 'nb_podcast', names= 'year', hole=.4)
-fig1 = go.Figure(data=[go.Pie(labels=podcast_per_year['year'],
-                              values=podcast_per_year['nb_podcast'], pull=[0, 0, 0.2, 0, 0, 0])])
-#fig1.update_layout(autosize=False, width=800)
-fig1.update_traces(textposition='inside', textinfo='value+label')
 st.write('''
 # Nous sommes la moyenne des personnes que nous fréquentons.
 Hello''')
-st.header('Nombre de podcasts par année')
-st.plotly_chart(fig1, use_container_width=False,
-                sharing="streamlit")
 st.header('Durée moyenne des podcasts par année')
 st.plotly_chart(fig, use_container_width=False,
                 sharing="streamlit")
 #word cloud of all terms
 st.header('Les mots fréquemment utilisés dans le podcast')
-text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
-wordcloud = WordCloud(background_color='white').generate(text_cloud)
-fig, ax = plt.subplots()
-ax.imshow(wordcloud, interpolation='bilinear')
-plt.axis("off")
-plt.show()
-st.pyplot(fig)
 #show topics
 st.header('Sujets évoqués dans le podcast')
 st.plotly_chart(topic_fig, use_container_width=False,

 import nltk
 import plotly.express as px
 import plotly.graph_objects as go
+import json
 import warnings
+from datetime import datetime
 nltk.download('stopwords')
 nlp = spacy.load("fr_core_news_sm")
 warnings.filterwarnings('ignore')
 from nltk import FreqDist
 df_clean = clean_data(df)
 # Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
 pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
 # Some frequently used in the podcast
+context = ['ouais', 'épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
            'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
+           'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait','abonnez', 'parce',
+             'ouai', 'sai', 'it', 'do', 'mets', 'yourself','si', 'chose','oui', 'truc', 'dessus', 'traite',
+             'that'] # add some frequent words in stopword
+with open('./clean_docs.json', 'r') as f:
+    clean_text = json.load(f)
+docs = clean_text['text']
 #load the model
+topic_model = BERTopic.load("./model_dir/")
+timestamps = [datetime.strptime(date_time, "%d/%m/%Y") for date_time in clean_text["date"]]
+topics_over_time = topic_model.topics_over_time(docs, timestamps,
                                                 global_tuning=True,
                                                 evolution_tuning=True,
                                                 nr_bins=20)
 fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
 fig.update_traces(textposition="bottom right")
 fig.update_layout(autosize=False, width=800)
 st.write('''
 # Nous sommes la moyenne des personnes que nous fréquentons.
 Hello''')
 st.header('Durée moyenne des podcasts par année')
 st.plotly_chart(fig, use_container_width=False,
                 sharing="streamlit")
 #word cloud of all terms
 st.header('Les mots fréquemment utilisés dans le podcast')
 #show topics
 st.header('Sujets évoqués dans le podcast')
 st.plotly_chart(topic_fig, use_container_width=False,