Spaces:

madoss
/

gdiy

Runtime error

App Files Files Community

madoss commited on Jul 9, 2023

Commit

f2b56a9

1 Parent(s): 7629f8d

app

Browse files

Files changed (2) hide show

.gitattributes +1 -0
app.py +51 -33

.gitattributes CHANGED Viewed

@@ -30,3 +30,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.psd filter=lfs diff=lfs merge=lfs -text

 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.psd filter=lfs diff=lfs merge=lfs -text
+clean_docs.json filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -6,14 +6,15 @@ import seaborn as sns
 from bertopic import BERTopic
 from wordcloud import WordCloud
 import nltk
 nltk.download('stopwords')
 from nltk.corpus import stopwords
-import pickle
-import plotly.express as px
 nlp = spacy.load("fr_core_news_sm")
 stopword = stopwords.words('french')
-import warnings
 warnings.filterwarnings('ignore')
 from nltk import FreqDist
@@ -22,12 +23,16 @@ df = pd.read_csv("gdiy_data.csv", sep=',',
 def clean_data(df):
     df = df.drop('Unnamed: 0', axis=1)
     df['description'] = df['description'].str.lower()
     df = df.set_index('release_date')
     df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
     df.loc[:, 'duration_min'] = df['duration_ms'].apply(
-        lambda row: row / (60 * 1000))  # convertir la durée de ms en minutes
     df['year'] = df.index.year
     df['month'] = df.index.month
     return df
@@ -37,9 +42,10 @@ df_clean = clean_data(df)
 def clean_up1(row: str, stopword, pos=None):
-    """ Prend une un text:
-    - Supprime les caractères `\xa0` et `\u200a`
-    - Supprime les mots avec moins de lettres """
     texts = row.replace(f'\xa0', '')
     texts = texts.replace(f'\u200a', '')
@@ -54,18 +60,18 @@ def clean_up1(row: str, stopword, pos=None):
     return list_tokens
-pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB']
 context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
            'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
            'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
-stopword = stopword + context # add some frequent words in the documents
 clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
 docs = clean_text.apply(lambda x: " ".join(x)).tolist()
-topic_model = BERTopic.load("bertopic")
 with open('topics', 'rb') as f:
     topics = pickle.load(f)
@@ -76,23 +82,24 @@ topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
                                                 global_tuning=True,
                                                 evolution_tuning=True,
                                                 nr_bins=20)
 time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
 topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
 topics_over_time.set_index('Timestamp', inplace=True)
 topics_over_time['year'] = topics_over_time.index.year
 topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
 topic_fig = topic_model.visualize_barchart(n_words=10)
-fig1, ax = plt.subplots()
-sns.countplot(ax=ax, x='year', data=df_clean, palette='viridis');
-# plt.ylabel('Nombre de podcasts');
 def wordscloud(text: str):
     WordCloud()
     word_cloud = WordCloud(background_color='white').generate(text)
     fig, ax = plt.subplots()
@@ -102,9 +109,19 @@ def wordscloud(text: str):
     st.pyplot(fig)
-data = df_clean.resample('Y')['duration_min'].mean()
 fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
 fig.update_traces(textposition="bottom right")
 st.write('''
 # Nous sommes la moyenne des personnes que nous fréquentons.
@@ -112,12 +129,13 @@ Hello''')
 st.header('Nombre de podcasts par année')
-st.write(fig1)
 st.header('Durée moyenne des podcasts par année')
 st.plotly_chart(fig, use_container_width=False,
                 sharing="streamlit")
 st.header('Les mots fréquemment utilisés dans le podcast')
 text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
 wordcloud = WordCloud(background_color='white').generate(text_cloud)
@@ -126,35 +144,35 @@ ax.imshow(wordcloud, interpolation='bilinear')
 plt.axis("off")
 plt.show()
 st.pyplot(fig)
 st.header('Sujets évoqués dans le podcast')
 st.plotly_chart(topic_fig, use_container_width=False,
                 sharing="streamlit")
 st.header('Sujets évoqués au cours du temps dans le podcast')
 st.plotly_chart(time_fig, use_container_width=False,
                 sharing="streamlit")
 st.header('Sujets en 2O17')
 text = topic_per_year[2017].replace(',', "")
 wordscloud(text)
 st.header('Sujets en 2O18')
 text = topic_per_year[2018].replace(',', "")
 wordscloud(text)
 st.header('Sujets en 2O19')
 text = topic_per_year[2019].replace(',', "")
 wordscloud(text)
 st.header('Sujets en 2O20')
 text = topic_per_year[2020].replace(',', "")
 wordscloud(text)
 st.header('Sujets en 2O21')
 text = topic_per_year[2021].replace(',', "")
 wordscloud(text)
 st.header('Sujets en 2O22')
 text = topic_per_year[2022].replace(',', "")
 wordscloud(text)

 from bertopic import BERTopic
 from wordcloud import WordCloud
 import nltk
+import plotly.express as px
+import plotly.graph_objects as go
+import pickle
+import warnings
 nltk.download('stopwords')
 from nltk.corpus import stopwords
 nlp = spacy.load("fr_core_news_sm")
 stopword = stopwords.words('french')
 warnings.filterwarnings('ignore')
 from nltk import FreqDist
 def clean_data(df):
+    '''
+    args : pd DataFrame
+    Return : pd DataFrme'''
     df = df.drop('Unnamed: 0', axis=1)
     df['description'] = df['description'].str.lower()
     df = df.set_index('release_date')
+    # Remove EXTRAIT and REDIFF in the podcasts
     df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
     df.loc[:, 'duration_min'] = df['duration_ms'].apply(
+        lambda row: row / (60 * 1000))  # convert duration in minutes
     df['year'] = df.index.year
     df['month'] = df.index.month
     return df
 def clean_up1(row: str, stopword, pos=None):
+    """ Args : text
+    Returns : List
+    - Remove `\xa0` and `\u200a`
+    - Remove word with length lower than 2"""
     texts = row.replace(f'\xa0', '')
     texts = texts.replace(f'\u200a', '')
     return list_tokens
+# Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
+pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
+# Some frequently used in the podcast
 context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
            'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
            'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
+stopword = stopword + context # add some frequent words in stopword
 clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
 docs = clean_text.apply(lambda x: " ".join(x)).tolist()
+#load the model
+topic_model = BERTopic.load("bertopic.pkl")
 with open('topics', 'rb') as f:
     topics = pickle.load(f)
                                                 global_tuning=True,
                                                 evolution_tuning=True,
                                                 nr_bins=20)
+#visualize topics over times
 time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
+time_fig.update(layout_showlegend=False)
+time_fig.update_layout(autosize=False, width=800, height=400,)
+#group topics per year
 topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
 topics_over_time.set_index('Timestamp', inplace=True)
 topics_over_time['year'] = topics_over_time.index.year
 topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
+# barchart of topics
 topic_fig = topic_model.visualize_barchart(n_words=10)
+topic_fig.update_layout(autosize=False, width=800)
 def wordscloud(text: str):
+    ''' compute wordcloud of some strings
+    Args : text is strings format
+    Return : matplotlib figure'''
     WordCloud()
     word_cloud = WordCloud(background_color='white').generate(text)
     fig, ax = plt.subplots()
     st.pyplot(fig)
+data = df_clean.resample('Y')['duration_min'].mean() #average per of duration
+podcast_per_year = df_clean['year'].value_counts().reset_index() # count the number of podcasts per year
+podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inplace=True) #rename columns
+#visualize duration by the year
 fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
 fig.update_traces(textposition="bottom right")
+fig.update_layout(autosize=False, width=800)
+#barchart of number of podcast per year
+#fig1 = px.pie(data_frame=podcast_per_year, values = 'nb_podcast', names= 'year', hole=.4)
+fig1 = go.Figure(data=[go.Pie(labels=podcast_per_year['year'],
+                              values=podcast_per_year['nb_podcast'], pull=[0, 0, 0.2, 0, 0, 0])])
+#fig1.update_layout(autosize=False, width=800)
+fig1.update_traces(textposition='inside', textinfo='value+label')
 st.write('''
 # Nous sommes la moyenne des personnes que nous fréquentons.
 st.header('Nombre de podcasts par année')
+st.plotly_chart(fig1, use_container_width=False,
+                sharing="streamlit")
 st.header('Durée moyenne des podcasts par année')
 st.plotly_chart(fig, use_container_width=False,
                 sharing="streamlit")
+#word cloud of all terms
 st.header('Les mots fréquemment utilisés dans le podcast')
 text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
 wordcloud = WordCloud(background_color='white').generate(text_cloud)
 plt.axis("off")
 plt.show()
 st.pyplot(fig)
+#show topics
 st.header('Sujets évoqués dans le podcast')
 st.plotly_chart(topic_fig, use_container_width=False,
                 sharing="streamlit")
+#show topics over years
 st.header('Sujets évoqués au cours du temps dans le podcast')
 st.plotly_chart(time_fig, use_container_width=False,
                 sharing="streamlit")
+#Terms used in 2017
 st.header('Sujets en 2O17')
 text = topic_per_year[2017].replace(',', "")
 wordscloud(text)
+#Terms used in 2018
 st.header('Sujets en 2O18')
 text = topic_per_year[2018].replace(',', "")
 wordscloud(text)
+#Terms used in 2019
 st.header('Sujets en 2O19')
 text = topic_per_year[2019].replace(',', "")
 wordscloud(text)
+#Terms used in 2020
 st.header('Sujets en 2O20')
 text = topic_per_year[2020].replace(',', "")
 wordscloud(text)
+#Terms used in 2021
 st.header('Sujets en 2O21')
 text = topic_per_year[2021].replace(',', "")
 wordscloud(text)
+#Terms used in 2022
 st.header('Sujets en 2O22')
 text = topic_per_year[2022].replace(',', "")
 wordscloud(text)