madoss commited on
Commit
f2b56a9
·
1 Parent(s): 7629f8d
Files changed (2) hide show
  1. .gitattributes +1 -0
  2. app.py +51 -33
.gitattributes CHANGED
@@ -30,3 +30,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
30
  *.zstandard filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
  *.psd filter=lfs diff=lfs merge=lfs -text
 
 
30
  *.zstandard filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
  *.psd filter=lfs diff=lfs merge=lfs -text
33
+ clean_docs.json filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -6,14 +6,15 @@ import seaborn as sns
6
  from bertopic import BERTopic
7
  from wordcloud import WordCloud
8
  import nltk
 
 
 
 
9
  nltk.download('stopwords')
10
  from nltk.corpus import stopwords
11
- import pickle
12
- import plotly.express as px
13
-
14
  nlp = spacy.load("fr_core_news_sm")
15
  stopword = stopwords.words('french')
16
- import warnings
17
  warnings.filterwarnings('ignore')
18
  from nltk import FreqDist
19
 
@@ -22,12 +23,16 @@ df = pd.read_csv("gdiy_data.csv", sep=',',
22
 
23
 
24
  def clean_data(df):
 
 
 
25
  df = df.drop('Unnamed: 0', axis=1)
26
  df['description'] = df['description'].str.lower()
27
  df = df.set_index('release_date')
 
28
  df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
29
  df.loc[:, 'duration_min'] = df['duration_ms'].apply(
30
- lambda row: row / (60 * 1000)) # convertir la durée de ms en minutes
31
  df['year'] = df.index.year
32
  df['month'] = df.index.month
33
  return df
@@ -37,9 +42,10 @@ df_clean = clean_data(df)
37
 
38
 
39
  def clean_up1(row: str, stopword, pos=None):
40
- """ Prend une un text:
41
- - Supprime les caractères `\xa0` et `\u200a`
42
- - Supprime les mots avec moins de lettres """
 
43
 
44
  texts = row.replace(f'\xa0', '')
45
  texts = texts.replace(f'\u200a', '')
@@ -54,18 +60,18 @@ def clean_up1(row: str, stopword, pos=None):
54
 
55
  return list_tokens
56
 
57
-
58
- pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB']
59
-
60
  context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
61
  'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
62
  'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
63
- stopword = stopword + context # add some frequent words in the documents
64
 
65
  clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
66
  docs = clean_text.apply(lambda x: " ".join(x)).tolist()
67
-
68
- topic_model = BERTopic.load("bertopic")
69
 
70
  with open('topics', 'rb') as f:
71
  topics = pickle.load(f)
@@ -76,23 +82,24 @@ topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
76
  global_tuning=True,
77
  evolution_tuning=True,
78
  nr_bins=20)
79
-
80
  time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
81
-
 
 
82
  topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
83
  topics_over_time.set_index('Timestamp', inplace=True)
84
  topics_over_time['year'] = topics_over_time.index.year
85
  topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
86
-
87
  topic_fig = topic_model.visualize_barchart(n_words=10)
88
-
89
- fig1, ax = plt.subplots()
90
- sns.countplot(ax=ax, x='year', data=df_clean, palette='viridis');
91
 
92
 
93
- # plt.ylabel('Nombre de podcasts');
94
-
95
  def wordscloud(text: str):
 
 
 
96
  WordCloud()
97
  word_cloud = WordCloud(background_color='white').generate(text)
98
  fig, ax = plt.subplots()
@@ -102,9 +109,19 @@ def wordscloud(text: str):
102
  st.pyplot(fig)
103
 
104
 
105
- data = df_clean.resample('Y')['duration_min'].mean()
 
 
 
106
  fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
107
  fig.update_traces(textposition="bottom right")
 
 
 
 
 
 
 
108
 
109
  st.write('''
110
  # Nous sommes la moyenne des personnes que nous fréquentons.
@@ -112,12 +129,13 @@ Hello''')
112
 
113
  st.header('Nombre de podcasts par année')
114
 
115
- st.write(fig1)
 
116
 
117
  st.header('Durée moyenne des podcasts par année')
118
  st.plotly_chart(fig, use_container_width=False,
119
  sharing="streamlit")
120
-
121
  st.header('Les mots fréquemment utilisés dans le podcast')
122
  text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
123
  wordcloud = WordCloud(background_color='white').generate(text_cloud)
@@ -126,35 +144,35 @@ ax.imshow(wordcloud, interpolation='bilinear')
126
  plt.axis("off")
127
  plt.show()
128
  st.pyplot(fig)
129
-
130
  st.header('Sujets évoqués dans le podcast')
131
  st.plotly_chart(topic_fig, use_container_width=False,
132
  sharing="streamlit")
133
-
134
  st.header('Sujets évoqués au cours du temps dans le podcast')
135
  st.plotly_chart(time_fig, use_container_width=False,
136
  sharing="streamlit")
137
-
138
  st.header('Sujets en 2O17')
139
  text = topic_per_year[2017].replace(',', "")
140
  wordscloud(text)
141
-
142
  st.header('Sujets en 2O18')
143
  text = topic_per_year[2018].replace(',', "")
144
  wordscloud(text)
145
-
146
  st.header('Sujets en 2O19')
147
  text = topic_per_year[2019].replace(',', "")
148
  wordscloud(text)
149
-
150
  st.header('Sujets en 2O20')
151
  text = topic_per_year[2020].replace(',', "")
152
  wordscloud(text)
153
-
154
  st.header('Sujets en 2O21')
155
  text = topic_per_year[2021].replace(',', "")
156
  wordscloud(text)
157
-
158
  st.header('Sujets en 2O22')
159
  text = topic_per_year[2022].replace(',', "")
160
  wordscloud(text)
 
6
  from bertopic import BERTopic
7
  from wordcloud import WordCloud
8
  import nltk
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+ import pickle
12
+ import warnings
13
  nltk.download('stopwords')
14
  from nltk.corpus import stopwords
 
 
 
15
  nlp = spacy.load("fr_core_news_sm")
16
  stopword = stopwords.words('french')
17
+
18
  warnings.filterwarnings('ignore')
19
  from nltk import FreqDist
20
 
 
23
 
24
 
25
  def clean_data(df):
26
+ '''
27
+ args : pd DataFrame
28
+ Return : pd DataFrme'''
29
  df = df.drop('Unnamed: 0', axis=1)
30
  df['description'] = df['description'].str.lower()
31
  df = df.set_index('release_date')
32
+ # Remove EXTRAIT and REDIFF in the podcasts
33
  df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
34
  df.loc[:, 'duration_min'] = df['duration_ms'].apply(
35
+ lambda row: row / (60 * 1000)) # convert duration in minutes
36
  df['year'] = df.index.year
37
  df['month'] = df.index.month
38
  return df
 
42
 
43
 
44
  def clean_up1(row: str, stopword, pos=None):
45
+ """ Args : text
46
+ Returns : List
47
+ - Remove `\xa0` and `\u200a`
48
+ - Remove word with length lower than 2"""
49
 
50
  texts = row.replace(f'\xa0', '')
51
  texts = texts.replace(f'\u200a', '')
 
60
 
61
  return list_tokens
62
 
63
+ # Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
64
+ pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
65
+ # Some frequently used in the podcast
66
  context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
67
  'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
68
  'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
69
+ stopword = stopword + context # add some frequent words in stopword
70
 
71
  clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
72
  docs = clean_text.apply(lambda x: " ".join(x)).tolist()
73
+ #load the model
74
+ topic_model = BERTopic.load("bertopic.pkl")
75
 
76
  with open('topics', 'rb') as f:
77
  topics = pickle.load(f)
 
82
  global_tuning=True,
83
  evolution_tuning=True,
84
  nr_bins=20)
85
+ #visualize topics over times
86
  time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
87
+ time_fig.update(layout_showlegend=False)
88
+ time_fig.update_layout(autosize=False, width=800, height=400,)
89
+ #group topics per year
90
  topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
91
  topics_over_time.set_index('Timestamp', inplace=True)
92
  topics_over_time['year'] = topics_over_time.index.year
93
  topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
94
+ # barchart of topics
95
  topic_fig = topic_model.visualize_barchart(n_words=10)
96
+ topic_fig.update_layout(autosize=False, width=800)
 
 
97
 
98
 
 
 
99
  def wordscloud(text: str):
100
+ ''' compute wordcloud of some strings
101
+ Args : text is strings format
102
+ Return : matplotlib figure'''
103
  WordCloud()
104
  word_cloud = WordCloud(background_color='white').generate(text)
105
  fig, ax = plt.subplots()
 
109
  st.pyplot(fig)
110
 
111
 
112
+ data = df_clean.resample('Y')['duration_min'].mean() #average per of duration
113
+ podcast_per_year = df_clean['year'].value_counts().reset_index() # count the number of podcasts per year
114
+ podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inplace=True) #rename columns
115
+ #visualize duration by the year
116
  fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
117
  fig.update_traces(textposition="bottom right")
118
+ fig.update_layout(autosize=False, width=800)
119
+ #barchart of number of podcast per year
120
+ #fig1 = px.pie(data_frame=podcast_per_year, values = 'nb_podcast', names= 'year', hole=.4)
121
+ fig1 = go.Figure(data=[go.Pie(labels=podcast_per_year['year'],
122
+ values=podcast_per_year['nb_podcast'], pull=[0, 0, 0.2, 0, 0, 0])])
123
+ #fig1.update_layout(autosize=False, width=800)
124
+ fig1.update_traces(textposition='inside', textinfo='value+label')
125
 
126
  st.write('''
127
  # Nous sommes la moyenne des personnes que nous fréquentons.
 
129
 
130
  st.header('Nombre de podcasts par année')
131
 
132
+ st.plotly_chart(fig1, use_container_width=False,
133
+ sharing="streamlit")
134
 
135
  st.header('Durée moyenne des podcasts par année')
136
  st.plotly_chart(fig, use_container_width=False,
137
  sharing="streamlit")
138
+ #word cloud of all terms
139
  st.header('Les mots fréquemment utilisés dans le podcast')
140
  text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
141
  wordcloud = WordCloud(background_color='white').generate(text_cloud)
 
144
  plt.axis("off")
145
  plt.show()
146
  st.pyplot(fig)
147
+ #show topics
148
  st.header('Sujets évoqués dans le podcast')
149
  st.plotly_chart(topic_fig, use_container_width=False,
150
  sharing="streamlit")
151
+ #show topics over years
152
  st.header('Sujets évoqués au cours du temps dans le podcast')
153
  st.plotly_chart(time_fig, use_container_width=False,
154
  sharing="streamlit")
155
+ #Terms used in 2017
156
  st.header('Sujets en 2O17')
157
  text = topic_per_year[2017].replace(',', "")
158
  wordscloud(text)
159
+ #Terms used in 2018
160
  st.header('Sujets en 2O18')
161
  text = topic_per_year[2018].replace(',', "")
162
  wordscloud(text)
163
+ #Terms used in 2019
164
  st.header('Sujets en 2O19')
165
  text = topic_per_year[2019].replace(',', "")
166
  wordscloud(text)
167
+ #Terms used in 2020
168
  st.header('Sujets en 2O20')
169
  text = topic_per_year[2020].replace(',', "")
170
  wordscloud(text)
171
+ #Terms used in 2021
172
  st.header('Sujets en 2O21')
173
  text = topic_per_year[2021].replace(',', "")
174
  wordscloud(text)
175
+ #Terms used in 2022
176
  st.header('Sujets en 2O22')
177
  text = topic_per_year[2022].replace(',', "")
178
  wordscloud(text)