app
Browse files- .gitattributes +1 -0
- app.py +51 -33
.gitattributes
CHANGED
@@ -30,3 +30,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
30 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
31 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
32 |
*.psd filter=lfs diff=lfs merge=lfs -text
|
|
|
|
30 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
31 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
32 |
*.psd filter=lfs diff=lfs merge=lfs -text
|
33 |
+
clean_docs.json filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -6,14 +6,15 @@ import seaborn as sns
|
|
6 |
from bertopic import BERTopic
|
7 |
from wordcloud import WordCloud
|
8 |
import nltk
|
|
|
|
|
|
|
|
|
9 |
nltk.download('stopwords')
|
10 |
from nltk.corpus import stopwords
|
11 |
-
import pickle
|
12 |
-
import plotly.express as px
|
13 |
-
|
14 |
nlp = spacy.load("fr_core_news_sm")
|
15 |
stopword = stopwords.words('french')
|
16 |
-
|
17 |
warnings.filterwarnings('ignore')
|
18 |
from nltk import FreqDist
|
19 |
|
@@ -22,12 +23,16 @@ df = pd.read_csv("gdiy_data.csv", sep=',',
|
|
22 |
|
23 |
|
24 |
def clean_data(df):
|
|
|
|
|
|
|
25 |
df = df.drop('Unnamed: 0', axis=1)
|
26 |
df['description'] = df['description'].str.lower()
|
27 |
df = df.set_index('release_date')
|
|
|
28 |
df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
|
29 |
df.loc[:, 'duration_min'] = df['duration_ms'].apply(
|
30 |
-
lambda row: row / (60 * 1000)) #
|
31 |
df['year'] = df.index.year
|
32 |
df['month'] = df.index.month
|
33 |
return df
|
@@ -37,9 +42,10 @@ df_clean = clean_data(df)
|
|
37 |
|
38 |
|
39 |
def clean_up1(row: str, stopword, pos=None):
|
40 |
-
"""
|
41 |
-
|
42 |
-
-
|
|
|
43 |
|
44 |
texts = row.replace(f'\xa0', '')
|
45 |
texts = texts.replace(f'\u200a', '')
|
@@ -54,18 +60,18 @@ def clean_up1(row: str, stopword, pos=None):
|
|
54 |
|
55 |
return list_tokens
|
56 |
|
57 |
-
|
58 |
-
pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB']
|
59 |
-
|
60 |
context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
|
61 |
'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
|
62 |
'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
|
63 |
-
stopword = stopword + context # add some frequent words in
|
64 |
|
65 |
clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
|
66 |
docs = clean_text.apply(lambda x: " ".join(x)).tolist()
|
67 |
-
|
68 |
-
topic_model = BERTopic.load("bertopic")
|
69 |
|
70 |
with open('topics', 'rb') as f:
|
71 |
topics = pickle.load(f)
|
@@ -76,23 +82,24 @@ topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
|
|
76 |
global_tuning=True,
|
77 |
evolution_tuning=True,
|
78 |
nr_bins=20)
|
79 |
-
|
80 |
time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
|
81 |
-
|
|
|
|
|
82 |
topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
|
83 |
topics_over_time.set_index('Timestamp', inplace=True)
|
84 |
topics_over_time['year'] = topics_over_time.index.year
|
85 |
topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
|
86 |
-
|
87 |
topic_fig = topic_model.visualize_barchart(n_words=10)
|
88 |
-
|
89 |
-
fig1, ax = plt.subplots()
|
90 |
-
sns.countplot(ax=ax, x='year', data=df_clean, palette='viridis');
|
91 |
|
92 |
|
93 |
-
# plt.ylabel('Nombre de podcasts');
|
94 |
-
|
95 |
def wordscloud(text: str):
|
|
|
|
|
|
|
96 |
WordCloud()
|
97 |
word_cloud = WordCloud(background_color='white').generate(text)
|
98 |
fig, ax = plt.subplots()
|
@@ -102,9 +109,19 @@ def wordscloud(text: str):
|
|
102 |
st.pyplot(fig)
|
103 |
|
104 |
|
105 |
-
data = df_clean.resample('Y')['duration_min'].mean()
|
|
|
|
|
|
|
106 |
fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
|
107 |
fig.update_traces(textposition="bottom right")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
st.write('''
|
110 |
# Nous sommes la moyenne des personnes que nous fréquentons.
|
@@ -112,12 +129,13 @@ Hello''')
|
|
112 |
|
113 |
st.header('Nombre de podcasts par année')
|
114 |
|
115 |
-
st.
|
|
|
116 |
|
117 |
st.header('Durée moyenne des podcasts par année')
|
118 |
st.plotly_chart(fig, use_container_width=False,
|
119 |
sharing="streamlit")
|
120 |
-
|
121 |
st.header('Les mots fréquemment utilisés dans le podcast')
|
122 |
text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
|
123 |
wordcloud = WordCloud(background_color='white').generate(text_cloud)
|
@@ -126,35 +144,35 @@ ax.imshow(wordcloud, interpolation='bilinear')
|
|
126 |
plt.axis("off")
|
127 |
plt.show()
|
128 |
st.pyplot(fig)
|
129 |
-
|
130 |
st.header('Sujets évoqués dans le podcast')
|
131 |
st.plotly_chart(topic_fig, use_container_width=False,
|
132 |
sharing="streamlit")
|
133 |
-
|
134 |
st.header('Sujets évoqués au cours du temps dans le podcast')
|
135 |
st.plotly_chart(time_fig, use_container_width=False,
|
136 |
sharing="streamlit")
|
137 |
-
|
138 |
st.header('Sujets en 2O17')
|
139 |
text = topic_per_year[2017].replace(',', "")
|
140 |
wordscloud(text)
|
141 |
-
|
142 |
st.header('Sujets en 2O18')
|
143 |
text = topic_per_year[2018].replace(',', "")
|
144 |
wordscloud(text)
|
145 |
-
|
146 |
st.header('Sujets en 2O19')
|
147 |
text = topic_per_year[2019].replace(',', "")
|
148 |
wordscloud(text)
|
149 |
-
|
150 |
st.header('Sujets en 2O20')
|
151 |
text = topic_per_year[2020].replace(',', "")
|
152 |
wordscloud(text)
|
153 |
-
|
154 |
st.header('Sujets en 2O21')
|
155 |
text = topic_per_year[2021].replace(',', "")
|
156 |
wordscloud(text)
|
157 |
-
|
158 |
st.header('Sujets en 2O22')
|
159 |
text = topic_per_year[2022].replace(',', "")
|
160 |
wordscloud(text)
|
|
|
6 |
from bertopic import BERTopic
|
7 |
from wordcloud import WordCloud
|
8 |
import nltk
|
9 |
+
import plotly.express as px
|
10 |
+
import plotly.graph_objects as go
|
11 |
+
import pickle
|
12 |
+
import warnings
|
13 |
nltk.download('stopwords')
|
14 |
from nltk.corpus import stopwords
|
|
|
|
|
|
|
15 |
nlp = spacy.load("fr_core_news_sm")
|
16 |
stopword = stopwords.words('french')
|
17 |
+
|
18 |
warnings.filterwarnings('ignore')
|
19 |
from nltk import FreqDist
|
20 |
|
|
|
23 |
|
24 |
|
25 |
def clean_data(df):
|
26 |
+
'''
|
27 |
+
args : pd DataFrame
|
28 |
+
Return : pd DataFrme'''
|
29 |
df = df.drop('Unnamed: 0', axis=1)
|
30 |
df['description'] = df['description'].str.lower()
|
31 |
df = df.set_index('release_date')
|
32 |
+
# Remove EXTRAIT and REDIFF in the podcasts
|
33 |
df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
|
34 |
df.loc[:, 'duration_min'] = df['duration_ms'].apply(
|
35 |
+
lambda row: row / (60 * 1000)) # convert duration in minutes
|
36 |
df['year'] = df.index.year
|
37 |
df['month'] = df.index.month
|
38 |
return df
|
|
|
42 |
|
43 |
|
44 |
def clean_up1(row: str, stopword, pos=None):
|
45 |
+
""" Args : text
|
46 |
+
Returns : List
|
47 |
+
- Remove `\xa0` and `\u200a`
|
48 |
+
- Remove word with length lower than 2"""
|
49 |
|
50 |
texts = row.replace(f'\xa0', '')
|
51 |
texts = texts.replace(f'\u200a', '')
|
|
|
60 |
|
61 |
return list_tokens
|
62 |
|
63 |
+
# Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
|
64 |
+
pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
|
65 |
+
# Some frequently used in the podcast
|
66 |
context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
|
67 |
'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
|
68 |
'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
|
69 |
+
stopword = stopword + context # add some frequent words in stopword
|
70 |
|
71 |
clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
|
72 |
docs = clean_text.apply(lambda x: " ".join(x)).tolist()
|
73 |
+
#load the model
|
74 |
+
topic_model = BERTopic.load("bertopic.pkl")
|
75 |
|
76 |
with open('topics', 'rb') as f:
|
77 |
topics = pickle.load(f)
|
|
|
82 |
global_tuning=True,
|
83 |
evolution_tuning=True,
|
84 |
nr_bins=20)
|
85 |
+
#visualize topics over times
|
86 |
time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
|
87 |
+
time_fig.update(layout_showlegend=False)
|
88 |
+
time_fig.update_layout(autosize=False, width=800, height=400,)
|
89 |
+
#group topics per year
|
90 |
topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
|
91 |
topics_over_time.set_index('Timestamp', inplace=True)
|
92 |
topics_over_time['year'] = topics_over_time.index.year
|
93 |
topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
|
94 |
+
# barchart of topics
|
95 |
topic_fig = topic_model.visualize_barchart(n_words=10)
|
96 |
+
topic_fig.update_layout(autosize=False, width=800)
|
|
|
|
|
97 |
|
98 |
|
|
|
|
|
99 |
def wordscloud(text: str):
|
100 |
+
''' compute wordcloud of some strings
|
101 |
+
Args : text is strings format
|
102 |
+
Return : matplotlib figure'''
|
103 |
WordCloud()
|
104 |
word_cloud = WordCloud(background_color='white').generate(text)
|
105 |
fig, ax = plt.subplots()
|
|
|
109 |
st.pyplot(fig)
|
110 |
|
111 |
|
112 |
+
data = df_clean.resample('Y')['duration_min'].mean() #average per of duration
|
113 |
+
podcast_per_year = df_clean['year'].value_counts().reset_index() # count the number of podcasts per year
|
114 |
+
podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inplace=True) #rename columns
|
115 |
+
#visualize duration by the year
|
116 |
fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
|
117 |
fig.update_traces(textposition="bottom right")
|
118 |
+
fig.update_layout(autosize=False, width=800)
|
119 |
+
#barchart of number of podcast per year
|
120 |
+
#fig1 = px.pie(data_frame=podcast_per_year, values = 'nb_podcast', names= 'year', hole=.4)
|
121 |
+
fig1 = go.Figure(data=[go.Pie(labels=podcast_per_year['year'],
|
122 |
+
values=podcast_per_year['nb_podcast'], pull=[0, 0, 0.2, 0, 0, 0])])
|
123 |
+
#fig1.update_layout(autosize=False, width=800)
|
124 |
+
fig1.update_traces(textposition='inside', textinfo='value+label')
|
125 |
|
126 |
st.write('''
|
127 |
# Nous sommes la moyenne des personnes que nous fréquentons.
|
|
|
129 |
|
130 |
st.header('Nombre de podcasts par année')
|
131 |
|
132 |
+
st.plotly_chart(fig1, use_container_width=False,
|
133 |
+
sharing="streamlit")
|
134 |
|
135 |
st.header('Durée moyenne des podcasts par année')
|
136 |
st.plotly_chart(fig, use_container_width=False,
|
137 |
sharing="streamlit")
|
138 |
+
#word cloud of all terms
|
139 |
st.header('Les mots fréquemment utilisés dans le podcast')
|
140 |
text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
|
141 |
wordcloud = WordCloud(background_color='white').generate(text_cloud)
|
|
|
144 |
plt.axis("off")
|
145 |
plt.show()
|
146 |
st.pyplot(fig)
|
147 |
+
#show topics
|
148 |
st.header('Sujets évoqués dans le podcast')
|
149 |
st.plotly_chart(topic_fig, use_container_width=False,
|
150 |
sharing="streamlit")
|
151 |
+
#show topics over years
|
152 |
st.header('Sujets évoqués au cours du temps dans le podcast')
|
153 |
st.plotly_chart(time_fig, use_container_width=False,
|
154 |
sharing="streamlit")
|
155 |
+
#Terms used in 2017
|
156 |
st.header('Sujets en 2O17')
|
157 |
text = topic_per_year[2017].replace(',', "")
|
158 |
wordscloud(text)
|
159 |
+
#Terms used in 2018
|
160 |
st.header('Sujets en 2O18')
|
161 |
text = topic_per_year[2018].replace(',', "")
|
162 |
wordscloud(text)
|
163 |
+
#Terms used in 2019
|
164 |
st.header('Sujets en 2O19')
|
165 |
text = topic_per_year[2019].replace(',', "")
|
166 |
wordscloud(text)
|
167 |
+
#Terms used in 2020
|
168 |
st.header('Sujets en 2O20')
|
169 |
text = topic_per_year[2020].replace(',', "")
|
170 |
wordscloud(text)
|
171 |
+
#Terms used in 2021
|
172 |
st.header('Sujets en 2O21')
|
173 |
text = topic_per_year[2021].replace(',', "")
|
174 |
wordscloud(text)
|
175 |
+
#Terms used in 2022
|
176 |
st.header('Sujets en 2O22')
|
177 |
text = topic_per_year[2022].replace(',', "")
|
178 |
wordscloud(text)
|