add app
Browse files- app.py +160 -0
- gdiy_data.csv +0 -0
- requirements.txt +198 -0
app.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import spacy
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
from bertopic import BERTopic
|
7 |
+
from wordcloud import WordCloud
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
import pickle
|
10 |
+
import plotly.express as px
|
11 |
+
|
12 |
+
nlp = spacy.load("fr_core_news_sm")
|
13 |
+
stopword = stopwords.words('french')
|
14 |
+
import warnings
|
15 |
+
warnings.filterwarnings('ignore')
|
16 |
+
from nltk import FreqDist
|
17 |
+
|
18 |
+
df = pd.read_csv("gdiy_data.csv", sep=',',
|
19 |
+
parse_dates=['release_date']) # use `release_date` as date in pandas
|
20 |
+
|
21 |
+
|
22 |
+
def clean_data(df):
|
23 |
+
df = df.drop('Unnamed: 0', axis=1)
|
24 |
+
df['description'] = df['description'].str.lower()
|
25 |
+
df = df.set_index('release_date')
|
26 |
+
df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
|
27 |
+
df.loc[:, 'duration_min'] = df['duration_ms'].apply(
|
28 |
+
lambda row: row / (60 * 1000)) # convertir la durée de ms en minutes
|
29 |
+
df['year'] = df.index.year
|
30 |
+
df['month'] = df.index.month
|
31 |
+
return df
|
32 |
+
|
33 |
+
|
34 |
+
df_clean = clean_data(df)
|
35 |
+
|
36 |
+
|
37 |
+
def clean_up1(row: str, stopword, pos=None):
|
38 |
+
""" Prend une un text:
|
39 |
+
- Supprime les caractères `\xa0` et `\u200a`
|
40 |
+
- Supprime les mots avec moins de lettres """
|
41 |
+
|
42 |
+
texts = row.replace(f'\xa0', '')
|
43 |
+
texts = texts.replace(f'\u200a', '')
|
44 |
+
text_ = " ".join([token for token in texts.split() if token.isalpha() and len(token) > 2])
|
45 |
+
texts = nlp(text_)
|
46 |
+
if pos is not None:
|
47 |
+
list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword \
|
48 |
+
and token.pos_ not in pos]
|
49 |
+
|
50 |
+
else:
|
51 |
+
list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword]
|
52 |
+
|
53 |
+
return list_tokens
|
54 |
+
|
55 |
+
|
56 |
+
pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB']
|
57 |
+
|
58 |
+
context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
|
59 |
+
'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
|
60 |
+
'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
|
61 |
+
stopword = stopword + context # add some frequent words in the documents
|
62 |
+
|
63 |
+
clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
|
64 |
+
docs = clean_text.apply(lambda x: " ".join(x)).tolist()
|
65 |
+
|
66 |
+
topic_model = BERTopic(language="multilingual",
|
67 |
+
nr_topics=6,
|
68 |
+
top_n_words=30,
|
69 |
+
low_memory=True,
|
70 |
+
n_gram_range=(1, 2))
|
71 |
+
|
72 |
+
topics, _ = topic_model.fit_transform(docs)
|
73 |
+
|
74 |
+
topic_fig = topic_model.visualize_barchart(n_words=10)
|
75 |
+
|
76 |
+
timestamps = df_clean.index
|
77 |
+
topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
|
78 |
+
global_tuning=True,
|
79 |
+
evolution_tuning=True,
|
80 |
+
nr_bins=20)
|
81 |
+
|
82 |
+
time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
|
83 |
+
|
84 |
+
topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
|
85 |
+
topics_over_time.set_index('Timestamp', inplace=True)
|
86 |
+
topics_over_time['year'] = topics_over_time.index.year
|
87 |
+
topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
|
88 |
+
|
89 |
+
fig1, ax = plt.subplots()
|
90 |
+
sns.countplot(ax=ax, x='year', data=df_clean, palette='viridis');
|
91 |
+
|
92 |
+
|
93 |
+
# plt.ylabel('Nombre de podcasts');
|
94 |
+
|
95 |
+
def wordscloud(text: str):
|
96 |
+
WordCloud()
|
97 |
+
word_cloud = WordCloud(background_color='white').generate(text)
|
98 |
+
fig, ax = plt.subplots()
|
99 |
+
ax.imshow(word_cloud, interpolation='bilinear')
|
100 |
+
plt.axis("off")
|
101 |
+
plt.show()
|
102 |
+
st.pyplot(fig)
|
103 |
+
|
104 |
+
|
105 |
+
data = df_clean.resample('Y')['duration_min'].mean()
|
106 |
+
fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
|
107 |
+
fig.update_traces(textposition="bottom right")
|
108 |
+
|
109 |
+
st.write('''
|
110 |
+
# Nous sommes la moyenne des personnes que nous fréquentons.
|
111 |
+
Hello''')
|
112 |
+
|
113 |
+
st.header('Nombre de podcasts par année')
|
114 |
+
|
115 |
+
st.write(fig1)
|
116 |
+
|
117 |
+
st.header('Durée moyenne des podcasts par année')
|
118 |
+
st.plotly_chart(fig, use_container_width=False,
|
119 |
+
sharing="streamlit")
|
120 |
+
|
121 |
+
st.header('Les mots fréquemment utilisés dans le podcast')
|
122 |
+
text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
|
123 |
+
wordcloud = WordCloud(background_color='white').generate(text_cloud)
|
124 |
+
fig, ax = plt.subplots()
|
125 |
+
ax.imshow(wordcloud, interpolation='bilinear')
|
126 |
+
plt.axis("off")
|
127 |
+
plt.show()
|
128 |
+
st.pyplot(fig)
|
129 |
+
|
130 |
+
st.header('Sujets évoqués dans le podcast')
|
131 |
+
st.plotly_chart(topic_fig, use_container_width=False,
|
132 |
+
sharing="streamlit")
|
133 |
+
|
134 |
+
st.header('Sujets évoqués au cours du temps dans le podcast')
|
135 |
+
st.plotly_chart(time_fig, use_container_width=False,
|
136 |
+
sharing="streamlit")
|
137 |
+
|
138 |
+
st.header('Sujets en 2O17')
|
139 |
+
text = topic_per_year[2017].replace(',', "")
|
140 |
+
wordscloud(text)
|
141 |
+
|
142 |
+
st.header('Sujets en 2O18')
|
143 |
+
text = topic_per_year[2018].replace(',', "")
|
144 |
+
wordscloud(text)
|
145 |
+
|
146 |
+
st.header('Sujets en 2O19')
|
147 |
+
text = topic_per_year[2019].replace(',', "")
|
148 |
+
wordscloud(text)
|
149 |
+
|
150 |
+
st.header('Sujets en 2O20')
|
151 |
+
text = topic_per_year[2020].replace(',', "")
|
152 |
+
wordscloud(text)
|
153 |
+
|
154 |
+
st.header('Sujets en 2O21')
|
155 |
+
text = topic_per_year[2021].replace(',', "")
|
156 |
+
wordscloud(text)
|
157 |
+
|
158 |
+
st.header('Sujets en 2O22')
|
159 |
+
text = topic_per_year[2022].replace(',', "")
|
160 |
+
wordscloud(text)
|
gdiy_data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.8.1
|
2 |
+
aiosignal==1.2.0
|
3 |
+
altair==4.2.0
|
4 |
+
anyio==3.5.0
|
5 |
+
argon2-cffi==21.3.0
|
6 |
+
argon2-cffi-bindings==21.2.0
|
7 |
+
asttokens==2.0.5
|
8 |
+
async-timeout==4.0.2
|
9 |
+
attrs==21.4.0
|
10 |
+
Babel==2.9.1
|
11 |
+
backcall==0.2.0
|
12 |
+
bertopic==0.11.0
|
13 |
+
black==22.1.0
|
14 |
+
bleach==4.1.0
|
15 |
+
blinker==1.5
|
16 |
+
blis==0.7.6
|
17 |
+
bpemb==0.3.3
|
18 |
+
cachetools==5.2.0
|
19 |
+
catalogue==2.0.7
|
20 |
+
certifi==2021.10.8
|
21 |
+
cffi==1.15.0
|
22 |
+
charset-normalizer==2.0.11
|
23 |
+
click==8.0.3
|
24 |
+
cloudpickle==2.1.0
|
25 |
+
colorama==0.4.4
|
26 |
+
commonmark==0.9.1
|
27 |
+
conllu==4.5.1
|
28 |
+
cycler==0.11.0
|
29 |
+
cymem==2.0.6
|
30 |
+
Cython==0.29.23
|
31 |
+
datasets==2.2.2
|
32 |
+
debugpy==1.5.1
|
33 |
+
decorator==5.1.1
|
34 |
+
defusedxml==0.7.1
|
35 |
+
Deprecated==1.2.13
|
36 |
+
dill==0.3.4
|
37 |
+
docopt==0.6.2
|
38 |
+
entrypoints==0.4
|
39 |
+
executing==0.8.2
|
40 |
+
fastjsonschema==2.16.1
|
41 |
+
filelock==3.6.0
|
42 |
+
flair==0.11.3
|
43 |
+
fonttools==4.29.1
|
44 |
+
fr-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.2.0/fr_core_news_sm-3.2.0-py3-none-any.whl
|
45 |
+
frozenlist==1.3.0
|
46 |
+
fsspec==2022.5.0
|
47 |
+
ftfy==6.1.1
|
48 |
+
funcy==1.17
|
49 |
+
future==0.18.2
|
50 |
+
gdown==4.4.0
|
51 |
+
gitdb==4.0.9
|
52 |
+
GitPython==3.1.27
|
53 |
+
hdbscan==0.8.28
|
54 |
+
huggingface-hub==0.4.0
|
55 |
+
hyperopt==0.2.7
|
56 |
+
idna==3.3
|
57 |
+
importlib-metadata==3.10.1
|
58 |
+
ipykernel==6.9.0
|
59 |
+
ipython==8.0.1
|
60 |
+
ipython-genutils==0.2.0
|
61 |
+
ipywidgets==7.7.0
|
62 |
+
Janome==0.4.2
|
63 |
+
jedi==0.18.1
|
64 |
+
Jinja2==3.0.3
|
65 |
+
joblib==1.1.0
|
66 |
+
Js2Py==0.71
|
67 |
+
json5==0.9.6
|
68 |
+
jsonschema==4.4.0
|
69 |
+
kiwisolver==1.3.2
|
70 |
+
konoha==4.6.5
|
71 |
+
langcodes==3.3.0
|
72 |
+
langdetect==1.0.9
|
73 |
+
llvmlite==0.38.1
|
74 |
+
lxml==4.8.0
|
75 |
+
MarkupSafe==2.0.1
|
76 |
+
matplotlib==3.5.1
|
77 |
+
matplotlib-inline==0.1.3
|
78 |
+
mistune==0.8.4
|
79 |
+
more-itertools==8.13.0
|
80 |
+
mpld3==0.3
|
81 |
+
multidict==6.0.2
|
82 |
+
multiprocess==0.70.12.2
|
83 |
+
murmurhash==1.0.6
|
84 |
+
mypy-extensions==0.4.3
|
85 |
+
nest-asyncio==1.5.4
|
86 |
+
networkx==2.8.4
|
87 |
+
nltk==3.7
|
88 |
+
numba==0.55.2
|
89 |
+
numexpr==2.8.1
|
90 |
+
numpy==1.22.4
|
91 |
+
overrides==3.1.0
|
92 |
+
packaging==21.3
|
93 |
+
pandas==1.4.0
|
94 |
+
pandoc==2.0.1
|
95 |
+
pandocfilters==1.5.0
|
96 |
+
parso==0.8.3
|
97 |
+
pathspec==0.9.0
|
98 |
+
pathy==0.6.1
|
99 |
+
pickleshare==0.7.5
|
100 |
+
Pillow==9.0.1
|
101 |
+
pipwin==0.5.2
|
102 |
+
platformdirs==2.4.1
|
103 |
+
plotly==5.9.0
|
104 |
+
plumbum==1.7.2
|
105 |
+
ply==3.11
|
106 |
+
pptree==3.1
|
107 |
+
preshed==3.0.6
|
108 |
+
prometheus-client==0.13.1
|
109 |
+
prompt-toolkit==3.0.27
|
110 |
+
protobuf==3.20.1
|
111 |
+
pure-eval==0.2.2
|
112 |
+
py4j==0.10.9.5
|
113 |
+
pyarrow==8.0.0
|
114 |
+
pycparser==2.21
|
115 |
+
pydantic==1.8.2
|
116 |
+
pydeck==0.7.1
|
117 |
+
Pygments==2.11.2
|
118 |
+
pyjsparser==2.7.1
|
119 |
+
pylibscrypt==2.0.0
|
120 |
+
pymongo==4.0.2
|
121 |
+
Pympler==1.0.1
|
122 |
+
pynndescent==0.5.7
|
123 |
+
pyparsing==3.0.7
|
124 |
+
PyPrind==2.11.3
|
125 |
+
pyrsistent==0.18.1
|
126 |
+
pySmartDL==1.3.4
|
127 |
+
PySocks==1.7.1
|
128 |
+
python-dateutil==2.8.2
|
129 |
+
pytz==2021.3
|
130 |
+
pytz-deprecation-shim==0.1.0.post0
|
131 |
+
PyYAML==5.4.1
|
132 |
+
pyzmq==22.3.0
|
133 |
+
redis==4.3.4
|
134 |
+
regex==2022.3.15
|
135 |
+
requests==2.27.1
|
136 |
+
responses==0.18.0
|
137 |
+
rich==12.5.1
|
138 |
+
sacremoses==0.0.49
|
139 |
+
scikit-learn==1.0.2
|
140 |
+
scipy==1.8.0
|
141 |
+
scrypt==0.8.20
|
142 |
+
seaborn==0.11.2
|
143 |
+
segtok==1.5.11
|
144 |
+
semver==2.13.0
|
145 |
+
Send2Trash==1.8.0
|
146 |
+
sentence-transformers==2.2.2
|
147 |
+
sentencepiece==0.1.95
|
148 |
+
seqeval==1.2.2
|
149 |
+
six==1.16.0
|
150 |
+
sklearn==0.0
|
151 |
+
smart-open==5.2.1
|
152 |
+
smmap==5.0.0
|
153 |
+
sniffio==1.2.0
|
154 |
+
soupsieve==2.3.1
|
155 |
+
spacy==3.2.3
|
156 |
+
spacy-legacy==3.0.9
|
157 |
+
spacy-loggers==1.0.1
|
158 |
+
spotipy==2.20.0
|
159 |
+
sqlitedict==2.0.0
|
160 |
+
srsly==2.4.2
|
161 |
+
stack-data==0.1.4
|
162 |
+
tabulate==0.8.10
|
163 |
+
tenacity==8.0.1
|
164 |
+
terminado==0.13.1
|
165 |
+
testpath==0.5.0
|
166 |
+
thinc==8.0.15
|
167 |
+
threadpoolctl==3.1.0
|
168 |
+
tinycss2==1.1.1
|
169 |
+
tokenizers==0.12.0
|
170 |
+
toml==0.10.2
|
171 |
+
tomli==2.0.1
|
172 |
+
toolz==0.12.0
|
173 |
+
torch==1.11.0
|
174 |
+
torchtext==0.12.0
|
175 |
+
torchvision==0.12.0
|
176 |
+
tornado==6.1
|
177 |
+
tqdm==4.63.0
|
178 |
+
traitlets==5.1.1
|
179 |
+
transformers==4.17.0
|
180 |
+
typer==0.4.0
|
181 |
+
typing_extensions==4.0.1
|
182 |
+
tzdata==2022.1
|
183 |
+
tzlocal==4.2
|
184 |
+
umap-learn==0.5.3
|
185 |
+
urllib3==1.26.8
|
186 |
+
validators==0.20.0
|
187 |
+
voluptuous==0.13.1
|
188 |
+
wasabi==0.9.0
|
189 |
+
watchdog==2.1.9
|
190 |
+
wcwidth==0.2.5
|
191 |
+
webencodings==0.5.1
|
192 |
+
websocket-client==1.2.3
|
193 |
+
widgetsnbextension==3.6.0
|
194 |
+
wordcloud==1.8.2.2
|
195 |
+
wrapt==1.14.1
|
196 |
+
xxhash==3.0.0
|
197 |
+
yarl==1.7.2
|
198 |
+
zipp==3.8.1
|