File size: 5,223 Bytes
d139382 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import streamlit as st
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
from bertopic import BERTopic
from wordcloud import WordCloud
from nltk.corpus import stopwords
import pickle
import plotly.express as px
nlp = spacy.load("fr_core_news_sm")
stopword = stopwords.words('french')
import warnings
warnings.filterwarnings('ignore')
from nltk import FreqDist
df = pd.read_csv("gdiy_data.csv", sep=',',
parse_dates=['release_date']) # use `release_date` as date in pandas
def clean_data(df):
df = df.drop('Unnamed: 0', axis=1)
df['description'] = df['description'].str.lower()
df = df.set_index('release_date')
df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
df.loc[:, 'duration_min'] = df['duration_ms'].apply(
lambda row: row / (60 * 1000)) # convertir la durée de ms en minutes
df['year'] = df.index.year
df['month'] = df.index.month
return df
df_clean = clean_data(df)
def clean_up1(row: str, stopword, pos=None):
""" Prend une un text:
- Supprime les caractères `\xa0` et `\u200a`
- Supprime les mots avec moins de lettres """
texts = row.replace(f'\xa0', '')
texts = texts.replace(f'\u200a', '')
text_ = " ".join([token for token in texts.split() if token.isalpha() and len(token) > 2])
texts = nlp(text_)
if pos is not None:
list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword \
and token.pos_ not in pos]
else:
list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword]
return list_tokens
pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB']
context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
stopword = stopword + context # add some frequent words in the documents
clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
docs = clean_text.apply(lambda x: " ".join(x)).tolist()
topic_model = BERTopic(language="multilingual",
nr_topics=6,
top_n_words=30,
low_memory=True,
n_gram_range=(1, 2))
topics, _ = topic_model.fit_transform(docs)
topic_fig = topic_model.visualize_barchart(n_words=10)
timestamps = df_clean.index
topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
global_tuning=True,
evolution_tuning=True,
nr_bins=20)
time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
topics_over_time.set_index('Timestamp', inplace=True)
topics_over_time['year'] = topics_over_time.index.year
topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
fig1, ax = plt.subplots()
sns.countplot(ax=ax, x='year', data=df_clean, palette='viridis');
# plt.ylabel('Nombre de podcasts');
def wordscloud(text: str):
WordCloud()
word_cloud = WordCloud(background_color='white').generate(text)
fig, ax = plt.subplots()
ax.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()
st.pyplot(fig)
data = df_clean.resample('Y')['duration_min'].mean()
fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
fig.update_traces(textposition="bottom right")
st.write('''
# Nous sommes la moyenne des personnes que nous fréquentons.
Hello''')
st.header('Nombre de podcasts par année')
st.write(fig1)
st.header('Durée moyenne des podcasts par année')
st.plotly_chart(fig, use_container_width=False,
sharing="streamlit")
st.header('Les mots fréquemment utilisés dans le podcast')
text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
wordcloud = WordCloud(background_color='white').generate(text_cloud)
fig, ax = plt.subplots()
ax.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
st.pyplot(fig)
st.header('Sujets évoqués dans le podcast')
st.plotly_chart(topic_fig, use_container_width=False,
sharing="streamlit")
st.header('Sujets évoqués au cours du temps dans le podcast')
st.plotly_chart(time_fig, use_container_width=False,
sharing="streamlit")
st.header('Sujets en 2O17')
text = topic_per_year[2017].replace(',', "")
wordscloud(text)
st.header('Sujets en 2O18')
text = topic_per_year[2018].replace(',', "")
wordscloud(text)
st.header('Sujets en 2O19')
text = topic_per_year[2019].replace(',', "")
wordscloud(text)
st.header('Sujets en 2O20')
text = topic_per_year[2020].replace(',', "")
wordscloud(text)
st.header('Sujets en 2O21')
text = topic_per_year[2021].replace(',', "")
wordscloud(text)
st.header('Sujets en 2O22')
text = topic_per_year[2022].replace(',', "")
wordscloud(text)
|