import streamlit as st |
import pandas as pd |
import matplotlib.pyplot as plt |
from bertopic import BERTopic |
from wordcloud import WordCloud |
import plotly.express as px |
import plotly.graph_objects as go |
import json |
import warnings |
from datetime import datetime |
warnings.filterwarnings('ignore') |
df = pd.read_csv("gdiy_data.csv", sep=',', |
parse_dates=['release_date']) |
def clean_data(df): |
''' |
args : pd DataFrame |
Return : pd DataFrme''' |
df = df.drop('Unnamed: 0', axis=1) |
df['description'] = df['description'].str.lower() |
df = df.set_index('release_date') |
df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]] |
df.loc[:, 'duration_min'] = df['duration_ms'].apply( |
lambda row: row / (60 * 1000)) |
df['year'] = df.index.year |
df['month'] = df.index.month |
return df |
df_clean = clean_data(df) |
pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] |
context = ['ouais', 'épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy', |
'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an', |
'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait','abonnez', 'parce', |
'ouai', 'sai', 'it', 'do', 'mets', 'yourself','si', 'chose','oui', 'truc', 'dessus', 'traite', |
'that'] |
with open('./clean_docs.json', 'r') as f: |
clean_text = json.load(f) |
docs = clean_text['text'] |
topic_model = BERTopic.load("./model_dir/") |
timestamps = [datetime.strptime(date_time, "%d/%m/%Y") for date_time in clean_text["date"]] |
topics_over_time = topic_model.topics_over_time(docs, timestamps, |
global_tuning=True, |
evolution_tuning=True, |
nr_bins=20) |
time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10) |
time_fig.update(layout_showlegend=False) |
time_fig.update_layout(autosize=False, width=800, height=400,) |
topics_over_time = topics_over_time[topics_over_time['Topic'] != -1] |
topics_over_time.set_index('Timestamp', inplace=True) |
topics_over_time['year'] = topics_over_time.index.year |
topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' ')) |
topic_fig = topic_model.visualize_barchart(n_words=10) |
topic_fig.update_layout(autosize=False, width=800) |
def wordscloud(text: str): |
''' compute wordcloud of some strings |
Args : text is strings format |
Return : matplotlib figure''' |
WordCloud() |
word_cloud = WordCloud(background_color='white').generate(text) |
fig, ax = plt.subplots() |
ax.imshow(word_cloud, interpolation='bilinear') |
plt.axis("off") |
plt.show() |
st.pyplot(fig) |
data = df_clean.resample('Y')['duration_min'].mean() |
podcast_per_year = df_clean['year'].value_counts().reset_index() |
podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inplace=True) |
fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True) |
fig.update_traces(textposition="bottom right") |
fig.update_layout(autosize=False, width=800) |
st.write(''' |
# Nous sommes la moyenne des personnes que nous fréquentons. |
Hello''') |
st.header('Durée moyenne des podcasts par année') |
st.plotly_chart(fig, use_container_width=False, |
sharing="streamlit") |
st.header('Les mots fréquemment utilisés dans le podcast') |
st.header('Sujets évoqués dans le podcast') |
st.plotly_chart(topic_fig, use_container_width=False, |
sharing="streamlit") |
st.header('Sujets évoqués au cours du temps dans le podcast') |
st.plotly_chart(time_fig, use_container_width=False, |
sharing="streamlit") |
st.header('Sujets en 2O17') |
text = topic_per_year[2017].replace(',', "") |
wordscloud(text) |
st.header('Sujets en 2O18') |
text = topic_per_year[2018].replace(',', "") |
wordscloud(text) |
st.header('Sujets en 2O19') |
text = topic_per_year[2019].replace(',', "") |
wordscloud(text) |
st.header('Sujets en 2O20') |
text = topic_per_year[2020].replace(',', "") |
wordscloud(text) |
st.header('Sujets en 2O21') |
text = topic_per_year[2021].replace(',', "") |
wordscloud(text) |
st.header('Sujets en 2O22') |
text = topic_per_year[2022].replace(',', "") |
wordscloud(text) |