gdiy / app.py
madoss's picture
update requirements
3aa558d
raw
history blame
5.04 kB
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from bertopic import BERTopic
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
import json
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')
df = pd.read_csv("gdiy_data.csv", sep=',',
parse_dates=['release_date']) # use `release_date` as date in pandas
def clean_data(df):
'''
args : pd DataFrame
Return : pd DataFrme'''
df = df.drop('Unnamed: 0', axis=1)
df['description'] = df['description'].str.lower()
df = df.set_index('release_date')
# Remove EXTRAIT and REDIFF in the podcasts
df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
df.loc[:, 'duration_min'] = df['duration_ms'].apply(
lambda row: row / (60 * 1000)) # convert duration in minutes
df['year'] = df.index.year
df['month'] = df.index.month
return df
df_clean = clean_data(df)
# Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
# Some frequently used in the podcast
context = ['ouais', 'épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait','abonnez', 'parce',
'ouai', 'sai', 'it', 'do', 'mets', 'yourself','si', 'chose','oui', 'truc', 'dessus', 'traite',
'that'] # add some frequent words in stopword
with open('./clean_docs.json', 'r') as f:
clean_text = json.load(f)
docs = clean_text['text']
#load the model
topic_model = BERTopic.load("./model_dir/")
timestamps = [datetime.strptime(date_time, "%d/%m/%Y") for date_time in clean_text["date"]]
topics_over_time = topic_model.topics_over_time(docs, timestamps,
global_tuning=True,
evolution_tuning=True,
nr_bins=20)
#visualize topics over times
time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
time_fig.update(layout_showlegend=False)
time_fig.update_layout(autosize=False, width=800, height=400,)
#group topics per year
topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
topics_over_time.set_index('Timestamp', inplace=True)
topics_over_time['year'] = topics_over_time.index.year
topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
# barchart of topics
topic_fig = topic_model.visualize_barchart(n_words=10)
topic_fig.update_layout(autosize=False, width=800)
def wordscloud(text: str):
''' compute wordcloud of some strings
Args : text is strings format
Return : matplotlib figure'''
WordCloud()
word_cloud = WordCloud(background_color='white').generate(text)
fig, ax = plt.subplots()
ax.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()
st.pyplot(fig)
data = df_clean.resample('Y')['duration_min'].mean() #average per of duration
podcast_per_year = df_clean['year'].value_counts().reset_index() # count the number of podcasts per year
podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inplace=True) #rename columns
#visualize duration by the year
fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
fig.update_traces(textposition="bottom right")
fig.update_layout(autosize=False, width=800)
st.write('''
# Nous sommes la moyenne des personnes que nous fréquentons.
Hello''')
st.header('Durée moyenne des podcasts par année')
st.plotly_chart(fig, use_container_width=False,
sharing="streamlit")
#word cloud of all terms
st.header('Les mots fréquemment utilisés dans le podcast')
#show topics
st.header('Sujets évoqués dans le podcast')
st.plotly_chart(topic_fig, use_container_width=False,
sharing="streamlit")
#show topics over years
st.header('Sujets évoqués au cours du temps dans le podcast')
st.plotly_chart(time_fig, use_container_width=False,
sharing="streamlit")
#Terms used in 2017
st.header('Sujets en 2O17')
text = topic_per_year[2017].replace(',', "")
wordscloud(text)
#Terms used in 2018
st.header('Sujets en 2O18')
text = topic_per_year[2018].replace(',', "")
wordscloud(text)
#Terms used in 2019
st.header('Sujets en 2O19')
text = topic_per_year[2019].replace(',', "")
wordscloud(text)
#Terms used in 2020
st.header('Sujets en 2O20')
text = topic_per_year[2020].replace(',', "")
wordscloud(text)
#Terms used in 2021
st.header('Sujets en 2O21')
text = topic_per_year[2021].replace(',', "")
wordscloud(text)
#Terms used in 2022
st.header('Sujets en 2O22')
text = topic_per_year[2022].replace(',', "")
wordscloud(text)