File size: 6,572 Bytes
d139382
 
 
 
 
 
 
8fabf7b
dcc8e89
 
 
 
8fabf7b
d139382
 
 
dcc8e89
d139382
 
 
 
 
 
 
 
dcc8e89
 
 
d139382
 
 
dcc8e89
d139382
 
dcc8e89
d139382
 
 
 
 
 
 
 
 
dcc8e89
 
 
 
d139382
 
 
 
 
 
 
 
 
 
 
 
 
 
dcc8e89
 
 
d139382
 
 
dcc8e89
d139382
 
 
dcc8e89
664ec70
d139382
1013d06
 
d139382
 
 
 
 
 
 
dcc8e89
d139382
dcc8e89
 
 
d139382
 
 
 
dcc8e89
1013d06
dcc8e89
d139382
 
 
dcc8e89
 
 
d139382
 
 
 
 
 
 
 
 
dcc8e89
 
 
 
d139382
 
dcc8e89
 
 
 
 
 
 
d139382
 
 
 
 
 
 
dcc8e89
 
d139382
 
 
 
dcc8e89
d139382
 
 
 
 
 
 
 
dcc8e89
d139382
 
 
dcc8e89
d139382
 
 
dcc8e89
d139382
 
 
dcc8e89
d139382
 
 
dcc8e89
d139382
 
 
dcc8e89
d139382
 
 
dcc8e89
d139382
 
 
dcc8e89
d139382
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import streamlit as st
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
from bertopic import BERTopic
from wordcloud import WordCloud
import nltk
import plotly.express as px
import plotly.graph_objects as go
import pickle
import warnings
nltk.download('stopwords')
from nltk.corpus import stopwords
nlp = spacy.load("fr_core_news_sm")
stopword = stopwords.words('french')

warnings.filterwarnings('ignore')
from nltk import FreqDist

df = pd.read_csv("gdiy_data.csv", sep=',',
                 parse_dates=['release_date'])  # use `release_date` as date in  pandas


def clean_data(df):
    '''
    args : pd DataFrame
    Return : pd DataFrme'''
    df = df.drop('Unnamed: 0', axis=1)
    df['description'] = df['description'].str.lower()
    df = df.set_index('release_date')
    # Remove EXTRAIT and REDIFF in the podcasts
    df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
    df.loc[:, 'duration_min'] = df['duration_ms'].apply(
        lambda row: row / (60 * 1000))  # convert duration in minutes
    df['year'] = df.index.year
    df['month'] = df.index.month
    return df


df_clean = clean_data(df)


def clean_up1(row: str, stopword, pos=None):
    """ Args : text
    Returns : List
    - Remove `\xa0` and `\u200a`
    - Remove word with length lower than 2"""

    texts = row.replace(f'\xa0', '')
    texts = texts.replace(f'\u200a', '')
    text_ = " ".join([token for token in texts.split() if token.isalpha() and len(token) > 2])
    texts = nlp(text_)
    if pos is not None:
        list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword \
                       and token.pos_ not in pos]

    else:
        list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword]

    return list_tokens

# Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
# Some frequently used in the podcast
context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
           'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
           'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
stopword = stopword + context # add some frequent words in stopword

clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
docs = clean_text.apply(lambda x: " ".join(x)).tolist()
#load the model
topic_model = BERTopic.load("bertopic.pkl")

with open('topics', 'rb') as f:
    topics = pickle.load(f)


timestamps = df_clean.index
topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
                                                global_tuning=True,
                                                evolution_tuning=True,
                                                nr_bins=20)
#visualize topics over times
time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
time_fig.update(layout_showlegend=False)
time_fig.update_layout(autosize=False, width=800, height=400,)
#group topics per year
topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
topics_over_time.set_index('Timestamp', inplace=True)
topics_over_time['year'] = topics_over_time.index.year
topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
# barchart of topics
topic_fig = topic_model.visualize_barchart(n_words=10)
topic_fig.update_layout(autosize=False, width=800)


def wordscloud(text: str):
    ''' compute wordcloud of some strings
    Args : text is strings format
    Return : matplotlib figure'''
    WordCloud()
    word_cloud = WordCloud(background_color='white').generate(text)
    fig, ax = plt.subplots()
    ax.imshow(word_cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    st.pyplot(fig)


data = df_clean.resample('Y')['duration_min'].mean() #average per of duration
podcast_per_year = df_clean['year'].value_counts().reset_index() # count the number of podcasts per year
podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inplace=True) #rename columns
#visualize duration by the year
fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
fig.update_traces(textposition="bottom right")
fig.update_layout(autosize=False, width=800)
#barchart of number of podcast per year
#fig1 = px.pie(data_frame=podcast_per_year, values = 'nb_podcast', names= 'year', hole=.4)
fig1 = go.Figure(data=[go.Pie(labels=podcast_per_year['year'],
                              values=podcast_per_year['nb_podcast'], pull=[0, 0, 0.2, 0, 0, 0])])
#fig1.update_layout(autosize=False, width=800)
fig1.update_traces(textposition='inside', textinfo='value+label')

st.write('''
# Nous sommes la moyenne des personnes que nous fréquentons.
Hello''')

st.header('Nombre de podcasts par année')

st.plotly_chart(fig1, use_container_width=False,
                sharing="streamlit")

st.header('Durée moyenne des podcasts par année')
st.plotly_chart(fig, use_container_width=False,
                sharing="streamlit")
#word cloud of all terms
st.header('Les mots fréquemment utilisés dans le podcast')
text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
wordcloud = WordCloud(background_color='white').generate(text_cloud)
fig, ax = plt.subplots()
ax.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
st.pyplot(fig)
#show topics
st.header('Sujets évoqués dans le podcast')
st.plotly_chart(topic_fig, use_container_width=False,
                sharing="streamlit")
#show topics over years
st.header('Sujets évoqués au cours du temps dans le podcast')
st.plotly_chart(time_fig, use_container_width=False,
                sharing="streamlit")
#Terms used in 2017
st.header('Sujets en 2O17')
text = topic_per_year[2017].replace(',', "")
wordscloud(text)
#Terms used in 2018
st.header('Sujets en 2O18')
text = topic_per_year[2018].replace(',', "")
wordscloud(text)
#Terms used in 2019
st.header('Sujets en 2O19')
text = topic_per_year[2019].replace(',', "")
wordscloud(text)
#Terms used in 2020
st.header('Sujets en 2O20')
text = topic_per_year[2020].replace(',', "")
wordscloud(text)
#Terms used in 2021
st.header('Sujets en 2O21')
text = topic_per_year[2021].replace(',', "")
wordscloud(text)
#Terms used in 2022
st.header('Sujets en 2O22')
text = topic_per_year[2022].replace(',', "")
wordscloud(text)