madoss commited on
Commit
c8ef7fe
·
1 Parent(s): 697528c

update app

Browse files
Files changed (1) hide show
  1. app.py +12 -52
app.py CHANGED
@@ -8,12 +8,11 @@ from wordcloud import WordCloud
8
  import nltk
9
  import plotly.express as px
10
  import plotly.graph_objects as go
11
- import pickle
12
  import warnings
 
13
  nltk.download('stopwords')
14
- from nltk.corpus import stopwords
15
  nlp = spacy.load("fr_core_news_sm")
16
- stopword = stopwords.words('french')
17
 
18
  warnings.filterwarnings('ignore')
19
  from nltk import FreqDist
@@ -40,45 +39,24 @@ def clean_data(df):
40
 
41
  df_clean = clean_data(df)
42
 
43
-
44
- def clean_up1(row: str, stopword, pos=None):
45
- """ Args : text
46
- Returns : List
47
- - Remove `\xa0` and `\u200a`
48
- - Remove word with length lower than 2"""
49
-
50
- texts = row.replace(f'\xa0', '')
51
- texts = texts.replace(f'\u200a', '')
52
- text_ = " ".join([token for token in texts.split() if token.isalpha() and len(token) > 2])
53
- texts = nlp(text_)
54
- if pos is not None:
55
- list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword \
56
- and token.pos_ not in pos]
57
-
58
- else:
59
- list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword]
60
-
61
- return list_tokens
62
-
63
  # Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
64
  pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
65
  # Some frequently used in the podcast
66
- context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
67
  'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
68
- 'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
69
- stopword = stopword + context # add some frequent words in stopword
 
 
 
70
 
71
- clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
72
- docs = clean_text.apply(lambda x: " ".join(x)).tolist()
73
  #load the model
74
- topic_model = BERTopic.load("bertopic.pkl")
75
 
76
- with open('topics', 'rb') as f:
77
- topics = pickle.load(f)
78
 
79
-
80
- timestamps = df_clean.index
81
- topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
82
  global_tuning=True,
83
  evolution_tuning=True,
84
  nr_bins=20)
@@ -116,34 +94,16 @@ podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inpla
116
  fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
117
  fig.update_traces(textposition="bottom right")
118
  fig.update_layout(autosize=False, width=800)
119
- #barchart of number of podcast per year
120
- #fig1 = px.pie(data_frame=podcast_per_year, values = 'nb_podcast', names= 'year', hole=.4)
121
- fig1 = go.Figure(data=[go.Pie(labels=podcast_per_year['year'],
122
- values=podcast_per_year['nb_podcast'], pull=[0, 0, 0.2, 0, 0, 0])])
123
- #fig1.update_layout(autosize=False, width=800)
124
- fig1.update_traces(textposition='inside', textinfo='value+label')
125
 
126
  st.write('''
127
  # Nous sommes la moyenne des personnes que nous fréquentons.
128
  Hello''')
129
 
130
- st.header('Nombre de podcasts par année')
131
-
132
- st.plotly_chart(fig1, use_container_width=False,
133
- sharing="streamlit")
134
-
135
  st.header('Durée moyenne des podcasts par année')
136
  st.plotly_chart(fig, use_container_width=False,
137
  sharing="streamlit")
138
  #word cloud of all terms
139
  st.header('Les mots fréquemment utilisés dans le podcast')
140
- text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
141
- wordcloud = WordCloud(background_color='white').generate(text_cloud)
142
- fig, ax = plt.subplots()
143
- ax.imshow(wordcloud, interpolation='bilinear')
144
- plt.axis("off")
145
- plt.show()
146
- st.pyplot(fig)
147
  #show topics
148
  st.header('Sujets évoqués dans le podcast')
149
  st.plotly_chart(topic_fig, use_container_width=False,
 
8
  import nltk
9
  import plotly.express as px
10
  import plotly.graph_objects as go
11
+ import json
12
  import warnings
13
+ from datetime import datetime
14
  nltk.download('stopwords')
 
15
  nlp = spacy.load("fr_core_news_sm")
 
16
 
17
  warnings.filterwarnings('ignore')
18
  from nltk import FreqDist
 
39
 
40
  df_clean = clean_data(df)
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
43
  pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
44
  # Some frequently used in the podcast
45
+ context = ['ouais', 'épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
46
  'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
47
+ 'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait','abonnez', 'parce',
48
+ 'ouai', 'sai', 'it', 'do', 'mets', 'yourself','si', 'chose','oui', 'truc', 'dessus', 'traite',
49
+ 'that'] # add some frequent words in stopword
50
+ with open('./clean_docs.json', 'r') as f:
51
+ clean_text = json.load(f)
52
 
53
+ docs = clean_text['text']
 
54
  #load the model
55
+ topic_model = BERTopic.load("./model_dir/")
56
 
 
 
57
 
58
+ timestamps = [datetime.strptime(date_time, "%d/%m/%Y") for date_time in clean_text["date"]]
59
+ topics_over_time = topic_model.topics_over_time(docs, timestamps,
 
60
  global_tuning=True,
61
  evolution_tuning=True,
62
  nr_bins=20)
 
94
  fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
95
  fig.update_traces(textposition="bottom right")
96
  fig.update_layout(autosize=False, width=800)
 
 
 
 
 
 
97
 
98
  st.write('''
99
  # Nous sommes la moyenne des personnes que nous fréquentons.
100
  Hello''')
101
 
 
 
 
 
 
102
  st.header('Durée moyenne des podcasts par année')
103
  st.plotly_chart(fig, use_container_width=False,
104
  sharing="streamlit")
105
  #word cloud of all terms
106
  st.header('Les mots fréquemment utilisés dans le podcast')
 
 
 
 
 
 
 
107
  #show topics
108
  st.header('Sujets évoqués dans le podcast')
109
  st.plotly_chart(topic_fig, use_container_width=False,