update app
Browse files
app.py
CHANGED
@@ -8,12 +8,11 @@ from wordcloud import WordCloud
|
|
8 |
import nltk
|
9 |
import plotly.express as px
|
10 |
import plotly.graph_objects as go
|
11 |
-
import
|
12 |
import warnings
|
|
|
13 |
nltk.download('stopwords')
|
14 |
-
from nltk.corpus import stopwords
|
15 |
nlp = spacy.load("fr_core_news_sm")
|
16 |
-
stopword = stopwords.words('french')
|
17 |
|
18 |
warnings.filterwarnings('ignore')
|
19 |
from nltk import FreqDist
|
@@ -40,45 +39,24 @@ def clean_data(df):
|
|
40 |
|
41 |
df_clean = clean_data(df)
|
42 |
|
43 |
-
|
44 |
-
def clean_up1(row: str, stopword, pos=None):
|
45 |
-
""" Args : text
|
46 |
-
Returns : List
|
47 |
-
- Remove `\xa0` and `\u200a`
|
48 |
-
- Remove word with length lower than 2"""
|
49 |
-
|
50 |
-
texts = row.replace(f'\xa0', '')
|
51 |
-
texts = texts.replace(f'\u200a', '')
|
52 |
-
text_ = " ".join([token for token in texts.split() if token.isalpha() and len(token) > 2])
|
53 |
-
texts = nlp(text_)
|
54 |
-
if pos is not None:
|
55 |
-
list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword \
|
56 |
-
and token.pos_ not in pos]
|
57 |
-
|
58 |
-
else:
|
59 |
-
list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword]
|
60 |
-
|
61 |
-
return list_tokens
|
62 |
-
|
63 |
# Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
|
64 |
pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
|
65 |
# Some frequently used in the podcast
|
66 |
-
context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
|
67 |
'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
|
68 |
-
'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait'
|
69 |
-
|
|
|
|
|
|
|
70 |
|
71 |
-
|
72 |
-
docs = clean_text.apply(lambda x: " ".join(x)).tolist()
|
73 |
#load the model
|
74 |
-
topic_model = BERTopic.load("
|
75 |
|
76 |
-
with open('topics', 'rb') as f:
|
77 |
-
topics = pickle.load(f)
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
|
82 |
global_tuning=True,
|
83 |
evolution_tuning=True,
|
84 |
nr_bins=20)
|
@@ -116,34 +94,16 @@ podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inpla
|
|
116 |
fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
|
117 |
fig.update_traces(textposition="bottom right")
|
118 |
fig.update_layout(autosize=False, width=800)
|
119 |
-
#barchart of number of podcast per year
|
120 |
-
#fig1 = px.pie(data_frame=podcast_per_year, values = 'nb_podcast', names= 'year', hole=.4)
|
121 |
-
fig1 = go.Figure(data=[go.Pie(labels=podcast_per_year['year'],
|
122 |
-
values=podcast_per_year['nb_podcast'], pull=[0, 0, 0.2, 0, 0, 0])])
|
123 |
-
#fig1.update_layout(autosize=False, width=800)
|
124 |
-
fig1.update_traces(textposition='inside', textinfo='value+label')
|
125 |
|
126 |
st.write('''
|
127 |
# Nous sommes la moyenne des personnes que nous fréquentons.
|
128 |
Hello''')
|
129 |
|
130 |
-
st.header('Nombre de podcasts par année')
|
131 |
-
|
132 |
-
st.plotly_chart(fig1, use_container_width=False,
|
133 |
-
sharing="streamlit")
|
134 |
-
|
135 |
st.header('Durée moyenne des podcasts par année')
|
136 |
st.plotly_chart(fig, use_container_width=False,
|
137 |
sharing="streamlit")
|
138 |
#word cloud of all terms
|
139 |
st.header('Les mots fréquemment utilisés dans le podcast')
|
140 |
-
text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
|
141 |
-
wordcloud = WordCloud(background_color='white').generate(text_cloud)
|
142 |
-
fig, ax = plt.subplots()
|
143 |
-
ax.imshow(wordcloud, interpolation='bilinear')
|
144 |
-
plt.axis("off")
|
145 |
-
plt.show()
|
146 |
-
st.pyplot(fig)
|
147 |
#show topics
|
148 |
st.header('Sujets évoqués dans le podcast')
|
149 |
st.plotly_chart(topic_fig, use_container_width=False,
|
|
|
8 |
import nltk
|
9 |
import plotly.express as px
|
10 |
import plotly.graph_objects as go
|
11 |
+
import json
|
12 |
import warnings
|
13 |
+
from datetime import datetime
|
14 |
nltk.download('stopwords')
|
|
|
15 |
nlp = spacy.load("fr_core_news_sm")
|
|
|
16 |
|
17 |
warnings.filterwarnings('ignore')
|
18 |
from nltk import FreqDist
|
|
|
39 |
|
40 |
df_clean = clean_data(df)
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
|
43 |
pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
|
44 |
# Some frequently used in the podcast
|
45 |
+
context = ['ouais', 'épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
|
46 |
'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
|
47 |
+
'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait','abonnez', 'parce',
|
48 |
+
'ouai', 'sai', 'it', 'do', 'mets', 'yourself','si', 'chose','oui', 'truc', 'dessus', 'traite',
|
49 |
+
'that'] # add some frequent words in stopword
|
50 |
+
with open('./clean_docs.json', 'r') as f:
|
51 |
+
clean_text = json.load(f)
|
52 |
|
53 |
+
docs = clean_text['text']
|
|
|
54 |
#load the model
|
55 |
+
topic_model = BERTopic.load("./model_dir/")
|
56 |
|
|
|
|
|
57 |
|
58 |
+
timestamps = [datetime.strptime(date_time, "%d/%m/%Y") for date_time in clean_text["date"]]
|
59 |
+
topics_over_time = topic_model.topics_over_time(docs, timestamps,
|
|
|
60 |
global_tuning=True,
|
61 |
evolution_tuning=True,
|
62 |
nr_bins=20)
|
|
|
94 |
fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
|
95 |
fig.update_traces(textposition="bottom right")
|
96 |
fig.update_layout(autosize=False, width=800)
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
st.write('''
|
99 |
# Nous sommes la moyenne des personnes que nous fréquentons.
|
100 |
Hello''')
|
101 |
|
|
|
|
|
|
|
|
|
|
|
102 |
st.header('Durée moyenne des podcasts par année')
|
103 |
st.plotly_chart(fig, use_container_width=False,
|
104 |
sharing="streamlit")
|
105 |
#word cloud of all terms
|
106 |
st.header('Les mots fréquemment utilisés dans le podcast')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
#show topics
|
108 |
st.header('Sujets évoqués dans le podcast')
|
109 |
st.plotly_chart(topic_fig, use_container_width=False,
|