Spaces:

madoss
/

gdiy

Runtime error

App Files Files Community

gdiy / app.py

madoss

app

f2b56a9 about 2 years ago

raw

history blame

6.57 kB

	import streamlit as st
	import pandas as pd
	import spacy
	import matplotlib.pyplot as plt
	import seaborn as sns
	from bertopic import BERTopic
	from wordcloud import WordCloud
	import nltk
	import plotly.express as px
	import plotly.graph_objects as go
	import pickle
	import warnings
	nltk.download('stopwords')
	from nltk.corpus import stopwords
	nlp = spacy.load("fr_core_news_sm")
	stopword = stopwords.words('french')

	warnings.filterwarnings('ignore')
	from nltk import FreqDist

	df = pd.read_csv("gdiy_data.csv", sep=',',
	parse_dates=['release_date']) # use `release_date` as date in pandas


	def clean_data(df):
	'''
	args : pd DataFrame
	Return : pd DataFrme'''
	df = df.drop('Unnamed: 0', axis=1)
	df['description'] = df['description'].str.lower()
	df = df.set_index('release_date')
	# Remove EXTRAIT and REDIFF in the podcasts
	df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
	df.loc[:, 'duration_min'] = df['duration_ms'].apply(
	lambda row: row / (60 * 1000)) # convert duration in minutes
	df['year'] = df.index.year
	df['month'] = df.index.month
	return df


	df_clean = clean_data(df)


	def clean_up1(row: str, stopword, pos=None):
	""" Args : text
	Returns : List
	- Remove `\xa0` and `\u200a`
	- Remove word with length lower than 2"""

	texts = row.replace(f'\xa0', '')
	texts = texts.replace(f'\u200a', '')
	text_ = " ".join([token for token in texts.split() if token.isalpha() and len(token) > 2])
	texts = nlp(text_)
	if pos is not None:
	list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword \
	and token.pos_ not in pos]

	else:
	list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword]

	return list_tokens

	# Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
	pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
	# Some frequently used in the podcast
	context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
	'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
	'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
	stopword = stopword + context # add some frequent words in stopword

	clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
	docs = clean_text.apply(lambda x: " ".join(x)).tolist()
	#load the model
	topic_model = BERTopic.load("bertopic.pkl")

	with open('topics', 'rb') as f:
	topics = pickle.load(f)


	timestamps = df_clean.index
	topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
	global_tuning=True,
	evolution_tuning=True,
	nr_bins=20)
	#visualize topics over times
	time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
	time_fig.update(layout_showlegend=False)
	time_fig.update_layout(autosize=False, width=800, height=400,)
	#group topics per year
	topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
	topics_over_time.set_index('Timestamp', inplace=True)
	topics_over_time['year'] = topics_over_time.index.year
	topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
	# barchart of topics
	topic_fig = topic_model.visualize_barchart(n_words=10)
	topic_fig.update_layout(autosize=False, width=800)


	def wordscloud(text: str):
	''' compute wordcloud of some strings
	Args : text is strings format
	Return : matplotlib figure'''
	WordCloud()
	word_cloud = WordCloud(background_color='white').generate(text)
	fig, ax = plt.subplots()
	ax.imshow(word_cloud, interpolation='bilinear')
	plt.axis("off")
	plt.show()
	st.pyplot(fig)


	data = df_clean.resample('Y')['duration_min'].mean() #average per of duration
	podcast_per_year = df_clean['year'].value_counts().reset_index() # count the number of podcasts per year
	podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inplace=True) #rename columns
	#visualize duration by the year
	fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
	fig.update_traces(textposition="bottom right")
	fig.update_layout(autosize=False, width=800)
	#barchart of number of podcast per year
	#fig1 = px.pie(data_frame=podcast_per_year, values = 'nb_podcast', names= 'year', hole=.4)
	fig1 = go.Figure(data=[go.Pie(labels=podcast_per_year['year'],
	values=podcast_per_year['nb_podcast'], pull=[0, 0, 0.2, 0, 0, 0])])
	#fig1.update_layout(autosize=False, width=800)
	fig1.update_traces(textposition='inside', textinfo='value+label')

	st.write('''
	# Nous sommes la moyenne des personnes que nous fréquentons.
	Hello''')

	st.header('Nombre de podcasts par année')

	st.plotly_chart(fig1, use_container_width=False,
	sharing="streamlit")

	st.header('Durée moyenne des podcasts par année')
	st.plotly_chart(fig, use_container_width=False,
	sharing="streamlit")
	#word cloud of all terms
	st.header('Les mots fréquemment utilisés dans le podcast')
	text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
	wordcloud = WordCloud(background_color='white').generate(text_cloud)
	fig, ax = plt.subplots()
	ax.imshow(wordcloud, interpolation='bilinear')
	plt.axis("off")
	plt.show()
	st.pyplot(fig)
	#show topics
	st.header('Sujets évoqués dans le podcast')
	st.plotly_chart(topic_fig, use_container_width=False,
	sharing="streamlit")
	#show topics over years
	st.header('Sujets évoqués au cours du temps dans le podcast')
	st.plotly_chart(time_fig, use_container_width=False,
	sharing="streamlit")
	#Terms used in 2017
	st.header('Sujets en 2O17')
	text = topic_per_year[2017].replace(',', "")
	wordscloud(text)
	#Terms used in 2018
	st.header('Sujets en 2O18')
	text = topic_per_year[2018].replace(',', "")
	wordscloud(text)
	#Terms used in 2019
	st.header('Sujets en 2O19')
	text = topic_per_year[2019].replace(',', "")
	wordscloud(text)
	#Terms used in 2020
	st.header('Sujets en 2O20')
	text = topic_per_year[2020].replace(',', "")
	wordscloud(text)
	#Terms used in 2021
	st.header('Sujets en 2O21')
	text = topic_per_year[2021].replace(',', "")
	wordscloud(text)
	#Terms used in 2022
	st.header('Sujets en 2O22')
	text = topic_per_year[2022].replace(',', "")
	wordscloud(text)