import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from bertopic import BERTopic
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
import json
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')

df = pd.read_csv("gdiy_data.csv", sep=',',
                 parse_dates=['release_date'])  # use `release_date` as date in  pandas


def clean_data(df):
    '''
    args : pd DataFrame
    Return : pd DataFrme'''
    df = df.drop('Unnamed: 0', axis=1)
    df['description'] = df['description'].str.lower()
    df = df.set_index('release_date')
    # Remove EXTRAIT and REDIFF in the podcasts
    df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
    df.loc[:, 'duration_min'] = df['duration_ms'].apply(
        lambda row: row / (60 * 1000))  # convert duration in minutes
    df['year'] = df.index.year
    df['month'] = df.index.month
    return df


df_clean = clean_data(df)

# Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective
pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed
# Some frequently used in the podcast
context = ['ouais', 'épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
           'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
           'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait','abonnez', 'parce',
             'ouai', 'sai', 'it', 'do', 'mets', 'yourself','si', 'chose','oui', 'truc', 'dessus', 'traite',
             'that'] # add some frequent words in stopword
with open('./clean_docs.json', 'r') as f:
    clean_text = json.load(f)

docs = clean_text['text']
#load the model
topic_model = BERTopic.load("./model_dir/")


timestamps = [datetime.strptime(date_time, "%d/%m/%Y") for date_time in clean_text["date"]]
topics_over_time = topic_model.topics_over_time(docs, timestamps,
                                                global_tuning=True,
                                                evolution_tuning=True,
                                                nr_bins=20)
#visualize topics over times
time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
time_fig.update(layout_showlegend=False)
time_fig.update_layout(autosize=False, width=800, height=400,)
#group topics per year
topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
topics_over_time.set_index('Timestamp', inplace=True)
topics_over_time['year'] = topics_over_time.index.year
topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
# barchart of topics
topic_fig = topic_model.visualize_barchart(n_words=10)
topic_fig.update_layout(autosize=False, width=800)


def wordscloud(text: str):
    ''' compute wordcloud of some strings
    Args : text is strings format
    Return : matplotlib figure'''
    WordCloud()
    word_cloud = WordCloud(background_color='white').generate(text)
    fig, ax = plt.subplots()
    ax.imshow(word_cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    st.pyplot(fig)


data = df_clean.resample('Y')['duration_min'].mean() #average per of duration
podcast_per_year = df_clean['year'].value_counts().reset_index() # count the number of podcasts per year
podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inplace=True) #rename columns
#visualize duration by the year
fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
fig.update_traces(textposition="bottom right")
fig.update_layout(autosize=False, width=800)

st.write('''
# Nous sommes la moyenne des personnes que nous fréquentons.
Hello''')

st.header('Durée moyenne des podcasts par année')
st.plotly_chart(fig, use_container_width=False,
                sharing="streamlit")
#word cloud of all terms
st.header('Les mots fréquemment utilisés dans le podcast')
#show topics
st.header('Sujets évoqués dans le podcast')
st.plotly_chart(topic_fig, use_container_width=False,
                sharing="streamlit")
#show topics over years
st.header('Sujets évoqués au cours du temps dans le podcast')
st.plotly_chart(time_fig, use_container_width=False,
                sharing="streamlit")
#Terms used in 2017
st.header('Sujets en 2O17')
text = topic_per_year[2017].replace(',', "")
wordscloud(text)
#Terms used in 2018
st.header('Sujets en 2O18')
text = topic_per_year[2018].replace(',', "")
wordscloud(text)
#Terms used in 2019
st.header('Sujets en 2O19')
text = topic_per_year[2019].replace(',', "")
wordscloud(text)
#Terms used in 2020
st.header('Sujets en 2O20')
text = topic_per_year[2020].replace(',', "")
wordscloud(text)
#Terms used in 2021
st.header('Sujets en 2O21')
text = topic_per_year[2021].replace(',', "")
wordscloud(text)
#Terms used in 2022
st.header('Sujets en 2O22')
text = topic_per_year[2022].replace(',', "")
wordscloud(text)