Spaces:

madoss
/

gdiy

Runtime error

App Files Files Community

madoss commited on Aug 5, 2022

Commit

d139382

1 Parent(s): 7c454fe

add app

Browse files

Files changed (3) hide show

app.py +160 -0
gdiy_data.csv +0 -0
requirements.txt +198 -0

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import streamlit as st
+import pandas as pd
+import spacy
+import matplotlib.pyplot as plt
+import seaborn as sns
+from bertopic import BERTopic
+from wordcloud import WordCloud
+from nltk.corpus import stopwords
+import pickle
+import plotly.express as px
+nlp = spacy.load("fr_core_news_sm")
+stopword = stopwords.words('french')
+import warnings
+warnings.filterwarnings('ignore')
+from nltk import FreqDist
+df = pd.read_csv("gdiy_data.csv", sep=',',
+                 parse_dates=['release_date'])  # use `release_date` as date in  pandas
+def clean_data(df):
+    df = df.drop('Unnamed: 0', axis=1)
+    df['description'] = df['description'].str.lower()
+    df = df.set_index('release_date')
+    df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
+    df.loc[:, 'duration_min'] = df['duration_ms'].apply(
+        lambda row: row / (60 * 1000))  # convertir la durée de ms en minutes
+    df['year'] = df.index.year
+    df['month'] = df.index.month
+    return df
+df_clean = clean_data(df)
+def clean_up1(row: str, stopword, pos=None):
+    """ Prend une un text:
+    - Supprime les caractères `\xa0` et `\u200a`
+    - Supprime les mots avec moins de lettres """
+    texts = row.replace(f'\xa0', '')
+    texts = texts.replace(f'\u200a', '')
+    text_ = " ".join([token for token in texts.split() if token.isalpha() and len(token) > 2])
+    texts = nlp(text_)
+    if pos is not None:
+        list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword \
+                       and token.pos_ not in pos]
+    else:
+        list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword]
+    return list_tokens
+pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB']
+context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
+           'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
+           'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
+stopword = stopword + context # add some frequent words in the documents
+clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
+docs = clean_text.apply(lambda x: " ".join(x)).tolist()
+topic_model = BERTopic(language="multilingual",
+                       nr_topics=6,
+                       top_n_words=30,
+                       low_memory=True,
+                       n_gram_range=(1, 2))
+topics, _ = topic_model.fit_transform(docs)
+topic_fig = topic_model.visualize_barchart(n_words=10)
+timestamps = df_clean.index
+topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
+                                                global_tuning=True,
+                                                evolution_tuning=True,
+                                                nr_bins=20)
+time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
+topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
+topics_over_time.set_index('Timestamp', inplace=True)
+topics_over_time['year'] = topics_over_time.index.year
+topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
+fig1, ax = plt.subplots()
+sns.countplot(ax=ax, x='year', data=df_clean, palette='viridis');
+# plt.ylabel('Nombre de podcasts');
+def wordscloud(text: str):
+    WordCloud()
+    word_cloud = WordCloud(background_color='white').generate(text)
+    fig, ax = plt.subplots()
+    ax.imshow(word_cloud, interpolation='bilinear')
+    plt.axis("off")
+    plt.show()
+    st.pyplot(fig)
+data = df_clean.resample('Y')['duration_min'].mean()
+fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
+fig.update_traces(textposition="bottom right")
+st.write('''
+# Nous sommes la moyenne des personnes que nous fréquentons.
+Hello''')
+st.header('Nombre de podcasts par année')
+st.write(fig1)
+st.header('Durée moyenne des podcasts par année')
+st.plotly_chart(fig, use_container_width=False,
+                sharing="streamlit")
+st.header('Les mots fréquemment utilisés dans le podcast')
+text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
+wordcloud = WordCloud(background_color='white').generate(text_cloud)
+fig, ax = plt.subplots()
+ax.imshow(wordcloud, interpolation='bilinear')
+plt.axis("off")
+plt.show()
+st.pyplot(fig)
+st.header('Sujets évoqués dans le podcast')
+st.plotly_chart(topic_fig, use_container_width=False,
+                sharing="streamlit")
+st.header('Sujets évoqués au cours du temps dans le podcast')
+st.plotly_chart(time_fig, use_container_width=False,
+                sharing="streamlit")
+st.header('Sujets en 2O17')
+text = topic_per_year[2017].replace(',', "")
+wordscloud(text)
+st.header('Sujets en 2O18')
+text = topic_per_year[2018].replace(',', "")
+wordscloud(text)
+st.header('Sujets en 2O19')
+text = topic_per_year[2019].replace(',', "")
+wordscloud(text)
+st.header('Sujets en 2O20')
+text = topic_per_year[2020].replace(',', "")
+wordscloud(text)
+st.header('Sujets en 2O21')
+text = topic_per_year[2021].replace(',', "")
+wordscloud(text)
+st.header('Sujets en 2O22')
+text = topic_per_year[2022].replace(',', "")
+wordscloud(text)

gdiy_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,198 @@

+aiohttp==3.8.1
+aiosignal==1.2.0
+altair==4.2.0
+anyio==3.5.0
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.0.5
+async-timeout==4.0.2
+attrs==21.4.0
+Babel==2.9.1
+backcall==0.2.0
+bertopic==0.11.0
+black==22.1.0
+bleach==4.1.0
+blinker==1.5
+blis==0.7.6
+bpemb==0.3.3
+cachetools==5.2.0
+catalogue==2.0.7
+certifi==2021.10.8
+cffi==1.15.0
+charset-normalizer==2.0.11
+click==8.0.3
+cloudpickle==2.1.0
+colorama==0.4.4
+commonmark==0.9.1
+conllu==4.5.1
+cycler==0.11.0
+cymem==2.0.6
+Cython==0.29.23
+datasets==2.2.2
+debugpy==1.5.1
+decorator==5.1.1
+defusedxml==0.7.1
+Deprecated==1.2.13
+dill==0.3.4
+docopt==0.6.2
+entrypoints==0.4
+executing==0.8.2
+fastjsonschema==2.16.1
+filelock==3.6.0
+flair==0.11.3
+fonttools==4.29.1
+fr-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.2.0/fr_core_news_sm-3.2.0-py3-none-any.whl
+frozenlist==1.3.0
+fsspec==2022.5.0
+ftfy==6.1.1
+funcy==1.17
+future==0.18.2
+gdown==4.4.0
+gitdb==4.0.9
+GitPython==3.1.27
+hdbscan==0.8.28
+huggingface-hub==0.4.0
+hyperopt==0.2.7
+idna==3.3
+importlib-metadata==3.10.1
+ipykernel==6.9.0
+ipython==8.0.1
+ipython-genutils==0.2.0
+ipywidgets==7.7.0
+Janome==0.4.2
+jedi==0.18.1
+Jinja2==3.0.3
+joblib==1.1.0
+Js2Py==0.71
+json5==0.9.6
+jsonschema==4.4.0
+kiwisolver==1.3.2
+konoha==4.6.5
+langcodes==3.3.0
+langdetect==1.0.9
+llvmlite==0.38.1
+lxml==4.8.0
+MarkupSafe==2.0.1
+matplotlib==3.5.1
+matplotlib-inline==0.1.3
+mistune==0.8.4
+more-itertools==8.13.0
+mpld3==0.3
+multidict==6.0.2
+multiprocess==0.70.12.2
+murmurhash==1.0.6
+mypy-extensions==0.4.3
+nest-asyncio==1.5.4
+networkx==2.8.4
+nltk==3.7
+numba==0.55.2
+numexpr==2.8.1
+numpy==1.22.4
+overrides==3.1.0
+packaging==21.3
+pandas==1.4.0
+pandoc==2.0.1
+pandocfilters==1.5.0
+parso==0.8.3
+pathspec==0.9.0
+pathy==0.6.1
+pickleshare==0.7.5
+Pillow==9.0.1
+pipwin==0.5.2
+platformdirs==2.4.1
+plotly==5.9.0
+plumbum==1.7.2
+ply==3.11
+pptree==3.1
+preshed==3.0.6
+prometheus-client==0.13.1
+prompt-toolkit==3.0.27
+protobuf==3.20.1
+pure-eval==0.2.2
+py4j==0.10.9.5
+pyarrow==8.0.0
+pycparser==2.21
+pydantic==1.8.2
+pydeck==0.7.1
+Pygments==2.11.2
+pyjsparser==2.7.1
+pylibscrypt==2.0.0
+pymongo==4.0.2
+Pympler==1.0.1
+pynndescent==0.5.7
+pyparsing==3.0.7
+PyPrind==2.11.3
+pyrsistent==0.18.1
+pySmartDL==1.3.4
+PySocks==1.7.1
+python-dateutil==2.8.2
+pytz==2021.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==5.4.1
+pyzmq==22.3.0
+redis==4.3.4
+regex==2022.3.15
+requests==2.27.1
+responses==0.18.0
+rich==12.5.1
+sacremoses==0.0.49
+scikit-learn==1.0.2
+scipy==1.8.0
+scrypt==0.8.20
+seaborn==0.11.2
+segtok==1.5.11
+semver==2.13.0
+Send2Trash==1.8.0
+sentence-transformers==2.2.2
+sentencepiece==0.1.95
+seqeval==1.2.2
+six==1.16.0
+sklearn==0.0
+smart-open==5.2.1
+smmap==5.0.0
+sniffio==1.2.0
+soupsieve==2.3.1
+spacy==3.2.3
+spacy-legacy==3.0.9
+spacy-loggers==1.0.1
+spotipy==2.20.0
+sqlitedict==2.0.0
+srsly==2.4.2
+stack-data==0.1.4
+tabulate==0.8.10
+tenacity==8.0.1
+terminado==0.13.1
+testpath==0.5.0
+thinc==8.0.15
+threadpoolctl==3.1.0
+tinycss2==1.1.1
+tokenizers==0.12.0
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.0
+torch==1.11.0
+torchtext==0.12.0
+torchvision==0.12.0
+tornado==6.1
+tqdm==4.63.0
+traitlets==5.1.1
+transformers==4.17.0
+typer==0.4.0
+typing_extensions==4.0.1
+tzdata==2022.1
+tzlocal==4.2
+umap-learn==0.5.3
+urllib3==1.26.8
+validators==0.20.0
+voluptuous==0.13.1
+wasabi==0.9.0
+watchdog==2.1.9
+wcwidth==0.2.5
+webencodings==0.5.1
+websocket-client==1.2.3
+widgetsnbextension==3.6.0
+wordcloud==1.8.2.2
+wrapt==1.14.1
+xxhash==3.0.0
+yarl==1.7.2
+zipp==3.8.1