Update app.py
Browse files
app.py
CHANGED
@@ -27,6 +27,8 @@ from bokeh.plotting import figure
|
|
27 |
from bokeh.models import ColumnDataSource
|
28 |
from datetime import datetime
|
29 |
|
|
|
|
|
30 |
model_dir = "./models/sbert.net_models_sentence-transformers_clip-ViT-B-32-multilingual-v1"
|
31 |
|
32 |
@st.cache_data(show_spinner=True)
|
@@ -778,98 +780,161 @@ elif selected_menu_option == "Clustering":
|
|
778 |
min_samples = None
|
779 |
elif clustering_algo == "HDBSCAN":
|
780 |
st.sidebar.markdown("### HDBSCAN Options")
|
781 |
-
min_cluster_size = st.sidebar.slider("[Minimum cluster size](https://
|
782 |
-
min_samples = st.sidebar.slider("Minimum samples", 2, 50, 5)
|
783 |
n_clusters = None
|
784 |
if dim_reduction == "UMAP":
|
785 |
st.sidebar.markdown("### UMAP Options")
|
786 |
-
n_components = st.sidebar.slider("Number of
|
787 |
-
n_neighbors = st.sidebar.slider("Number of neighbors", 2, 20, 15)
|
788 |
-
min_dist = st.sidebar.slider("Minimum distance", 0.0, 1.0, 0.0)
|
789 |
else:
|
790 |
st.sidebar.markdown("### PCA Options")
|
791 |
-
n_components = st.sidebar.slider("Number of
|
792 |
n_neighbors = None
|
793 |
min_dist = None
|
794 |
|
795 |
-
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
-
embeddings = dataset['img_embs']
|
801 |
-
|
802 |
-
# Cluster embeddings
|
803 |
-
labels, reduced_embeddings = cluster_embeddings(embeddings, clustering_algo=clustering_algo, dim_reduction=dim_reduction, n_clusters=n_clusters, min_cluster_size=min_cluster_size, n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
|
804 |
-
st.markdown(f"Clustering {type_embeddings} embeddings using {clustering_algo} with {dim_reduction} dimensionality reduction method resulting in **{len(set(labels))}** clusters.")
|
805 |
-
|
806 |
-
df_clustered = df.copy()
|
807 |
-
df_clustered['cluster'] = labels
|
808 |
-
df_clustered = df_clustered.set_index('cluster').reset_index()
|
809 |
-
st.dataframe(
|
810 |
-
data=filter_dataframe(df_clustered),
|
811 |
-
# use_container_width=True,
|
812 |
-
column_config={
|
813 |
-
"image": st.column_config.ImageColumn(
|
814 |
-
"Image", help="Instagram image"
|
815 |
-
),
|
816 |
-
"URL": st.column_config.LinkColumn(
|
817 |
-
"Link", help="Instagram link", width="small"
|
818 |
-
)
|
819 |
-
},
|
820 |
-
hide_index=True,
|
821 |
-
)
|
822 |
|
823 |
-
|
824 |
-
|
825 |
-
|
826 |
-
|
827 |
-
|
828 |
-
|
829 |
-
|
830 |
-
|
831 |
-
|
832 |
-
|
833 |
-
|
834 |
-
|
835 |
-
|
836 |
-
|
837 |
-
|
838 |
-
|
839 |
-
|
840 |
-
|
841 |
-
|
842 |
-
|
843 |
-
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
-
|
849 |
-
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
854 |
<div>
|
855 |
-
<
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
-
|
860 |
-
></img>
|
861 |
-
</div>
|
862 |
-
<div>
|
863 |
-
<span style="font-size: 12px; font-weight: bold;">@desc</span>
|
864 |
-
</div>
|
865 |
</div>
|
866 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
867 |
|
868 |
-
|
869 |
-
|
870 |
|
871 |
-
p.circle('x', 'y', size=10, source=source, color='colors', line_color=None)
|
872 |
-
st.bokeh_chart(p)
|
873 |
|
874 |
|
875 |
elif selected_menu_option == "Stats":
|
|
|
27 |
from bokeh.models import ColumnDataSource
|
28 |
from datetime import datetime
|
29 |
|
30 |
+
#st.set_page_config(layout="wide")
|
31 |
+
|
32 |
model_dir = "./models/sbert.net_models_sentence-transformers_clip-ViT-B-32-multilingual-v1"
|
33 |
|
34 |
@st.cache_data(show_spinner=True)
|
|
|
780 |
min_samples = None
|
781 |
elif clustering_algo == "HDBSCAN":
|
782 |
st.sidebar.markdown("### HDBSCAN Options")
|
783 |
+
min_cluster_size = st.sidebar.slider("[Minimum cluster size](https://hdbscan.readthedocs.io/en/latest/parameter_selection.html#selecting-min-cluster-size)", 2, 200, 5)
|
784 |
+
min_samples = st.sidebar.slider("[Minimum samples](https://hdbscan.readthedocs.io/en/latest/parameter_selection.html#selecting-min-samples)", 2, 50, 5)
|
785 |
n_clusters = None
|
786 |
if dim_reduction == "UMAP":
|
787 |
st.sidebar.markdown("### UMAP Options")
|
788 |
+
n_components = st.sidebar.slider("[Number of components](https://umap-learn.readthedocs.io/en/latest/parameters.html#n-components)", 2, 80, 50)
|
789 |
+
n_neighbors = st.sidebar.slider("[Number of neighbors](https://umap-learn.readthedocs.io/en/latest/parameters.html#n-neighbors)", 2, 20, 15)
|
790 |
+
min_dist = st.sidebar.slider("[Minimum distance](https://umap-learn.readthedocs.io/en/latest/parameters.html#min-dist)", 0.0, 1.0, 0.0)
|
791 |
else:
|
792 |
st.sidebar.markdown("### PCA Options")
|
793 |
+
n_components = st.sidebar.slider("[Number of components](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)", 2, 80, 2)
|
794 |
n_neighbors = None
|
795 |
min_dist = None
|
796 |
|
797 |
+
st.markdown("### Clustering Results")
|
798 |
+
if type_embeddings == "Text":
|
799 |
+
embeddings = dataset['txt_embs']
|
800 |
+
elif type_embeddings == "Image":
|
801 |
+
embeddings = dataset['img_embs']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
802 |
|
803 |
+
# Cluster embeddings
|
804 |
+
labels, reduced_embeddings = cluster_embeddings(embeddings, clustering_algo=clustering_algo, dim_reduction=dim_reduction, n_clusters=n_clusters, min_cluster_size=min_cluster_size, n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
|
805 |
+
st.markdown(f"Clustering {type_embeddings} embeddings using {clustering_algo} with {dim_reduction} dimensionality reduction method resulting in **{len(set(labels))}** clusters.")
|
806 |
+
|
807 |
+
df_clustered = df.copy()
|
808 |
+
df_clustered['cluster'] = labels
|
809 |
+
df_clustered = df_clustered.set_index('cluster').reset_index()
|
810 |
+
st.dataframe(
|
811 |
+
data=filter_dataframe(df_clustered),
|
812 |
+
# use_container_width=True,
|
813 |
+
column_config={
|
814 |
+
"image": st.column_config.ImageColumn(
|
815 |
+
"Image", help="Instagram image"
|
816 |
+
),
|
817 |
+
"URL": st.column_config.LinkColumn(
|
818 |
+
"Link", help="Instagram link", width="small"
|
819 |
+
)
|
820 |
+
},
|
821 |
+
hide_index=True,
|
822 |
+
)
|
823 |
+
|
824 |
+
st.download_button(
|
825 |
+
"Download dataset with labels",
|
826 |
+
df_clustered.to_csv(index=False).encode('utf-8'),
|
827 |
+
f'ditaduranuncamais_{datetime.now().strftime("%Y%m%d-%H%M%S")}.csv',
|
828 |
+
"text/csv",
|
829 |
+
key='download-csv'
|
830 |
+
)
|
831 |
+
|
832 |
+
st.markdown("### Cluster Plot")
|
833 |
+
# Plot the scatter plot in plotly with the cluster labels as colors reduce further to 2 dimensions if n_components > 2
|
834 |
+
if n_components > 2:
|
835 |
+
reducer = umap.UMAP(n_components=2, random_state=42)
|
836 |
+
reduced_embeddings = reducer.fit_transform(reduced_embeddings)
|
837 |
+
# set the labels to be the cluster labels dynamically
|
838 |
+
|
839 |
+
# visualise with bokeh showing df_clustered['Description'] and df_clustered['image'] on hover
|
840 |
+
descriptions = df_clustered['Description'].tolist()
|
841 |
+
images = df_clustered['image'].tolist()
|
842 |
+
glasbey_colors = cc.glasbey_hv
|
843 |
+
color_dict = {n: rgb2hex(glasbey_colors[i % len(glasbey_colors)]) for i, n in enumerate(set(labels))}
|
844 |
+
colors = [color_dict[label] for label in labels]
|
845 |
+
|
846 |
+
source = ColumnDataSource(data=dict(
|
847 |
+
x=reduced_embeddings[:, 0],
|
848 |
+
y=reduced_embeddings[:, 1],
|
849 |
+
desc=descriptions,
|
850 |
+
imgs=images,
|
851 |
+
colors=colors
|
852 |
+
))
|
853 |
+
|
854 |
+
TOOLTIPS = """
|
855 |
+
<div>
|
856 |
<div>
|
857 |
+
<img
|
858 |
+
src="@imgs" height="100" alt="@imgs" width="100"
|
859 |
+
style="float: left; margin: 0px 15px 15px 0px;"
|
860 |
+
border="2"
|
861 |
+
></img>
|
|
|
|
|
|
|
|
|
|
|
862 |
</div>
|
863 |
+
<div>
|
864 |
+
<span style="font-size: 12px; font-weight: bold;">@desc</span>
|
865 |
+
</div>
|
866 |
+
</div>
|
867 |
+
"""
|
868 |
+
|
869 |
+
p = figure(width=800, height=800, tooltips=TOOLTIPS,
|
870 |
+
title="Mouse over the dots")
|
871 |
+
|
872 |
+
p.circle('x', 'y', size=10, source=source, color='colors', line_color=None)
|
873 |
+
st.bokeh_chart(p)
|
874 |
+
|
875 |
+
# inster time series graph for clusters sorted by size (except cluster -1, show top5 by default, but include selectbox. reuse resample_dict for binning)
|
876 |
+
st.markdown("### Cluster Size")
|
877 |
+
cluster_sizes = df_clustered.groupby('cluster').size().reset_index(name='counts')
|
878 |
+
cluster_sizes = cluster_sizes.sort_values(by='counts', ascending=False)
|
879 |
+
cluster_sizes = cluster_sizes[cluster_sizes['cluster'] != -1]
|
880 |
+
cluster_sizes = cluster_sizes.set_index('cluster').reset_index()
|
881 |
+
cluster_sizes = cluster_sizes.rename(columns={'cluster': 'Cluster', 'counts': 'Size'})
|
882 |
+
st.dataframe(cluster_sizes)
|
883 |
+
|
884 |
+
st.markdown("### Cluster Time Series")
|
885 |
+
|
886 |
+
# Dropdown to select variables
|
887 |
+
variable = st.selectbox('Select Variable', ['Likes', 'Comments', 'Followers at Posting', 'Total Interactions'])
|
888 |
+
|
889 |
+
# Dropdown to select time resampling
|
890 |
+
resample_dict = {
|
891 |
+
'Day': 'D',
|
892 |
+
'Three Days': '3D',
|
893 |
+
'Week': 'W',
|
894 |
+
'Two Weeks': '2W',
|
895 |
+
'Month': 'M',
|
896 |
+
'Quarter': 'Q',
|
897 |
+
'Year': 'Y'
|
898 |
+
}
|
899 |
+
|
900 |
+
# Dropdown to select time resampling
|
901 |
+
resample_time = st.selectbox('Select Time Resampling', list(resample_dict.keys()))
|
902 |
+
|
903 |
+
# Slider for date range selection
|
904 |
+
min_date = df_clustered['Post Created'].min().date()
|
905 |
+
max_date = df_clustered['Post Created'].max().date()
|
906 |
+
|
907 |
+
date_range = st.slider('Select Date Range', min_value=min_date, max_value=max_date, value=(min_date, max_date))
|
908 |
+
|
909 |
+
# Filter dataframe based on selected date range
|
910 |
+
df_resampled = df_clustered[(df_clustered['Post Created'].dt.date >= date_range[0]) & (df_clustered['Post Created'].dt.date <= date_range[1])]
|
911 |
+
df_resampled = df_resampled.set_index('Post Created')
|
912 |
+
|
913 |
+
# Get unique clusters and their sizes
|
914 |
+
cluster_sizes = df_resampled[df_resampled['cluster'] != -1]['cluster'].value_counts()
|
915 |
+
clusters = cluster_sizes.index
|
916 |
+
|
917 |
+
# Select the largest 5 clusters by default
|
918 |
+
default_clusters = cluster_sizes.sort_values(ascending=False).head(5).index.tolist()
|
919 |
+
|
920 |
+
# Multiselect widget to choose clusters
|
921 |
+
selected_clusters = st.multiselect('Select Clusters', options=clusters.tolist(), default=default_clusters)
|
922 |
+
|
923 |
+
# Create a new DataFrame for the plot
|
924 |
+
df_plot = pd.DataFrame()
|
925 |
+
|
926 |
+
# Loop through selected clusters
|
927 |
+
for cluster in selected_clusters:
|
928 |
+
# Create a separate DataFrame for each cluster, resample and add to the plot DataFrame
|
929 |
+
df_cluster = df_resampled[df_resampled['cluster'] == cluster][variable].resample(resample_dict[resample_time]).sum()
|
930 |
+
df_plot = pd.concat([df_plot, df_cluster], axis=1)
|
931 |
+
|
932 |
+
# Add legend (use cluster numbers as legend)
|
933 |
+
df_plot.columns = selected_clusters
|
934 |
|
935 |
+
# Create the line chart
|
936 |
+
st.line_chart(df_plot)
|
937 |
|
|
|
|
|
938 |
|
939 |
|
940 |
elif selected_menu_option == "Stats":
|