rjadr commited on
Commit
e94d0a2
1 Parent(s): b751266

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -79
app.py CHANGED
@@ -27,6 +27,8 @@ from bokeh.plotting import figure
27
  from bokeh.models import ColumnDataSource
28
  from datetime import datetime
29
 
 
 
30
  model_dir = "./models/sbert.net_models_sentence-transformers_clip-ViT-B-32-multilingual-v1"
31
 
32
  @st.cache_data(show_spinner=True)
@@ -778,98 +780,161 @@ elif selected_menu_option == "Clustering":
778
  min_samples = None
779
  elif clustering_algo == "HDBSCAN":
780
  st.sidebar.markdown("### HDBSCAN Options")
781
- min_cluster_size = st.sidebar.slider("[Minimum cluster size](https://github.com/scikit-learn-contrib/hdbscan/blob/master/docs/parameter_selection.rst)", 2, 200, 5)
782
- min_samples = st.sidebar.slider("Minimum samples", 2, 50, 5)
783
  n_clusters = None
784
  if dim_reduction == "UMAP":
785
  st.sidebar.markdown("### UMAP Options")
786
- n_components = st.sidebar.slider("Number of dimensions", 2, 80, 50)
787
- n_neighbors = st.sidebar.slider("Number of neighbors", 2, 20, 15)
788
- min_dist = st.sidebar.slider("Minimum distance", 0.0, 1.0, 0.0)
789
  else:
790
  st.sidebar.markdown("### PCA Options")
791
- n_components = st.sidebar.slider("Number of dimensions", 2, 80, 2)
792
  n_neighbors = None
793
  min_dist = None
794
 
795
- if st.sidebar.button('Run clustering'):
796
- st.markdown("### Clustering Results")
797
- if type_embeddings == "Text":
798
- embeddings = dataset['txt_embs']
799
- elif type_embeddings == "Image":
800
- embeddings = dataset['img_embs']
801
-
802
- # Cluster embeddings
803
- labels, reduced_embeddings = cluster_embeddings(embeddings, clustering_algo=clustering_algo, dim_reduction=dim_reduction, n_clusters=n_clusters, min_cluster_size=min_cluster_size, n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
804
- st.markdown(f"Clustering {type_embeddings} embeddings using {clustering_algo} with {dim_reduction} dimensionality reduction method resulting in **{len(set(labels))}** clusters.")
805
-
806
- df_clustered = df.copy()
807
- df_clustered['cluster'] = labels
808
- df_clustered = df_clustered.set_index('cluster').reset_index()
809
- st.dataframe(
810
- data=filter_dataframe(df_clustered),
811
- # use_container_width=True,
812
- column_config={
813
- "image": st.column_config.ImageColumn(
814
- "Image", help="Instagram image"
815
- ),
816
- "URL": st.column_config.LinkColumn(
817
- "Link", help="Instagram link", width="small"
818
- )
819
- },
820
- hide_index=True,
821
- )
822
 
823
- st.download_button(
824
- "Download dataset with labels",
825
- df_clustered.to_csv(index=False).encode('utf-8'),
826
- f'ditaduranuncamais_{datetime.now().strftime("%Y%m%d-%H%M%S")}.csv',
827
- "text/csv",
828
- key='download-csv'
829
- )
830
-
831
- st.markdown("### Cluster Plot")
832
- # Plot the scatter plot in plotly with the cluster labels as colors reduce further to 2 dimensions if n_components > 2
833
- if n_components > 2:
834
- reducer = umap.UMAP(n_components=2, random_state=42)
835
- reduced_embeddings = reducer.fit_transform(reduced_embeddings)
836
- # set the labels to be the cluster labels dynamically
837
-
838
- # visualise with bokeh showing df_clustered['Description'] and df_clustered['image'] on hover
839
- descriptions = df_clustered['Description'].tolist()
840
- images = df_clustered['image'].tolist()
841
- glasbey_colors = cc.glasbey_hv
842
- color_dict = {n: rgb2hex(glasbey_colors[i % len(glasbey_colors)]) for i, n in enumerate(set(labels))}
843
- colors = [color_dict[label] for label in labels]
844
-
845
- source = ColumnDataSource(data=dict(
846
- x=reduced_embeddings[:, 0],
847
- y=reduced_embeddings[:, 1],
848
- desc=descriptions,
849
- imgs=images,
850
- colors=colors
851
- ))
852
-
853
- TOOLTIPS = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
854
  <div>
855
- <div>
856
- <img
857
- src="@imgs" height="100" alt="@imgs" width="100"
858
- style="float: left; margin: 0px 15px 15px 0px;"
859
- border="2"
860
- ></img>
861
- </div>
862
- <div>
863
- <span style="font-size: 12px; font-weight: bold;">@desc</span>
864
- </div>
865
  </div>
866
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
867
 
868
- p = figure(width=800, height=800, tooltips=TOOLTIPS,
869
- title="Mouse over the dots")
870
 
871
- p.circle('x', 'y', size=10, source=source, color='colors', line_color=None)
872
- st.bokeh_chart(p)
873
 
874
 
875
  elif selected_menu_option == "Stats":
 
27
  from bokeh.models import ColumnDataSource
28
  from datetime import datetime
29
 
30
+ #st.set_page_config(layout="wide")
31
+
32
  model_dir = "./models/sbert.net_models_sentence-transformers_clip-ViT-B-32-multilingual-v1"
33
 
34
  @st.cache_data(show_spinner=True)
 
780
  min_samples = None
781
  elif clustering_algo == "HDBSCAN":
782
  st.sidebar.markdown("### HDBSCAN Options")
783
+ min_cluster_size = st.sidebar.slider("[Minimum cluster size](https://hdbscan.readthedocs.io/en/latest/parameter_selection.html#selecting-min-cluster-size)", 2, 200, 5)
784
+ min_samples = st.sidebar.slider("[Minimum samples](https://hdbscan.readthedocs.io/en/latest/parameter_selection.html#selecting-min-samples)", 2, 50, 5)
785
  n_clusters = None
786
  if dim_reduction == "UMAP":
787
  st.sidebar.markdown("### UMAP Options")
788
+ n_components = st.sidebar.slider("[Number of components](https://umap-learn.readthedocs.io/en/latest/parameters.html#n-components)", 2, 80, 50)
789
+ n_neighbors = st.sidebar.slider("[Number of neighbors](https://umap-learn.readthedocs.io/en/latest/parameters.html#n-neighbors)", 2, 20, 15)
790
+ min_dist = st.sidebar.slider("[Minimum distance](https://umap-learn.readthedocs.io/en/latest/parameters.html#min-dist)", 0.0, 1.0, 0.0)
791
  else:
792
  st.sidebar.markdown("### PCA Options")
793
+ n_components = st.sidebar.slider("[Number of components](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)", 2, 80, 2)
794
  n_neighbors = None
795
  min_dist = None
796
 
797
+ st.markdown("### Clustering Results")
798
+ if type_embeddings == "Text":
799
+ embeddings = dataset['txt_embs']
800
+ elif type_embeddings == "Image":
801
+ embeddings = dataset['img_embs']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
802
 
803
+ # Cluster embeddings
804
+ labels, reduced_embeddings = cluster_embeddings(embeddings, clustering_algo=clustering_algo, dim_reduction=dim_reduction, n_clusters=n_clusters, min_cluster_size=min_cluster_size, n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
805
+ st.markdown(f"Clustering {type_embeddings} embeddings using {clustering_algo} with {dim_reduction} dimensionality reduction method resulting in **{len(set(labels))}** clusters.")
806
+
807
+ df_clustered = df.copy()
808
+ df_clustered['cluster'] = labels
809
+ df_clustered = df_clustered.set_index('cluster').reset_index()
810
+ st.dataframe(
811
+ data=filter_dataframe(df_clustered),
812
+ # use_container_width=True,
813
+ column_config={
814
+ "image": st.column_config.ImageColumn(
815
+ "Image", help="Instagram image"
816
+ ),
817
+ "URL": st.column_config.LinkColumn(
818
+ "Link", help="Instagram link", width="small"
819
+ )
820
+ },
821
+ hide_index=True,
822
+ )
823
+
824
+ st.download_button(
825
+ "Download dataset with labels",
826
+ df_clustered.to_csv(index=False).encode('utf-8'),
827
+ f'ditaduranuncamais_{datetime.now().strftime("%Y%m%d-%H%M%S")}.csv',
828
+ "text/csv",
829
+ key='download-csv'
830
+ )
831
+
832
+ st.markdown("### Cluster Plot")
833
+ # Plot the scatter plot in plotly with the cluster labels as colors reduce further to 2 dimensions if n_components > 2
834
+ if n_components > 2:
835
+ reducer = umap.UMAP(n_components=2, random_state=42)
836
+ reduced_embeddings = reducer.fit_transform(reduced_embeddings)
837
+ # set the labels to be the cluster labels dynamically
838
+
839
+ # visualise with bokeh showing df_clustered['Description'] and df_clustered['image'] on hover
840
+ descriptions = df_clustered['Description'].tolist()
841
+ images = df_clustered['image'].tolist()
842
+ glasbey_colors = cc.glasbey_hv
843
+ color_dict = {n: rgb2hex(glasbey_colors[i % len(glasbey_colors)]) for i, n in enumerate(set(labels))}
844
+ colors = [color_dict[label] for label in labels]
845
+
846
+ source = ColumnDataSource(data=dict(
847
+ x=reduced_embeddings[:, 0],
848
+ y=reduced_embeddings[:, 1],
849
+ desc=descriptions,
850
+ imgs=images,
851
+ colors=colors
852
+ ))
853
+
854
+ TOOLTIPS = """
855
+ <div>
856
  <div>
857
+ <img
858
+ src="@imgs" height="100" alt="@imgs" width="100"
859
+ style="float: left; margin: 0px 15px 15px 0px;"
860
+ border="2"
861
+ ></img>
 
 
 
 
 
862
  </div>
863
+ <div>
864
+ <span style="font-size: 12px; font-weight: bold;">@desc</span>
865
+ </div>
866
+ </div>
867
+ """
868
+
869
+ p = figure(width=800, height=800, tooltips=TOOLTIPS,
870
+ title="Mouse over the dots")
871
+
872
+ p.circle('x', 'y', size=10, source=source, color='colors', line_color=None)
873
+ st.bokeh_chart(p)
874
+
875
+ # inster time series graph for clusters sorted by size (except cluster -1, show top5 by default, but include selectbox. reuse resample_dict for binning)
876
+ st.markdown("### Cluster Size")
877
+ cluster_sizes = df_clustered.groupby('cluster').size().reset_index(name='counts')
878
+ cluster_sizes = cluster_sizes.sort_values(by='counts', ascending=False)
879
+ cluster_sizes = cluster_sizes[cluster_sizes['cluster'] != -1]
880
+ cluster_sizes = cluster_sizes.set_index('cluster').reset_index()
881
+ cluster_sizes = cluster_sizes.rename(columns={'cluster': 'Cluster', 'counts': 'Size'})
882
+ st.dataframe(cluster_sizes)
883
+
884
+ st.markdown("### Cluster Time Series")
885
+
886
+ # Dropdown to select variables
887
+ variable = st.selectbox('Select Variable', ['Likes', 'Comments', 'Followers at Posting', 'Total Interactions'])
888
+
889
+ # Dropdown to select time resampling
890
+ resample_dict = {
891
+ 'Day': 'D',
892
+ 'Three Days': '3D',
893
+ 'Week': 'W',
894
+ 'Two Weeks': '2W',
895
+ 'Month': 'M',
896
+ 'Quarter': 'Q',
897
+ 'Year': 'Y'
898
+ }
899
+
900
+ # Dropdown to select time resampling
901
+ resample_time = st.selectbox('Select Time Resampling', list(resample_dict.keys()))
902
+
903
+ # Slider for date range selection
904
+ min_date = df_clustered['Post Created'].min().date()
905
+ max_date = df_clustered['Post Created'].max().date()
906
+
907
+ date_range = st.slider('Select Date Range', min_value=min_date, max_value=max_date, value=(min_date, max_date))
908
+
909
+ # Filter dataframe based on selected date range
910
+ df_resampled = df_clustered[(df_clustered['Post Created'].dt.date >= date_range[0]) & (df_clustered['Post Created'].dt.date <= date_range[1])]
911
+ df_resampled = df_resampled.set_index('Post Created')
912
+
913
+ # Get unique clusters and their sizes
914
+ cluster_sizes = df_resampled[df_resampled['cluster'] != -1]['cluster'].value_counts()
915
+ clusters = cluster_sizes.index
916
+
917
+ # Select the largest 5 clusters by default
918
+ default_clusters = cluster_sizes.sort_values(ascending=False).head(5).index.tolist()
919
+
920
+ # Multiselect widget to choose clusters
921
+ selected_clusters = st.multiselect('Select Clusters', options=clusters.tolist(), default=default_clusters)
922
+
923
+ # Create a new DataFrame for the plot
924
+ df_plot = pd.DataFrame()
925
+
926
+ # Loop through selected clusters
927
+ for cluster in selected_clusters:
928
+ # Create a separate DataFrame for each cluster, resample and add to the plot DataFrame
929
+ df_cluster = df_resampled[df_resampled['cluster'] == cluster][variable].resample(resample_dict[resample_time]).sum()
930
+ df_plot = pd.concat([df_plot, df_cluster], axis=1)
931
+
932
+ # Add legend (use cluster numbers as legend)
933
+ df_plot.columns = selected_clusters
934
 
935
+ # Create the line chart
936
+ st.line_chart(df_plot)
937
 
 
 
938
 
939
 
940
  elif selected_menu_option == "Stats":