Spaces:

rjadr
/

ditaduranuncamais_explorer

App Files Files Community

rjadr commited on Jul 22, 2023

Commit

e94d0a2

•

1 Parent(s): b751266

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -79

app.py CHANGED Viewed

@@ -27,6 +27,8 @@ from bokeh.plotting import figure
 from bokeh.models import ColumnDataSource
 from datetime import datetime
 model_dir = "./models/sbert.net_models_sentence-transformers_clip-ViT-B-32-multilingual-v1"
 @st.cache_data(show_spinner=True)
@@ -778,98 +780,161 @@ elif selected_menu_option == "Clustering":
         min_samples = None
     elif clustering_algo == "HDBSCAN":
         st.sidebar.markdown("### HDBSCAN Options")
-        min_cluster_size = st.sidebar.slider("[Minimum cluster size](https://github.com/scikit-learn-contrib/hdbscan/blob/master/docs/parameter_selection.rst)", 2, 200, 5)
-        min_samples = st.sidebar.slider("Minimum samples", 2, 50, 5)
         n_clusters = None
     if dim_reduction == "UMAP":
         st.sidebar.markdown("### UMAP Options")
-        n_components = st.sidebar.slider("Number of dimensions", 2, 80, 50)
-        n_neighbors = st.sidebar.slider("Number of neighbors", 2, 20, 15)
-        min_dist = st.sidebar.slider("Minimum distance", 0.0, 1.0, 0.0)
     else:
         st.sidebar.markdown("### PCA Options")
-        n_components = st.sidebar.slider("Number of dimensions", 2, 80, 2)
         n_neighbors = None
         min_dist = None
-    if st.sidebar.button('Run clustering'):
-        st.markdown("### Clustering Results")
-        if type_embeddings == "Text":
-            embeddings = dataset['txt_embs']
-        elif type_embeddings == "Image":
-            embeddings = dataset['img_embs']
-        # Cluster embeddings
-        labels, reduced_embeddings = cluster_embeddings(embeddings, clustering_algo=clustering_algo, dim_reduction=dim_reduction, n_clusters=n_clusters, min_cluster_size=min_cluster_size, n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
-        st.markdown(f"Clustering {type_embeddings} embeddings using {clustering_algo} with {dim_reduction} dimensionality reduction method resulting in **{len(set(labels))}** clusters.")
-        df_clustered = df.copy()
-        df_clustered['cluster'] = labels
-        df_clustered = df_clustered.set_index('cluster').reset_index()
-        st.dataframe(
-            data=filter_dataframe(df_clustered),
-        # use_container_width=True,
-            column_config={
-                "image": st.column_config.ImageColumn(
-                    "Image", help="Instagram image"
-                ),
-                "URL": st.column_config.LinkColumn(
-                    "Link", help="Instagram link", width="small"
-                )
-            },
-            hide_index=True,
-        )
-        st.download_button(
-            "Download dataset with labels",
-            df_clustered.to_csv(index=False).encode('utf-8'),
-            f'ditaduranuncamais_{datetime.now().strftime("%Y%m%d-%H%M%S")}.csv',
-            "text/csv",
-            key='download-csv'
-        )
-        st.markdown("### Cluster Plot")
-        # Plot the scatter plot in plotly with the cluster labels as colors reduce further to 2 dimensions if n_components > 2
-        if n_components > 2:
-            reducer = umap.UMAP(n_components=2, random_state=42)
-            reduced_embeddings = reducer.fit_transform(reduced_embeddings)
-            # set the labels to be the cluster labels dynamically
-        # visualise with bokeh showing df_clustered['Description'] and df_clustered['image'] on hover
-        descriptions = df_clustered['Description'].tolist()
-        images = df_clustered['image'].tolist()
-        glasbey_colors = cc.glasbey_hv
-        color_dict = {n: rgb2hex(glasbey_colors[i % len(glasbey_colors)]) for i, n in enumerate(set(labels))}
-        colors = [color_dict[label] for label in labels]
-        source = ColumnDataSource(data=dict(
-            x=reduced_embeddings[:, 0],
-            y=reduced_embeddings[:, 1],
-            desc=descriptions,
-            imgs=images,
-            colors=colors
-        ))
-        TOOLTIPS = """
             <div>
-                <div>
-                    <img
-                        src="@imgs" height="100" alt="@imgs" width="100"
-                        style="float: left; margin: 0px 15px 15px 0px;"
-                        border="2"
-                    ></img>
-                </div>
-                <div>
-                    <span style="font-size: 12px; font-weight: bold;">@desc</span>
-                </div>
             </div>
-        """
-        p = figure(width=800, height=800, tooltips=TOOLTIPS,
-                title="Mouse over the dots")
-        p.circle('x', 'y', size=10, source=source, color='colors', line_color=None)
-        st.bokeh_chart(p)
 elif selected_menu_option == "Stats":

 from bokeh.models import ColumnDataSource
 from datetime import datetime
+#st.set_page_config(layout="wide")
 model_dir = "./models/sbert.net_models_sentence-transformers_clip-ViT-B-32-multilingual-v1"
 @st.cache_data(show_spinner=True)
         min_samples = None
     elif clustering_algo == "HDBSCAN":
         st.sidebar.markdown("### HDBSCAN Options")
+        min_cluster_size = st.sidebar.slider("[Minimum cluster size](https://hdbscan.readthedocs.io/en/latest/parameter_selection.html#selecting-min-cluster-size)", 2, 200, 5)
+        min_samples = st.sidebar.slider("[Minimum samples](https://hdbscan.readthedocs.io/en/latest/parameter_selection.html#selecting-min-samples)", 2, 50, 5)
         n_clusters = None
     if dim_reduction == "UMAP":
         st.sidebar.markdown("### UMAP Options")
+        n_components = st.sidebar.slider("[Number of components](https://umap-learn.readthedocs.io/en/latest/parameters.html#n-components)", 2, 80, 50)
+        n_neighbors = st.sidebar.slider("[Number of neighbors](https://umap-learn.readthedocs.io/en/latest/parameters.html#n-neighbors)", 2, 20, 15)
+        min_dist = st.sidebar.slider("[Minimum distance](https://umap-learn.readthedocs.io/en/latest/parameters.html#min-dist)", 0.0, 1.0, 0.0)
     else:
         st.sidebar.markdown("### PCA Options")
+        n_components = st.sidebar.slider("[Number of components](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)", 2, 80, 2)
         n_neighbors = None
         min_dist = None
+    st.markdown("### Clustering Results")
+    if type_embeddings == "Text":
+        embeddings = dataset['txt_embs']
+    elif type_embeddings == "Image":
+        embeddings = dataset['img_embs']
+    # Cluster embeddings
+    labels, reduced_embeddings = cluster_embeddings(embeddings, clustering_algo=clustering_algo, dim_reduction=dim_reduction, n_clusters=n_clusters, min_cluster_size=min_cluster_size, n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
+    st.markdown(f"Clustering {type_embeddings} embeddings using {clustering_algo} with {dim_reduction} dimensionality reduction method resulting in **{len(set(labels))}** clusters.")
+    df_clustered = df.copy()
+    df_clustered['cluster'] = labels
+    df_clustered = df_clustered.set_index('cluster').reset_index()
+    st.dataframe(
+        data=filter_dataframe(df_clustered),
+    # use_container_width=True,
+        column_config={
+            "image": st.column_config.ImageColumn(
+                "Image", help="Instagram image"
+            ),
+            "URL": st.column_config.LinkColumn(
+                "Link", help="Instagram link", width="small"
+            )
+        },
+        hide_index=True,
+    )
+    st.download_button(
+        "Download dataset with labels",
+        df_clustered.to_csv(index=False).encode('utf-8'),
+        f'ditaduranuncamais_{datetime.now().strftime("%Y%m%d-%H%M%S")}.csv',
+        "text/csv",
+        key='download-csv'
+    )
+    st.markdown("### Cluster Plot")
+    # Plot the scatter plot in plotly with the cluster labels as colors reduce further to 2 dimensions if n_components > 2
+    if n_components > 2:
+        reducer = umap.UMAP(n_components=2, random_state=42)
+        reduced_embeddings = reducer.fit_transform(reduced_embeddings)
+        # set the labels to be the cluster labels dynamically
+    # visualise with bokeh showing df_clustered['Description'] and df_clustered['image'] on hover
+    descriptions = df_clustered['Description'].tolist()
+    images = df_clustered['image'].tolist()
+    glasbey_colors = cc.glasbey_hv
+    color_dict = {n: rgb2hex(glasbey_colors[i % len(glasbey_colors)]) for i, n in enumerate(set(labels))}
+    colors = [color_dict[label] for label in labels]
+    source = ColumnDataSource(data=dict(
+        x=reduced_embeddings[:, 0],
+        y=reduced_embeddings[:, 1],
+        desc=descriptions,
+        imgs=images,
+        colors=colors
+    ))
+    TOOLTIPS = """
+        <div>
             <div>
+                <img
+                    src="@imgs" height="100" alt="@imgs" width="100"
+                    style="float: left; margin: 0px 15px 15px 0px;"
+                    border="2"
+                ></img>
             </div>
+            <div>
+                <span style="font-size: 12px; font-weight: bold;">@desc</span>
+            </div>
+        </div>
+    """
+    p = figure(width=800, height=800, tooltips=TOOLTIPS,
+            title="Mouse over the dots")
+    p.circle('x', 'y', size=10, source=source, color='colors', line_color=None)
+    st.bokeh_chart(p)
+    # inster time series graph for clusters sorted by size (except cluster -1, show top5 by default, but include selectbox. reuse resample_dict for binning)
+    st.markdown("### Cluster Size")
+    cluster_sizes = df_clustered.groupby('cluster').size().reset_index(name='counts')
+    cluster_sizes = cluster_sizes.sort_values(by='counts', ascending=False)
+    cluster_sizes = cluster_sizes[cluster_sizes['cluster'] != -1]
+    cluster_sizes = cluster_sizes.set_index('cluster').reset_index()
+    cluster_sizes = cluster_sizes.rename(columns={'cluster': 'Cluster', 'counts': 'Size'})
+    st.dataframe(cluster_sizes)
+    st.markdown("### Cluster Time Series")
+    # Dropdown to select variables
+    variable = st.selectbox('Select Variable', ['Likes', 'Comments', 'Followers at Posting', 'Total Interactions'])
+    # Dropdown to select time resampling
+    resample_dict = {
+        'Day': 'D',
+        'Three Days': '3D',
+        'Week': 'W',
+        'Two Weeks': '2W',
+        'Month': 'M',
+        'Quarter': 'Q',
+        'Year': 'Y'
+    }
+    # Dropdown to select time resampling
+    resample_time = st.selectbox('Select Time Resampling', list(resample_dict.keys()))
+    # Slider for date range selection
+    min_date = df_clustered['Post Created'].min().date()
+    max_date = df_clustered['Post Created'].max().date()
+    date_range = st.slider('Select Date Range', min_value=min_date, max_value=max_date, value=(min_date, max_date))
+    # Filter dataframe based on selected date range
+    df_resampled = df_clustered[(df_clustered['Post Created'].dt.date >= date_range[0]) & (df_clustered['Post Created'].dt.date <= date_range[1])]
+    df_resampled = df_resampled.set_index('Post Created')
+    # Get unique clusters and their sizes
+    cluster_sizes = df_resampled[df_resampled['cluster'] != -1]['cluster'].value_counts()
+    clusters = cluster_sizes.index
+    # Select the largest 5 clusters by default
+    default_clusters = cluster_sizes.sort_values(ascending=False).head(5).index.tolist()
+    # Multiselect widget to choose clusters
+    selected_clusters = st.multiselect('Select Clusters', options=clusters.tolist(), default=default_clusters)
+    # Create a new DataFrame for the plot
+    df_plot = pd.DataFrame()
+    # Loop through selected clusters
+    for cluster in selected_clusters:
+        # Create a separate DataFrame for each cluster, resample and add to the plot DataFrame
+        df_cluster = df_resampled[df_resampled['cluster'] == cluster][variable].resample(resample_dict[resample_time]).sum()
+        df_plot = pd.concat([df_plot, df_cluster], axis=1)
+    # Add legend (use cluster numbers as legend)
+    df_plot.columns = selected_clusters
+    # Create the line chart
+    st.line_chart(df_plot)
 elif selected_menu_option == "Stats":