Spaces:

HUBioDataLab
/

ProtHGT

Sleeping

App Files Files Community

Erva Ulusoy commited on Mar 21

Commit

a1e2231

1 Parent(s): 14c3500

added filtering options for second-degree edge visualization

Browse files

Files changed (2) hide show

ProtHGT_app.py +48 -13
visualize_kg.py +26 -8

ProtHGT_app.py CHANGED Viewed

@@ -562,18 +562,28 @@ if st.session_state.submitted:
                 # Create visualizations in each tab
                 for idx, protein_id in enumerate(selected_proteins):
                     with protein_tabs[idx]:
-                        col1, col2 = st.columns([3, 1])
                         with col1:
                             max_node_count = st.slider(
-                                "Maximum neighbors per edge type",
                                 min_value=5,
                                 max_value=50,
                                 value=10,
                                 step=5,
-                                help="Control the maximum number of neighboring nodes shown for each relationship type",
                                 key=f"slider_{protein_id}"
                             )
                         # Check if both visualizations exist for this protein
                         viz_exists = (protein_id in st.session_state.protein_visualizations and
                                      'first_degree' in st.session_state.protein_visualizations[protein_id] and
@@ -592,6 +602,7 @@ if st.session_state.submitted:
                                     protein_id,
                                     st.session_state.predictions_df,
                                     limit=max_node_count,
                                     include_second_degree=False
                                 )
@@ -601,29 +612,48 @@ if st.session_state.submitted:
                                     protein_id,
                                     st.session_state.predictions_df,
                                     limit=max_node_count,
                                     include_second_degree=True
                                 )
                                 # Store both visualizations in session state
                                 st.session_state.protein_visualizations[protein_id]['first_degree'] = {
                                     'path': html_path_1st,
-                                    'edges': edges_1st
                                 }
                                 st.session_state.protein_visualizations[protein_id]['second_degree'] = {
                                     'path': html_path_2nd,
-                                    'edges': edges_2nd
                                 }
                                 st.rerun()
                         # If visualization exists, show the toggle and display appropriate version
                         if viz_exists:
-                            with col2:
-                                include_second_degree = st.checkbox(
-                                    "Include second-degree edges",
-                                    value=False,
-                                    key=f"second_degree_{protein_id}",
-                                    help="Show connections between neighbor nodes"
-                                )
                             # Get the appropriate visualization based on checkbox
                             viz_type = 'second_degree' if include_second_degree else 'first_degree'
@@ -663,6 +693,11 @@ if st.session_state.submitted:
                                         del st.session_state.protein_visualizations[protein_id]
                                     st.rerun()
                             # Display the appropriate visualization
                             with open(viz_info['path'], 'r', encoding='utf-8') as f:
                                 html_content = f.read()

                 # Create visualizations in each tab
                 for idx, protein_id in enumerate(selected_proteins):
                     with protein_tabs[idx]:
+                        col1, col2 = st.columns([1, 1])
                         with col1:
                             max_node_count = st.slider(
+                                "Maximum neighbors per edge type (first-degree)",
                                 min_value=5,
                                 max_value=50,
                                 value=10,
                                 step=5,
+                                help="Control the maximum number of direct neighbors of the query protein shown for each relationship type",
                                 key=f"slider_{protein_id}"
                             )
+                        with col2:
+                            second_degree_limit = st.slider(
+                                "Maximum neighbors per edge type (second-degree)",
+                                min_value=2,
+                                max_value=10,
+                                value=3,
+                                step=1,
+                                help="Control the maximum number of second-degree neighbors of the query protein shown for each relationship type. Second-degree edge limit is intentionally kept low to maintain visual clarity. Higher values may make the graph cluttered and difficult to interpret.",
+                                key=f"second_degree_slider_{protein_id}"
+                            )
                         # Check if both visualizations exist for this protein
                         viz_exists = (protein_id in st.session_state.protein_visualizations and
                                      'first_degree' in st.session_state.protein_visualizations[protein_id] and
                                     protein_id,
                                     st.session_state.predictions_df,
                                     limit=max_node_count,
+                                    second_degree_limit=second_degree_limit,
                                     include_second_degree=False
                                 )
                                     protein_id,
                                     st.session_state.predictions_df,
                                     limit=max_node_count,
+                                    second_degree_limit=second_degree_limit,
                                     include_second_degree=True
                                 )
                                 # Store both visualizations in session state
                                 st.session_state.protein_visualizations[protein_id]['first_degree'] = {
                                     'path': html_path_1st,
+                                    'edges': edges_1st,
+                                    'settings': {
+                                        'max_node_count': max_node_count,
+                                        'second_degree_limit': second_degree_limit
+                                    }
                                 }
                                 st.session_state.protein_visualizations[protein_id]['second_degree'] = {
                                     'path': html_path_2nd,
+                                    'edges': edges_2nd,
+                                    'settings': {
+                                        'max_node_count': max_node_count,
+                                        'second_degree_limit': second_degree_limit
+                                    }
                                 }
                                 st.rerun()
                         # If visualization exists, show the toggle and display appropriate version
                         if viz_exists:
+                            # Check if settings have changed, but handle cases where settings don't exist
+                            current_settings = {
+                                'max_node_count': max_node_count,
+                                'second_degree_limit': second_degree_limit
+                            }
+                            # Safely get stored settings or use None if they don't exist
+                            stored_settings = (st.session_state.protein_visualizations[protein_id]['first_degree'].get('settings')
+                                             if 'first_degree' in st.session_state.protein_visualizations[protein_id]
+                                             else None)
+                            include_second_degree = st.checkbox(
+                                "Include second-degree edges",
+                                value=False,
+                                key=f"second_degree_{protein_id}",
+                                help="Show connections between neighbor nodes"
+                            )
                             # Get the appropriate visualization based on checkbox
                             viz_type = 'second_degree' if include_second_degree else 'first_degree'
                                         del st.session_state.protein_visualizations[protein_id]
                                     st.rerun()
+                            if stored_settings is not None and current_settings != stored_settings:
+                                st.warning("⚠️ Settings have changed. Click 'Regenerate Visualization' to apply new settings.")
+                            elif stored_settings is None:
+                                st.warning("⚠️ Visualization was generated with default settings. Consider regenerating to apply custom settings.")
                             # Display the appropriate visualization
                             with open(viz_info['path'], 'r', encoding='utf-8') as f:
                                 html_content = f.read()

visualize_kg.py CHANGED Viewed

@@ -131,9 +131,23 @@ def _gather_protein_edges(data, protein_id):
     return protein_edges
-def _filter_edges(protein_id, protein_edges, prediction_df, limit=10):
     filtered_edges = {}
     prediction_categories = prediction_df['GO_category'].unique()
     prediction_categories = [GO_CATEGORY_MAPPING[category] for category in prediction_categories]
     go_category_reverse_mapping = {v:k for k, v in GO_CATEGORY_MAPPING.items()}
@@ -160,18 +174,18 @@ def _filter_edges(protein_id, protein_edges, prediction_df, limit=10):
                         edge = (protein_id, term)
                         is_ground_truth = edge in edges_set
                         valid_edges.append((edge, prob, is_ground_truth))
-                        if len(valid_edges) >= limit:
                             break
                     filtered_edges[edge_type] = valid_edges
                 else:
                     # If no predictions but it's a GO category in prediction_df
-                    filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:limit]]
             else:
                 # For GO terms not in prediction_df, mark them as ground truth with blue color
-                filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:limit]]
         else:
             # For non-GO edges, include all edges up to limit
-            filtered_edges[edge_type] = [(edge, None, True) for edge in list(edges)[:limit]]
     return filtered_edges
@@ -225,7 +239,8 @@ def visualize_protein_subgraph(data, protein_id, prediction_df, limit=10, second
     # Get the first-degree edges and filter them
     protein_edges = _gather_protein_edges(data, protein_id)
-    first_degree_edges = _filter_edges(protein_id, protein_edges, prediction_df, limit)
     # Initialize all_edges with first degree edges
     all_edges = first_degree_edges.copy()
@@ -243,11 +258,14 @@ def visualize_protein_subgraph(data, protein_id, prediction_df, limit=10, second
                 if target != protein_id:
                     neighbor_nodes.add((target, target_type))
-        # Gather and filter second-degree edges
         second_degree_edges = {}
         for neighbor_id, neighbor_type in neighbor_nodes:
             neighbor_edges = _gather_neighbor_edges(data, neighbor_id, neighbor_type, protein_id)
-            filtered_neighbor_edges = _filter_edges(neighbor_id, neighbor_edges, prediction_df, second_degree_limit)
             # Merge filtered neighbor edges into second_degree_edges
             for edge_type, edges in filtered_neighbor_edges.items():

     return protein_edges
+def _filter_edges(protein_id, protein_edges, prediction_df, limit=10, is_second_degree=False, second_degree_limit=3):
+    """
+    Filter edges based on type and limit
+    Args:
+        protein_id: ID of the protein
+        protein_edges: Dictionary of edges to filter
+        prediction_df: DataFrame containing predictions
+        limit: Maximum number of edges to keep for first-degree connections
+        is_second_degree: Whether these are second-degree edges
+        second_degree_limit: Maximum number of edges to keep for second-degree connections
+    """
     filtered_edges = {}
+    # Use appropriate limit based on edge degree
+    current_limit = second_degree_limit if is_second_degree else limit
     prediction_categories = prediction_df['GO_category'].unique()
     prediction_categories = [GO_CATEGORY_MAPPING[category] for category in prediction_categories]
     go_category_reverse_mapping = {v:k for k, v in GO_CATEGORY_MAPPING.items()}
                         edge = (protein_id, term)
                         is_ground_truth = edge in edges_set
                         valid_edges.append((edge, prob, is_ground_truth))
+                        if len(valid_edges) >= current_limit:
                             break
                     filtered_edges[edge_type] = valid_edges
                 else:
                     # If no predictions but it's a GO category in prediction_df
+                    filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:current_limit]]
             else:
                 # For GO terms not in prediction_df, mark them as ground truth with blue color
+                filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:current_limit]]
         else:
             # For non-GO edges, include all edges up to limit
+            filtered_edges[edge_type] = [(edge, None, True) for edge in list(edges)[:current_limit]]
     return filtered_edges
     # Get the first-degree edges and filter them
     protein_edges = _gather_protein_edges(data, protein_id)
+    first_degree_edges = _filter_edges(protein_id, protein_edges, prediction_df,
+                                     limit=limit, is_second_degree=False)
     # Initialize all_edges with first degree edges
     all_edges = first_degree_edges.copy()
                 if target != protein_id:
                     neighbor_nodes.add((target, target_type))
+        # Gather and filter second-degree edges with the smaller limit
         second_degree_edges = {}
         for neighbor_id, neighbor_type in neighbor_nodes:
             neighbor_edges = _gather_neighbor_edges(data, neighbor_id, neighbor_type, protein_id)
+            filtered_neighbor_edges = _filter_edges(neighbor_id, neighbor_edges, prediction_df,
+                                                 limit=limit,
+                                                 is_second_degree=True,
+                                                 second_degree_limit=second_degree_limit)
             # Merge filtered neighbor edges into second_degree_edges
             for edge_type, edges in filtered_neighbor_edges.items():