Erva Ulusoy commited on
Commit
a1e2231
·
1 Parent(s): 14c3500

added filtering options for second-degree edge visualization

Browse files
Files changed (2) hide show
  1. ProtHGT_app.py +48 -13
  2. visualize_kg.py +26 -8
ProtHGT_app.py CHANGED
@@ -562,18 +562,28 @@ if st.session_state.submitted:
562
  # Create visualizations in each tab
563
  for idx, protein_id in enumerate(selected_proteins):
564
  with protein_tabs[idx]:
565
- col1, col2 = st.columns([3, 1])
566
  with col1:
567
  max_node_count = st.slider(
568
- "Maximum neighbors per edge type",
569
  min_value=5,
570
  max_value=50,
571
  value=10,
572
  step=5,
573
- help="Control the maximum number of neighboring nodes shown for each relationship type",
574
  key=f"slider_{protein_id}"
575
  )
576
-
 
 
 
 
 
 
 
 
 
 
577
  # Check if both visualizations exist for this protein
578
  viz_exists = (protein_id in st.session_state.protein_visualizations and
579
  'first_degree' in st.session_state.protein_visualizations[protein_id] and
@@ -592,6 +602,7 @@ if st.session_state.submitted:
592
  protein_id,
593
  st.session_state.predictions_df,
594
  limit=max_node_count,
 
595
  include_second_degree=False
596
  )
597
 
@@ -601,29 +612,48 @@ if st.session_state.submitted:
601
  protein_id,
602
  st.session_state.predictions_df,
603
  limit=max_node_count,
 
604
  include_second_degree=True
605
  )
606
 
607
  # Store both visualizations in session state
608
  st.session_state.protein_visualizations[protein_id]['first_degree'] = {
609
  'path': html_path_1st,
610
- 'edges': edges_1st
 
 
 
 
611
  }
612
  st.session_state.protein_visualizations[protein_id]['second_degree'] = {
613
  'path': html_path_2nd,
614
- 'edges': edges_2nd
 
 
 
 
615
  }
616
  st.rerun()
617
 
618
  # If visualization exists, show the toggle and display appropriate version
619
  if viz_exists:
620
- with col2:
621
- include_second_degree = st.checkbox(
622
- "Include second-degree edges",
623
- value=False,
624
- key=f"second_degree_{protein_id}",
625
- help="Show connections between neighbor nodes"
626
- )
 
 
 
 
 
 
 
 
 
 
627
 
628
  # Get the appropriate visualization based on checkbox
629
  viz_type = 'second_degree' if include_second_degree else 'first_degree'
@@ -663,6 +693,11 @@ if st.session_state.submitted:
663
  del st.session_state.protein_visualizations[protein_id]
664
  st.rerun()
665
 
 
 
 
 
 
666
  # Display the appropriate visualization
667
  with open(viz_info['path'], 'r', encoding='utf-8') as f:
668
  html_content = f.read()
 
562
  # Create visualizations in each tab
563
  for idx, protein_id in enumerate(selected_proteins):
564
  with protein_tabs[idx]:
565
+ col1, col2 = st.columns([1, 1])
566
  with col1:
567
  max_node_count = st.slider(
568
+ "Maximum neighbors per edge type (first-degree)",
569
  min_value=5,
570
  max_value=50,
571
  value=10,
572
  step=5,
573
+ help="Control the maximum number of direct neighbors of the query protein shown for each relationship type",
574
  key=f"slider_{protein_id}"
575
  )
576
+ with col2:
577
+ second_degree_limit = st.slider(
578
+ "Maximum neighbors per edge type (second-degree)",
579
+ min_value=2,
580
+ max_value=10,
581
+ value=3,
582
+ step=1,
583
+ help="Control the maximum number of second-degree neighbors of the query protein shown for each relationship type. Second-degree edge limit is intentionally kept low to maintain visual clarity. Higher values may make the graph cluttered and difficult to interpret.",
584
+ key=f"second_degree_slider_{protein_id}"
585
+ )
586
+
587
  # Check if both visualizations exist for this protein
588
  viz_exists = (protein_id in st.session_state.protein_visualizations and
589
  'first_degree' in st.session_state.protein_visualizations[protein_id] and
 
602
  protein_id,
603
  st.session_state.predictions_df,
604
  limit=max_node_count,
605
+ second_degree_limit=second_degree_limit,
606
  include_second_degree=False
607
  )
608
 
 
612
  protein_id,
613
  st.session_state.predictions_df,
614
  limit=max_node_count,
615
+ second_degree_limit=second_degree_limit,
616
  include_second_degree=True
617
  )
618
 
619
  # Store both visualizations in session state
620
  st.session_state.protein_visualizations[protein_id]['first_degree'] = {
621
  'path': html_path_1st,
622
+ 'edges': edges_1st,
623
+ 'settings': {
624
+ 'max_node_count': max_node_count,
625
+ 'second_degree_limit': second_degree_limit
626
+ }
627
  }
628
  st.session_state.protein_visualizations[protein_id]['second_degree'] = {
629
  'path': html_path_2nd,
630
+ 'edges': edges_2nd,
631
+ 'settings': {
632
+ 'max_node_count': max_node_count,
633
+ 'second_degree_limit': second_degree_limit
634
+ }
635
  }
636
  st.rerun()
637
 
638
  # If visualization exists, show the toggle and display appropriate version
639
  if viz_exists:
640
+ # Check if settings have changed, but handle cases where settings don't exist
641
+ current_settings = {
642
+ 'max_node_count': max_node_count,
643
+ 'second_degree_limit': second_degree_limit
644
+ }
645
+
646
+ # Safely get stored settings or use None if they don't exist
647
+ stored_settings = (st.session_state.protein_visualizations[protein_id]['first_degree'].get('settings')
648
+ if 'first_degree' in st.session_state.protein_visualizations[protein_id]
649
+ else None)
650
+
651
+ include_second_degree = st.checkbox(
652
+ "Include second-degree edges",
653
+ value=False,
654
+ key=f"second_degree_{protein_id}",
655
+ help="Show connections between neighbor nodes"
656
+ )
657
 
658
  # Get the appropriate visualization based on checkbox
659
  viz_type = 'second_degree' if include_second_degree else 'first_degree'
 
693
  del st.session_state.protein_visualizations[protein_id]
694
  st.rerun()
695
 
696
+ if stored_settings is not None and current_settings != stored_settings:
697
+ st.warning("⚠️ Settings have changed. Click 'Regenerate Visualization' to apply new settings.")
698
+ elif stored_settings is None:
699
+ st.warning("⚠️ Visualization was generated with default settings. Consider regenerating to apply custom settings.")
700
+
701
  # Display the appropriate visualization
702
  with open(viz_info['path'], 'r', encoding='utf-8') as f:
703
  html_content = f.read()
visualize_kg.py CHANGED
@@ -131,9 +131,23 @@ def _gather_protein_edges(data, protein_id):
131
 
132
  return protein_edges
133
 
134
- def _filter_edges(protein_id, protein_edges, prediction_df, limit=10):
 
 
 
 
 
 
 
 
 
 
 
135
  filtered_edges = {}
136
 
 
 
 
137
  prediction_categories = prediction_df['GO_category'].unique()
138
  prediction_categories = [GO_CATEGORY_MAPPING[category] for category in prediction_categories]
139
  go_category_reverse_mapping = {v:k for k, v in GO_CATEGORY_MAPPING.items()}
@@ -160,18 +174,18 @@ def _filter_edges(protein_id, protein_edges, prediction_df, limit=10):
160
  edge = (protein_id, term)
161
  is_ground_truth = edge in edges_set
162
  valid_edges.append((edge, prob, is_ground_truth))
163
- if len(valid_edges) >= limit:
164
  break
165
  filtered_edges[edge_type] = valid_edges
166
  else:
167
  # If no predictions but it's a GO category in prediction_df
168
- filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:limit]]
169
  else:
170
  # For GO terms not in prediction_df, mark them as ground truth with blue color
171
- filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:limit]]
172
  else:
173
  # For non-GO edges, include all edges up to limit
174
- filtered_edges[edge_type] = [(edge, None, True) for edge in list(edges)[:limit]]
175
 
176
  return filtered_edges
177
 
@@ -225,7 +239,8 @@ def visualize_protein_subgraph(data, protein_id, prediction_df, limit=10, second
225
 
226
  # Get the first-degree edges and filter them
227
  protein_edges = _gather_protein_edges(data, protein_id)
228
- first_degree_edges = _filter_edges(protein_id, protein_edges, prediction_df, limit)
 
229
 
230
  # Initialize all_edges with first degree edges
231
  all_edges = first_degree_edges.copy()
@@ -243,11 +258,14 @@ def visualize_protein_subgraph(data, protein_id, prediction_df, limit=10, second
243
  if target != protein_id:
244
  neighbor_nodes.add((target, target_type))
245
 
246
- # Gather and filter second-degree edges
247
  second_degree_edges = {}
248
  for neighbor_id, neighbor_type in neighbor_nodes:
249
  neighbor_edges = _gather_neighbor_edges(data, neighbor_id, neighbor_type, protein_id)
250
- filtered_neighbor_edges = _filter_edges(neighbor_id, neighbor_edges, prediction_df, second_degree_limit)
 
 
 
251
 
252
  # Merge filtered neighbor edges into second_degree_edges
253
  for edge_type, edges in filtered_neighbor_edges.items():
 
131
 
132
  return protein_edges
133
 
134
+ def _filter_edges(protein_id, protein_edges, prediction_df, limit=10, is_second_degree=False, second_degree_limit=3):
135
+ """
136
+ Filter edges based on type and limit
137
+
138
+ Args:
139
+ protein_id: ID of the protein
140
+ protein_edges: Dictionary of edges to filter
141
+ prediction_df: DataFrame containing predictions
142
+ limit: Maximum number of edges to keep for first-degree connections
143
+ is_second_degree: Whether these are second-degree edges
144
+ second_degree_limit: Maximum number of edges to keep for second-degree connections
145
+ """
146
  filtered_edges = {}
147
 
148
+ # Use appropriate limit based on edge degree
149
+ current_limit = second_degree_limit if is_second_degree else limit
150
+
151
  prediction_categories = prediction_df['GO_category'].unique()
152
  prediction_categories = [GO_CATEGORY_MAPPING[category] for category in prediction_categories]
153
  go_category_reverse_mapping = {v:k for k, v in GO_CATEGORY_MAPPING.items()}
 
174
  edge = (protein_id, term)
175
  is_ground_truth = edge in edges_set
176
  valid_edges.append((edge, prob, is_ground_truth))
177
+ if len(valid_edges) >= current_limit:
178
  break
179
  filtered_edges[edge_type] = valid_edges
180
  else:
181
  # If no predictions but it's a GO category in prediction_df
182
+ filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:current_limit]]
183
  else:
184
  # For GO terms not in prediction_df, mark them as ground truth with blue color
185
+ filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:current_limit]]
186
  else:
187
  # For non-GO edges, include all edges up to limit
188
+ filtered_edges[edge_type] = [(edge, None, True) for edge in list(edges)[:current_limit]]
189
 
190
  return filtered_edges
191
 
 
239
 
240
  # Get the first-degree edges and filter them
241
  protein_edges = _gather_protein_edges(data, protein_id)
242
+ first_degree_edges = _filter_edges(protein_id, protein_edges, prediction_df,
243
+ limit=limit, is_second_degree=False)
244
 
245
  # Initialize all_edges with first degree edges
246
  all_edges = first_degree_edges.copy()
 
258
  if target != protein_id:
259
  neighbor_nodes.add((target, target_type))
260
 
261
+ # Gather and filter second-degree edges with the smaller limit
262
  second_degree_edges = {}
263
  for neighbor_id, neighbor_type in neighbor_nodes:
264
  neighbor_edges = _gather_neighbor_edges(data, neighbor_id, neighbor_type, protein_id)
265
+ filtered_neighbor_edges = _filter_edges(neighbor_id, neighbor_edges, prediction_df,
266
+ limit=limit,
267
+ is_second_degree=True,
268
+ second_degree_limit=second_degree_limit)
269
 
270
  # Merge filtered neighbor edges into second_degree_edges
271
  for edge_type, edges in filtered_neighbor_edges.items():