Spaces:
Sleeping
Sleeping
Erva Ulusoy
commited on
Commit
·
a1e2231
1
Parent(s):
14c3500
added filtering options for second-degree edge visualization
Browse files- ProtHGT_app.py +48 -13
- visualize_kg.py +26 -8
ProtHGT_app.py
CHANGED
@@ -562,18 +562,28 @@ if st.session_state.submitted:
|
|
562 |
# Create visualizations in each tab
|
563 |
for idx, protein_id in enumerate(selected_proteins):
|
564 |
with protein_tabs[idx]:
|
565 |
-
col1, col2 = st.columns([
|
566 |
with col1:
|
567 |
max_node_count = st.slider(
|
568 |
-
"Maximum neighbors per edge type",
|
569 |
min_value=5,
|
570 |
max_value=50,
|
571 |
value=10,
|
572 |
step=5,
|
573 |
-
help="Control the maximum number of
|
574 |
key=f"slider_{protein_id}"
|
575 |
)
|
576 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
577 |
# Check if both visualizations exist for this protein
|
578 |
viz_exists = (protein_id in st.session_state.protein_visualizations and
|
579 |
'first_degree' in st.session_state.protein_visualizations[protein_id] and
|
@@ -592,6 +602,7 @@ if st.session_state.submitted:
|
|
592 |
protein_id,
|
593 |
st.session_state.predictions_df,
|
594 |
limit=max_node_count,
|
|
|
595 |
include_second_degree=False
|
596 |
)
|
597 |
|
@@ -601,29 +612,48 @@ if st.session_state.submitted:
|
|
601 |
protein_id,
|
602 |
st.session_state.predictions_df,
|
603 |
limit=max_node_count,
|
|
|
604 |
include_second_degree=True
|
605 |
)
|
606 |
|
607 |
# Store both visualizations in session state
|
608 |
st.session_state.protein_visualizations[protein_id]['first_degree'] = {
|
609 |
'path': html_path_1st,
|
610 |
-
'edges': edges_1st
|
|
|
|
|
|
|
|
|
611 |
}
|
612 |
st.session_state.protein_visualizations[protein_id]['second_degree'] = {
|
613 |
'path': html_path_2nd,
|
614 |
-
'edges': edges_2nd
|
|
|
|
|
|
|
|
|
615 |
}
|
616 |
st.rerun()
|
617 |
|
618 |
# If visualization exists, show the toggle and display appropriate version
|
619 |
if viz_exists:
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
627 |
|
628 |
# Get the appropriate visualization based on checkbox
|
629 |
viz_type = 'second_degree' if include_second_degree else 'first_degree'
|
@@ -663,6 +693,11 @@ if st.session_state.submitted:
|
|
663 |
del st.session_state.protein_visualizations[protein_id]
|
664 |
st.rerun()
|
665 |
|
|
|
|
|
|
|
|
|
|
|
666 |
# Display the appropriate visualization
|
667 |
with open(viz_info['path'], 'r', encoding='utf-8') as f:
|
668 |
html_content = f.read()
|
|
|
562 |
# Create visualizations in each tab
|
563 |
for idx, protein_id in enumerate(selected_proteins):
|
564 |
with protein_tabs[idx]:
|
565 |
+
col1, col2 = st.columns([1, 1])
|
566 |
with col1:
|
567 |
max_node_count = st.slider(
|
568 |
+
"Maximum neighbors per edge type (first-degree)",
|
569 |
min_value=5,
|
570 |
max_value=50,
|
571 |
value=10,
|
572 |
step=5,
|
573 |
+
help="Control the maximum number of direct neighbors of the query protein shown for each relationship type",
|
574 |
key=f"slider_{protein_id}"
|
575 |
)
|
576 |
+
with col2:
|
577 |
+
second_degree_limit = st.slider(
|
578 |
+
"Maximum neighbors per edge type (second-degree)",
|
579 |
+
min_value=2,
|
580 |
+
max_value=10,
|
581 |
+
value=3,
|
582 |
+
step=1,
|
583 |
+
help="Control the maximum number of second-degree neighbors of the query protein shown for each relationship type. Second-degree edge limit is intentionally kept low to maintain visual clarity. Higher values may make the graph cluttered and difficult to interpret.",
|
584 |
+
key=f"second_degree_slider_{protein_id}"
|
585 |
+
)
|
586 |
+
|
587 |
# Check if both visualizations exist for this protein
|
588 |
viz_exists = (protein_id in st.session_state.protein_visualizations and
|
589 |
'first_degree' in st.session_state.protein_visualizations[protein_id] and
|
|
|
602 |
protein_id,
|
603 |
st.session_state.predictions_df,
|
604 |
limit=max_node_count,
|
605 |
+
second_degree_limit=second_degree_limit,
|
606 |
include_second_degree=False
|
607 |
)
|
608 |
|
|
|
612 |
protein_id,
|
613 |
st.session_state.predictions_df,
|
614 |
limit=max_node_count,
|
615 |
+
second_degree_limit=second_degree_limit,
|
616 |
include_second_degree=True
|
617 |
)
|
618 |
|
619 |
# Store both visualizations in session state
|
620 |
st.session_state.protein_visualizations[protein_id]['first_degree'] = {
|
621 |
'path': html_path_1st,
|
622 |
+
'edges': edges_1st,
|
623 |
+
'settings': {
|
624 |
+
'max_node_count': max_node_count,
|
625 |
+
'second_degree_limit': second_degree_limit
|
626 |
+
}
|
627 |
}
|
628 |
st.session_state.protein_visualizations[protein_id]['second_degree'] = {
|
629 |
'path': html_path_2nd,
|
630 |
+
'edges': edges_2nd,
|
631 |
+
'settings': {
|
632 |
+
'max_node_count': max_node_count,
|
633 |
+
'second_degree_limit': second_degree_limit
|
634 |
+
}
|
635 |
}
|
636 |
st.rerun()
|
637 |
|
638 |
# If visualization exists, show the toggle and display appropriate version
|
639 |
if viz_exists:
|
640 |
+
# Check if settings have changed, but handle cases where settings don't exist
|
641 |
+
current_settings = {
|
642 |
+
'max_node_count': max_node_count,
|
643 |
+
'second_degree_limit': second_degree_limit
|
644 |
+
}
|
645 |
+
|
646 |
+
# Safely get stored settings or use None if they don't exist
|
647 |
+
stored_settings = (st.session_state.protein_visualizations[protein_id]['first_degree'].get('settings')
|
648 |
+
if 'first_degree' in st.session_state.protein_visualizations[protein_id]
|
649 |
+
else None)
|
650 |
+
|
651 |
+
include_second_degree = st.checkbox(
|
652 |
+
"Include second-degree edges",
|
653 |
+
value=False,
|
654 |
+
key=f"second_degree_{protein_id}",
|
655 |
+
help="Show connections between neighbor nodes"
|
656 |
+
)
|
657 |
|
658 |
# Get the appropriate visualization based on checkbox
|
659 |
viz_type = 'second_degree' if include_second_degree else 'first_degree'
|
|
|
693 |
del st.session_state.protein_visualizations[protein_id]
|
694 |
st.rerun()
|
695 |
|
696 |
+
if stored_settings is not None and current_settings != stored_settings:
|
697 |
+
st.warning("⚠️ Settings have changed. Click 'Regenerate Visualization' to apply new settings.")
|
698 |
+
elif stored_settings is None:
|
699 |
+
st.warning("⚠️ Visualization was generated with default settings. Consider regenerating to apply custom settings.")
|
700 |
+
|
701 |
# Display the appropriate visualization
|
702 |
with open(viz_info['path'], 'r', encoding='utf-8') as f:
|
703 |
html_content = f.read()
|
visualize_kg.py
CHANGED
@@ -131,9 +131,23 @@ def _gather_protein_edges(data, protein_id):
|
|
131 |
|
132 |
return protein_edges
|
133 |
|
134 |
-
def _filter_edges(protein_id, protein_edges, prediction_df, limit=10):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
filtered_edges = {}
|
136 |
|
|
|
|
|
|
|
137 |
prediction_categories = prediction_df['GO_category'].unique()
|
138 |
prediction_categories = [GO_CATEGORY_MAPPING[category] for category in prediction_categories]
|
139 |
go_category_reverse_mapping = {v:k for k, v in GO_CATEGORY_MAPPING.items()}
|
@@ -160,18 +174,18 @@ def _filter_edges(protein_id, protein_edges, prediction_df, limit=10):
|
|
160 |
edge = (protein_id, term)
|
161 |
is_ground_truth = edge in edges_set
|
162 |
valid_edges.append((edge, prob, is_ground_truth))
|
163 |
-
if len(valid_edges) >=
|
164 |
break
|
165 |
filtered_edges[edge_type] = valid_edges
|
166 |
else:
|
167 |
# If no predictions but it's a GO category in prediction_df
|
168 |
-
filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:
|
169 |
else:
|
170 |
# For GO terms not in prediction_df, mark them as ground truth with blue color
|
171 |
-
filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:
|
172 |
else:
|
173 |
# For non-GO edges, include all edges up to limit
|
174 |
-
filtered_edges[edge_type] = [(edge, None, True) for edge in list(edges)[:
|
175 |
|
176 |
return filtered_edges
|
177 |
|
@@ -225,7 +239,8 @@ def visualize_protein_subgraph(data, protein_id, prediction_df, limit=10, second
|
|
225 |
|
226 |
# Get the first-degree edges and filter them
|
227 |
protein_edges = _gather_protein_edges(data, protein_id)
|
228 |
-
first_degree_edges = _filter_edges(protein_id, protein_edges, prediction_df,
|
|
|
229 |
|
230 |
# Initialize all_edges with first degree edges
|
231 |
all_edges = first_degree_edges.copy()
|
@@ -243,11 +258,14 @@ def visualize_protein_subgraph(data, protein_id, prediction_df, limit=10, second
|
|
243 |
if target != protein_id:
|
244 |
neighbor_nodes.add((target, target_type))
|
245 |
|
246 |
-
# Gather and filter second-degree edges
|
247 |
second_degree_edges = {}
|
248 |
for neighbor_id, neighbor_type in neighbor_nodes:
|
249 |
neighbor_edges = _gather_neighbor_edges(data, neighbor_id, neighbor_type, protein_id)
|
250 |
-
filtered_neighbor_edges = _filter_edges(neighbor_id, neighbor_edges, prediction_df,
|
|
|
|
|
|
|
251 |
|
252 |
# Merge filtered neighbor edges into second_degree_edges
|
253 |
for edge_type, edges in filtered_neighbor_edges.items():
|
|
|
131 |
|
132 |
return protein_edges
|
133 |
|
134 |
+
def _filter_edges(protein_id, protein_edges, prediction_df, limit=10, is_second_degree=False, second_degree_limit=3):
|
135 |
+
"""
|
136 |
+
Filter edges based on type and limit
|
137 |
+
|
138 |
+
Args:
|
139 |
+
protein_id: ID of the protein
|
140 |
+
protein_edges: Dictionary of edges to filter
|
141 |
+
prediction_df: DataFrame containing predictions
|
142 |
+
limit: Maximum number of edges to keep for first-degree connections
|
143 |
+
is_second_degree: Whether these are second-degree edges
|
144 |
+
second_degree_limit: Maximum number of edges to keep for second-degree connections
|
145 |
+
"""
|
146 |
filtered_edges = {}
|
147 |
|
148 |
+
# Use appropriate limit based on edge degree
|
149 |
+
current_limit = second_degree_limit if is_second_degree else limit
|
150 |
+
|
151 |
prediction_categories = prediction_df['GO_category'].unique()
|
152 |
prediction_categories = [GO_CATEGORY_MAPPING[category] for category in prediction_categories]
|
153 |
go_category_reverse_mapping = {v:k for k, v in GO_CATEGORY_MAPPING.items()}
|
|
|
174 |
edge = (protein_id, term)
|
175 |
is_ground_truth = edge in edges_set
|
176 |
valid_edges.append((edge, prob, is_ground_truth))
|
177 |
+
if len(valid_edges) >= current_limit:
|
178 |
break
|
179 |
filtered_edges[edge_type] = valid_edges
|
180 |
else:
|
181 |
# If no predictions but it's a GO category in prediction_df
|
182 |
+
filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:current_limit]]
|
183 |
else:
|
184 |
# For GO terms not in prediction_df, mark them as ground truth with blue color
|
185 |
+
filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:current_limit]]
|
186 |
else:
|
187 |
# For non-GO edges, include all edges up to limit
|
188 |
+
filtered_edges[edge_type] = [(edge, None, True) for edge in list(edges)[:current_limit]]
|
189 |
|
190 |
return filtered_edges
|
191 |
|
|
|
239 |
|
240 |
# Get the first-degree edges and filter them
|
241 |
protein_edges = _gather_protein_edges(data, protein_id)
|
242 |
+
first_degree_edges = _filter_edges(protein_id, protein_edges, prediction_df,
|
243 |
+
limit=limit, is_second_degree=False)
|
244 |
|
245 |
# Initialize all_edges with first degree edges
|
246 |
all_edges = first_degree_edges.copy()
|
|
|
258 |
if target != protein_id:
|
259 |
neighbor_nodes.add((target, target_type))
|
260 |
|
261 |
+
# Gather and filter second-degree edges with the smaller limit
|
262 |
second_degree_edges = {}
|
263 |
for neighbor_id, neighbor_type in neighbor_nodes:
|
264 |
neighbor_edges = _gather_neighbor_edges(data, neighbor_id, neighbor_type, protein_id)
|
265 |
+
filtered_neighbor_edges = _filter_edges(neighbor_id, neighbor_edges, prediction_df,
|
266 |
+
limit=limit,
|
267 |
+
is_second_degree=True,
|
268 |
+
second_degree_limit=second_degree_limit)
|
269 |
|
270 |
# Merge filtered neighbor edges into second_degree_edges
|
271 |
for edge_type, edges in filtered_neighbor_edges.items():
|