Spaces:

HUBioDataLab
/

ProtHGT

Running

ProtHGT / visualize_kg.py

Erva Ulusoy

updated edge coloring, modified edge labels to more clear ones

67e78f6 4 months ago

10.9 kB

	from pyvis.network import Network
	import os

	NODE_TYPE_COLORS = {
	'Disease': '#079dbb',
	'HPO': '#58d0e8',
	'Drug': '#815ac0',
	'Compound': '#d2b7e5',
	'Domain': '#6bbf59',
	'GO_term_P': '#ff8800',
	'GO_term_F': '#ffaa00',
	'GO_term_C': '#ffc300',
	'Pathway': '#720026',
	'kegg_Pathway': '#720026',
	'EC_number': '#ce4257',
	'Protein': '#3aa6a4'
	}

	EDGE_LABEL_TRANSLATION = {
	'Orthology': 'is ortholog to',
	'Pathway': 'takes part in',
	'kegg_path_prot': 'takes part in',
	'protein_domain': 'has',
	'PPI': 'interacts with',
	'HPO': 'is associated with',
	'kegg_dis_prot': 'is related to',
	'Disease': 'is related to',
	'Drug': 'targets',
	'protein_ec': 'catalyzes',
	'Chembl': 'targets',
	('protein_function', 'GO_term_F'): 'enables',
	('protein_function', 'GO_term_P'): 'is involved in',
	('protein_function', 'GO_term_C'): 'localizes to',
	}

	GO_CATEGORY_MAPPING = {
	'Biological Process': 'GO_term_P',
	'Molecular Function': 'GO_term_F',
	'Cellular Component': 'GO_term_C'
	}

	def _gather_protein_edges(data, protein_id):

	protein_idx = data['Protein']['id_mapping'][protein_id]
	reverse_id_mapping = {}
	for node_type in data.node_types:
	reverse_id_mapping[node_type] = {v:k for k, v in data[node_type]['id_mapping'].items()}

	protein_edges = {}

	print(f'Gathering edges for {protein_id}...')

	for edge_type in data.edge_types:
	if 'rev' not in edge_type[1]:
	if edge_type not in protein_edges:
	protein_edges[edge_type] = []
	if edge_type[0] == 'Protein':
	print(f'Gathering edges for {edge_type}...')
	# append the edges with protein_idx as source node
	edges = data[edge_type].edge_index[:, data[edge_type].edge_index[0] == protein_idx]
	protein_edges[edge_type].extend(edges.T.tolist())
	elif edge_type[2] == 'Protein':
	print(f'Gathering edges for {edge_type}...')
	# append the edges with protein_idx as target node
	edges = data[edge_type].edge_index[:, data[edge_type].edge_index[1] == protein_idx]
	protein_edges[edge_type].extend(edges.T.tolist())

	for edge_type in protein_edges.keys():
	if protein_edges[edge_type]:
	mapped_edges = set()
	for edge in protein_edges[edge_type]:
	# Get source and target node types from edge_type
	source_type, _, target_type = edge_type
	# Map indices back to original IDs
	source_id = reverse_id_mapping[source_type][edge[0]]
	target_id = reverse_id_mapping[target_type][edge[1]]
	mapped_edges.add((source_id, target_id))
	protein_edges[edge_type] = mapped_edges

	return protein_edges

	def _filter_edges(protein_id, protein_edges, prediction_df, limit=10):
	filtered_edges = {}

	prediction_categories = prediction_df['GO_category'].unique()
	prediction_categories = [GO_CATEGORY_MAPPING[category] for category in prediction_categories]
	go_category_reverse_mapping = {v:k for k, v in GO_CATEGORY_MAPPING.items()}

	for edge_type, edges in protein_edges.items():
	# Skip if edges is empty
	if edges is None or len(edges) == 0:
	continue

	if edge_type[2].startswith('GO_term'): # Check if it's any GO term edge
	if edge_type[2] in prediction_categories:
	# Handle edges for GO terms that are in prediction_df
	category_mask = (prediction_df['GO_category'] == go_category_reverse_mapping[edge_type[2]]) & (prediction_df['UniProt_ID'] == protein_id)
	category_predictions = prediction_df[category_mask]

	if len(category_predictions) > 0:
	category_predictions = category_predictions.sort_values(by='Probability', ascending=False)
	edges_set = set(edges) # Convert to set for O(1) lookup

	valid_edges = []
	for _, row in category_predictions.iterrows():
	term = row['GO_ID']
	prob = row['Probability']
	edge = (protein_id, term)
	is_ground_truth = edge in edges_set
	valid_edges.append((edge, prob, is_ground_truth))
	if len(valid_edges) >= limit:
	break
	filtered_edges[edge_type] = valid_edges
	else:
	# If no predictions but it's a GO category in prediction_df
	filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:limit]]
	else:
	# For GO terms not in prediction_df, mark them as ground truth with blue color
	filtered_edges[edge_type] = [(edge, 'no_pred', True) for edge in list(edges)[:limit]]
	else:
	# For non-GO edges, include all edges up to limit
	filtered_edges[edge_type] = [(edge, None, True) for edge in list(edges)[:limit]]

	return filtered_edges


	def visualize_protein_subgraph(data, protein_id, prediction_df, limit=10):
	protein_edges = _gather_protein_edges(data, protein_id)
	visualized_edges = _filter_edges(protein_id, protein_edges, prediction_df, limit)
	print(f'Edges to be visualized: {visualized_edges}')

	net = Network(height="600px", width="100%", directed=True, notebook=False)

	# Create groups configuration from NODE_TYPE_COLORS
	groups_config = {}
	for node_type, color in NODE_TYPE_COLORS.items():
	groups_config[node_type] = {
	"color": {"background": color, "border": color}
	}

	# Convert groups_config to a JSON-compatible string
	import json
	groups_json = json.dumps(groups_config)

	# Configure physics options with settings for better clustering
	net.set_options("""{
	"physics": {
	"enabled": true,
	"barnesHut": {
	"gravitationalConstant": -1000,
	"springLength": 250,
	"springConstant": 0.001,
	"damping": 0.09,
	"avoidOverlap": 0
	},
	"forceAtlas2Based": {
	"gravitationalConstant": -50,
	"centralGravity": 0.01,
	"springLength": 100,
	"springConstant": 0.08,
	"damping": 0.4,
	"avoidOverlap": 0
	},
	"solver": "barnesHut",
	"stabilization": {
	"enabled": true,
	"iterations": 1000,
	"updateInterval": 25
	}
	},
	"layout": {
	"improvedLayout": true,
	"hierarchical": {
	"enabled": false
	}
	},
	"interaction": {
	"hover": true,
	"navigationButtons": true,
	"multiselect": true
	},
	"configure": {
	"enabled": true,
	"filter": ["physics", "layout", "manipulation"],
	"showButton": true
	},
	"groups": """ + groups_json + "}")

	# Add the main protein node
	net.add_node(protein_id,
	label=f"Protein: {protein_id}",
	color={'background': 'white', 'border': '#c1121f'},
	borderWidth=4,
	shape="dot",
	font={'color': '#000000', 'size': 15},
	group='Protein',
	size=30,
	mass=2.5)

	# Track added nodes to avoid duplication
	added_nodes = {protein_id}

	# Add edges and target nodes
	for edge_type, edges in visualized_edges.items():
	source_type, relation_type, target_type = edge_type

	if relation_type == 'protein_function':
	relation_type = EDGE_LABEL_TRANSLATION[(relation_type, target_type)]
	else:
	relation_type = EDGE_LABEL_TRANSLATION[relation_type]

	for edge_info in edges:
	edge, probability, is_ground_truth = edge_info
	source, target = edge[0], edge[1]
	source_str = str(source)
	target_str = str(target)

	# Add source node if not present
	if source_str not in added_nodes:
	net.add_node(source_str,
	label=f"{source_str}",
	shape="dot",
	font={'color': '#000000', 'size': 12},
	title=f"{source_type}: {source_str}",
	group=source_type,
	size=15,
	mass=1.5)
	added_nodes.add(source_str)

	# Add target node if not present
	if target_str not in added_nodes:
	net.add_node(target_str,
	label=f"{target_str}",
	shape="dot",
	font={'color': '#000000', 'size': 12},
	title=f"{target_type}: {target_str}",
	group=target_type,
	size=15,
	mass=1.5)
	added_nodes.add(target_str)

	# Add edge with relationship type and probability as label
	edge_label = f"{relation_type}"
	if probability is not None:
	if probability == 'no_pred':
	edge_color = '#219ebc'
	edge_label += ' (P=Not generated)'
	else:
	edge_label += f" (P={probability:.2f})"
	edge_color = '#8338ec' if is_ground_truth else '#c1121f'
	# if validated prediction purple, if non-validated prediction red, if no prediction (directly from database) blue
	net.add_edge(source_str, target_str,
	label=edge_label,
	font={'size': 0},
	color=edge_color,
	title=edge_label,
	length=200,
	smooth={'type': 'curvedCW', 'roundness': 0.1})
	else:
	net.add_edge(source_str, target_str,
	label=edge_label,
	font={'size': 0},
	color='#666666', # Keep default gray for non-GO edges
	title=edge_label,
	length=200,
	smooth={'type': 'curvedCW', 'roundness': 0.1})

	# Save graph to a protein-specific file in a temporary directory
	os.makedirs('temp_viz', exist_ok=True)
	file_path = os.path.join('temp_viz', f'{protein_id}_graph.html')
	net.save_graph(file_path)

	return file_path, visualized_edges