Upload code

4636dc3 verified about 2 months ago

6.88 kB

	import multiprocessing
	import os
	from tqdm import tqdm
	from sklearn.preprocessing import MultiLabelBinarizer

	from torch_geometric.data import Data
	import torch

	import numpy as np

	from .conversion import convert_nx_to_pyg_data
	from graphein.protein.config import ProteinGraphConfig, DSSPConfig
	from graphein.protein.features.nodes.amino_acid import amino_acid_one_hot, meiler_embedding, expasy_protein_scale, hydrogen_bond_acceptor, hydrogen_bond_donor
	from graphein.protein.features.nodes.dssp import phi, psi, asa, rsa, secondary_structure
	from graphein.protein.edges.distance import (add_peptide_bonds,
	add_hydrogen_bond_interactions,
	add_disulfide_interactions,
	add_ionic_interactions,
	add_delaunay_triangulation,
	add_distance_threshold,
	add_sequence_distance_edges,
	add_k_nn_edges)

	from functools import partial
	from .graphs import *
	from .utils_dataset import *
	import os
	import sys
	import subprocess
	import wget


	class PDB2Graph():
	def __init__(self, root, output_folder, config, n_processors=int(multiprocessing.cpu_count())):
	self.root = root
	self.output_folder = output_folder
	self.map_secondary_structure = {'-':0, 'H':1, 'B':2, 'E':3, 'G':4, 'I':5, 'T':6, 'S':7}
	self.init_ohe_edge_type()
	self.config = config
	self.features = ['phi', 'psi', 'rsa', 'asa', 'ss', 'expasy']
	self.n_processors = n_processors
	self.raw_dir = root
	self.processed_dir = self._processed_dir()
	self.raw_file_names = self._raw_file_names()
	self.processed_file_names = self._processed_file_names()


	def _processed_dir(self):
	#processed_dir = os.path.join(os.path.split(self.root)[0], "processed_new")
	if not os.path.exists(self.output_folder):
	os.makedirs(self.output_folder)
	return self.output_folder

	def _raw_file_names(self):
	return os.listdir(self.raw_dir)

	def _processed_file_names(self):
	return [self.pdb2pathdata(pdb_path.split(".")[0]) for pdb_path in self.raw_file_names]

	def create_nx_graph(self, path_to_structure):
	return construct_graph(self.config, pdb_path = path_to_structure)

	def create_pyg_graph(self, path_to_structure):
	pyg_graph = convert_nx_to_pyg_data(self.create_nx_graph(path_to_structure))

	graph = Data(edge_index = pyg_graph.edge_index,
	num_nodes = len(pyg_graph.node_id),
	node_id = pyg_graph.node_id,
	name = pyg_graph.name[0],
	sequence = getattr(pyg_graph, f"sequence_{pyg_graph.chain_id[0]}"),
	distance_matrix = pyg_graph.dist_mat,
	distance = pyg_graph.distance,
	coordinates = torch.FloatTensor(np.array(pyg_graph.coords[0])))
	#create the features
	x = np.array([np.argmax(pyg_graph.amino_acid_one_hot, axis=1)]).reshape(-1,1)
	for feat in self.features:
	if feat == "ss":
	feature = np.array([[self.map_secondary_structure.get(feat_node, 0)] \
	for feat_node in pyg_graph[feat]])
	else:
	feature = np.array(pyg_graph[feat])
	if len(feature.shape) == 1:
	feature = feature.reshape(-1,1)
	x = np.concatenate((x, feature), axis = 1)
	graph.edge_type = self.mlb.transform(pyg_graph.kind)
	graph.x = torch.FloatTensor(x)
	# y = self.annotations[graph.name.split("_")[0]]
	# if self.task == 'GeneOntology' :
	# graph.y_mf = torch.FloatTensor(y["mf"])
	# graph.y_cc = torch.FloatTensor(y["cc"])
	# graph.y_bp = torch.FloatTensor(y["bp"])
	# else:
	# graph.y_ec = torch.FloatTensor(y["ec"])
	return graph

	def init_ohe_edge_type(self):
	self.mlb = MultiLabelBinarizer(classes = ['peptide_bond', 'sequence_distance_2', 'sequence_distance_3'
	, 'distance_threshold', 'delaunay', 'hbond', 'k_nn'])
	self.mlb.fit([['peptide_bond', 'sequence_distance_2', 'sequence_distance_3'
	, 'distance_threshold', 'delaunay', 'hbond', 'k_nn']])

	def process(self):
	"""Convert the PDB files into torch geometric graphs"""
	# self.pdb2graph = PDB2Graph(self.config)
	to_be_processed = self.get_files_to_process()

	# pool = multiprocessing.Pool(self.n_processors)
	# for _ in tqdm(pool.imap_unordered(self.graph_creation, to_be_processed), total=len(to_be_processed)):
	# continue
	# pool.close()
	# pool.join()



	processes = []
	for prot in tqdm(to_be_processed):
	p = multiprocessing.Process(target=self.graph_creation, args=(prot,))
	processes.append(p)
	p.start()

	for process in processes:
	process.join()


	def graph_creation(self, pdb):
	"""Create a graph from the PDB file"""

	# Define the path_to_structure from the pdb name file
	path_to_structure = self.pdb2pathstructure(pdb)

	# Convert the structure into a graph
	g = self.create_pyg_graph(path_to_structure)
	# Save the graph
	torch.save(g, os.path.join(self.output_folder, self.pdb2pathdata(pdb)))

	return None

	def pdb2pathdata(self, pdb):
	return pdb+'.pt'

	def pdb2pathstructure(self, pdb):
	return os.path.join(self.raw_dir, pdb+'.pdb')

	def get_files_to_process(self):
	RAW_FILES = self.processed_file_names
	PROCESSED_FILES = os.listdir(self.processed_dir)
	to_be_processed = set(RAW_FILES).difference(set(PROCESSED_FILES))
	to_be_processed = [path.split('.')[0] for path in to_be_processed]
	return to_be_processed

	def download_alphafold_structure(
	uniprot_id: str,
	out_dir: str,
	version: int = 4
	):

	BASE_URL = "https://alphafold.ebi.ac.uk/files/"
	uniprot_id = uniprot_id.upper()

	query_url = f"{BASE_URL}AF-{uniprot_id}-F1-model_v{version}.pdb"
	structure_filename = os.path.join(out_dir, f"AF-{uniprot_id}-F1-model_v{version}.pdb")
	if os.path.exists(structure_filename):
	return structure_filename
	try:
	structure_filename = wget.download(query_url, out=out_dir)
	except:
	print('Error.. could not download: ', f"AF-{uniprot_id}-F1-model_v{version}.pdb")
	return None
	return structure_filename