Spaces:

InstaDeepAI
/

folding-studio-demo

Running

App Files Files Community

folding-studio-demo / aggrescan3d /aggrescan /aggrescan_3d.py

jfaustin

secretion-scores (#4)

a3f3d91 verified 8 days ago

raw

history blame contribute delete

12.1 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --

	import os
	import math
	import csv
	from . import logger

	# minimum and maximum surface exposition to be considered
	min_surf = 10
	max_surf = 55
	_name = "agg3D"


	class Residue:
	def __init__(self, chain, resi, resn, coords, rsa, score_matrix, ph):
	if chain == ' ':
	self.chain = '-'
	else:
	self.chain = chain
	self.resi = resi
	self.resn = get_one_letter(resn) # we store it in one-letter code
	self.coords = coords
	self.score_matrix = score_matrix
	self.agg_sup = self.calc_res_agg_fin(rsa=rsa)
	self.agg_fin = 0
	self.rsa = 0
	self.ph = ph
	self.const_ph_scores = {
	"A": 0.66,
	"N": -0.84,
	"C": 1.66,
	"Q": -0.87,
	"G": 0,
	"I": 2.75,
	"L": 1.77,
	"M": 1.30,
	"F": 3.99,
	"P": 1.69,
	"S": -0.99,
	"T": 0.12,
	"W": 3.29,
	"Y": 1.33,
	"V": 1.45
	}

	self.aa_data = {
	'R': {'pKa': 12.5, "R": 0.857546950885663, "PN": 10 (-3.66), "PI": 10 (-7.38)},
	'D': {'pKa': 3.5, "R": 0.622376926398327, "PN": 10 (-3.18), "PI": 10 (-8.54)},
	'E': {'pKa': 4.2, 'R': 0.940369170156725, 'PN': 10 (-3.79), 'PI': 10 (-6.2)},
	'H': {'pKa': 6.6, 'R': 0.812212007706225, 'PN': 10 (-4.67), 'PI': 10 (-5.97)},
	'K': {'pKa': 10.5, 'R': 0.9093172437, 'PN': 10 (-2.19), 'PI': 10 (-6.81)},
	}

	def __str__(self):
	return self.chain+self.resn

	def set_agg_fin(self, agg_dis_sum):
	self.agg_fin = self.agg_sup + agg_dis_sum

	def id(self):
	return self.chain+self.resn+self.resi

	def set_rsa(self, rsa):
	self.rsa = rsa
	self.agg_sup = self.calc_res_agg_fin(rsa=rsa)

	def calc_res_agg_fin(self, rsa):
	if rsa < min_surf:
	return 0
	else:
	if rsa > max_surf:
	rsa = max_surf
	if not self.ph:
	return float(self.score_matrix[self.resn]) * 0.0599 * math.exp(0.0521 * rsa)
	else:
	return self.calc_ph_score() * 0.0599 * math.exp(0.0521 * rsa)

	def calc_ph_score(self):
	correction_value = -3.13
	try:
	return self.const_ph_scores[self.resn]
	except KeyError:
	res_data = self.aa_data[self.resn]
	term1, term2 = self.calc_terms(res_data)
	corrected_1 = term1 * res_data['R']
	return (corrected_1 - term2) - correction_value

	def calc_terms(self, data):
	first = math.log10(data['PN'] + (data['PI'] * 10 ** (abs(data['pKa'] - self.ph))))
	second = math.log10(1 + 10 ** (abs(data['pKa'] - self.ph)))
	return first, second

	@property
	def score(self):
	return self.agg_fin


	class Protein:
	def __init__(self, pdb_code, residues, out_dir, max_dist):
	self.name = pdb_code
	self.residues = residues
	self.out_dir = out_dir
	self.max_dist = max_dist

	def __str__(self):
	return self.name

	def add_residue(self, res):
	self.residues.append(res)

	def change_res_rsa(self, res_number, res_id, rsa):
	if self.residues[res_number].id() == res_id:
	self.residues[res_number].set_rsa(rsa)
	else:
	logger.critical(module_name=_name,
	msg="Error while parsing freeSasa output. Probably freeSasa failed quietly.")
	logger.critical(module_name=_name,
	msg="Failed to assign rsa to %s proper id is: %s " %
	(res_id, self.residues[res_number].id()) +
	"(ChainID, residue's one letter code, residue's ID)")
	raise logger.AggrescanError("Failed to parse freeSasa output. Qutting.",
	module_name=_name)

	def calc_agg_fin(self):
	"""Calculate the agg_dis_sum for every Residue in the Protein"""
	dist_correction = -2.56 / self.max_dist
	for central in self.residues:
	agg_dis_sum = 0
	if not central.agg_sup == 0:
	for peripheric in self.residues:
	dist = get_distance(central.coords, peripheric.coords)
	if dist < self.max_dist and not dist == 0:
	agg_dis_sum += peripheric.agg_sup * 1.2915 * math.exp(dist_correction*dist)
	central.set_agg_fin(agg_dis_sum)

	def short_summary(self):
	"""Return a short summarized output and identify it with a leading #"""
	total_sum = 0
	maximum = 0
	minimum = 0
	pos_auc = 0
	neg_auc = 0
	prev_score = 0
	res_number = len(self.residues)
	for res in self.residues:
	total_sum += res.agg_fin
	if res.agg_fin > maximum:
	maximum = res.agg_fin
	if res.agg_fin < minimum:
	minimum = res.agg_fin
	if res.agg_fin != 0:
	if prev_score != 0:
	auc = (res.agg_fin + prev_score)/2
	if auc > 0:
	pos_auc += abs(auc)
	else:
	neg_auc += abs(auc)
	prev_score = res.agg_fin
	return '#' + self.name + " " + ' '.join(str("%.4f" % round(val, 4))
	for val in (total_sum,
	total_sum/res_number,
	maximum,
	minimum,
	pos_auc))

	def long_output(self):
	"""Return the complete output and identify it with a leading //"""
	output = ""
	for res in self.residues:
	output += '//' + " ".join((res.chain, res.resi, res.resn)) + " %.4f\n" % round(res.agg_fin, 4)
	return output

	def out_csv(self):
	"""Save the complete output in a csv file"""
	csv_file = os.path.join(self.out_dir, "A3D.csv")
	c = csv.writer(open(csv_file, "w"))
	if os.stat(csv_file).st_size == 0:
	c.writerow(["protein", "chain", "residue", "residue_name", "score"])
	for res in self.residues:
	c.writerow([self.name.split("/")[-1], res.chain, res.resi, res.resn, "%.4f" % round(res.agg_fin, 4)])

	def get_residues(self):
	return self.residues


	def get_one_letter(raw):
	"""Return the one letter code for a residue"""
	if len(raw) > 3:
	aa = raw[-3:]
	else:
	aa = raw

	three_letters = aa.capitalize()

	conversion = {"Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C",
	"Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I",
	"Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P",
	"Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V"}
	try:
	one_letter = conversion[three_letters]
	except KeyError:
	logger.warning(module_name=_name,
	msg='Could not recognize the following residue symbol: "%s"' % raw)
	one_letter = None
	return one_letter


	def get_distance(a, b):
	"""Return the distance between two cartesian tri dimensional points"""
	return math.sqrt((a[0] - b[0])**2 +
	(a[1] - b[1])**2 +
	(a[2] - b[2])**2)


	def parse_matrix(filename=''):
	"""Parse the matrix input into a dictionary"""
	with open(filename, 'r') as fh:
	matdict = {}
	for line in fh.readlines():
	if not line == '\n':
	pair = line.strip().split(" ")
	matdict[pair[0]] = pair[1]

	return matdict


	def parse_naccess(pdb_code, work_dir, score_matrix, max_dist, ph):
	"""Parse the output of naccess into an object of class Protein"""
	with open(os.path.join(work_dir,pdb_code) + ".rsa", "r") as fh:
	rsa_dict = {line[3:8].strip() + # resn
	line[8] + # chain
	line[9:16].strip(): # resi
	float(line[22:28].strip()) # rsa
	for line in fh.readlines()
	if line.startswith('RES')}

	my_protein = Protein(pdb_code=pdb_code, residues=[], out_dir=work_dir, max_dist=max_dist)
	center_atom = 'CA' # atom to be considered the center of the residue

	with open(os.path.join(work_dir,pdb_code) + '.asa', 'r') as fh:
	for line in fh.readlines():
	# use data only from alpha carbons
	if line[12:16].strip() == center_atom:
	# extract the data according to the characteristics of the pdb format
	my_chain = line[21]
	my_resi = line[22:29].strip()
	my_resn = line[17:20].strip()
	x = float(line[30:38].strip())
	y = float(line[38:46].strip())
	z = float(line[46:54].strip())
	res_id = my_resn + my_chain + my_resi
	my_rsa = rsa_dict[res_id]
	my_protein.add_residue(Residue(chain=my_chain,
	resi=my_resi,
	resn=my_resn,
	coords=(x, y, z),
	rsa=my_rsa,
	score_matrix=score_matrix,
	ph=ph))
	return my_protein


	def parse_freesasa(pdb_code, work_dir, filename, score_matrix, max_dist, ph):
	my_protein = Protein(pdb_code=pdb_code, residues=[], out_dir=work_dir, max_dist=max_dist)
	residue_count = 0
	central_atom = 'CA' # atom to be considered the center of the residue
	with open(os.path.join(work_dir, filename), 'r') as f:
	for line in f:
	if line.startswith("ATOM "): # this is the same as naccess asa file
	if line[12:16].strip() == central_atom:
	# extract the data according to the characteristics of the pdb format
	my_chain = line[21]
	my_resi = line[22:29].strip()
	my_resn = line[17:20].strip()
	x = float(line[30:38].strip())
	y = float(line[38:46].strip())
	z = float(line[46:54].strip())
	my_protein.add_residue(Residue(chain=my_chain,
	resi=my_resi,
	resn=my_resn,
	coords=(x, y, z),
	rsa=0,
	score_matrix=score_matrix,
	ph=ph))
	elif line.startswith("RES"): # equivalent to naccess rsa file
	this_id = line[8] + get_one_letter(line[3:8].strip()) + line[9:16].strip()
	rsa = float(line[22:28].strip())
	my_protein.change_res_rsa(res_number=residue_count, res_id=this_id, rsa=rsa)
	residue_count += 1

	return my_protein


	def run(pdb_file='', mat_file='', max_dist='', work_dir='', naccess=False, ph=None):
	score_matrix = parse_matrix(filename=mat_file)
	pdb_code = pdb_file.split(".")[0]
	if naccess:
	my_protein = parse_naccess(pdb_code=pdb_code, work_dir=work_dir,
	score_matrix=score_matrix, max_dist=max_dist, ph=ph)
	else:
	my_protein = parse_freesasa(pdb_code=pdb_code, work_dir=work_dir,
	filename="sasa.out", score_matrix=score_matrix, max_dist=max_dist, ph=ph)
	my_protein.calc_agg_fin()
	my_protein.out_csv()
	# short_summary = my_protein.short_summary()+"\n"
	# long_summary = my_protein.long_output()
	# return short_summary + long_summary
	return my_protein.residues