#!/usr/bin/env python # -*- coding: utf-8 -*- import os import math import csv from . import logger # minimum and maximum surface exposition to be considered min_surf = 10 max_surf = 55 _name = "agg3D" class Residue: def __init__(self, chain, resi, resn, coords, rsa, score_matrix, ph): if chain == ' ': self.chain = '-' else: self.chain = chain self.resi = resi self.resn = get_one_letter(resn) # we store it in one-letter code self.coords = coords self.score_matrix = score_matrix self.agg_sup = self.calc_res_agg_fin(rsa=rsa) self.agg_fin = 0 self.rsa = 0 self.ph = ph self.const_ph_scores = { "A": 0.66, "N": -0.84, "C": 1.66, "Q": -0.87, "G": 0, "I": 2.75, "L": 1.77, "M": 1.30, "F": 3.99, "P": 1.69, "S": -0.99, "T": 0.12, "W": 3.29, "Y": 1.33, "V": 1.45 } self.aa_data = { 'R': {'pKa': 12.5, "R": 0.857546950885663, "PN": 10 ** (-3.66), "PI": 10 ** (-7.38)}, 'D': {'pKa': 3.5, "R": 0.622376926398327, "PN": 10 ** (-3.18), "PI": 10 ** (-8.54)}, 'E': {'pKa': 4.2, 'R': 0.940369170156725, 'PN': 10 ** (-3.79), 'PI': 10 ** (-6.2)}, 'H': {'pKa': 6.6, 'R': 0.812212007706225, 'PN': 10 ** (-4.67), 'PI': 10 ** (-5.97)}, 'K': {'pKa': 10.5, 'R': 0.9093172437, 'PN': 10 ** (-2.19), 'PI': 10 ** (-6.81)}, } def __str__(self): return self.chain+self.resn def set_agg_fin(self, agg_dis_sum): self.agg_fin = self.agg_sup + agg_dis_sum def id(self): return self.chain+self.resn+self.resi def set_rsa(self, rsa): self.rsa = rsa self.agg_sup = self.calc_res_agg_fin(rsa=rsa) def calc_res_agg_fin(self, rsa): if rsa < min_surf: return 0 else: if rsa > max_surf: rsa = max_surf if not self.ph: return float(self.score_matrix[self.resn]) * 0.0599 * math.exp(0.0521 * rsa) else: return self.calc_ph_score() * 0.0599 * math.exp(0.0521 * rsa) def calc_ph_score(self): correction_value = -3.13 try: return self.const_ph_scores[self.resn] except KeyError: res_data = self.aa_data[self.resn] term1, term2 = self.calc_terms(res_data) corrected_1 = term1 * res_data['R'] return (corrected_1 - term2) - correction_value def calc_terms(self, data): first = math.log10(data['PN'] + (data['PI'] * 10 ** (abs(data['pKa'] - self.ph)))) second = math.log10(1 + 10 ** (abs(data['pKa'] - self.ph))) return first, second @property def score(self): return self.agg_fin class Protein: def __init__(self, pdb_code, residues, out_dir, max_dist): self.name = pdb_code self.residues = residues self.out_dir = out_dir self.max_dist = max_dist def __str__(self): return self.name def add_residue(self, res): self.residues.append(res) def change_res_rsa(self, res_number, res_id, rsa): if self.residues[res_number].id() == res_id: self.residues[res_number].set_rsa(rsa) else: logger.critical(module_name=_name, msg="Error while parsing freeSasa output. Probably freeSasa failed quietly.") logger.critical(module_name=_name, msg="Failed to assign rsa to %s proper id is: %s " % (res_id, self.residues[res_number].id()) + "(ChainID, residue's one letter code, residue's ID)") raise logger.AggrescanError("Failed to parse freeSasa output. Qutting.", module_name=_name) def calc_agg_fin(self): """Calculate the agg_dis_sum for every Residue in the Protein""" dist_correction = -2.56 / self.max_dist for central in self.residues: agg_dis_sum = 0 if not central.agg_sup == 0: for peripheric in self.residues: dist = get_distance(central.coords, peripheric.coords) if dist < self.max_dist and not dist == 0: agg_dis_sum += peripheric.agg_sup * 1.2915 * math.exp(dist_correction*dist) central.set_agg_fin(agg_dis_sum) def short_summary(self): """Return a short summarized output and identify it with a leading #""" total_sum = 0 maximum = 0 minimum = 0 pos_auc = 0 neg_auc = 0 prev_score = 0 res_number = len(self.residues) for res in self.residues: total_sum += res.agg_fin if res.agg_fin > maximum: maximum = res.agg_fin if res.agg_fin < minimum: minimum = res.agg_fin if res.agg_fin != 0: if prev_score != 0: auc = (res.agg_fin + prev_score)/2 if auc > 0: pos_auc += abs(auc) else: neg_auc += abs(auc) prev_score = res.agg_fin return '#' + self.name + " " + ' '.join(str("%.4f" % round(val, 4)) for val in (total_sum, total_sum/res_number, maximum, minimum, pos_auc)) def long_output(self): """Return the complete output and identify it with a leading //""" output = "" for res in self.residues: output += '//' + " ".join((res.chain, res.resi, res.resn)) + " %.4f\n" % round(res.agg_fin, 4) return output def out_csv(self): """Save the complete output in a csv file""" csv_file = os.path.join(self.out_dir, "A3D.csv") c = csv.writer(open(csv_file, "w")) if os.stat(csv_file).st_size == 0: c.writerow(["protein", "chain", "residue", "residue_name", "score"]) for res in self.residues: c.writerow([self.name.split("/")[-1], res.chain, res.resi, res.resn, "%.4f" % round(res.agg_fin, 4)]) def get_residues(self): return self.residues def get_one_letter(raw): """Return the one letter code for a residue""" if len(raw) > 3: aa = raw[-3:] else: aa = raw three_letters = aa.capitalize() conversion = {"Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C", "Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I", "Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P", "Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V"} try: one_letter = conversion[three_letters] except KeyError: logger.warning(module_name=_name, msg='Could not recognize the following residue symbol: "%s"' % raw) one_letter = None return one_letter def get_distance(a, b): """Return the distance between two cartesian tri dimensional points""" return math.sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2 + (a[2] - b[2])**2) def parse_matrix(filename=''): """Parse the matrix input into a dictionary""" with open(filename, 'r') as fh: matdict = {} for line in fh.readlines(): if not line == '\n': pair = line.strip().split(" ") matdict[pair[0]] = pair[1] return matdict def parse_naccess(pdb_code, work_dir, score_matrix, max_dist, ph): """Parse the output of naccess into an object of class Protein""" with open(os.path.join(work_dir,pdb_code) + ".rsa", "r") as fh: rsa_dict = {line[3:8].strip() + # resn line[8] + # chain line[9:16].strip(): # resi float(line[22:28].strip()) # rsa for line in fh.readlines() if line.startswith('RES')} my_protein = Protein(pdb_code=pdb_code, residues=[], out_dir=work_dir, max_dist=max_dist) center_atom = 'CA' # atom to be considered the center of the residue with open(os.path.join(work_dir,pdb_code) + '.asa', 'r') as fh: for line in fh.readlines(): # use data only from alpha carbons if line[12:16].strip() == center_atom: # extract the data according to the characteristics of the pdb format my_chain = line[21] my_resi = line[22:29].strip() my_resn = line[17:20].strip() x = float(line[30:38].strip()) y = float(line[38:46].strip()) z = float(line[46:54].strip()) res_id = my_resn + my_chain + my_resi my_rsa = rsa_dict[res_id] my_protein.add_residue(Residue(chain=my_chain, resi=my_resi, resn=my_resn, coords=(x, y, z), rsa=my_rsa, score_matrix=score_matrix, ph=ph)) return my_protein def parse_freesasa(pdb_code, work_dir, filename, score_matrix, max_dist, ph): my_protein = Protein(pdb_code=pdb_code, residues=[], out_dir=work_dir, max_dist=max_dist) residue_count = 0 central_atom = 'CA' # atom to be considered the center of the residue with open(os.path.join(work_dir, filename), 'r') as f: for line in f: if line.startswith("ATOM "): # this is the same as naccess asa file if line[12:16].strip() == central_atom: # extract the data according to the characteristics of the pdb format my_chain = line[21] my_resi = line[22:29].strip() my_resn = line[17:20].strip() x = float(line[30:38].strip()) y = float(line[38:46].strip()) z = float(line[46:54].strip()) my_protein.add_residue(Residue(chain=my_chain, resi=my_resi, resn=my_resn, coords=(x, y, z), rsa=0, score_matrix=score_matrix, ph=ph)) elif line.startswith("RES"): # equivalent to naccess rsa file this_id = line[8] + get_one_letter(line[3:8].strip()) + line[9:16].strip() rsa = float(line[22:28].strip()) my_protein.change_res_rsa(res_number=residue_count, res_id=this_id, rsa=rsa) residue_count += 1 return my_protein def run(pdb_file='', mat_file='', max_dist='', work_dir='', naccess=False, ph=None): score_matrix = parse_matrix(filename=mat_file) pdb_code = pdb_file.split(".")[0] if naccess: my_protein = parse_naccess(pdb_code=pdb_code, work_dir=work_dir, score_matrix=score_matrix, max_dist=max_dist, ph=ph) else: my_protein = parse_freesasa(pdb_code=pdb_code, work_dir=work_dir, filename="sasa.out", score_matrix=score_matrix, max_dist=max_dist, ph=ph) my_protein.calc_agg_fin() my_protein.out_csv() # short_summary = my_protein.short_summary()+"\n" # long_summary = my_protein.long_output() # return short_summary + long_summary return my_protein.residues