File size: 12,146 Bytes
a3f3d91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import math
import csv
from . import logger
# minimum and maximum surface exposition to be considered
min_surf = 10
max_surf = 55
_name = "agg3D"
class Residue:
def __init__(self, chain, resi, resn, coords, rsa, score_matrix, ph):
if chain == ' ':
self.chain = '-'
else:
self.chain = chain
self.resi = resi
self.resn = get_one_letter(resn) # we store it in one-letter code
self.coords = coords
self.score_matrix = score_matrix
self.agg_sup = self.calc_res_agg_fin(rsa=rsa)
self.agg_fin = 0
self.rsa = 0
self.ph = ph
self.const_ph_scores = {
"A": 0.66,
"N": -0.84,
"C": 1.66,
"Q": -0.87,
"G": 0,
"I": 2.75,
"L": 1.77,
"M": 1.30,
"F": 3.99,
"P": 1.69,
"S": -0.99,
"T": 0.12,
"W": 3.29,
"Y": 1.33,
"V": 1.45
}
self.aa_data = {
'R': {'pKa': 12.5, "R": 0.857546950885663, "PN": 10 ** (-3.66), "PI": 10 ** (-7.38)},
'D': {'pKa': 3.5, "R": 0.622376926398327, "PN": 10 ** (-3.18), "PI": 10 ** (-8.54)},
'E': {'pKa': 4.2, 'R': 0.940369170156725, 'PN': 10 ** (-3.79), 'PI': 10 ** (-6.2)},
'H': {'pKa': 6.6, 'R': 0.812212007706225, 'PN': 10 ** (-4.67), 'PI': 10 ** (-5.97)},
'K': {'pKa': 10.5, 'R': 0.9093172437, 'PN': 10 ** (-2.19), 'PI': 10 ** (-6.81)},
}
def __str__(self):
return self.chain+self.resn
def set_agg_fin(self, agg_dis_sum):
self.agg_fin = self.agg_sup + agg_dis_sum
def id(self):
return self.chain+self.resn+self.resi
def set_rsa(self, rsa):
self.rsa = rsa
self.agg_sup = self.calc_res_agg_fin(rsa=rsa)
def calc_res_agg_fin(self, rsa):
if rsa < min_surf:
return 0
else:
if rsa > max_surf:
rsa = max_surf
if not self.ph:
return float(self.score_matrix[self.resn]) * 0.0599 * math.exp(0.0521 * rsa)
else:
return self.calc_ph_score() * 0.0599 * math.exp(0.0521 * rsa)
def calc_ph_score(self):
correction_value = -3.13
try:
return self.const_ph_scores[self.resn]
except KeyError:
res_data = self.aa_data[self.resn]
term1, term2 = self.calc_terms(res_data)
corrected_1 = term1 * res_data['R']
return (corrected_1 - term2) - correction_value
def calc_terms(self, data):
first = math.log10(data['PN'] + (data['PI'] * 10 ** (abs(data['pKa'] - self.ph))))
second = math.log10(1 + 10 ** (abs(data['pKa'] - self.ph)))
return first, second
@property
def score(self):
return self.agg_fin
class Protein:
def __init__(self, pdb_code, residues, out_dir, max_dist):
self.name = pdb_code
self.residues = residues
self.out_dir = out_dir
self.max_dist = max_dist
def __str__(self):
return self.name
def add_residue(self, res):
self.residues.append(res)
def change_res_rsa(self, res_number, res_id, rsa):
if self.residues[res_number].id() == res_id:
self.residues[res_number].set_rsa(rsa)
else:
logger.critical(module_name=_name,
msg="Error while parsing freeSasa output. Probably freeSasa failed quietly.")
logger.critical(module_name=_name,
msg="Failed to assign rsa to %s proper id is: %s " %
(res_id, self.residues[res_number].id()) +
"(ChainID, residue's one letter code, residue's ID)")
raise logger.AggrescanError("Failed to parse freeSasa output. Qutting.",
module_name=_name)
def calc_agg_fin(self):
"""Calculate the agg_dis_sum for every Residue in the Protein"""
dist_correction = -2.56 / self.max_dist
for central in self.residues:
agg_dis_sum = 0
if not central.agg_sup == 0:
for peripheric in self.residues:
dist = get_distance(central.coords, peripheric.coords)
if dist < self.max_dist and not dist == 0:
agg_dis_sum += peripheric.agg_sup * 1.2915 * math.exp(dist_correction*dist)
central.set_agg_fin(agg_dis_sum)
def short_summary(self):
"""Return a short summarized output and identify it with a leading #"""
total_sum = 0
maximum = 0
minimum = 0
pos_auc = 0
neg_auc = 0
prev_score = 0
res_number = len(self.residues)
for res in self.residues:
total_sum += res.agg_fin
if res.agg_fin > maximum:
maximum = res.agg_fin
if res.agg_fin < minimum:
minimum = res.agg_fin
if res.agg_fin != 0:
if prev_score != 0:
auc = (res.agg_fin + prev_score)/2
if auc > 0:
pos_auc += abs(auc)
else:
neg_auc += abs(auc)
prev_score = res.agg_fin
return '#' + self.name + " " + ' '.join(str("%.4f" % round(val, 4))
for val in (total_sum,
total_sum/res_number,
maximum,
minimum,
pos_auc))
def long_output(self):
"""Return the complete output and identify it with a leading //"""
output = ""
for res in self.residues:
output += '//' + " ".join((res.chain, res.resi, res.resn)) + " %.4f\n" % round(res.agg_fin, 4)
return output
def out_csv(self):
"""Save the complete output in a csv file"""
csv_file = os.path.join(self.out_dir, "A3D.csv")
c = csv.writer(open(csv_file, "w"))
if os.stat(csv_file).st_size == 0:
c.writerow(["protein", "chain", "residue", "residue_name", "score"])
for res in self.residues:
c.writerow([self.name.split("/")[-1], res.chain, res.resi, res.resn, "%.4f" % round(res.agg_fin, 4)])
def get_residues(self):
return self.residues
def get_one_letter(raw):
"""Return the one letter code for a residue"""
if len(raw) > 3:
aa = raw[-3:]
else:
aa = raw
three_letters = aa.capitalize()
conversion = {"Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C",
"Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I",
"Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P",
"Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V"}
try:
one_letter = conversion[three_letters]
except KeyError:
logger.warning(module_name=_name,
msg='Could not recognize the following residue symbol: "%s"' % raw)
one_letter = None
return one_letter
def get_distance(a, b):
"""Return the distance between two cartesian tri dimensional points"""
return math.sqrt((a[0] - b[0])**2 +
(a[1] - b[1])**2 +
(a[2] - b[2])**2)
def parse_matrix(filename=''):
"""Parse the matrix input into a dictionary"""
with open(filename, 'r') as fh:
matdict = {}
for line in fh.readlines():
if not line == '\n':
pair = line.strip().split(" ")
matdict[pair[0]] = pair[1]
return matdict
def parse_naccess(pdb_code, work_dir, score_matrix, max_dist, ph):
"""Parse the output of naccess into an object of class Protein"""
with open(os.path.join(work_dir,pdb_code) + ".rsa", "r") as fh:
rsa_dict = {line[3:8].strip() + # resn
line[8] + # chain
line[9:16].strip(): # resi
float(line[22:28].strip()) # rsa
for line in fh.readlines()
if line.startswith('RES')}
my_protein = Protein(pdb_code=pdb_code, residues=[], out_dir=work_dir, max_dist=max_dist)
center_atom = 'CA' # atom to be considered the center of the residue
with open(os.path.join(work_dir,pdb_code) + '.asa', 'r') as fh:
for line in fh.readlines():
# use data only from alpha carbons
if line[12:16].strip() == center_atom:
# extract the data according to the characteristics of the pdb format
my_chain = line[21]
my_resi = line[22:29].strip()
my_resn = line[17:20].strip()
x = float(line[30:38].strip())
y = float(line[38:46].strip())
z = float(line[46:54].strip())
res_id = my_resn + my_chain + my_resi
my_rsa = rsa_dict[res_id]
my_protein.add_residue(Residue(chain=my_chain,
resi=my_resi,
resn=my_resn,
coords=(x, y, z),
rsa=my_rsa,
score_matrix=score_matrix,
ph=ph))
return my_protein
def parse_freesasa(pdb_code, work_dir, filename, score_matrix, max_dist, ph):
my_protein = Protein(pdb_code=pdb_code, residues=[], out_dir=work_dir, max_dist=max_dist)
residue_count = 0
central_atom = 'CA' # atom to be considered the center of the residue
with open(os.path.join(work_dir, filename), 'r') as f:
for line in f:
if line.startswith("ATOM "): # this is the same as naccess asa file
if line[12:16].strip() == central_atom:
# extract the data according to the characteristics of the pdb format
my_chain = line[21]
my_resi = line[22:29].strip()
my_resn = line[17:20].strip()
x = float(line[30:38].strip())
y = float(line[38:46].strip())
z = float(line[46:54].strip())
my_protein.add_residue(Residue(chain=my_chain,
resi=my_resi,
resn=my_resn,
coords=(x, y, z),
rsa=0,
score_matrix=score_matrix,
ph=ph))
elif line.startswith("RES"): # equivalent to naccess rsa file
this_id = line[8] + get_one_letter(line[3:8].strip()) + line[9:16].strip()
rsa = float(line[22:28].strip())
my_protein.change_res_rsa(res_number=residue_count, res_id=this_id, rsa=rsa)
residue_count += 1
return my_protein
def run(pdb_file='', mat_file='', max_dist='', work_dir='', naccess=False, ph=None):
score_matrix = parse_matrix(filename=mat_file)
pdb_code = pdb_file.split(".")[0]
if naccess:
my_protein = parse_naccess(pdb_code=pdb_code, work_dir=work_dir,
score_matrix=score_matrix, max_dist=max_dist, ph=ph)
else:
my_protein = parse_freesasa(pdb_code=pdb_code, work_dir=work_dir,
filename="sasa.out", score_matrix=score_matrix, max_dist=max_dist, ph=ph)
my_protein.calc_agg_fin()
my_protein.out_csv()
# short_summary = my_protein.short_summary()+"\n"
# long_summary = my_protein.long_output()
# return short_summary + long_summary
return my_protein.residues
|