Spaces:
No application file
No application file
# Copyright (C) 2002, Thomas Hamelryck ([email protected]) | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Polypeptide-related classes (construction and representation). | |
Simple example with multiple chains, | |
>>> from Bio.PDB.PDBParser import PDBParser | |
>>> from Bio.PDB.Polypeptide import PPBuilder | |
>>> structure = PDBParser().get_structure('2BEG', 'PDB/2BEG.pdb') | |
>>> ppb=PPBuilder() | |
>>> for pp in ppb.build_peptides(structure): | |
... print(pp.get_sequence()) | |
LVFFAEDVGSNKGAIIGLMVGGVVIA | |
LVFFAEDVGSNKGAIIGLMVGGVVIA | |
LVFFAEDVGSNKGAIIGLMVGGVVIA | |
LVFFAEDVGSNKGAIIGLMVGGVVIA | |
LVFFAEDVGSNKGAIIGLMVGGVVIA | |
Example with non-standard amino acids using HETATM lines in the PDB file, | |
in this case selenomethionine (MSE): | |
>>> from Bio.PDB.PDBParser import PDBParser | |
>>> from Bio.PDB.Polypeptide import PPBuilder | |
>>> structure = PDBParser().get_structure('1A8O', 'PDB/1A8O.pdb') | |
>>> ppb=PPBuilder() | |
>>> for pp in ppb.build_peptides(structure): | |
... print(pp.get_sequence()) | |
DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW | |
TETLLVQNANPDCKTILKALGPGATLEE | |
TACQG | |
If you want to, you can include non-standard amino acids in the peptides: | |
>>> for pp in ppb.build_peptides(structure, aa_only=False): | |
... print(pp.get_sequence()) | |
... print("%s %s" % (pp.get_sequence()[0], pp[0].get_resname())) | |
... print("%s %s" % (pp.get_sequence()[-7], pp[-7].get_resname())) | |
... print("%s %s" % (pp.get_sequence()[-6], pp[-6].get_resname())) | |
MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG | |
M MSE | |
M MSE | |
M MSE | |
In this case the selenomethionines (the first and also seventh and sixth from | |
last residues) have been shown as M (methionine) by the get_sequence method. | |
""" | |
import warnings | |
from Bio import BiopythonDeprecationWarning | |
from Bio.Data.PDBData import nucleic_letters_3to1 | |
from Bio.Data.PDBData import nucleic_letters_3to1_extended | |
from Bio.Data.PDBData import protein_letters_3to1 | |
from Bio.Data.PDBData import protein_letters_3to1_extended | |
from Bio.PDB.PDBExceptions import PDBException | |
from Bio.PDB.vectors import calc_dihedral, calc_angle | |
from Bio.Seq import Seq | |
# Sorted by 1-letter code | |
aa3, aa1 = zip(*sorted(protein_letters_3to1.items(), key=lambda x: x[1])) | |
standard_aa_names = aa3 | |
d1_to_index = {} | |
dindex_to_1 = {} | |
d3_to_index = {} | |
dindex_to_3 = {} | |
# Create some lookup tables | |
for i in range(0, 20): | |
n1 = aa1[i] | |
n3 = aa3[i] | |
d1_to_index[n1] = i | |
dindex_to_1[i] = n1 | |
d3_to_index[n3] = i | |
dindex_to_3[i] = n3 | |
def index_to_one(index): | |
"""Index to corresponding one letter amino acid name. | |
>>> index_to_one(0) | |
'A' | |
>>> index_to_one(19) | |
'Y' | |
""" | |
return dindex_to_1[index] | |
def one_to_index(s): | |
"""One letter code to index. | |
>>> one_to_index('A') | |
0 | |
>>> one_to_index('Y') | |
19 | |
""" | |
return d1_to_index[s] | |
def index_to_three(i): | |
"""Index to corresponding three letter amino acid name. | |
>>> index_to_three(0) | |
'ALA' | |
>>> index_to_three(19) | |
'TYR' | |
""" | |
return dindex_to_3[i] | |
def three_to_index(s): | |
"""Three letter code to index. | |
>>> three_to_index('ALA') | |
0 | |
>>> three_to_index('TYR') | |
19 | |
""" | |
return d3_to_index[s] | |
def three_to_one(s): | |
"""Three letter code to one letter code. | |
>>> three_to_one('ALA') | |
'A' | |
>>> three_to_one('TYR') | |
'Y' | |
For non-standard amino acids, you get a KeyError: | |
>>> three_to_one('MSE') | |
Traceback (most recent call last): | |
... | |
KeyError: 'MSE' | |
""" | |
warnings.warn( | |
"'three_to_one' will be deprecated in a future release of Biopython " | |
"in favor of 'Bio.PDB.Polypeptide.protein_letters_3to1'.", | |
BiopythonDeprecationWarning, | |
) | |
i = d3_to_index[s] | |
return dindex_to_1[i] | |
def one_to_three(s): | |
"""One letter code to three letter code. | |
>>> one_to_three('A') | |
'ALA' | |
>>> one_to_three('Y') | |
'TYR' | |
""" | |
warnings.warn( | |
"'one_to_three' will be deprecated in a future release of Biopython " | |
"in favor of 'Bio.PDB.Polypeptide.protein_letters_1to3'.", | |
BiopythonDeprecationWarning, | |
) | |
i = d1_to_index[s] | |
return dindex_to_3[i] | |
def is_aa(residue, standard=False): | |
"""Return True if residue object/string is an amino acid. | |
:param residue: a L{Residue} object OR a three letter amino acid code | |
:type residue: L{Residue} or string | |
:param standard: flag to check for the 20 AA (default false) | |
:type standard: boolean | |
>>> is_aa('ALA') | |
True | |
Known three letter codes for modified amino acids are supported, | |
>>> is_aa('FME') | |
True | |
>>> is_aa('FME', standard=True) | |
False | |
""" | |
if not isinstance(residue, str): | |
residue = f"{residue.get_resname():<3s}" | |
residue = residue.upper() | |
if standard: | |
return residue in protein_letters_3to1 | |
else: | |
return residue in protein_letters_3to1_extended | |
def is_nucleic(residue, standard=False): | |
"""Return True if residue object/string is a nucleic acid. | |
:param residue: a L{Residue} object OR a three letter code | |
:type residue: L{Residue} or string | |
:param standard: flag to check for the 8 (DNA + RNA) canonical bases. | |
Default is False. | |
:type standard: boolean | |
>>> is_nucleic('DA ') | |
True | |
>>> is_nucleic('A ') | |
True | |
Known three letter codes for modified nucleotides are supported, | |
>>> is_nucleic('A2L') | |
True | |
>>> is_nucleic('A2L', standard=True) | |
False | |
""" | |
if not isinstance(residue, str): | |
residue = f"{residue.get_resname():<3s}" | |
residue = residue.upper() | |
if standard: | |
return residue in nucleic_letters_3to1 | |
else: | |
return residue in nucleic_letters_3to1_extended | |
class Polypeptide(list): | |
"""A polypeptide is simply a list of L{Residue} objects.""" | |
def get_ca_list(self): | |
"""Get list of C-alpha atoms in the polypeptide. | |
:return: the list of C-alpha atoms | |
:rtype: [L{Atom}, L{Atom}, ...] | |
""" | |
ca_list = [] | |
for res in self: | |
ca = res["CA"] | |
ca_list.append(ca) | |
return ca_list | |
def get_phi_psi_list(self): | |
"""Return the list of phi/psi dihedral angles.""" | |
ppl = [] | |
lng = len(self) | |
for i in range(0, lng): | |
res = self[i] | |
try: | |
n = res["N"].get_vector() | |
ca = res["CA"].get_vector() | |
c = res["C"].get_vector() | |
except Exception: | |
# Some atoms are missing | |
# Phi/Psi cannot be calculated for this residue | |
ppl.append((None, None)) | |
res.xtra["PHI"] = None | |
res.xtra["PSI"] = None | |
continue | |
# Phi | |
if i > 0: | |
rp = self[i - 1] | |
try: | |
cp = rp["C"].get_vector() | |
phi = calc_dihedral(cp, n, ca, c) | |
except Exception: | |
phi = None | |
else: | |
# No phi for residue 0! | |
phi = None | |
# Psi | |
if i < (lng - 1): | |
rn = self[i + 1] | |
try: | |
nn = rn["N"].get_vector() | |
psi = calc_dihedral(n, ca, c, nn) | |
except Exception: | |
psi = None | |
else: | |
# No psi for last residue! | |
psi = None | |
ppl.append((phi, psi)) | |
# Add Phi/Psi to xtra dict of residue | |
res.xtra["PHI"] = phi | |
res.xtra["PSI"] = psi | |
return ppl | |
def get_tau_list(self): | |
"""List of tau torsions angles for all 4 consecutive Calpha atoms.""" | |
ca_list = self.get_ca_list() | |
tau_list = [] | |
for i in range(0, len(ca_list) - 3): | |
atom_list = (ca_list[i], ca_list[i + 1], ca_list[i + 2], ca_list[i + 3]) | |
v1, v2, v3, v4 = (a.get_vector() for a in atom_list) | |
tau = calc_dihedral(v1, v2, v3, v4) | |
tau_list.append(tau) | |
# Put tau in xtra dict of residue | |
res = ca_list[i + 2].get_parent() | |
res.xtra["TAU"] = tau | |
return tau_list | |
def get_theta_list(self): | |
"""List of theta angles for all 3 consecutive Calpha atoms.""" | |
theta_list = [] | |
ca_list = self.get_ca_list() | |
for i in range(0, len(ca_list) - 2): | |
atom_list = (ca_list[i], ca_list[i + 1], ca_list[i + 2]) | |
v1, v2, v3 = (a.get_vector() for a in atom_list) | |
theta = calc_angle(v1, v2, v3) | |
theta_list.append(theta) | |
# Put tau in xtra dict of residue | |
res = ca_list[i + 1].get_parent() | |
res.xtra["THETA"] = theta | |
return theta_list | |
def get_sequence(self): | |
"""Return the AA sequence as a Seq object. | |
:return: polypeptide sequence | |
:rtype: L{Seq} | |
""" | |
s = "".join( | |
protein_letters_3to1_extended.get(res.get_resname(), "X") for res in self | |
) | |
return Seq(s) | |
def __repr__(self): | |
"""Return string representation of the polypeptide. | |
Return <Polypeptide start=START end=END>, where START | |
and END are sequence identifiers of the outer residues. | |
""" | |
start = self[0].get_id()[1] | |
end = self[-1].get_id()[1] | |
return f"<Polypeptide start={start} end={end}>" | |
class _PPBuilder: | |
"""Base class to extract polypeptides. | |
It checks if two consecutive residues in a chain are connected. | |
The connectivity test is implemented by a subclass. | |
This assumes you want both standard and non-standard amino acids. | |
""" | |
def __init__(self, radius): | |
"""Initialize the base class. | |
:param radius: distance | |
:type radius: float | |
""" | |
self.radius = radius | |
def _accept(self, residue, standard_aa_only): | |
"""Check if the residue is an amino acid (PRIVATE).""" | |
if is_aa(residue, standard=standard_aa_only): | |
return True | |
elif not standard_aa_only and "CA" in residue.child_dict: | |
# It has an alpha carbon... | |
# We probably need to update the hard coded list of | |
# non-standard residues, see function is_aa for details. | |
warnings.warn( | |
"Assuming residue %s is an unknown modified amino acid" | |
% residue.get_resname() | |
) | |
return True | |
else: | |
# not a standard AA so skip | |
return False | |
def build_peptides(self, entity, aa_only=1): | |
"""Build and return a list of Polypeptide objects. | |
:param entity: polypeptides are searched for in this object | |
:type entity: L{Structure}, L{Model} or L{Chain} | |
:param aa_only: if 1, the residue needs to be a standard AA | |
:type aa_only: int | |
""" | |
is_connected = self._is_connected | |
accept = self._accept | |
level = entity.get_level() | |
# Decide which entity we are dealing with | |
if level == "S": | |
model = entity[0] | |
chain_list = model.get_list() | |
elif level == "M": | |
chain_list = entity.get_list() | |
elif level == "C": | |
chain_list = [entity] | |
else: | |
raise PDBException("Entity should be Structure, Model or Chain.") | |
pp_list = [] | |
for chain in chain_list: | |
chain_it = iter(chain) | |
try: | |
prev_res = next(chain_it) | |
while not accept(prev_res, aa_only): | |
prev_res = next(chain_it) | |
except StopIteration: | |
# No interesting residues at all in this chain | |
continue | |
pp = None | |
for next_res in chain_it: | |
if ( | |
accept(prev_res, aa_only) | |
and accept(next_res, aa_only) | |
and is_connected(prev_res, next_res) | |
): | |
if pp is None: | |
pp = Polypeptide() | |
pp.append(prev_res) | |
pp_list.append(pp) | |
pp.append(next_res) | |
else: | |
# Either too far apart, or one of the residues is unwanted. | |
# End the current peptide | |
pp = None | |
prev_res = next_res | |
return pp_list | |
class CaPPBuilder(_PPBuilder): | |
"""Use CA--CA distance to find polypeptides.""" | |
def __init__(self, radius=4.3): | |
"""Initialize the class.""" | |
_PPBuilder.__init__(self, radius) | |
def _is_connected(self, prev_res, next_res): | |
for r in [prev_res, next_res]: | |
if not r.has_id("CA"): | |
return False | |
n = next_res["CA"] | |
p = prev_res["CA"] | |
# Unpack disordered | |
if n.is_disordered(): | |
nlist = n.disordered_get_list() | |
else: | |
nlist = [n] | |
if p.is_disordered(): | |
plist = p.disordered_get_list() | |
else: | |
plist = [p] | |
for nn in nlist: | |
for pp in plist: | |
if (nn - pp) < self.radius: | |
return True | |
return False | |
class PPBuilder(_PPBuilder): | |
"""Use C--N distance to find polypeptides.""" | |
def __init__(self, radius=1.8): | |
"""Initialize the class.""" | |
_PPBuilder.__init__(self, radius) | |
def _is_connected(self, prev_res, next_res): | |
if not prev_res.has_id("C"): | |
return False | |
if not next_res.has_id("N"): | |
return False | |
test_dist = self._test_dist | |
c = prev_res["C"] | |
n = next_res["N"] | |
# Test all disordered atom positions! | |
if c.is_disordered(): | |
clist = c.disordered_get_list() | |
else: | |
clist = [c] | |
if n.is_disordered(): | |
nlist = n.disordered_get_list() | |
else: | |
nlist = [n] | |
for nn in nlist: | |
for cc in clist: | |
# To form a peptide bond, N and C must be | |
# within radius and have the same altloc | |
# identifier or one altloc blank | |
n_altloc = nn.get_altloc() | |
c_altloc = cc.get_altloc() | |
if n_altloc == c_altloc or n_altloc == " " or c_altloc == " ": | |
if test_dist(nn, cc): | |
# Select the disordered atoms that | |
# are indeed bonded | |
if c.is_disordered(): | |
c.disordered_select(c_altloc) | |
if n.is_disordered(): | |
n.disordered_select(n_altloc) | |
return True | |
return False | |
def _test_dist(self, c, n): | |
"""Return 1 if distance between atoms<radius (PRIVATE).""" | |
if (c - n) < self.radius: | |
return 1 | |
else: | |
return 0 | |