aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright (C) 2002, Thomas Hamelryck ([email protected])
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Polypeptide-related classes (construction and representation).
Simple example with multiple chains,
>>> from Bio.PDB.PDBParser import PDBParser
>>> from Bio.PDB.Polypeptide import PPBuilder
>>> structure = PDBParser().get_structure('2BEG', 'PDB/2BEG.pdb')
>>> ppb=PPBuilder()
>>> for pp in ppb.build_peptides(structure):
... print(pp.get_sequence())
LVFFAEDVGSNKGAIIGLMVGGVVIA
LVFFAEDVGSNKGAIIGLMVGGVVIA
LVFFAEDVGSNKGAIIGLMVGGVVIA
LVFFAEDVGSNKGAIIGLMVGGVVIA
LVFFAEDVGSNKGAIIGLMVGGVVIA
Example with non-standard amino acids using HETATM lines in the PDB file,
in this case selenomethionine (MSE):
>>> from Bio.PDB.PDBParser import PDBParser
>>> from Bio.PDB.Polypeptide import PPBuilder
>>> structure = PDBParser().get_structure('1A8O', 'PDB/1A8O.pdb')
>>> ppb=PPBuilder()
>>> for pp in ppb.build_peptides(structure):
... print(pp.get_sequence())
DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW
TETLLVQNANPDCKTILKALGPGATLEE
TACQG
If you want to, you can include non-standard amino acids in the peptides:
>>> for pp in ppb.build_peptides(structure, aa_only=False):
... print(pp.get_sequence())
... print("%s %s" % (pp.get_sequence()[0], pp[0].get_resname()))
... print("%s %s" % (pp.get_sequence()[-7], pp[-7].get_resname()))
... print("%s %s" % (pp.get_sequence()[-6], pp[-6].get_resname()))
MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG
M MSE
M MSE
M MSE
In this case the selenomethionines (the first and also seventh and sixth from
last residues) have been shown as M (methionine) by the get_sequence method.
"""
import warnings
from Bio import BiopythonDeprecationWarning
from Bio.Data.PDBData import nucleic_letters_3to1
from Bio.Data.PDBData import nucleic_letters_3to1_extended
from Bio.Data.PDBData import protein_letters_3to1
from Bio.Data.PDBData import protein_letters_3to1_extended
from Bio.PDB.PDBExceptions import PDBException
from Bio.PDB.vectors import calc_dihedral, calc_angle
from Bio.Seq import Seq
# Sorted by 1-letter code
aa3, aa1 = zip(*sorted(protein_letters_3to1.items(), key=lambda x: x[1]))
standard_aa_names = aa3
d1_to_index = {}
dindex_to_1 = {}
d3_to_index = {}
dindex_to_3 = {}
# Create some lookup tables
for i in range(0, 20):
n1 = aa1[i]
n3 = aa3[i]
d1_to_index[n1] = i
dindex_to_1[i] = n1
d3_to_index[n3] = i
dindex_to_3[i] = n3
def index_to_one(index):
"""Index to corresponding one letter amino acid name.
>>> index_to_one(0)
'A'
>>> index_to_one(19)
'Y'
"""
return dindex_to_1[index]
def one_to_index(s):
"""One letter code to index.
>>> one_to_index('A')
0
>>> one_to_index('Y')
19
"""
return d1_to_index[s]
def index_to_three(i):
"""Index to corresponding three letter amino acid name.
>>> index_to_three(0)
'ALA'
>>> index_to_three(19)
'TYR'
"""
return dindex_to_3[i]
def three_to_index(s):
"""Three letter code to index.
>>> three_to_index('ALA')
0
>>> three_to_index('TYR')
19
"""
return d3_to_index[s]
def three_to_one(s):
"""Three letter code to one letter code.
>>> three_to_one('ALA')
'A'
>>> three_to_one('TYR')
'Y'
For non-standard amino acids, you get a KeyError:
>>> three_to_one('MSE')
Traceback (most recent call last):
...
KeyError: 'MSE'
"""
warnings.warn(
"'three_to_one' will be deprecated in a future release of Biopython "
"in favor of 'Bio.PDB.Polypeptide.protein_letters_3to1'.",
BiopythonDeprecationWarning,
)
i = d3_to_index[s]
return dindex_to_1[i]
def one_to_three(s):
"""One letter code to three letter code.
>>> one_to_three('A')
'ALA'
>>> one_to_three('Y')
'TYR'
"""
warnings.warn(
"'one_to_three' will be deprecated in a future release of Biopython "
"in favor of 'Bio.PDB.Polypeptide.protein_letters_1to3'.",
BiopythonDeprecationWarning,
)
i = d1_to_index[s]
return dindex_to_3[i]
def is_aa(residue, standard=False):
"""Return True if residue object/string is an amino acid.
:param residue: a L{Residue} object OR a three letter amino acid code
:type residue: L{Residue} or string
:param standard: flag to check for the 20 AA (default false)
:type standard: boolean
>>> is_aa('ALA')
True
Known three letter codes for modified amino acids are supported,
>>> is_aa('FME')
True
>>> is_aa('FME', standard=True)
False
"""
if not isinstance(residue, str):
residue = f"{residue.get_resname():<3s}"
residue = residue.upper()
if standard:
return residue in protein_letters_3to1
else:
return residue in protein_letters_3to1_extended
def is_nucleic(residue, standard=False):
"""Return True if residue object/string is a nucleic acid.
:param residue: a L{Residue} object OR a three letter code
:type residue: L{Residue} or string
:param standard: flag to check for the 8 (DNA + RNA) canonical bases.
Default is False.
:type standard: boolean
>>> is_nucleic('DA ')
True
>>> is_nucleic('A ')
True
Known three letter codes for modified nucleotides are supported,
>>> is_nucleic('A2L')
True
>>> is_nucleic('A2L', standard=True)
False
"""
if not isinstance(residue, str):
residue = f"{residue.get_resname():<3s}"
residue = residue.upper()
if standard:
return residue in nucleic_letters_3to1
else:
return residue in nucleic_letters_3to1_extended
class Polypeptide(list):
"""A polypeptide is simply a list of L{Residue} objects."""
def get_ca_list(self):
"""Get list of C-alpha atoms in the polypeptide.
:return: the list of C-alpha atoms
:rtype: [L{Atom}, L{Atom}, ...]
"""
ca_list = []
for res in self:
ca = res["CA"]
ca_list.append(ca)
return ca_list
def get_phi_psi_list(self):
"""Return the list of phi/psi dihedral angles."""
ppl = []
lng = len(self)
for i in range(0, lng):
res = self[i]
try:
n = res["N"].get_vector()
ca = res["CA"].get_vector()
c = res["C"].get_vector()
except Exception:
# Some atoms are missing
# Phi/Psi cannot be calculated for this residue
ppl.append((None, None))
res.xtra["PHI"] = None
res.xtra["PSI"] = None
continue
# Phi
if i > 0:
rp = self[i - 1]
try:
cp = rp["C"].get_vector()
phi = calc_dihedral(cp, n, ca, c)
except Exception:
phi = None
else:
# No phi for residue 0!
phi = None
# Psi
if i < (lng - 1):
rn = self[i + 1]
try:
nn = rn["N"].get_vector()
psi = calc_dihedral(n, ca, c, nn)
except Exception:
psi = None
else:
# No psi for last residue!
psi = None
ppl.append((phi, psi))
# Add Phi/Psi to xtra dict of residue
res.xtra["PHI"] = phi
res.xtra["PSI"] = psi
return ppl
def get_tau_list(self):
"""List of tau torsions angles for all 4 consecutive Calpha atoms."""
ca_list = self.get_ca_list()
tau_list = []
for i in range(0, len(ca_list) - 3):
atom_list = (ca_list[i], ca_list[i + 1], ca_list[i + 2], ca_list[i + 3])
v1, v2, v3, v4 = (a.get_vector() for a in atom_list)
tau = calc_dihedral(v1, v2, v3, v4)
tau_list.append(tau)
# Put tau in xtra dict of residue
res = ca_list[i + 2].get_parent()
res.xtra["TAU"] = tau
return tau_list
def get_theta_list(self):
"""List of theta angles for all 3 consecutive Calpha atoms."""
theta_list = []
ca_list = self.get_ca_list()
for i in range(0, len(ca_list) - 2):
atom_list = (ca_list[i], ca_list[i + 1], ca_list[i + 2])
v1, v2, v3 = (a.get_vector() for a in atom_list)
theta = calc_angle(v1, v2, v3)
theta_list.append(theta)
# Put tau in xtra dict of residue
res = ca_list[i + 1].get_parent()
res.xtra["THETA"] = theta
return theta_list
def get_sequence(self):
"""Return the AA sequence as a Seq object.
:return: polypeptide sequence
:rtype: L{Seq}
"""
s = "".join(
protein_letters_3to1_extended.get(res.get_resname(), "X") for res in self
)
return Seq(s)
def __repr__(self):
"""Return string representation of the polypeptide.
Return <Polypeptide start=START end=END>, where START
and END are sequence identifiers of the outer residues.
"""
start = self[0].get_id()[1]
end = self[-1].get_id()[1]
return f"<Polypeptide start={start} end={end}>"
class _PPBuilder:
"""Base class to extract polypeptides.
It checks if two consecutive residues in a chain are connected.
The connectivity test is implemented by a subclass.
This assumes you want both standard and non-standard amino acids.
"""
def __init__(self, radius):
"""Initialize the base class.
:param radius: distance
:type radius: float
"""
self.radius = radius
def _accept(self, residue, standard_aa_only):
"""Check if the residue is an amino acid (PRIVATE)."""
if is_aa(residue, standard=standard_aa_only):
return True
elif not standard_aa_only and "CA" in residue.child_dict:
# It has an alpha carbon...
# We probably need to update the hard coded list of
# non-standard residues, see function is_aa for details.
warnings.warn(
"Assuming residue %s is an unknown modified amino acid"
% residue.get_resname()
)
return True
else:
# not a standard AA so skip
return False
def build_peptides(self, entity, aa_only=1):
"""Build and return a list of Polypeptide objects.
:param entity: polypeptides are searched for in this object
:type entity: L{Structure}, L{Model} or L{Chain}
:param aa_only: if 1, the residue needs to be a standard AA
:type aa_only: int
"""
is_connected = self._is_connected
accept = self._accept
level = entity.get_level()
# Decide which entity we are dealing with
if level == "S":
model = entity[0]
chain_list = model.get_list()
elif level == "M":
chain_list = entity.get_list()
elif level == "C":
chain_list = [entity]
else:
raise PDBException("Entity should be Structure, Model or Chain.")
pp_list = []
for chain in chain_list:
chain_it = iter(chain)
try:
prev_res = next(chain_it)
while not accept(prev_res, aa_only):
prev_res = next(chain_it)
except StopIteration:
# No interesting residues at all in this chain
continue
pp = None
for next_res in chain_it:
if (
accept(prev_res, aa_only)
and accept(next_res, aa_only)
and is_connected(prev_res, next_res)
):
if pp is None:
pp = Polypeptide()
pp.append(prev_res)
pp_list.append(pp)
pp.append(next_res)
else:
# Either too far apart, or one of the residues is unwanted.
# End the current peptide
pp = None
prev_res = next_res
return pp_list
class CaPPBuilder(_PPBuilder):
"""Use CA--CA distance to find polypeptides."""
def __init__(self, radius=4.3):
"""Initialize the class."""
_PPBuilder.__init__(self, radius)
def _is_connected(self, prev_res, next_res):
for r in [prev_res, next_res]:
if not r.has_id("CA"):
return False
n = next_res["CA"]
p = prev_res["CA"]
# Unpack disordered
if n.is_disordered():
nlist = n.disordered_get_list()
else:
nlist = [n]
if p.is_disordered():
plist = p.disordered_get_list()
else:
plist = [p]
for nn in nlist:
for pp in plist:
if (nn - pp) < self.radius:
return True
return False
class PPBuilder(_PPBuilder):
"""Use C--N distance to find polypeptides."""
def __init__(self, radius=1.8):
"""Initialize the class."""
_PPBuilder.__init__(self, radius)
def _is_connected(self, prev_res, next_res):
if not prev_res.has_id("C"):
return False
if not next_res.has_id("N"):
return False
test_dist = self._test_dist
c = prev_res["C"]
n = next_res["N"]
# Test all disordered atom positions!
if c.is_disordered():
clist = c.disordered_get_list()
else:
clist = [c]
if n.is_disordered():
nlist = n.disordered_get_list()
else:
nlist = [n]
for nn in nlist:
for cc in clist:
# To form a peptide bond, N and C must be
# within radius and have the same altloc
# identifier or one altloc blank
n_altloc = nn.get_altloc()
c_altloc = cc.get_altloc()
if n_altloc == c_altloc or n_altloc == " " or c_altloc == " ":
if test_dist(nn, cc):
# Select the disordered atoms that
# are indeed bonded
if c.is_disordered():
c.disordered_select(c_altloc)
if n.is_disordered():
n.disordered_select(n_altloc)
return True
return False
def _test_dist(self, c, n):
"""Return 1 if distance between atoms<radius (PRIVATE)."""
if (c - n) < self.radius:
return 1
else:
return 0