jfaustin's picture
secretion-scores (#4)
a3f3d91 verified
# -*- coding: utf-8 -*-
"""Module to handle pdb files."""
import os
from . import logger
import re
import gzip
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from io import StringIO
import json
_name = "PDB"
class Pdb:
"""
Pdb parser. Initialized by:
1. pdb filename
2. gzipped pdb filename
3. 4-letter pdb code
"""
def __init__(self,*args, **kwargs):
self.file_name = None
self.pdb_code = None
self.dir = os.getcwd()
self.loc = os.path.join(self.dir, "input_pdb")
self.codification = {"ALA" : 'A', "CYS" : 'C', "ASP" : 'D', "GLU" : 'E', "PHE" : 'F', "GLY" : 'G', "HIS" : 'H',
"ILE" : 'I', "LYS" : 'K', "LEU" : 'L', "MET" : 'M', "MSE" : 'M', "ASN" : 'N', "PYL" : 'O',
"PRO" : 'P', "GLN" : 'Q', "ARG" : 'R', "SER" : 'S', "THR" : 'T', "SEC" : 'U', "VAL" : 'V',
"TRP" : 'W', "5HP" : 'E', "ABA" : 'A', "AIB" : 'A', "BMT" : 'T', "CEA" : 'C', "CGU" : 'E',
"CME" : 'C', "CRO" : 'X', "CSD" : 'C', "CSO" : 'C', "CSS" : 'C', "CSW" : 'C', "CSX" : 'C',
"CXM" : 'M', "DAL" : 'A', "DAR" : 'R', "DCY" : 'C', "DGL" : 'E', "DGN" : 'Q', "DHI" : 'H',
"DIL" : 'I', "DIV" : 'V', "DLE" : 'L', "DLY" : 'K', "DPN" : 'F', "DPR" : 'P', "DSG" : 'N',
"DSN" : 'S', "DSP" : 'D', "DTH" : 'T', "DTR" : 'X', "DTY" : 'Y', "DVA" : 'V', "FME" : 'M',
"HYP" : 'P', "KCX" : 'K', "LLP" : 'K', "MLE" : 'L', "MVA" : 'V', "NLE" : 'L', "OCS" : 'C',
"ORN" : 'A', "PCA" : 'E', "PTR" : 'Y', "SAR" : 'G', "SEP" : 'S', "STY" : 'Y', "TPO" : 'T',
"TPQ" : 'F', "TYS" : 'Y', "TYR" : 'Y' }
keys = list(self.codification.keys())
self.sequences = {}
self.onlycalfa = ""
self.allatoms = ""
self.chain = ""
self.canumber = 0
self.allnumber = 0
if args and len(args) == 1:
if args[0] is None: raise logger.AggrescanError("No pdb code/file provided. Quitting.",
module_name=_name)
if os.path.isfile(args[0]):
self.file_name = args[0]
else:
self.pdb_code = args[0]
if kwargs:
self.loc = kwargs['output']
try:
self.chain = kwargs['chain']
except KeyError:
pass
if self.file_name:
try:
self.handler = gzip.GzipFile(filename=self.file_name)
self.data = self.handler.readlines()
logger.debug(module_name=_name, msg="Reading %s" % os.path.abspath(self.file_name))
except IOError:
try:
self.handler = open(self.file_name)
self.data = self.handler.readlines()
logger.debug(module_name=_name, msg="Reading %s" % os.path.abspath(self.file_name))
except IOError:
raise logger.AggrescanError("Couldnt open specified filename %s. Quitting.' % os.path.abspath(self.file_name)",
module_name=_name)
elif self.pdb_code:
self.handler = self.download_pdb()
self.data = self.handler.readlines()
seq = re.compile(r"^ATOM.{9}CA..(?P<seqid>.{3}).(?P<chain>.{1})(?P<resid>.{4})") # TODO zle dla alternatywnych
if self.chain != '':
atm = re.compile(r"^ATOM.{9}(.{2}).( |A).{4}" + self.chain + "(?P<resid>.{4})(?P<x>.{12})(?P<y>.{8})(?P<z>.{8})")
else:
atm = re.compile(r"^ATOM.{9}(.{2}).( |A).{5}(?P<resid>.{4})(?P<x>.{12})(?P<y>.{8})(?P<z>.{8})")
ter = re.compile(r'^END|^TER')
mod = re.compile(r"^ENDMDL")
self.trajectory = []
self.sequence = ""
lines = self.data
end = len(lines) - 1
counter = 0
self._chainsOrder(lines)
self._resIndexes(lines)
self.mutatedata = {}
for line in lines:
line = re.sub(r'^HETATM(.{11})MSE(.*$)', r'ATOM \1MET\2', line)
localData = atm.match(line)
data_seq = seq.match(line)
if data_seq:
seqid = data_seq.groups()[0].strip()
chainid = data_seq.groups()[1].strip()
resid = data_seq.groups()[2].strip()
if seqid in keys:
s = self.codification[seqid]
else:
s = "X"
self.sequence += s
# add to mutate page
if chainid in list(self.mutatedata.keys()):
self.mutatedata[chainid].append({'chain': chainid,
'resname': s,
'residx': resid})
else:
self.mutatedata[chainid] = [{'chain': chainid,
'resname': s,
'residx': resid}]
if chainid in list(self.sequences.keys()):
self.sequences[chainid] += s
else:
self.sequences[chainid] = s
if localData:
self.allnumber += 1
self.allatoms += line
dg = localData.groups()
if dg[0] == 'CA':
self.onlycalfa += line
self.canumber += 1
if counter == end:
self.onlycalfa += line
self.allatoms += line
if ter.match(line):
if self.chain:
if line[21] == self.chain:
self.onlycalfa += line
self.allatoms += line
else:
self.onlycalfa += line
self.allatoms += line
if (mod.match(line) and len(self.onlycalfa) > 1) or counter == end:
break
counter += 1
self.handler.close()
def _resIndexes(self, body):
atm = re.compile(r"^ATOM.{9}CA..(?P<seqid>.{3}).(?P<chain>.{1})(?P<resid>.{4})")
ter = re.compile(r'^END|^TER')
mod = re.compile(r"^ENDMDL")
self.numb = {}
for chain in self.chains_order:
self.numb[chain] = []
for line in body:
d = atm.match(line)
if d:
self.numb[d.group('chain').strip()].append(int(d.group('resid')))
if mod.match(line):
break
def _chainsOrder(self, body):
atm = re.compile(r"^ATOM.{9}CA..(?P<seqid>.{3}).(?P<chain>.{1})(?P<resid>.{4})")
self.chains_order = []
for line in body:
d = atm.match(line)
if d and d.group('chain') not in self.chains_order:
self.chains_order.append(d.group('chain'))
def isSingleChain(self):
if self.chain != '' or len(list(self.sequences.keys())) == 1:
return True
else:
return False
def containsOnlyCA(self):
if self.allnumber == self.canumber:
return True
else:
return False
def isBroken(self):
brk = []
if self.chain != '':
indexes = self.numb[self.chain]
first = indexes[0]
for i in range(1, len(indexes)):
if indexes[i] - 1 != first:
brk.append(str(first) + "-" + str(indexes[i]))
first = indexes[i]
else:
for chain in list(self.sequences.keys()):
indexes = self.numb[chain]
first = indexes[0]
for i in range(1, len(indexes)):
if indexes[i] - 1 != first:
brk.append(str(first) + "-" + str(indexes[i]))
first = indexes[i]
if len(brk) > 0:
return ", ".join(brk)
return False
def getResIndexes(self):
t = [str(i) for i in self.numb[self.chain]]
return ",".join(t)
def getBody(self):
return self.allatoms
def containsChain(self, chain):
if chain in list(self.sequences.keys()):
return True
def getSequenceNoHTML(self):
if self.chain != '':
return self.sequences[self.chain]
else:
out = ""
for k in list(self.sequences.keys()):
out += "".join(self.sequences[k])
return out
def getSequence(self):
if self.chain != '':
return "<strong>" + self.chain + "</strong>: " + self.sequences[self.chain]
else:
out = ""
for k in list(self.sequences.keys()):
out += "<strong>" + k + "</strong>: "
out += "".join(self.sequences[k])
out += "<br>"
return out
def getChainIdxResname(self):
if self.chain == '':
return json.dumps(self.mutatedata)
else:
return json.dumps({self.chain: self.mutatedata[self.chain]})
def savePdbFile(self,path=''):
if path:
logger.to_file(filename=path, content=self.allatoms, allow_err=True)
else:
logger.to_file(filename=self.loc, content=self.allatoms, allow_err=True)
def getPath(self):
if os.path.isfile(self.loc):
return self.loc
else:
raise logger.AggrescanError("Location for pdb file requested at: %s. The file was not found." % self.loc,
module_name=_name)
def download_pdb(self):
try:
gz_string = urlopen('http://www.rcsb.org/pdb/files/' + self.pdb_code.lower() + '.pdb.gz').read()
except HTTPError as e:
raise logger.AggrescanError("Could not download the pdb file. %s is not a valid pdb code/file. " % self.pdb_code,
module_name=_name)
except URLError as e:
raise logger.AggrescanError("Could not download the pdb file. Can't connect to the PDB database - quitting",
module_name=_name)
fileLike = StringIO(gz_string)
logger.debug(module_name=_name, msg="Successfully downloaded %s" % self.pdb_code.lower() + '.pdb.gz')
return gzip.GzipFile(fileobj=fileLike,mode="rb")
def validate(self):
logger.debug(module_name=_name,msg='Validating pdb file: %s' % self.loc)
if self.chain != '' and not self.containsChain(self.chain):
raise logger.AggrescanError("Selected chain: %s not found in the pdb file. Quitting." % self.chain,
module_name=_name)
seq = self.getSequence()
seq = re.sub("<strong>\w+</strong>:", "", seq)
seq = re.sub("<br>", "", seq)
seq = seq.replace(" ", "")
allowed_seq = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'Y']
if len(seq) < 4:
raise logger.AggrescanError("Sequence too short (perhaps something went wrong with pdb parsing).",
module_name=_name)
for e in seq:
if e not in allowed_seq:
raise logger.AggrescanError("Not supported amino acid: %s found in pdb file. Quitting." % e,
module_name=_name)
if __name__ == '__main__':
pass