Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /PDB /parse_pdb_header.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

11.7 kB

	#!/usr/bin/env python
	# Copyright 2004 Kristian Rother.
	# Revisions copyright 2004 Thomas Hamelryck.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.

	"""Parse header of PDB files into a python dictionary.

	Emerged from the Columba database project www.columba-db.de, original author
	Kristian Rother.
	"""


	import re

	from Bio import File


	def _get_journal(inl):
	# JRNL AUTH L.CHEN,M.DOI,F.S.MATHEWS,A.Y.CHISTOSERDOV, 2BBK 7
	journal = ""
	for line in inl:
	if re.search(r"\AJRNL", line):
	journal += line[19:72].lower()
	journal = re.sub(r"\s\s+", " ", journal)
	return journal


	def _get_references(inl):
	# REMARK 1 REFERENCE 1 1CSE 11
	# REMARK 1 AUTH W.BODE,E.PAPAMOKOS,D.MUSIL 1CSE 12
	references = []
	actref = ""
	for line in inl:
	if re.search(r"\AREMARK 1", line):
	if re.search(r"\AREMARK 1 REFERENCE", line):
	if actref != "":
	actref = re.sub(r"\s\s+", " ", actref)
	if actref != " ":
	references.append(actref)
	actref = ""
	else:
	actref += line[19:72].lower()

	if actref != "":
	actref = re.sub(r"\s\s+", " ", actref)
	if actref != " ":
	references.append(actref)
	return references


	# bring dates to format: 1909-01-08
	def _format_date(pdb_date):
	"""Convert dates from DD-Mon-YY to YYYY-MM-DD format (PRIVATE)."""
	date = ""
	year = int(pdb_date[7:])
	if year < 50:
	century = 2000
	else:
	century = 1900
	date = str(century + year) + "-"
	all_months = [
	"xxx",
	"Jan",
	"Feb",
	"Mar",
	"Apr",
	"May",
	"Jun",
	"Jul",
	"Aug",
	"Sep",
	"Oct",
	"Nov",
	"Dec",
	]
	month = str(all_months.index(pdb_date[3:6]))
	if len(month) == 1:
	month = "0" + month
	date = date + month + "-" + pdb_date[:2]
	return date


	def _chop_end_codes(line):
	"""Chops lines ending with ' 1CSA 14' and the like (PRIVATE)."""
	return re.sub(r"\s\s\s\s+[\w]{4}.\s+\d*\Z", "", line)


	def _chop_end_misc(line):
	"""Chops lines ending with ' 14-JUL-97 1CSA' and the like (PRIVATE)."""
	return re.sub(r"\s+\d\d-\w\w\w-\d\d\s+[1-9][0-9A-Z]{3}\s*\Z", "", line)


	def _nice_case(line):
	"""Make A Lowercase String With Capitals (PRIVATE)."""
	line_lower = line.lower()
	s = ""
	i = 0
	nextCap = 1
	while i < len(line_lower):
	c = line_lower[i]
	if c >= "a" and c <= "z" and nextCap:
	c = c.upper()
	nextCap = 0
	elif c in " .,;:\t-_":
	nextCap = 1
	s += c
	i += 1
	return s


	def parse_pdb_header(infile):
	"""Return the header lines of a pdb file as a dictionary.

	Dictionary keys are: head, deposition_date, release_date, structure_method,
	resolution, structure_reference, journal_reference, author and
	compound.
	"""
	header = []
	with File.as_handle(infile) as f:
	for line in f:
	record_type = line[0:6]
	if record_type in ("ATOM ", "HETATM", "MODEL "):
	break
	else:
	header.append(line)
	return _parse_pdb_header_list(header)


	def _parse_remark_465(line):
	"""Parse missing residue remarks.

	Returns a dictionary describing the missing residue.
	The specification for REMARK 465 at
	http://www.wwpdb.org/documentation/file-format-content/format33/remarks2.html#REMARK%20465
	only gives templates, but does not say they have to be followed.
	So we assume that not all pdb-files with a REMARK 465 can be understood.

	Returns a dictionary with the following keys:
	"model", "res_name", "chain", "ssseq", "insertion"
	"""
	if line:
	# Note that line has been stripped.
	assert line[0] != " " and line[-1] not in "\n ", "line has to be stripped"
	pattern = re.compile(
	r"""
	(\d+\s[\sA-Z][\sA-Z][A-Z] \| # Either model number + residue name
	[A-Z]{1,3}) # Or only residue name with 1 (RNA) to 3 letters
	\s ([A-Za-z0-9]) # A single character chain
	\s+(-?\d+[A-Za-z]?)$ # Residue number: A digit followed by an optional
	# insertion code (Hetero-flags make no sense in
	# context with missing res)
	""",
	re.VERBOSE,
	)
	match = pattern.match(line)
	if match is None:
	return None
	residue = {}
	if " " in match.group(1):
	model, residue["res_name"] = match.group(1).split()
	residue["model"] = int(model)
	else:
	residue["model"] = None
	residue["res_name"] = match.group(1)
	residue["chain"] = match.group(2)
	try:
	residue["ssseq"] = int(match.group(3))
	except ValueError:
	residue["insertion"] = match.group(3)[-1]
	residue["ssseq"] = int(match.group(3)[:-1])
	else:
	residue["insertion"] = None
	return residue


	def _parse_pdb_header_list(header):
	# database fields
	pdbh_dict = {
	"name": "",
	"head": "",
	"idcode": "",
	"deposition_date": "1909-01-08",
	"release_date": "1909-01-08",
	"structure_method": "unknown",
	"resolution": None,
	"structure_reference": "unknown",
	"journal_reference": "unknown",
	"author": "",
	"compound": {"1": {"misc": ""}},
	"source": {"1": {"misc": ""}},
	"has_missing_residues": False,
	"missing_residues": [],
	}

	pdbh_dict["structure_reference"] = _get_references(header)
	pdbh_dict["journal_reference"] = _get_journal(header)
	comp_molid = "1"
	last_comp_key = "misc"
	last_src_key = "misc"

	for hh in header:
	h = re.sub(r"[\s\n\r]*\Z", "", hh) # chop linebreaks off
	# key=re.sub("\s.+\s*","",h)
	key = h[:6].strip()
	# tail=re.sub("\A\w+\s+\d\s","",h)
	tail = h[10:].strip()
	# print("%s:%s" % (key, tail)

	# From here, all the keys from the header are being parsed
	if key == "TITLE":
	name = _chop_end_codes(tail).lower()
	pdbh_dict["name"] = " ".join([pdbh_dict["name"], name]).strip()
	elif key == "HEADER":
	rr = re.search(r"\d\d-\w\w\w-\d\d", tail)
	if rr is not None:
	pdbh_dict["deposition_date"] = _format_date(_nice_case(rr.group()))
	rr = re.search(r"\s+([1-9][0-9A-Z]{3})\s*\Z", tail)
	if rr is not None:
	pdbh_dict["idcode"] = rr.group(1)
	head = _chop_end_misc(tail).lower()
	pdbh_dict["head"] = head
	elif key == "COMPND":
	tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower()
	# look for E.C. numbers in COMPND lines
	rec = re.search(r"\d+\.\d+\.\d+\.\d+", tt)
	if rec:
	pdbh_dict["compound"][comp_molid]["ec_number"] = rec.group()
	tt = re.sub(r"$(e\.c\.)*\d+\.\d+\.\d+\.\d+$", "", tt)
	tok = tt.split(":")
	if len(tok) >= 2:
	ckey = tok[0]
	cval = re.sub(r"\A\s*", "", tok[1])
	if ckey == "mol_id":
	pdbh_dict["compound"][cval] = {"misc": ""}
	comp_molid = cval
	last_comp_key = "misc"
	else:
	pdbh_dict["compound"][comp_molid][ckey] = cval
	last_comp_key = ckey
	else:
	pdbh_dict["compound"][comp_molid][last_comp_key] += tok[0] + " "
	elif key == "SOURCE":
	tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower()
	tok = tt.split(":")
	# print(tok)
	if len(tok) >= 2:
	ckey = tok[0]
	cval = re.sub(r"\A\s*", "", tok[1])
	if ckey == "mol_id":
	pdbh_dict["source"][cval] = {"misc": ""}
	comp_molid = cval
	last_src_key = "misc"
	else:
	pdbh_dict["source"][comp_molid][ckey] = cval
	last_src_key = ckey
	else:
	pdbh_dict["source"][comp_molid][last_src_key] += tok[0] + " "
	elif key == "KEYWDS":
	kwd = _chop_end_codes(tail).lower()
	if "keywords" in pdbh_dict:
	pdbh_dict["keywords"] += " " + kwd
	else:
	pdbh_dict["keywords"] = kwd
	elif key == "EXPDTA":
	expd = _chop_end_codes(tail)
	# chop junk at end of lines for some structures
	expd = re.sub(r"\s\s\s\s\s\s\s.*\Z", "", expd)
	# if re.search('\Anmr',expd,re.IGNORECASE): expd='nmr'
	# if re.search('x-ray diffraction',expd,re.IGNORECASE): expd='x-ray diffraction'
	pdbh_dict["structure_method"] = expd.lower()
	elif key == "CAVEAT":
	# make Annotation entries out of these!!!
	pass
	elif key == "REVDAT":
	rr = re.search(r"\d\d-\w\w\w-\d\d", tail)
	if rr is not None:
	pdbh_dict["release_date"] = _format_date(_nice_case(rr.group()))
	elif key == "JRNL":
	# print("%s:%s" % (key, tail))
	if "journal" in pdbh_dict:
	pdbh_dict["journal"] += tail
	else:
	pdbh_dict["journal"] = tail
	elif key == "AUTHOR":
	auth = _nice_case(_chop_end_codes(tail))
	if "author" in pdbh_dict:
	pdbh_dict["author"] += auth
	else:
	pdbh_dict["author"] = auth
	elif key == "REMARK":
	if re.search("REMARK 2 RESOLUTION.", hh):
	r = _chop_end_codes(re.sub("REMARK 2 RESOLUTION.", "", hh))
	r = re.sub(r"\s+ANGSTROM.*", "", r)
	try:
	pdbh_dict["resolution"] = float(r)
	except ValueError:
	# print('nonstandard resolution %r' % r)
	pdbh_dict["resolution"] = None
	elif hh.startswith("REMARK 465"):
	if tail:
	pdbh_dict["has_missing_residues"] = True
	missing_res_info = _parse_remark_465(tail)
	if missing_res_info:
	pdbh_dict["missing_residues"].append(missing_res_info)
	elif hh.startswith("REMARK 99 ASTRAL"):
	if tail:
	remark_99_keyval = tail.replace("ASTRAL ", "").split(": ")
	if type(remark_99_keyval) == list and len(remark_99_keyval) == 2:
	if "astral" not in pdbh_dict:
	pdbh_dict["astral"] = {
	remark_99_keyval[0]: remark_99_keyval[1]
	}
	else:
	pdbh_dict["astral"][remark_99_keyval[0]] = remark_99_keyval[
	1
	]
	else:
	# print(key)
	pass
	if pdbh_dict["structure_method"] == "unknown":
	res = pdbh_dict["resolution"]
	if res is not None and res > 0.0:
	pdbh_dict["structure_method"] = "x-ray diffraction"
	return pdbh_dict