Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /PDB /mmcifio.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

15.4 kB

	# Copyright 2017 Joe Greener. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.

	"""Write an mmCIF file.

	See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for syntax.
	"""

	import re
	from collections import defaultdict

	from Bio.PDB.StructureBuilder import StructureBuilder
	from Bio.PDB.PDBIO import Select, StructureIO

	# If certain entries should have a certain order of keys, that is specified here
	mmcif_order = {
	"_atom_site": [
	"group_PDB",
	"id",
	"type_symbol",
	"label_atom_id",
	"label_alt_id",
	"label_comp_id",
	"label_asym_id",
	"label_entity_id",
	"label_seq_id",
	"pdbx_PDB_ins_code",
	"Cartn_x",
	"Cartn_y",
	"Cartn_z",
	"occupancy",
	"B_iso_or_equiv",
	"pdbx_formal_charge",
	"auth_seq_id",
	"auth_comp_id",
	"auth_asym_id",
	"auth_atom_id",
	"pdbx_PDB_model_num",
	]
	}


	_select = Select()


	class MMCIFIO(StructureIO):
	"""Write a Structure object or a mmCIF dictionary as a mmCIF file.

	Examples
	--------
	>>> from Bio.PDB import MMCIFParser
	>>> from Bio.PDB.mmcifio import MMCIFIO
	>>> parser = MMCIFParser()
	>>> structure = parser.get_structure("1a8o", "PDB/1A8O.cif")
	>>> io=MMCIFIO()
	>>> io.set_structure(structure)
	>>> io.save("bio-pdb-mmcifio-out.cif")
	>>> import os
	>>> os.remove("bio-pdb-mmcifio-out.cif") # tidy up


	"""

	def __init__(self):
	"""Initialise."""
	pass

	def set_dict(self, dic):
	"""Set the mmCIF dictionary to be written out."""
	self.dic = dic
	# Remove self.structure if it has been set
	if hasattr(self, "structure"):
	delattr(self, "structure")

	def save(self, filepath, select=_select, preserve_atom_numbering=False):
	"""Save the structure to a file.

	:param filepath: output file
	:type filepath: string or filehandle

	:param select: selects which entities will be written.
	:type select: object

	Typically select is a subclass of L{Select}, it should
	have the following methods:

	- accept_model(model)
	- accept_chain(chain)
	- accept_residue(residue)
	- accept_atom(atom)

	These methods should return 1 if the entity is to be
	written out, 0 otherwise.
	"""
	# Similar to the PDBIO save method, we check if the filepath is a
	# string for a filepath or an open file handle
	if isinstance(filepath, str):
	fp = open(filepath, "w")
	close_file = True
	else:
	fp = filepath
	close_file = False
	# Decide whether to save a Structure object or an mmCIF dictionary
	if hasattr(self, "structure"):
	self._save_structure(fp, select, preserve_atom_numbering)
	elif hasattr(self, "dic"):
	self._save_dict(fp)
	else:
	raise ValueError(
	"Use set_structure or set_dict to set a structure or dictionary to write out"
	)
	if close_file:
	fp.close()

	def _save_dict(self, out_file):
	# Form dictionary where key is first part of mmCIF key and value is list
	# of corresponding second parts
	key_lists = {}
	for key in self.dic:
	if key == "data_":
	data_val = self.dic[key]
	else:
	s = re.split(r"\.", key)
	if len(s) == 2:
	if s[0] in key_lists:
	key_lists[s[0]].append(s[1])
	else:
	key_lists[s[0]] = [s[1]]
	else:
	raise ValueError("Invalid key in mmCIF dictionary: " + key)

	# Re-order lists if an order has been specified
	# Not all elements from the specified order are necessarily present
	for key, key_list in key_lists.items():
	if key in mmcif_order:
	inds = []
	for i in key_list:
	try:
	inds.append(mmcif_order[key].index(i))
	# Unrecognised key - add at end
	except ValueError:
	inds.append(len(mmcif_order[key]))
	key_lists[key] = [k for _, k in sorted(zip(inds, key_list))]

	# Write out top data_ line
	if data_val:
	out_file.write("data_" + data_val + "\n#\n")

	for key, key_list in key_lists.items():
	# Pick a sample mmCIF value, which can be a list or a single value
	sample_val = self.dic[key + "." + key_list[0]]
	n_vals = len(sample_val)
	# Check the mmCIF dictionary has consistent list sizes
	for i in key_list:
	val = self.dic[key + "." + i]
	if (
	isinstance(sample_val, list)
	and (isinstance(val, str) or len(val) != n_vals)
	) or (isinstance(sample_val, str) and isinstance(val, list)):
	raise ValueError(
	"Inconsistent list sizes in mmCIF dictionary: " + key + "." + i
	)
	# If the value is a single value, write as key-value pairs
	if isinstance(sample_val, str) or (
	isinstance(sample_val, list) and len(sample_val) == 1
	):
	m = 0
	# Find the maximum key length
	for i in key_list:
	if len(i) > m:
	m = len(i)
	for i in key_list:
	# If the value is a single item list, just take the value
	if isinstance(sample_val, str):
	value_no_list = self.dic[key + "." + i]
	else:
	value_no_list = self.dic[key + "." + i][0]
	out_file.write(
	"{k: <{width}}".format(k=key + "." + i, width=len(key) + m + 4)
	+ self._format_mmcif_col(value_no_list, len(value_no_list))
	+ "\n"
	)
	# If the value is more than one value, write as keys then a value table
	elif isinstance(sample_val, list):
	out_file.write("loop_\n")
	col_widths = {}
	# Write keys and find max widths for each set of values
	for i in key_list:
	out_file.write(key + "." + i + "\n")
	col_widths[i] = 0
	for val in self.dic[key + "." + i]:
	len_val = len(val)
	# If the value requires quoting it will add 2 characters
	if self._requires_quote(val) and not self._requires_newline(
	val
	):
	len_val += 2
	if len_val > col_widths[i]:
	col_widths[i] = len_val
	# Technically the max of the sum of the column widths is 2048

	# Write the values as rows
	for i in range(n_vals):
	for col in key_list:
	out_file.write(
	self._format_mmcif_col(
	self.dic[key + "." + col][i], col_widths[col] + 1
	)
	)
	out_file.write("\n")
	else:
	raise ValueError(
	"Invalid type in mmCIF dictionary: " + str(type(sample_val))
	)
	out_file.write("#\n")

	def _format_mmcif_col(self, val, col_width):
	# Format a mmCIF data value by enclosing with quotes or semicolon lines
	# where appropriate. See
	# https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for
	# syntax.

	# If there is a newline or quotes cannot be contained, use semicolon
	# and newline construct
	if self._requires_newline(val):
	return "\n;" + val + "\n;\n"
	elif self._requires_quote(val):
	# Choose quote character
	if "' " in val:
	return "{v: <{width}}".format(v='"' + val + '"', width=col_width)
	else:
	return "{v: <{width}}".format(v="'" + val + "'", width=col_width)
	# Safe to not quote
	# Numbers must not be quoted
	else:
	return "{v: <{width}}".format(v=val, width=col_width)

	def _requires_newline(self, val):
	# Technically the space can be a tab too
	if "\n" in val or ("' " in val and '" ' in val):
	return True
	else:
	return False

	def _requires_quote(self, val):
	# Technically the words should be case-insensitive
	if (
	" " in val
	or "'" in val
	or '"' in val
	or val[0] in ["_", "#", "$", "[", "]", ";"]
	or val.startswith("data_")
	or val.startswith("save_")
	or val in ["loop_", "stop_", "global_"]
	):
	return True
	else:
	return False

	def _get_label_asym_id(self, entity_id):
	# Convert a positive integer into a chain ID
	# Goes A to Z, then AA to ZA, AB to ZB etc
	# This is in line with existing mmCIF files
	div = entity_id
	out = ""
	while div > 0:
	mod = (div - 1) % 26
	out += chr(65 + mod)
	div = int((div - mod) / 26)
	return out

	def _save_structure(self, out_file, select, preserve_atom_numbering):
	atom_dict = defaultdict(list)

	for model in self.structure.get_list():
	if not select.accept_model(model):
	continue
	# mmCIF files with a single model have it specified as model 1
	if model.serial_num == 0:
	model_n = "1"
	else:
	model_n = str(model.serial_num)
	# This is used to write label_entity_id and label_asym_id and
	# increments from 1, changing with each molecule
	entity_id = 0
	if not preserve_atom_numbering:
	atom_number = 1
	for chain in model.get_list():
	if not select.accept_chain(chain):
	continue
	chain_id = chain.get_id()
	if chain_id == " ":
	chain_id = "."
	# This is used to write label_seq_id and increments from 1,
	# remaining blank for hetero residues
	residue_number = 1
	prev_residue_type = ""
	prev_resname = ""
	for residue in chain.get_unpacked_list():
	if not select.accept_residue(residue):
	continue
	hetfield, resseq, icode = residue.get_id()
	if hetfield == " ":
	residue_type = "ATOM"
	label_seq_id = str(residue_number)
	residue_number += 1
	else:
	residue_type = "HETATM"
	label_seq_id = "."
	resseq = str(resseq)
	if icode == " ":
	icode = "?"
	resname = residue.get_resname()
	# Check if the molecule changes within the chain
	# This will always increment for the first residue in a
	# chain due to the starting values above
	if residue_type != prev_residue_type or (
	residue_type == "HETATM" and resname != prev_resname
	):
	entity_id += 1
	prev_residue_type = residue_type
	prev_resname = resname
	label_asym_id = self._get_label_asym_id(entity_id)
	for atom in residue.get_unpacked_list():
	if select.accept_atom(atom):
	atom_dict["_atom_site.group_PDB"].append(residue_type)
	if preserve_atom_numbering:
	atom_number = atom.get_serial_number()
	atom_dict["_atom_site.id"].append(str(atom_number))
	if not preserve_atom_numbering:
	atom_number += 1
	element = atom.element.strip()
	if element == "":
	element = "?"
	atom_dict["_atom_site.type_symbol"].append(element)
	atom_dict["_atom_site.label_atom_id"].append(
	atom.get_name().strip()
	)
	altloc = atom.get_altloc()
	if altloc == " ":
	altloc = "."
	atom_dict["_atom_site.label_alt_id"].append(altloc)
	atom_dict["_atom_site.label_comp_id"].append(
	resname.strip()
	)
	atom_dict["_atom_site.label_asym_id"].append(label_asym_id)
	# The entity ID should be the same for similar chains
	# However this is non-trivial to calculate so we write "?"
	atom_dict["_atom_site.label_entity_id"].append("?")
	atom_dict["_atom_site.label_seq_id"].append(label_seq_id)
	atom_dict["_atom_site.pdbx_PDB_ins_code"].append(icode)
	coord = atom.get_coord()
	atom_dict["_atom_site.Cartn_x"].append(f"{coord[0]:.3f}")
	atom_dict["_atom_site.Cartn_y"].append(f"{coord[1]:.3f}")
	atom_dict["_atom_site.Cartn_z"].append(f"{coord[2]:.3f}")
	atom_dict["_atom_site.occupancy"].append(
	str(atom.get_occupancy())
	)
	atom_dict["_atom_site.B_iso_or_equiv"].append(
	str(atom.get_bfactor())
	)
	atom_dict["_atom_site.auth_seq_id"].append(resseq)
	atom_dict["_atom_site.auth_asym_id"].append(chain_id)
	atom_dict["_atom_site.pdbx_PDB_model_num"].append(model_n)

	# Data block name is the structure ID with special characters removed
	structure_id = self.structure.id
	for c in ["#", "$", "'", '"', "[", "]", " ", "\t", "\n"]:
	structure_id = structure_id.replace(c, "")
	atom_dict["data_"] = structure_id

	# Set the dictionary and write out using the generic dictionary method
	self.dic = atom_dict
	self._save_dict(out_file)