aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2017 Joe Greener. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Write an mmCIF file.
See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for syntax.
"""
import re
from collections import defaultdict
from Bio.PDB.StructureBuilder import StructureBuilder
from Bio.PDB.PDBIO import Select, StructureIO
# If certain entries should have a certain order of keys, that is specified here
mmcif_order = {
"_atom_site": [
"group_PDB",
"id",
"type_symbol",
"label_atom_id",
"label_alt_id",
"label_comp_id",
"label_asym_id",
"label_entity_id",
"label_seq_id",
"pdbx_PDB_ins_code",
"Cartn_x",
"Cartn_y",
"Cartn_z",
"occupancy",
"B_iso_or_equiv",
"pdbx_formal_charge",
"auth_seq_id",
"auth_comp_id",
"auth_asym_id",
"auth_atom_id",
"pdbx_PDB_model_num",
]
}
_select = Select()
class MMCIFIO(StructureIO):
"""Write a Structure object or a mmCIF dictionary as a mmCIF file.
Examples
--------
>>> from Bio.PDB import MMCIFParser
>>> from Bio.PDB.mmcifio import MMCIFIO
>>> parser = MMCIFParser()
>>> structure = parser.get_structure("1a8o", "PDB/1A8O.cif")
>>> io=MMCIFIO()
>>> io.set_structure(structure)
>>> io.save("bio-pdb-mmcifio-out.cif")
>>> import os
>>> os.remove("bio-pdb-mmcifio-out.cif") # tidy up
"""
def __init__(self):
"""Initialise."""
pass
def set_dict(self, dic):
"""Set the mmCIF dictionary to be written out."""
self.dic = dic
# Remove self.structure if it has been set
if hasattr(self, "structure"):
delattr(self, "structure")
def save(self, filepath, select=_select, preserve_atom_numbering=False):
"""Save the structure to a file.
:param filepath: output file
:type filepath: string or filehandle
:param select: selects which entities will be written.
:type select: object
Typically select is a subclass of L{Select}, it should
have the following methods:
- accept_model(model)
- accept_chain(chain)
- accept_residue(residue)
- accept_atom(atom)
These methods should return 1 if the entity is to be
written out, 0 otherwise.
"""
# Similar to the PDBIO save method, we check if the filepath is a
# string for a filepath or an open file handle
if isinstance(filepath, str):
fp = open(filepath, "w")
close_file = True
else:
fp = filepath
close_file = False
# Decide whether to save a Structure object or an mmCIF dictionary
if hasattr(self, "structure"):
self._save_structure(fp, select, preserve_atom_numbering)
elif hasattr(self, "dic"):
self._save_dict(fp)
else:
raise ValueError(
"Use set_structure or set_dict to set a structure or dictionary to write out"
)
if close_file:
fp.close()
def _save_dict(self, out_file):
# Form dictionary where key is first part of mmCIF key and value is list
# of corresponding second parts
key_lists = {}
for key in self.dic:
if key == "data_":
data_val = self.dic[key]
else:
s = re.split(r"\.", key)
if len(s) == 2:
if s[0] in key_lists:
key_lists[s[0]].append(s[1])
else:
key_lists[s[0]] = [s[1]]
else:
raise ValueError("Invalid key in mmCIF dictionary: " + key)
# Re-order lists if an order has been specified
# Not all elements from the specified order are necessarily present
for key, key_list in key_lists.items():
if key in mmcif_order:
inds = []
for i in key_list:
try:
inds.append(mmcif_order[key].index(i))
# Unrecognised key - add at end
except ValueError:
inds.append(len(mmcif_order[key]))
key_lists[key] = [k for _, k in sorted(zip(inds, key_list))]
# Write out top data_ line
if data_val:
out_file.write("data_" + data_val + "\n#\n")
for key, key_list in key_lists.items():
# Pick a sample mmCIF value, which can be a list or a single value
sample_val = self.dic[key + "." + key_list[0]]
n_vals = len(sample_val)
# Check the mmCIF dictionary has consistent list sizes
for i in key_list:
val = self.dic[key + "." + i]
if (
isinstance(sample_val, list)
and (isinstance(val, str) or len(val) != n_vals)
) or (isinstance(sample_val, str) and isinstance(val, list)):
raise ValueError(
"Inconsistent list sizes in mmCIF dictionary: " + key + "." + i
)
# If the value is a single value, write as key-value pairs
if isinstance(sample_val, str) or (
isinstance(sample_val, list) and len(sample_val) == 1
):
m = 0
# Find the maximum key length
for i in key_list:
if len(i) > m:
m = len(i)
for i in key_list:
# If the value is a single item list, just take the value
if isinstance(sample_val, str):
value_no_list = self.dic[key + "." + i]
else:
value_no_list = self.dic[key + "." + i][0]
out_file.write(
"{k: <{width}}".format(k=key + "." + i, width=len(key) + m + 4)
+ self._format_mmcif_col(value_no_list, len(value_no_list))
+ "\n"
)
# If the value is more than one value, write as keys then a value table
elif isinstance(sample_val, list):
out_file.write("loop_\n")
col_widths = {}
# Write keys and find max widths for each set of values
for i in key_list:
out_file.write(key + "." + i + "\n")
col_widths[i] = 0
for val in self.dic[key + "." + i]:
len_val = len(val)
# If the value requires quoting it will add 2 characters
if self._requires_quote(val) and not self._requires_newline(
val
):
len_val += 2
if len_val > col_widths[i]:
col_widths[i] = len_val
# Technically the max of the sum of the column widths is 2048
# Write the values as rows
for i in range(n_vals):
for col in key_list:
out_file.write(
self._format_mmcif_col(
self.dic[key + "." + col][i], col_widths[col] + 1
)
)
out_file.write("\n")
else:
raise ValueError(
"Invalid type in mmCIF dictionary: " + str(type(sample_val))
)
out_file.write("#\n")
def _format_mmcif_col(self, val, col_width):
# Format a mmCIF data value by enclosing with quotes or semicolon lines
# where appropriate. See
# https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for
# syntax.
# If there is a newline or quotes cannot be contained, use semicolon
# and newline construct
if self._requires_newline(val):
return "\n;" + val + "\n;\n"
elif self._requires_quote(val):
# Choose quote character
if "' " in val:
return "{v: <{width}}".format(v='"' + val + '"', width=col_width)
else:
return "{v: <{width}}".format(v="'" + val + "'", width=col_width)
# Safe to not quote
# Numbers must not be quoted
else:
return "{v: <{width}}".format(v=val, width=col_width)
def _requires_newline(self, val):
# Technically the space can be a tab too
if "\n" in val or ("' " in val and '" ' in val):
return True
else:
return False
def _requires_quote(self, val):
# Technically the words should be case-insensitive
if (
" " in val
or "'" in val
or '"' in val
or val[0] in ["_", "#", "$", "[", "]", ";"]
or val.startswith("data_")
or val.startswith("save_")
or val in ["loop_", "stop_", "global_"]
):
return True
else:
return False
def _get_label_asym_id(self, entity_id):
# Convert a positive integer into a chain ID
# Goes A to Z, then AA to ZA, AB to ZB etc
# This is in line with existing mmCIF files
div = entity_id
out = ""
while div > 0:
mod = (div - 1) % 26
out += chr(65 + mod)
div = int((div - mod) / 26)
return out
def _save_structure(self, out_file, select, preserve_atom_numbering):
atom_dict = defaultdict(list)
for model in self.structure.get_list():
if not select.accept_model(model):
continue
# mmCIF files with a single model have it specified as model 1
if model.serial_num == 0:
model_n = "1"
else:
model_n = str(model.serial_num)
# This is used to write label_entity_id and label_asym_id and
# increments from 1, changing with each molecule
entity_id = 0
if not preserve_atom_numbering:
atom_number = 1
for chain in model.get_list():
if not select.accept_chain(chain):
continue
chain_id = chain.get_id()
if chain_id == " ":
chain_id = "."
# This is used to write label_seq_id and increments from 1,
# remaining blank for hetero residues
residue_number = 1
prev_residue_type = ""
prev_resname = ""
for residue in chain.get_unpacked_list():
if not select.accept_residue(residue):
continue
hetfield, resseq, icode = residue.get_id()
if hetfield == " ":
residue_type = "ATOM"
label_seq_id = str(residue_number)
residue_number += 1
else:
residue_type = "HETATM"
label_seq_id = "."
resseq = str(resseq)
if icode == " ":
icode = "?"
resname = residue.get_resname()
# Check if the molecule changes within the chain
# This will always increment for the first residue in a
# chain due to the starting values above
if residue_type != prev_residue_type or (
residue_type == "HETATM" and resname != prev_resname
):
entity_id += 1
prev_residue_type = residue_type
prev_resname = resname
label_asym_id = self._get_label_asym_id(entity_id)
for atom in residue.get_unpacked_list():
if select.accept_atom(atom):
atom_dict["_atom_site.group_PDB"].append(residue_type)
if preserve_atom_numbering:
atom_number = atom.get_serial_number()
atom_dict["_atom_site.id"].append(str(atom_number))
if not preserve_atom_numbering:
atom_number += 1
element = atom.element.strip()
if element == "":
element = "?"
atom_dict["_atom_site.type_symbol"].append(element)
atom_dict["_atom_site.label_atom_id"].append(
atom.get_name().strip()
)
altloc = atom.get_altloc()
if altloc == " ":
altloc = "."
atom_dict["_atom_site.label_alt_id"].append(altloc)
atom_dict["_atom_site.label_comp_id"].append(
resname.strip()
)
atom_dict["_atom_site.label_asym_id"].append(label_asym_id)
# The entity ID should be the same for similar chains
# However this is non-trivial to calculate so we write "?"
atom_dict["_atom_site.label_entity_id"].append("?")
atom_dict["_atom_site.label_seq_id"].append(label_seq_id)
atom_dict["_atom_site.pdbx_PDB_ins_code"].append(icode)
coord = atom.get_coord()
atom_dict["_atom_site.Cartn_x"].append(f"{coord[0]:.3f}")
atom_dict["_atom_site.Cartn_y"].append(f"{coord[1]:.3f}")
atom_dict["_atom_site.Cartn_z"].append(f"{coord[2]:.3f}")
atom_dict["_atom_site.occupancy"].append(
str(atom.get_occupancy())
)
atom_dict["_atom_site.B_iso_or_equiv"].append(
str(atom.get_bfactor())
)
atom_dict["_atom_site.auth_seq_id"].append(resseq)
atom_dict["_atom_site.auth_asym_id"].append(chain_id)
atom_dict["_atom_site.pdbx_PDB_model_num"].append(model_n)
# Data block name is the structure ID with special characters removed
structure_id = self.structure.id
for c in ["#", "$", "'", '"', "[", "]", " ", "\t", "\n"]:
structure_id = structure_id.replace(c, "")
atom_dict["data_"] = structure_id
# Set the dictionary and write out using the generic dictionary method
self.dic = atom_dict
self._save_dict(out_file)