aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2019 Joe Greener. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Write a MMTF file."""
import itertools
from collections import defaultdict
from string import ascii_uppercase
from Bio.PDB.StructureBuilder import StructureBuilder
from Bio.PDB.PDBIO import Select, StructureIO
from mmtf.api.mmtf_writer import MMTFEncoder
from Bio.SeqUtils import seq1
from Bio.Data.PDBData import protein_letters_3to1_extended
_select = Select()
class MMTFIO(StructureIO):
"""Write a Structure object as a MMTF file.
Examples
--------
>>> from Bio.PDB import MMCIFParser
>>> from Bio.PDB.mmtf import MMTFIO
>>> parser = MMCIFParser()
>>> structure = parser.get_structure("1a8o", "PDB/1A8O.cif")
>>> io=MMTFIO()
>>> io.set_structure(structure)
>>> io.save("bio-pdb-mmtf-out.mmtf")
>>> import os
>>> os.remove("bio-pdb-mmtf-out.mmtf") # tidy up
"""
def __init__(self):
"""Initialise."""
pass
def save(self, filepath, select=_select):
"""Save the structure to a file.
:param filepath: output file
:type filepath: string
:param select: selects which entities will be written.
:type select: object
Typically select is a subclass of L{Select}, it should
have the following methods:
- accept_model(model)
- accept_chain(chain)
- accept_residue(residue)
- accept_atom(atom)
These methods should return 1 if the entity is to be
written out, 0 otherwise.
"""
# Similar to the PDBIO save method, we check if the filepath is a
# string for a filepath or an open file handle
if not isinstance(filepath, str):
raise ValueError(
"Writing to a file handle is not supported for MMTF, filepath must be a string"
)
if hasattr(self, "structure"):
self._save_structure(filepath, select)
else:
raise ValueError("Use set_structure to set a structure to write out")
def _chain_id_iterator(self):
"""Label chains sequentially: A, B, ..., Z, AA, AB etc."""
for size in itertools.count(1):
for s in itertools.product(ascii_uppercase, repeat=size):
yield "".join(s)
def _save_structure(self, filepath, select):
count_models, count_chains, count_groups, count_atoms = 0, 0, 0, 0
# If atom serials are missing, renumber atoms starting from 1
atom_serials = [a.serial_number for a in self.structure.get_atoms()]
renumber_atoms = None in atom_serials
encoder = MMTFEncoder()
# The counts are set to 0 here and changed later once we have the values
encoder.init_structure(
total_num_bonds=0,
total_num_atoms=0,
total_num_groups=0,
total_num_chains=0,
total_num_models=0,
structure_id=self.structure.id,
)
encoder.set_xtal_info(space_group="", unit_cell=None)
# The header information is missing for some structure objects
header_dict = defaultdict(str, self.structure.header)
if header_dict["resolution"] == "":
header_dict["resolution"] = None
if header_dict["structure_method"] == "":
header_dict["structure_method"] = []
else:
header_dict["structure_method"] = [header_dict["structure_method"]]
encoder.set_header_info(
r_free=None,
r_work=None,
resolution=header_dict["resolution"],
title=header_dict["name"],
deposition_date=header_dict["deposition_date"],
release_date=header_dict["release_date"],
experimental_methods=header_dict["structure_method"],
)
# Tracks values to replace them at the end
chains_per_model = []
groups_per_chain = []
for mi, model in enumerate(self.structure.get_models()):
if not select.accept_model(model):
continue
chain_id_iterator = self._chain_id_iterator()
count_models += 1
encoder.set_model_info(
model_id=mi, # According to mmtf-python this is meaningless
chain_count=0, # Set to 0 here and changed later
)
for chain in model.get_chains():
if not select.accept_chain(chain):
continue
seqs = []
seq = ""
prev_residue_type = ""
prev_resname = ""
first_chain = True
for residue in chain.get_unpacked_list():
if not select.accept_residue(residue):
continue
count_groups += 1
hetfield, resseq, icode = residue.get_id()
if hetfield == " ":
residue_type = "ATOM"
entity_type = "polymer"
elif hetfield == "W":
residue_type = "HETATM"
entity_type = "water"
else:
residue_type = "HETATM"
entity_type = "non-polymer"
resname = residue.get_resname()
# Check if the molecule changes within the chain
# This will always increment for the first residue in a
# chain due to the starting values above
# Checking for similar entities is non-trivial from the
# structure object so we treat each molecule as a separate
# entity
if residue_type != prev_residue_type or (
residue_type == "HETATM" and resname != prev_resname
):
encoder.set_entity_info(
chain_indices=[count_chains],
sequence="", # Set to empty here and changed later
description="",
entity_type=entity_type,
)
encoder.set_chain_info(
chain_id=next(chain_id_iterator),
chain_name="\x00"
if len(chain.get_id().strip()) == 0
else chain.get_id(),
num_groups=0, # Set to 0 here and changed later
)
if count_chains > 0:
groups_per_chain.append(
count_groups - sum(groups_per_chain) - 1
)
if not first_chain:
seqs.append(seq)
first_chain = False
count_chains += 1
seq = ""
if entity_type == "polymer":
seq += seq1(resname, custom_map=protein_letters_3to1_extended)
prev_residue_type = residue_type
prev_resname = resname
encoder.set_group_info(
group_name=resname,
group_number=residue.id[1],
insertion_code="\x00"
if residue.id[2] == " "
else residue.id[2],
group_type="", # Value in the chemcomp dictionary, which is unknown here
atom_count=sum(
1
for a in residue.get_unpacked_list()
if select.accept_atom(a)
),
bond_count=0,
single_letter_code=seq1(
resname, custom_map=protein_letters_3to1_extended
),
sequence_index=len(seq) - 1 if entity_type == "polymer" else -1,
secondary_structure_type=-1,
)
for atom in residue.get_unpacked_list():
if select.accept_atom(atom):
count_atoms += 1
encoder.set_atom_info(
atom_name=atom.name,
serial_number=count_atoms
if renumber_atoms
else atom.serial_number,
alternative_location_id="\x00"
if atom.altloc == " "
else atom.altloc,
x=atom.coord[0],
y=atom.coord[1],
z=atom.coord[2],
occupancy=atom.occupancy,
temperature_factor=atom.bfactor,
element=atom.element,
charge=0,
)
seqs.append(seq)
# Now that we have the sequences, edit the entities to add them
start_ind = len(encoder.entity_list) - len(seqs)
for i, seq in enumerate(seqs):
encoder.entity_list[start_ind + i]["sequence"] = seq
chains_per_model.append(count_chains - sum(chains_per_model))
groups_per_chain.append(count_groups - sum(groups_per_chain))
encoder.chains_per_model = chains_per_model
encoder.groups_per_chain = groups_per_chain
encoder.num_atoms = count_atoms
encoder.num_groups = count_groups
encoder.num_chains = count_chains
encoder.num_models = count_models
encoder.finalize_structure()
encoder.write_file(filepath)