File size: 10,193 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# Copyright 2019 Joe Greener. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.

"""Write a MMTF file."""

import itertools
from collections import defaultdict
from string import ascii_uppercase
from Bio.PDB.StructureBuilder import StructureBuilder
from Bio.PDB.PDBIO import Select, StructureIO
from mmtf.api.mmtf_writer import MMTFEncoder
from Bio.SeqUtils import seq1
from Bio.Data.PDBData import protein_letters_3to1_extended

_select = Select()


class MMTFIO(StructureIO):
    """Write a Structure object as a MMTF file.

    Examples
    --------
        >>> from Bio.PDB import MMCIFParser
        >>> from Bio.PDB.mmtf import MMTFIO
        >>> parser = MMCIFParser()
        >>> structure = parser.get_structure("1a8o", "PDB/1A8O.cif")
        >>> io=MMTFIO()
        >>> io.set_structure(structure)
        >>> io.save("bio-pdb-mmtf-out.mmtf")
        >>> import os
        >>> os.remove("bio-pdb-mmtf-out.mmtf")  # tidy up

    """

    def __init__(self):
        """Initialise."""
        pass

    def save(self, filepath, select=_select):
        """Save the structure to a file.

        :param filepath: output file
        :type filepath: string

        :param select: selects which entities will be written.
        :type select: object

        Typically select is a subclass of L{Select}, it should
        have the following methods:

         - accept_model(model)
         - accept_chain(chain)
         - accept_residue(residue)
         - accept_atom(atom)

        These methods should return 1 if the entity is to be
        written out, 0 otherwise.
        """
        # Similar to the PDBIO save method, we check if the filepath is a
        # string for a filepath or an open file handle
        if not isinstance(filepath, str):
            raise ValueError(
                "Writing to a file handle is not supported for MMTF, filepath must be a string"
            )
        if hasattr(self, "structure"):
            self._save_structure(filepath, select)
        else:
            raise ValueError("Use set_structure to set a structure to write out")

    def _chain_id_iterator(self):
        """Label chains sequentially: A, B, ..., Z, AA, AB etc."""
        for size in itertools.count(1):
            for s in itertools.product(ascii_uppercase, repeat=size):
                yield "".join(s)

    def _save_structure(self, filepath, select):
        count_models, count_chains, count_groups, count_atoms = 0, 0, 0, 0

        # If atom serials are missing, renumber atoms starting from 1
        atom_serials = [a.serial_number for a in self.structure.get_atoms()]
        renumber_atoms = None in atom_serials

        encoder = MMTFEncoder()
        # The counts are set to 0 here and changed later once we have the values
        encoder.init_structure(
            total_num_bonds=0,
            total_num_atoms=0,
            total_num_groups=0,
            total_num_chains=0,
            total_num_models=0,
            structure_id=self.structure.id,
        )

        encoder.set_xtal_info(space_group="", unit_cell=None)

        # The header information is missing for some structure objects
        header_dict = defaultdict(str, self.structure.header)
        if header_dict["resolution"] == "":
            header_dict["resolution"] = None
        if header_dict["structure_method"] == "":
            header_dict["structure_method"] = []
        else:
            header_dict["structure_method"] = [header_dict["structure_method"]]

        encoder.set_header_info(
            r_free=None,
            r_work=None,
            resolution=header_dict["resolution"],
            title=header_dict["name"],
            deposition_date=header_dict["deposition_date"],
            release_date=header_dict["release_date"],
            experimental_methods=header_dict["structure_method"],
        )

        # Tracks values to replace them at the end
        chains_per_model = []
        groups_per_chain = []

        for mi, model in enumerate(self.structure.get_models()):
            if not select.accept_model(model):
                continue

            chain_id_iterator = self._chain_id_iterator()

            count_models += 1
            encoder.set_model_info(
                model_id=mi,  # According to mmtf-python this is meaningless
                chain_count=0,  # Set to 0 here and changed later
            )
            for chain in model.get_chains():
                if not select.accept_chain(chain):
                    continue

                seqs = []
                seq = ""
                prev_residue_type = ""
                prev_resname = ""
                first_chain = True

                for residue in chain.get_unpacked_list():
                    if not select.accept_residue(residue):
                        continue

                    count_groups += 1
                    hetfield, resseq, icode = residue.get_id()
                    if hetfield == " ":
                        residue_type = "ATOM"
                        entity_type = "polymer"
                    elif hetfield == "W":
                        residue_type = "HETATM"
                        entity_type = "water"
                    else:
                        residue_type = "HETATM"
                        entity_type = "non-polymer"
                    resname = residue.get_resname()

                    # Check if the molecule changes within the chain
                    # This will always increment for the first residue in a
                    #  chain due to the starting values above
                    # Checking for similar entities is non-trivial from the
                    #  structure object so we treat each molecule as a separate
                    #  entity
                    if residue_type != prev_residue_type or (
                        residue_type == "HETATM" and resname != prev_resname
                    ):
                        encoder.set_entity_info(
                            chain_indices=[count_chains],
                            sequence="",  # Set to empty here and changed later
                            description="",
                            entity_type=entity_type,
                        )
                        encoder.set_chain_info(
                            chain_id=next(chain_id_iterator),
                            chain_name="\x00"
                            if len(chain.get_id().strip()) == 0
                            else chain.get_id(),
                            num_groups=0,  # Set to 0 here and changed later
                        )
                        if count_chains > 0:
                            groups_per_chain.append(
                                count_groups - sum(groups_per_chain) - 1
                            )
                        if not first_chain:
                            seqs.append(seq)
                        first_chain = False
                        count_chains += 1
                        seq = ""

                    if entity_type == "polymer":
                        seq += seq1(resname, custom_map=protein_letters_3to1_extended)

                    prev_residue_type = residue_type
                    prev_resname = resname

                    encoder.set_group_info(
                        group_name=resname,
                        group_number=residue.id[1],
                        insertion_code="\x00"
                        if residue.id[2] == " "
                        else residue.id[2],
                        group_type="",  # Value in the chemcomp dictionary, which is unknown here
                        atom_count=sum(
                            1
                            for a in residue.get_unpacked_list()
                            if select.accept_atom(a)
                        ),
                        bond_count=0,
                        single_letter_code=seq1(
                            resname, custom_map=protein_letters_3to1_extended
                        ),
                        sequence_index=len(seq) - 1 if entity_type == "polymer" else -1,
                        secondary_structure_type=-1,
                    )

                    for atom in residue.get_unpacked_list():
                        if select.accept_atom(atom):
                            count_atoms += 1
                            encoder.set_atom_info(
                                atom_name=atom.name,
                                serial_number=count_atoms
                                if renumber_atoms
                                else atom.serial_number,
                                alternative_location_id="\x00"
                                if atom.altloc == " "
                                else atom.altloc,
                                x=atom.coord[0],
                                y=atom.coord[1],
                                z=atom.coord[2],
                                occupancy=atom.occupancy,
                                temperature_factor=atom.bfactor,
                                element=atom.element,
                                charge=0,
                            )

                seqs.append(seq)
                # Now that we have the sequences, edit the entities to add them
                start_ind = len(encoder.entity_list) - len(seqs)
                for i, seq in enumerate(seqs):
                    encoder.entity_list[start_ind + i]["sequence"] = seq

            chains_per_model.append(count_chains - sum(chains_per_model))

        groups_per_chain.append(count_groups - sum(groups_per_chain))

        encoder.chains_per_model = chains_per_model
        encoder.groups_per_chain = groups_per_chain
        encoder.num_atoms = count_atoms
        encoder.num_groups = count_groups
        encoder.num_chains = count_chains
        encoder.num_models = count_models

        encoder.finalize_structure()
        encoder.write_file(filepath)