File size: 8,376 Bytes
89c0b51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# Copyright 2024 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from pathlib import Path

import biotite.structure as struc
import pandas as pd
from joblib import Parallel, delayed

from protenix.data.ccd import get_one_letter_code
from protenix.data.parser import MMCIFParser

pd.options.mode.copy_on_write = True


def get_seqs(mmcif_file):
    mmcif_parser = MMCIFParser(mmcif_file)

    entity_poly = mmcif_parser.get_category_table("entity_poly")
    if entity_poly is None:
        return pdb_id, None
    entity_poly["mmcif_seq_old"] = entity_poly.pdbx_seq_one_letter_code_can.str.replace(
        "\n", ""
    )
    entity_poly["pdbx_type"] = entity_poly.type
    mol_type = []
    # https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_entity_poly.type.html
    for i in entity_poly.type:
        if "ribonucleotide" in i:
            mol_type.append("na")
        elif "polypeptide" in i:
            mol_type.append("protein")
        else:
            mol_type.append("other")
    entity_poly["mol_type"] = mol_type

    entity = mmcif_parser.get_category_table("entity")
    info_df = pd.merge(
        entity, entity_poly, left_on="id", right_on="entity_id", how="inner"
    )
    pdb_id = mmcif_file.name.split(".")[0]
    info_df["pdb_id"] = pdb_id

    if "pdbx_audit_revision_history" in mmcif_parser.cif.block:
        history = mmcif_parser.cif.block["pdbx_audit_revision_history"]
        info_df["release_date"] = history["revision_date"].as_array()[0]

    info_df["release_date_retrace_obsolete"] = mmcif_parser.release_date

    entity_poly_seq = mmcif_parser.get_category_table("entity_poly_seq")

    seq_from_resname = []
    diff_seq_mmcif_vs_atom_site = []
    has_alt_res = []
    diff_alt_res_seq_vs_atom_site = []
    for entity_id, mmcif_seq_old in zip(info_df.entity_id, info_df.mmcif_seq_old):
        chain_mask = entity_poly_seq.entity_id == entity_id
        res_names = entity_poly_seq.mon_id[chain_mask].to_numpy(dtype=str)
        res_ids = entity_poly_seq.num[chain_mask].to_numpy(dtype=int)

        seq = ""
        pre_res_id = 0
        id_2_name = {}
        for res_id, res_name in zip(res_ids, res_names):
            if res_id == pre_res_id:
                continue
            id_2_name[res_id] = res_name
            one = get_one_letter_code(res_name)
            if one is None:
                one = "X"
            if len(one) > 1:
                one = "X"
            seq += one
            pre_res_id = res_id
        assert len(seq) == max(res_ids)

        diff_seq_mmcif_vs_atom_site.append(seq != mmcif_seq_old)
        has_alt = False
        mismatch_res_name = False
        if len(seq) < len(res_ids):  # has altloc residue in same res_id
            has_alt = True
            # get_structure() return atom array only keep first altloc residue
            atom_array = mmcif_parser.get_structure()
            res_starts = struc.get_residue_starts(atom_array)
            for start in res_starts:
                if atom_array.label_entity_id[start] == entity_id:
                    first_res_in_seq = id_2_name[atom_array.res_id[start]]
                    first_res_in_atom = atom_array.res_name[start]
                    if first_res_in_seq != first_res_in_atom:
                        mismatch_res_name = True
                        break

        has_alt_res.append(has_alt)
        diff_alt_res_seq_vs_atom_site.append(mismatch_res_name)

        seq_from_resname.append(seq)
    info_df["seq"] = seq_from_resname
    info_df["length"] = [len(s) for s in info_df.seq]
    info_df["diff_seq_mmcif_vs_atom_site"] = diff_seq_mmcif_vs_atom_site
    info_df["has_alt_res"] = has_alt_res
    info_df["diff_alt_res_seq_vs_atom_site"] = diff_alt_res_seq_vs_atom_site
    info_df["auth_asym_id"] = info_df["pdbx_strand_id"]

    columns = [
        "pdb_id",
        "entity_id",
        "mol_type",
        "pdbx_type",
        "length",
        "mmcif_seq_old",
        "seq",
        "diff_seq_mmcif_vs_atom_site",
        "has_alt_res",
        "diff_alt_res_seq_vs_atom_site",
        "pdbx_description",
        "auth_asym_id",
        "release_date",
        "release_date_retrace_obsolete",
    ]
    info_df = info_df[columns]
    return pdb_id, info_df


def try_get_seqs(cif_file):
    pdb_id = cif_file.name.split(".")[0]
    try:
        return get_seqs(cif_file)
    except Exception as e:
        print("skip", pdb_id, e)
        return pdb_id, "Error:" + str(e)


def export_to_fasta(df, filename):
    df_protein = df[df["mol_type"] == "protein"]
    # drop duplicates sequence for avoiding duplicate msa search
    df_protein = df_protein.drop_duplicates(subset=["seq"])
    with open(filename, "w") as fasta_file:
        for _, row in df_protein.iterrows():
            header = f">{row['pdb_id']}_{row['entity_id']}\n"
            sequence = f"{row['seq']}\n"
            fasta_file.write(header)
            fasta_file.write(sequence)


def mapping_seqs_to_pdb_entity_id(df, output_json_file):
    df_protein = df[df["mol_type"] == "protein"]
    sequence_mapping = {}

    for _, row in df_protein.iterrows():
        seq = row["seq"]
        key = row["pdb_id"]
        value = row["entity_id"]

        if seq not in sequence_mapping:
            sequence_mapping[seq] = []
        sequence_mapping[seq].append([key, value])

    with open(output_json_file, "w") as json_file:
        json.dump(sequence_mapping, json_file, indent=4)
    return sequence_mapping


def mapping_seqs_to_integer_identifiers(
    sequence_mapping,
    pdb_index_to_seq_path,
    seq_to_pdb_index_path,
):
    seq_to_pdb_index = {}
    for idx, seq in enumerate(sorted(sequence_mapping.keys())):
        seq_to_pdb_index[seq] = idx
    pdb_index_to_seq = {v: k for k, v in seq_to_pdb_index.items()}
    with open(pdb_index_to_seq_path, "w") as f:
        json.dump(pdb_index_to_seq, f, indent=4)
    with open(seq_to_pdb_index_path, "w") as f:
        json.dump(seq_to_pdb_index, f, indent=4)


if __name__ == "__main__":
    # It's a demo here
    cif_dir = Path("./scripts/msa/data/mmcif")
    cif_files = [x for x in cif_dir.iterdir() if x.is_file()]

    info_dfs = []
    none_list = []
    error_list = []
    with Parallel(n_jobs=-2, verbose=10) as parallel:
        for pdb_id, info_df in parallel([delayed(try_get_seqs)(f) for f in cif_files]):
            if info_df is None:
                none_list.append(pdb_id)
            elif isinstance(info_df, str) and info_df.startswith("Error:"):
                error_list.append((pdb_id, info_df))
            else:
                info_dfs.append(info_df)

    out_df = pd.concat(info_dfs)
    out_df = out_df.sort_values(["pdb_id", "entity_id"])

    out_dir = Path("./scripts/msa/data/pdb_seqs")
    if not out_dir.exists():
        out_dir.mkdir(parents=True)
    # 1. extract pdb sequence info
    seq_file = out_dir / "pdb_seq.csv"
    seq_df = out_df[out_df.mol_type != "other"]
    seq_df.to_csv(seq_file, index=False)
    # 2. generate protein fasta file as MSA input
    fasta_file = out_dir / "pdb_seq.fasta"
    export_to_fasta(seq_df, fasta_file)

    # 3. get seq_to_pdb_id_entity_id mapping
    seq_to_pdb_id_entity_id_json = out_dir / "seq_to_pdb_id_entity_id.json"
    sequence_mapping = mapping_seqs_to_pdb_entity_id(
        seq_df, seq_to_pdb_id_entity_id_json
    )

    # 4. mapping sequence with integers identifiers for saving MSA.
    # When we actually store MSA, we need to use simpler integers as
    # identifiers, It's much better than directly use the sequence as identifiers,
    # if there exists long sequences.
    pdb_index_to_seq_path = out_dir / "pdb_index_to_seq.json"
    seq_to_pdb_index_path = out_dir / "seq_to_pdb_index.json"
    mapping_seqs_to_integer_identifiers(
        sequence_mapping, pdb_index_to_seq_path, seq_to_pdb_index_path
    )