File size: 8,376 Bytes
89c0b51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
# Copyright 2024 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
from pathlib import Path
import biotite.structure as struc
import pandas as pd
from joblib import Parallel, delayed
from protenix.data.ccd import get_one_letter_code
from protenix.data.parser import MMCIFParser
pd.options.mode.copy_on_write = True
def get_seqs(mmcif_file):
mmcif_parser = MMCIFParser(mmcif_file)
entity_poly = mmcif_parser.get_category_table("entity_poly")
if entity_poly is None:
return pdb_id, None
entity_poly["mmcif_seq_old"] = entity_poly.pdbx_seq_one_letter_code_can.str.replace(
"\n", ""
)
entity_poly["pdbx_type"] = entity_poly.type
mol_type = []
# https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_entity_poly.type.html
for i in entity_poly.type:
if "ribonucleotide" in i:
mol_type.append("na")
elif "polypeptide" in i:
mol_type.append("protein")
else:
mol_type.append("other")
entity_poly["mol_type"] = mol_type
entity = mmcif_parser.get_category_table("entity")
info_df = pd.merge(
entity, entity_poly, left_on="id", right_on="entity_id", how="inner"
)
pdb_id = mmcif_file.name.split(".")[0]
info_df["pdb_id"] = pdb_id
if "pdbx_audit_revision_history" in mmcif_parser.cif.block:
history = mmcif_parser.cif.block["pdbx_audit_revision_history"]
info_df["release_date"] = history["revision_date"].as_array()[0]
info_df["release_date_retrace_obsolete"] = mmcif_parser.release_date
entity_poly_seq = mmcif_parser.get_category_table("entity_poly_seq")
seq_from_resname = []
diff_seq_mmcif_vs_atom_site = []
has_alt_res = []
diff_alt_res_seq_vs_atom_site = []
for entity_id, mmcif_seq_old in zip(info_df.entity_id, info_df.mmcif_seq_old):
chain_mask = entity_poly_seq.entity_id == entity_id
res_names = entity_poly_seq.mon_id[chain_mask].to_numpy(dtype=str)
res_ids = entity_poly_seq.num[chain_mask].to_numpy(dtype=int)
seq = ""
pre_res_id = 0
id_2_name = {}
for res_id, res_name in zip(res_ids, res_names):
if res_id == pre_res_id:
continue
id_2_name[res_id] = res_name
one = get_one_letter_code(res_name)
if one is None:
one = "X"
if len(one) > 1:
one = "X"
seq += one
pre_res_id = res_id
assert len(seq) == max(res_ids)
diff_seq_mmcif_vs_atom_site.append(seq != mmcif_seq_old)
has_alt = False
mismatch_res_name = False
if len(seq) < len(res_ids): # has altloc residue in same res_id
has_alt = True
# get_structure() return atom array only keep first altloc residue
atom_array = mmcif_parser.get_structure()
res_starts = struc.get_residue_starts(atom_array)
for start in res_starts:
if atom_array.label_entity_id[start] == entity_id:
first_res_in_seq = id_2_name[atom_array.res_id[start]]
first_res_in_atom = atom_array.res_name[start]
if first_res_in_seq != first_res_in_atom:
mismatch_res_name = True
break
has_alt_res.append(has_alt)
diff_alt_res_seq_vs_atom_site.append(mismatch_res_name)
seq_from_resname.append(seq)
info_df["seq"] = seq_from_resname
info_df["length"] = [len(s) for s in info_df.seq]
info_df["diff_seq_mmcif_vs_atom_site"] = diff_seq_mmcif_vs_atom_site
info_df["has_alt_res"] = has_alt_res
info_df["diff_alt_res_seq_vs_atom_site"] = diff_alt_res_seq_vs_atom_site
info_df["auth_asym_id"] = info_df["pdbx_strand_id"]
columns = [
"pdb_id",
"entity_id",
"mol_type",
"pdbx_type",
"length",
"mmcif_seq_old",
"seq",
"diff_seq_mmcif_vs_atom_site",
"has_alt_res",
"diff_alt_res_seq_vs_atom_site",
"pdbx_description",
"auth_asym_id",
"release_date",
"release_date_retrace_obsolete",
]
info_df = info_df[columns]
return pdb_id, info_df
def try_get_seqs(cif_file):
pdb_id = cif_file.name.split(".")[0]
try:
return get_seqs(cif_file)
except Exception as e:
print("skip", pdb_id, e)
return pdb_id, "Error:" + str(e)
def export_to_fasta(df, filename):
df_protein = df[df["mol_type"] == "protein"]
# drop duplicates sequence for avoiding duplicate msa search
df_protein = df_protein.drop_duplicates(subset=["seq"])
with open(filename, "w") as fasta_file:
for _, row in df_protein.iterrows():
header = f">{row['pdb_id']}_{row['entity_id']}\n"
sequence = f"{row['seq']}\n"
fasta_file.write(header)
fasta_file.write(sequence)
def mapping_seqs_to_pdb_entity_id(df, output_json_file):
df_protein = df[df["mol_type"] == "protein"]
sequence_mapping = {}
for _, row in df_protein.iterrows():
seq = row["seq"]
key = row["pdb_id"]
value = row["entity_id"]
if seq not in sequence_mapping:
sequence_mapping[seq] = []
sequence_mapping[seq].append([key, value])
with open(output_json_file, "w") as json_file:
json.dump(sequence_mapping, json_file, indent=4)
return sequence_mapping
def mapping_seqs_to_integer_identifiers(
sequence_mapping,
pdb_index_to_seq_path,
seq_to_pdb_index_path,
):
seq_to_pdb_index = {}
for idx, seq in enumerate(sorted(sequence_mapping.keys())):
seq_to_pdb_index[seq] = idx
pdb_index_to_seq = {v: k for k, v in seq_to_pdb_index.items()}
with open(pdb_index_to_seq_path, "w") as f:
json.dump(pdb_index_to_seq, f, indent=4)
with open(seq_to_pdb_index_path, "w") as f:
json.dump(seq_to_pdb_index, f, indent=4)
if __name__ == "__main__":
# It's a demo here
cif_dir = Path("./scripts/msa/data/mmcif")
cif_files = [x for x in cif_dir.iterdir() if x.is_file()]
info_dfs = []
none_list = []
error_list = []
with Parallel(n_jobs=-2, verbose=10) as parallel:
for pdb_id, info_df in parallel([delayed(try_get_seqs)(f) for f in cif_files]):
if info_df is None:
none_list.append(pdb_id)
elif isinstance(info_df, str) and info_df.startswith("Error:"):
error_list.append((pdb_id, info_df))
else:
info_dfs.append(info_df)
out_df = pd.concat(info_dfs)
out_df = out_df.sort_values(["pdb_id", "entity_id"])
out_dir = Path("./scripts/msa/data/pdb_seqs")
if not out_dir.exists():
out_dir.mkdir(parents=True)
# 1. extract pdb sequence info
seq_file = out_dir / "pdb_seq.csv"
seq_df = out_df[out_df.mol_type != "other"]
seq_df.to_csv(seq_file, index=False)
# 2. generate protein fasta file as MSA input
fasta_file = out_dir / "pdb_seq.fasta"
export_to_fasta(seq_df, fasta_file)
# 3. get seq_to_pdb_id_entity_id mapping
seq_to_pdb_id_entity_id_json = out_dir / "seq_to_pdb_id_entity_id.json"
sequence_mapping = mapping_seqs_to_pdb_entity_id(
seq_df, seq_to_pdb_id_entity_id_json
)
# 4. mapping sequence with integers identifiers for saving MSA.
# When we actually store MSA, we need to use simpler integers as
# identifiers, It's much better than directly use the sequence as identifiers,
# if there exists long sequences.
pdb_index_to_seq_path = out_dir / "pdb_index_to_seq.json"
seq_to_pdb_index_path = out_dir / "seq_to_pdb_index.json"
mapping_seqs_to_integer_identifiers(
sequence_mapping, pdb_index_to_seq_path, seq_to_pdb_index_path
)
|