# Copyright 2024 ByteDance and/or its affiliates. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json from pathlib import Path import biotite.structure as struc import pandas as pd from joblib import Parallel, delayed from protenix.data.ccd import get_one_letter_code from protenix.data.parser import MMCIFParser pd.options.mode.copy_on_write = True def get_seqs(mmcif_file): mmcif_parser = MMCIFParser(mmcif_file) entity_poly = mmcif_parser.get_category_table("entity_poly") if entity_poly is None: return pdb_id, None entity_poly["mmcif_seq_old"] = entity_poly.pdbx_seq_one_letter_code_can.str.replace( "\n", "" ) entity_poly["pdbx_type"] = entity_poly.type mol_type = [] # https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_entity_poly.type.html for i in entity_poly.type: if "ribonucleotide" in i: mol_type.append("na") elif "polypeptide" in i: mol_type.append("protein") else: mol_type.append("other") entity_poly["mol_type"] = mol_type entity = mmcif_parser.get_category_table("entity") info_df = pd.merge( entity, entity_poly, left_on="id", right_on="entity_id", how="inner" ) pdb_id = mmcif_file.name.split(".")[0] info_df["pdb_id"] = pdb_id if "pdbx_audit_revision_history" in mmcif_parser.cif.block: history = mmcif_parser.cif.block["pdbx_audit_revision_history"] info_df["release_date"] = history["revision_date"].as_array()[0] info_df["release_date_retrace_obsolete"] = mmcif_parser.release_date entity_poly_seq = mmcif_parser.get_category_table("entity_poly_seq") seq_from_resname = [] diff_seq_mmcif_vs_atom_site = [] has_alt_res = [] diff_alt_res_seq_vs_atom_site = [] for entity_id, mmcif_seq_old in zip(info_df.entity_id, info_df.mmcif_seq_old): chain_mask = entity_poly_seq.entity_id == entity_id res_names = entity_poly_seq.mon_id[chain_mask].to_numpy(dtype=str) res_ids = entity_poly_seq.num[chain_mask].to_numpy(dtype=int) seq = "" pre_res_id = 0 id_2_name = {} for res_id, res_name in zip(res_ids, res_names): if res_id == pre_res_id: continue id_2_name[res_id] = res_name one = get_one_letter_code(res_name) if one is None: one = "X" if len(one) > 1: one = "X" seq += one pre_res_id = res_id assert len(seq) == max(res_ids) diff_seq_mmcif_vs_atom_site.append(seq != mmcif_seq_old) has_alt = False mismatch_res_name = False if len(seq) < len(res_ids): # has altloc residue in same res_id has_alt = True # get_structure() return atom array only keep first altloc residue atom_array = mmcif_parser.get_structure() res_starts = struc.get_residue_starts(atom_array) for start in res_starts: if atom_array.label_entity_id[start] == entity_id: first_res_in_seq = id_2_name[atom_array.res_id[start]] first_res_in_atom = atom_array.res_name[start] if first_res_in_seq != first_res_in_atom: mismatch_res_name = True break has_alt_res.append(has_alt) diff_alt_res_seq_vs_atom_site.append(mismatch_res_name) seq_from_resname.append(seq) info_df["seq"] = seq_from_resname info_df["length"] = [len(s) for s in info_df.seq] info_df["diff_seq_mmcif_vs_atom_site"] = diff_seq_mmcif_vs_atom_site info_df["has_alt_res"] = has_alt_res info_df["diff_alt_res_seq_vs_atom_site"] = diff_alt_res_seq_vs_atom_site info_df["auth_asym_id"] = info_df["pdbx_strand_id"] columns = [ "pdb_id", "entity_id", "mol_type", "pdbx_type", "length", "mmcif_seq_old", "seq", "diff_seq_mmcif_vs_atom_site", "has_alt_res", "diff_alt_res_seq_vs_atom_site", "pdbx_description", "auth_asym_id", "release_date", "release_date_retrace_obsolete", ] info_df = info_df[columns] return pdb_id, info_df def try_get_seqs(cif_file): pdb_id = cif_file.name.split(".")[0] try: return get_seqs(cif_file) except Exception as e: print("skip", pdb_id, e) return pdb_id, "Error:" + str(e) def export_to_fasta(df, filename): df_protein = df[df["mol_type"] == "protein"] # drop duplicates sequence for avoiding duplicate msa search df_protein = df_protein.drop_duplicates(subset=["seq"]) with open(filename, "w") as fasta_file: for _, row in df_protein.iterrows(): header = f">{row['pdb_id']}_{row['entity_id']}\n" sequence = f"{row['seq']}\n" fasta_file.write(header) fasta_file.write(sequence) def mapping_seqs_to_pdb_entity_id(df, output_json_file): df_protein = df[df["mol_type"] == "protein"] sequence_mapping = {} for _, row in df_protein.iterrows(): seq = row["seq"] key = row["pdb_id"] value = row["entity_id"] if seq not in sequence_mapping: sequence_mapping[seq] = [] sequence_mapping[seq].append([key, value]) with open(output_json_file, "w") as json_file: json.dump(sequence_mapping, json_file, indent=4) return sequence_mapping def mapping_seqs_to_integer_identifiers( sequence_mapping, pdb_index_to_seq_path, seq_to_pdb_index_path, ): seq_to_pdb_index = {} for idx, seq in enumerate(sorted(sequence_mapping.keys())): seq_to_pdb_index[seq] = idx pdb_index_to_seq = {v: k for k, v in seq_to_pdb_index.items()} with open(pdb_index_to_seq_path, "w") as f: json.dump(pdb_index_to_seq, f, indent=4) with open(seq_to_pdb_index_path, "w") as f: json.dump(seq_to_pdb_index, f, indent=4) if __name__ == "__main__": # It's a demo here cif_dir = Path("./scripts/msa/data/mmcif") cif_files = [x for x in cif_dir.iterdir() if x.is_file()] info_dfs = [] none_list = [] error_list = [] with Parallel(n_jobs=-2, verbose=10) as parallel: for pdb_id, info_df in parallel([delayed(try_get_seqs)(f) for f in cif_files]): if info_df is None: none_list.append(pdb_id) elif isinstance(info_df, str) and info_df.startswith("Error:"): error_list.append((pdb_id, info_df)) else: info_dfs.append(info_df) out_df = pd.concat(info_dfs) out_df = out_df.sort_values(["pdb_id", "entity_id"]) out_dir = Path("./scripts/msa/data/pdb_seqs") if not out_dir.exists(): out_dir.mkdir(parents=True) # 1. extract pdb sequence info seq_file = out_dir / "pdb_seq.csv" seq_df = out_df[out_df.mol_type != "other"] seq_df.to_csv(seq_file, index=False) # 2. generate protein fasta file as MSA input fasta_file = out_dir / "pdb_seq.fasta" export_to_fasta(seq_df, fasta_file) # 3. get seq_to_pdb_id_entity_id mapping seq_to_pdb_id_entity_id_json = out_dir / "seq_to_pdb_id_entity_id.json" sequence_mapping = mapping_seqs_to_pdb_entity_id( seq_df, seq_to_pdb_id_entity_id_json ) # 4. mapping sequence with integers identifiers for saving MSA. # When we actually store MSA, we need to use simpler integers as # identifiers, It's much better than directly use the sequence as identifiers, # if there exists long sequences. pdb_index_to_seq_path = out_dir / "pdb_index_to_seq.json" seq_to_pdb_index_path = out_dir / "seq_to_pdb_index.json" mapping_seqs_to_integer_identifiers( sequence_mapping, pdb_index_to_seq_path, seq_to_pdb_index_path )