Spaces:

Zaixi
/

FoldMark

Running on Zero

App Files Files Community

FoldMark / scripts /msa /step1-get_prot_seq.py

Zaixi

Add large file

89c0b51 5 months ago

raw

history blame contribute delete

8.38 kB

	# Copyright 2024 ByteDance and/or its affiliates.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import json
	from pathlib import Path

	import biotite.structure as struc
	import pandas as pd
	from joblib import Parallel, delayed

	from protenix.data.ccd import get_one_letter_code
	from protenix.data.parser import MMCIFParser

	pd.options.mode.copy_on_write = True


	def get_seqs(mmcif_file):
	mmcif_parser = MMCIFParser(mmcif_file)

	entity_poly = mmcif_parser.get_category_table("entity_poly")
	if entity_poly is None:
	return pdb_id, None
	entity_poly["mmcif_seq_old"] = entity_poly.pdbx_seq_one_letter_code_can.str.replace(
	"\n", ""
	)
	entity_poly["pdbx_type"] = entity_poly.type
	mol_type = []
	# https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_entity_poly.type.html
	for i in entity_poly.type:
	if "ribonucleotide" in i:
	mol_type.append("na")
	elif "polypeptide" in i:
	mol_type.append("protein")
	else:
	mol_type.append("other")
	entity_poly["mol_type"] = mol_type

	entity = mmcif_parser.get_category_table("entity")
	info_df = pd.merge(
	entity, entity_poly, left_on="id", right_on="entity_id", how="inner"
	)
	pdb_id = mmcif_file.name.split(".")[0]
	info_df["pdb_id"] = pdb_id

	if "pdbx_audit_revision_history" in mmcif_parser.cif.block:
	history = mmcif_parser.cif.block["pdbx_audit_revision_history"]
	info_df["release_date"] = history["revision_date"].as_array()[0]

	info_df["release_date_retrace_obsolete"] = mmcif_parser.release_date

	entity_poly_seq = mmcif_parser.get_category_table("entity_poly_seq")

	seq_from_resname = []
	diff_seq_mmcif_vs_atom_site = []
	has_alt_res = []
	diff_alt_res_seq_vs_atom_site = []
	for entity_id, mmcif_seq_old in zip(info_df.entity_id, info_df.mmcif_seq_old):
	chain_mask = entity_poly_seq.entity_id == entity_id
	res_names = entity_poly_seq.mon_id[chain_mask].to_numpy(dtype=str)
	res_ids = entity_poly_seq.num[chain_mask].to_numpy(dtype=int)

	seq = ""
	pre_res_id = 0
	id_2_name = {}
	for res_id, res_name in zip(res_ids, res_names):
	if res_id == pre_res_id:
	continue
	id_2_name[res_id] = res_name
	one = get_one_letter_code(res_name)
	if one is None:
	one = "X"
	if len(one) > 1:
	one = "X"
	seq += one
	pre_res_id = res_id
	assert len(seq) == max(res_ids)

	diff_seq_mmcif_vs_atom_site.append(seq != mmcif_seq_old)
	has_alt = False
	mismatch_res_name = False
	if len(seq) < len(res_ids): # has altloc residue in same res_id
	has_alt = True
	# get_structure() return atom array only keep first altloc residue
	atom_array = mmcif_parser.get_structure()
	res_starts = struc.get_residue_starts(atom_array)
	for start in res_starts:
	if atom_array.label_entity_id[start] == entity_id:
	first_res_in_seq = id_2_name[atom_array.res_id[start]]
	first_res_in_atom = atom_array.res_name[start]
	if first_res_in_seq != first_res_in_atom:
	mismatch_res_name = True
	break

	has_alt_res.append(has_alt)
	diff_alt_res_seq_vs_atom_site.append(mismatch_res_name)

	seq_from_resname.append(seq)
	info_df["seq"] = seq_from_resname
	info_df["length"] = [len(s) for s in info_df.seq]
	info_df["diff_seq_mmcif_vs_atom_site"] = diff_seq_mmcif_vs_atom_site
	info_df["has_alt_res"] = has_alt_res
	info_df["diff_alt_res_seq_vs_atom_site"] = diff_alt_res_seq_vs_atom_site
	info_df["auth_asym_id"] = info_df["pdbx_strand_id"]

	columns = [
	"pdb_id",
	"entity_id",
	"mol_type",
	"pdbx_type",
	"length",
	"mmcif_seq_old",
	"seq",
	"diff_seq_mmcif_vs_atom_site",
	"has_alt_res",
	"diff_alt_res_seq_vs_atom_site",
	"pdbx_description",
	"auth_asym_id",
	"release_date",
	"release_date_retrace_obsolete",
	]
	info_df = info_df[columns]
	return pdb_id, info_df


	def try_get_seqs(cif_file):
	pdb_id = cif_file.name.split(".")[0]
	try:
	return get_seqs(cif_file)
	except Exception as e:
	print("skip", pdb_id, e)
	return pdb_id, "Error:" + str(e)


	def export_to_fasta(df, filename):
	df_protein = df[df["mol_type"] == "protein"]
	# drop duplicates sequence for avoiding duplicate msa search
	df_protein = df_protein.drop_duplicates(subset=["seq"])
	with open(filename, "w") as fasta_file:
	for _, row in df_protein.iterrows():
	header = f">{row['pdb_id']}_{row['entity_id']}\n"
	sequence = f"{row['seq']}\n"
	fasta_file.write(header)
	fasta_file.write(sequence)


	def mapping_seqs_to_pdb_entity_id(df, output_json_file):
	df_protein = df[df["mol_type"] == "protein"]
	sequence_mapping = {}

	for _, row in df_protein.iterrows():
	seq = row["seq"]
	key = row["pdb_id"]
	value = row["entity_id"]

	if seq not in sequence_mapping:
	sequence_mapping[seq] = []
	sequence_mapping[seq].append([key, value])

	with open(output_json_file, "w") as json_file:
	json.dump(sequence_mapping, json_file, indent=4)
	return sequence_mapping


	def mapping_seqs_to_integer_identifiers(
	sequence_mapping,
	pdb_index_to_seq_path,
	seq_to_pdb_index_path,
	):
	seq_to_pdb_index = {}
	for idx, seq in enumerate(sorted(sequence_mapping.keys())):
	seq_to_pdb_index[seq] = idx
	pdb_index_to_seq = {v: k for k, v in seq_to_pdb_index.items()}
	with open(pdb_index_to_seq_path, "w") as f:
	json.dump(pdb_index_to_seq, f, indent=4)
	with open(seq_to_pdb_index_path, "w") as f:
	json.dump(seq_to_pdb_index, f, indent=4)


	if __name__ == "__main__":
	# It's a demo here
	cif_dir = Path("./scripts/msa/data/mmcif")
	cif_files = [x for x in cif_dir.iterdir() if x.is_file()]

	info_dfs = []
	none_list = []
	error_list = []
	with Parallel(n_jobs=-2, verbose=10) as parallel:
	for pdb_id, info_df in parallel([delayed(try_get_seqs)(f) for f in cif_files]):
	if info_df is None:
	none_list.append(pdb_id)
	elif isinstance(info_df, str) and info_df.startswith("Error:"):
	error_list.append((pdb_id, info_df))
	else:
	info_dfs.append(info_df)

	out_df = pd.concat(info_dfs)
	out_df = out_df.sort_values(["pdb_id", "entity_id"])

	out_dir = Path("./scripts/msa/data/pdb_seqs")
	if not out_dir.exists():
	out_dir.mkdir(parents=True)
	# 1. extract pdb sequence info
	seq_file = out_dir / "pdb_seq.csv"
	seq_df = out_df[out_df.mol_type != "other"]
	seq_df.to_csv(seq_file, index=False)
	# 2. generate protein fasta file as MSA input
	fasta_file = out_dir / "pdb_seq.fasta"
	export_to_fasta(seq_df, fasta_file)

	# 3. get seq_to_pdb_id_entity_id mapping
	seq_to_pdb_id_entity_id_json = out_dir / "seq_to_pdb_id_entity_id.json"
	sequence_mapping = mapping_seqs_to_pdb_entity_id(
	seq_df, seq_to_pdb_id_entity_id_json
	)

	# 4. mapping sequence with integers identifiers for saving MSA.
	# When we actually store MSA, we need to use simpler integers as
	# identifiers, It's much better than directly use the sequence as identifiers,
	# if there exists long sequences.
	pdb_index_to_seq_path = out_dir / "pdb_index_to_seq.json"
	seq_to_pdb_index_path = out_dir / "seq_to_pdb_index.json"
	mapping_seqs_to_integer_identifiers(
	sequence_mapping, pdb_index_to_seq_path, seq_to_pdb_index_path
	)