Spaces:
Sleeping
Sleeping
# Copyright Generate Biomedicines, Inc. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Standard residue names for polymers of different types (e.g., L- or D-amino acid proteins, | |
mixed-chirality proteins, DNA/RNA, etc.) | |
""" | |
from enum import Enum | |
class polymerType(Enum): | |
LPROT = 0 | |
DPROT = 1 | |
LDPROT = 2 | |
DNA = 3 | |
RNA = 4 | |
def polymer_type_name(ptype: polymerType): | |
if ptype == polymerType.LPROT: | |
return "polypeptide(L)" | |
elif ptype == polymerType.DPROT: | |
return "polypeptide(D)" | |
elif ptype == polymerType.LDPROT: | |
return "polypeptide(L,D)" | |
elif ptype == polymerType.DNA: | |
return "polydeoxyribonucleotide" | |
elif ptype == polymerType.RNA: | |
return "polyribonucleotide" | |
else: | |
raise Exception(f"unknown polymer type {ptype}") | |
_res3 = [[] for _ in range(len(polymerType))] | |
_res1 = [[] for _ in range(len(polymerType))] | |
_res_to_idx = [dict() for _ in range(len(polymerType))] | |
_unk_idx = [set() for _ in range(len(polymerType))] | |
_gap_idx = [set() for _ in range(len(polymerType))] | |
_stp_idx = [set() for _ in range(len(polymerType))] | |
def _add_residue(ptype: polymerType, res3, res1): | |
if isinstance(ptype, list): | |
for pt, r3, r1 in zip(ptype, res3, res1): | |
_add_residue(pt, r3, r1) | |
else: | |
_res_to_idx[ptype.value][res3] = len(_res3[ptype.value]) | |
# single-letter code is ambiguous, so take the first residue when going from single-letter code to index | |
if res1 not in _res_to_idx[ptype.value]: | |
_res_to_idx[ptype.value][res1] = _res_to_idx[ptype.value][res3] | |
_res3[ptype.value].append(res3) | |
_res1[ptype.value].append(res1) | |
if res3 == "---": | |
_gap_idx[ptype.value].add(_res_to_idx[ptype.value][res3]) | |
elif res3 == "UNK": | |
_unk_idx[ptype.value].add(_res_to_idx[ptype.value][res3]) | |
elif res3 == "STP": | |
_stp_idx[ptype.value].add(_res_to_idx[ptype.value][res3]) | |
def num_tokens(ptype=polymerType.LPROT): | |
return len(_res3[ptype.value]) | |
def num_known_molecular_tokens(ptype=polymerType.LPROT): | |
return sum( | |
[ | |
not is_punctuation_index(idx) and not is_unknown(idx) | |
for idx in range(len(_res3[ptype.value])) | |
] | |
) | |
def res_to_index(res: str, ptype=polymerType.LPROT): | |
return _res_to_idx[ptype.value].get(res, next(iter(_unk_idx[ptype.value]))) | |
def index_to_single(idx: int, ptype=polymerType.LPROT): | |
return _res1[ptype.value][idx] | |
def index_to_triple(idx: int, ptype=polymerType.LPROT): | |
return _res3[ptype.value][idx] | |
def to_single(res: str, ptype=polymerType.LPROT): | |
return index_to_single(res_to_index(res, ptype)) | |
def to_triple(res: str, ptype=polymerType.LPROT): | |
return index_to_triple(res_to_index(res, ptype)) | |
def is_gap_index(idx: int, ptype=polymerType.LPROT): | |
return idx in _gap_idx[ptype.value] | |
def is_stop_index(idx: int, ptype=polymerType.LPROT): | |
return idx in _stp_idx[ptype.value] | |
def is_unknown(res: str, ptype=polymerType.LPROT): | |
return is_unknown_index(res_to_index(res, ptype), ptype) | |
def is_unknown_index(idx: int, ptype=polymerType.LPROT): | |
return idx in _unk_idx[ptype.value] | |
def is_polymer_residue(res: str, ptype: polymerType): | |
if ptype is None: | |
# determine if this is a polymer residue for any known polymer | |
for ptype in polymerType: | |
if res in _res_to_idx[ptype.value]: | |
return True | |
return False | |
return res in _res_to_idx[ptype.value] | |
def is_punctuation_index(idx: int, ptype=polymerType.LPROT): | |
return is_gap_index(idx, ptype) or is_stop_index(idx, ptype) | |
def is_canonical(res: str, ptype=polymerType.LPROT): | |
if ptype == polymerType.LPROT or ptype == polymerType.DPROT: | |
idx = res_to_index(res, ptype) | |
return (idx < 20) and (idx >= 0) | |
elif ptype == polymerType.LDPROT: | |
return is_canonical(res, polymerType.LPROT) or is_canonical( | |
mirror_amino_acid(res), polymerType.DPROT | |
) | |
raise Exception(f"do not known how to deal with polymer type {ptype}") | |
def canonical_amino_acids(ptype=polymerType.LPROT): | |
canonicals = [] | |
for aa in _res3[ptype.value]: | |
if is_canonical(aa, ptype): | |
canonicals.append(aa) | |
return canonicals | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["ALA", "DAL"], ["A", "a"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["CYS", "DCY"], ["C", "c"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["ASP", "DAS"], ["D", "d"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["GLU", "DGL"], ["E", "e"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["PHE", "DPN"], ["F", "f"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["GLY", "GLY"], ["G", "G"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["HIS", "DHI"], ["H", "h"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["ILE", "DIL"], ["I", "i"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["LYS", "DLY"], ["K", "k"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["LEU", "DLE"], ["L", "l"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["MET", "MED"], ["M", "m"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["ASN", "DSG"], ["N", "n"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["PRO", "DPR"], ["P", "p"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["GLN", "DGN"], ["Q", "q"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["ARG", "DAR"], ["R", "r"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["SER", "DSN"], ["S", "s"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["THR", "DTH"], ["T", "t"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["VAL", "DVA"], ["V", "v"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["TRP", "DTR"], ["W", "w"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["TYR", "DTY"], ["Y", "y"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["HSD", "DSD"], ["H", "h"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["HSE", "DSE"], ["H", "h"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["HSC", "DSC"], ["H", "h"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["HSP", "DSP"], ["H", "h"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["MSE", "DMS"], ["M", "m"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["CSO", "DCS"], ["C", "c"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["SEC", "DEC"], ["C", "c"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["CSX", "DCX"], ["C", "c"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["HIP", "DHP"], ["H", "h"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["SEP", "DEP"], ["S", "s"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["TPO", "DTP"], ["T", "t"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["PTR", "DPT"], ["Y", "y"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["UNK", "UNK"], ["X", "X"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["STP", "STP"], ["*", "*"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["---", "---"], ["-", "-"]) | |
_add_residue([polymerType.LPROT, polymerType.DPROT], ["---", "---"], [".", "."]) | |
for grp in [1, 2, 3]: | |
for tp in [polymerType.LPROT, polymerType.DPROT]: | |
for idx in range(num_tokens(tp)): | |
if grp == 1: | |
if not is_punctuation_index(idx, tp) and ( | |
not is_unknown_index(idx, tp) | |
): | |
if _res3[tp.value][idx] not in _res3[polymerType.LDPROT.value]: | |
_add_residue( | |
polymerType.LDPROT, | |
_res3[tp.value][idx], | |
_res1[tp.value][idx], | |
) | |
elif grp == 2: | |
if is_unknown_index(idx, tp): | |
if _res3[tp.value][idx] not in _res3[polymerType.LDPROT.value]: | |
_add_residue( | |
polymerType.LDPROT, | |
_res3[tp.value][idx], | |
_res1[tp.value][idx], | |
) | |
elif grp == 3: | |
if is_punctuation_index(idx, tp): | |
if _res3[tp.value][idx] not in _res3[polymerType.LDPROT.value]: | |
_add_residue( | |
polymerType.LDPROT, | |
_res3[tp.value][idx], | |
_res1[tp.value][idx], | |
) | |
def mirror_amino_acid(res: str): | |
idx = mirror_amino_acid_index(res_to_index(res, polymerType.LDPROT)) | |
if len(res) == 1: | |
return index_to_single(idx) | |
return index_to_triple(idx) | |
def mirror_amino_acid_index(idx: int): | |
N = num_known_molecular_tokens(polymerType.LDPROT) | |
# if this is an unknown residue or a punctuation mark, return as is | |
if idx >= N: | |
return idx | |
# otherwise, flip chirality | |
return (idx + N // 2) % N | |