Spaces:
Sleeping
Sleeping
File size: 2,680 Bytes
ce7bf5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
# Copyright Generate Biomedicines, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Constants used across protein representations.
These constants standardize protein tokenization alphabets, ideal structure
geometries and topologies, etc.
"""
from chroma.constants.geometry import AA_GEOMETRY
# Standard tokenization for Omniprot and Omniprot-interacting models
OMNIPROT_TOKENS = "ABCDEFGHIKLMNOPQRSTUVWYXZ*-#"
POTTS_EXTENDED_TOKENS = "ACDEFGHIKLMNPQRSTVWY-*#"
PAD = "-"
START = "@"
STOP = "*"
MASK = "#"
DNA_TOKENS = "ACGT"
RNA_TOKENS = "AGCU"
PROTEIN_TOKENS = "ACDEFGHIKLMNPQRSTVWY"
# Minimal 20-letter alphabet and corresponding triplet codes
AA20 = "ACDEFGHIKLMNPQRSTVWY"
AA20_3_TO_1 = {
"ALA": "A",
"ARG": "R",
"ASN": "N",
"ASP": "D",
"CYS": "C",
"GLN": "Q",
"GLU": "E",
"GLY": "G",
"HIS": "H",
"ILE": "I",
"LEU": "L",
"LYS": "K",
"MET": "M",
"PHE": "F",
"PRO": "P",
"SER": "S",
"THR": "T",
"TRP": "W",
"TYR": "Y",
"VAL": "V",
}
AA20_1_TO_3 = {
"A": "ALA",
"R": "ARG",
"N": "ASN",
"D": "ASP",
"C": "CYS",
"Q": "GLN",
"E": "GLU",
"G": "GLY",
"H": "HIS",
"I": "ILE",
"L": "LEU",
"K": "LYS",
"M": "MET",
"F": "PHE",
"P": "PRO",
"S": "SER",
"T": "THR",
"W": "TRP",
"Y": "TYR",
"V": "VAL",
}
AA20_3 = [AA20_1_TO_3[aa] for aa in AA20]
# Adding noncanonical amino acids
NONCANON_AA = [
"HSD",
"HSE",
"HSC",
"HSP",
"MSE",
"CSO",
"SEC",
"CSX",
"HIP",
"SEP",
"TPO",
]
AA31_3 = AA20_3 + NONCANON_AA
# Chain alphabet for PDB chain naming
CHAIN_ALPHABET = "_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
# Standard atom indexing
ATOMS_BB = ["N", "CA", "C", "O"]
ATOM_SYMMETRIES = {
"ARG": [("NH1", "NH2")], # Correct handling of NH1 and NH2 is relabeling
"ASP": [("OD1", "OD2")],
"GLU": [("OE1", "OE2")],
"PHE": [("CD1", "CD2"), ("CE1", "CE2")],
"TYR": [("CD1", "CD2"), ("CE1", "CE2")],
}
AA20_NUM_ATOMS = [4 + len(AA_GEOMETRY[aa]["atoms"]) for aa in AA20_3]
AA20_NUM_CHI = [len(AA_GEOMETRY[aa]["chi_indices"]) for aa in AA20_3]
|