Spaces:
Sleeping
Sleeping
# Copyright Generate Biomedicines, Inc. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Constants used across protein representations. | |
These constants standardize protein tokenization alphabets, ideal structure | |
geometries and topologies, etc. | |
""" | |
from chroma.constants.geometry import AA_GEOMETRY | |
# Standard tokenization for Omniprot and Omniprot-interacting models | |
OMNIPROT_TOKENS = "ABCDEFGHIKLMNOPQRSTUVWYXZ*-#" | |
POTTS_EXTENDED_TOKENS = "ACDEFGHIKLMNPQRSTVWY-*#" | |
PAD = "-" | |
START = "@" | |
STOP = "*" | |
MASK = "#" | |
DNA_TOKENS = "ACGT" | |
RNA_TOKENS = "AGCU" | |
PROTEIN_TOKENS = "ACDEFGHIKLMNPQRSTVWY" | |
# Minimal 20-letter alphabet and corresponding triplet codes | |
AA20 = "ACDEFGHIKLMNPQRSTVWY" | |
AA20_3_TO_1 = { | |
"ALA": "A", | |
"ARG": "R", | |
"ASN": "N", | |
"ASP": "D", | |
"CYS": "C", | |
"GLN": "Q", | |
"GLU": "E", | |
"GLY": "G", | |
"HIS": "H", | |
"ILE": "I", | |
"LEU": "L", | |
"LYS": "K", | |
"MET": "M", | |
"PHE": "F", | |
"PRO": "P", | |
"SER": "S", | |
"THR": "T", | |
"TRP": "W", | |
"TYR": "Y", | |
"VAL": "V", | |
} | |
AA20_1_TO_3 = { | |
"A": "ALA", | |
"R": "ARG", | |
"N": "ASN", | |
"D": "ASP", | |
"C": "CYS", | |
"Q": "GLN", | |
"E": "GLU", | |
"G": "GLY", | |
"H": "HIS", | |
"I": "ILE", | |
"L": "LEU", | |
"K": "LYS", | |
"M": "MET", | |
"F": "PHE", | |
"P": "PRO", | |
"S": "SER", | |
"T": "THR", | |
"W": "TRP", | |
"Y": "TYR", | |
"V": "VAL", | |
} | |
AA20_3 = [AA20_1_TO_3[aa] for aa in AA20] | |
# Adding noncanonical amino acids | |
NONCANON_AA = [ | |
"HSD", | |
"HSE", | |
"HSC", | |
"HSP", | |
"MSE", | |
"CSO", | |
"SEC", | |
"CSX", | |
"HIP", | |
"SEP", | |
"TPO", | |
] | |
AA31_3 = AA20_3 + NONCANON_AA | |
# Chain alphabet for PDB chain naming | |
CHAIN_ALPHABET = "_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" | |
# Standard atom indexing | |
ATOMS_BB = ["N", "CA", "C", "O"] | |
ATOM_SYMMETRIES = { | |
"ARG": [("NH1", "NH2")], # Correct handling of NH1 and NH2 is relabeling | |
"ASP": [("OD1", "OD2")], | |
"GLU": [("OE1", "OE2")], | |
"PHE": [("CD1", "CD2"), ("CE1", "CE2")], | |
"TYR": [("CD1", "CD2"), ("CE1", "CE2")], | |
} | |
AA20_NUM_ATOMS = [4 + len(AA_GEOMETRY[aa]["atoms"]) for aa in AA20_3] | |
AA20_NUM_CHI = [len(AA_GEOMETRY[aa]["chi_indices"]) for aa in AA20_3] | |