Spaces:
Running
Running
from functools import cache | |
from pathlib import Path | |
from huggingface_hub import snapshot_download | |
SEQUENCE_BOS_TOKEN = 0 | |
SEQUENCE_PAD_TOKEN = 1 | |
SEQUENCE_EOS_TOKEN = 2 | |
SEQUENCE_CHAINBREAK_TOKEN = 31 | |
SEQUENCE_MASK_TOKEN = 32 | |
VQVAE_CODEBOOK_SIZE = 4096 | |
VQVAE_SPECIAL_TOKENS = { | |
"MASK": VQVAE_CODEBOOK_SIZE, | |
"EOS": VQVAE_CODEBOOK_SIZE + 1, | |
"BOS": VQVAE_CODEBOOK_SIZE + 2, | |
"PAD": VQVAE_CODEBOOK_SIZE + 3, | |
"CHAINBREAK": VQVAE_CODEBOOK_SIZE + 4, | |
} | |
VQVAE_DIRECTION_LOSS_BINS = 16 | |
VQVAE_PAE_BINS = 64 | |
VQVAE_MAX_PAE_BIN = 31.0 | |
VQVAE_PLDDT_BINS = 50 | |
STRUCTURE_MASK_TOKEN = VQVAE_SPECIAL_TOKENS["MASK"] | |
STRUCTURE_BOS_TOKEN = VQVAE_SPECIAL_TOKENS["BOS"] | |
STRUCTURE_EOS_TOKEN = VQVAE_SPECIAL_TOKENS["EOS"] | |
STRUCTURE_PAD_TOKEN = VQVAE_SPECIAL_TOKENS["PAD"] | |
STRUCTURE_CHAINBREAK_TOKEN = VQVAE_SPECIAL_TOKENS["CHAINBREAK"] | |
STRUCTURE_UNDEFINED_TOKEN = 955 | |
SASA_UNK_TOKEN = 2 | |
SASA_PAD_TOKEN = 0 | |
SS8_UNK_TOKEN = 2 | |
SS8_PAD_TOKEN = 0 | |
INTERPRO_PAD_TOKEN = 0 | |
RESIDUE_PAD_TOKEN = 0 | |
CHAIN_BREAK_STR = "|" | |
SEQUENCE_BOS_STR = "<cls>" | |
SEQUENCE_EOS_STR = "<eos>" | |
MASK_STR_SHORT = "_" | |
SEQUENCE_MASK_STR = "<mask>" | |
SASA_MASK_STR = "<unk>" | |
SS8_MASK_STR = "<unk>" | |
# fmt: off | |
SEQUENCE_VOCAB = [ | |
"<cls>", "<pad>", "<eos>", "<unk>", | |
"L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", | |
"Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", | |
"O", ".", "-", "|", | |
"<mask>", | |
] | |
# fmt: on | |
SSE_8CLASS_VOCAB = "GHITEBSC" | |
SSE_3CLASS_VOCAB = "HEC" | |
SSE_8CLASS_TO_3CLASS_MAP = { | |
"G": "H", | |
"H": "H", | |
"I": "H", | |
"T": "C", | |
"E": "E", | |
"B": "E", | |
"S": "C", | |
"C": "C", | |
} | |
SASA_DISCRETIZATION_BOUNDARIES = [ | |
0.8, | |
4.0, | |
9.6, | |
16.4, | |
24.5, | |
32.9, | |
42.0, | |
51.5, | |
61.2, | |
70.9, | |
81.6, | |
93.3, | |
107.2, | |
125.4, | |
151.4, | |
] | |
MAX_RESIDUE_ANNOTATIONS = 16 | |
TFIDF_VECTOR_SIZE = 58641 | |
def data_root(): | |
# Try a few default directories | |
for path in [ | |
"esm/data", | |
]: | |
if (p := Path(path)).exists(): | |
return p.parent | |
# Try to download from hugginface if it doesn't exist | |
path = Path(snapshot_download(repo_id="EvolutionaryScale/esm3-sm-open-v1")) | |
return path | |
INTERPRO_ENTRY = "data/entry_list_safety_29026.list" | |
INTERPRO_HIERARCHY = "data/ParentChildTreeFile.txt" | |
INTERPRO2GO = "data/ParentChildTreeFile.txt" | |
INTERPRO_2ID = "data/tag_dict_4_safety_filtered.json" | |
LSH_TABLE_PATHS = { | |
"8bit": "data/hyperplanes_8bit_58641.npz", | |
} | |
KEYWORDS_VOCABULARY = "data/keyword_vocabulary_safety_filtered_58641.txt" | |
KEYWORDS_IDF = "data/keyword_idf_safety_filtered_58641.npy" | |
RESID_CSV = "data/uniref90_and_mgnify90_residue_annotations_gt_1k_proteins.csv" | |
INTERPRO2KEYWORDS = "data/interpro_29026_to_keywords_58641.csv" |