Spaces:
Sleeping
Sleeping
File size: 2,752 Bytes
224a33f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
from functools import cache
from pathlib import Path
from huggingface_hub import snapshot_download
SEQUENCE_BOS_TOKEN = 0
SEQUENCE_PAD_TOKEN = 1
SEQUENCE_EOS_TOKEN = 2
SEQUENCE_CHAINBREAK_TOKEN = 31
SEQUENCE_MASK_TOKEN = 32
VQVAE_CODEBOOK_SIZE = 4096
VQVAE_SPECIAL_TOKENS = {
"MASK": VQVAE_CODEBOOK_SIZE,
"EOS": VQVAE_CODEBOOK_SIZE + 1,
"BOS": VQVAE_CODEBOOK_SIZE + 2,
"PAD": VQVAE_CODEBOOK_SIZE + 3,
"CHAINBREAK": VQVAE_CODEBOOK_SIZE + 4,
}
VQVAE_DIRECTION_LOSS_BINS = 16
VQVAE_PAE_BINS = 64
VQVAE_MAX_PAE_BIN = 31.0
VQVAE_PLDDT_BINS = 50
STRUCTURE_MASK_TOKEN = VQVAE_SPECIAL_TOKENS["MASK"]
STRUCTURE_BOS_TOKEN = VQVAE_SPECIAL_TOKENS["BOS"]
STRUCTURE_EOS_TOKEN = VQVAE_SPECIAL_TOKENS["EOS"]
STRUCTURE_PAD_TOKEN = VQVAE_SPECIAL_TOKENS["PAD"]
STRUCTURE_CHAINBREAK_TOKEN = VQVAE_SPECIAL_TOKENS["CHAINBREAK"]
STRUCTURE_UNDEFINED_TOKEN = 955
SASA_UNK_TOKEN = 2
SASA_PAD_TOKEN = 0
SS8_UNK_TOKEN = 2
SS8_PAD_TOKEN = 0
INTERPRO_PAD_TOKEN = 0
RESIDUE_PAD_TOKEN = 0
CHAIN_BREAK_STR = "|"
SEQUENCE_BOS_STR = "<cls>"
SEQUENCE_EOS_STR = "<eos>"
MASK_STR_SHORT = "_"
SEQUENCE_MASK_STR = "<mask>"
SASA_MASK_STR = "<unk>"
SS8_MASK_STR = "<unk>"
# fmt: off
SEQUENCE_VOCAB = [
"<cls>", "<pad>", "<eos>", "<unk>",
"L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K",
"Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z",
"O", ".", "-", "|",
"<mask>",
]
# fmt: on
SSE_8CLASS_VOCAB = "GHITEBSC"
SSE_3CLASS_VOCAB = "HEC"
SSE_8CLASS_TO_3CLASS_MAP = {
"G": "H",
"H": "H",
"I": "H",
"T": "C",
"E": "E",
"B": "E",
"S": "C",
"C": "C",
}
SASA_DISCRETIZATION_BOUNDARIES = [
0.8,
4.0,
9.6,
16.4,
24.5,
32.9,
42.0,
51.5,
61.2,
70.9,
81.6,
93.3,
107.2,
125.4,
151.4,
]
MAX_RESIDUE_ANNOTATIONS = 16
TFIDF_VECTOR_SIZE = 58641
@staticmethod
@cache
def data_root():
# Try a few default directories
for path in [
"esm/data",
]:
if (p := Path(path)).exists():
return p.parent
# Try to download from hugginface if it doesn't exist
path = Path(snapshot_download(repo_id="EvolutionaryScale/esm3-sm-open-v1"))
return path
INTERPRO_ENTRY = "data/entry_list_safety_29026.list"
INTERPRO_HIERARCHY = "data/ParentChildTreeFile.txt"
INTERPRO2GO = "data/ParentChildTreeFile.txt"
INTERPRO_2ID = "data/tag_dict_4_safety_filtered.json"
LSH_TABLE_PATHS = {
"8bit": "data/hyperplanes_8bit_58641.npz",
}
KEYWORDS_VOCABULARY = "data/keyword_vocabulary_safety_filtered_58641.txt"
KEYWORDS_IDF = "data/keyword_idf_safety_filtered_58641.npy"
RESID_CSV = "data/uniref90_and_mgnify90_residue_annotations_gt_1k_proteins.csv"
INTERPRO2KEYWORDS = "data/interpro_29026_to_keywords_58641.csv" |