anonymousforpaper's picture
Upload 103 files
224a33f verified
from functools import cache
from pathlib import Path
from huggingface_hub import snapshot_download
SEQUENCE_BOS_TOKEN = 0
SEQUENCE_PAD_TOKEN = 1
SEQUENCE_EOS_TOKEN = 2
SEQUENCE_CHAINBREAK_TOKEN = 31
SEQUENCE_MASK_TOKEN = 32
VQVAE_CODEBOOK_SIZE = 4096
VQVAE_SPECIAL_TOKENS = {
"MASK": VQVAE_CODEBOOK_SIZE,
"EOS": VQVAE_CODEBOOK_SIZE + 1,
"BOS": VQVAE_CODEBOOK_SIZE + 2,
"PAD": VQVAE_CODEBOOK_SIZE + 3,
"CHAINBREAK": VQVAE_CODEBOOK_SIZE + 4,
}
VQVAE_DIRECTION_LOSS_BINS = 16
VQVAE_PAE_BINS = 64
VQVAE_MAX_PAE_BIN = 31.0
VQVAE_PLDDT_BINS = 50
STRUCTURE_MASK_TOKEN = VQVAE_SPECIAL_TOKENS["MASK"]
STRUCTURE_BOS_TOKEN = VQVAE_SPECIAL_TOKENS["BOS"]
STRUCTURE_EOS_TOKEN = VQVAE_SPECIAL_TOKENS["EOS"]
STRUCTURE_PAD_TOKEN = VQVAE_SPECIAL_TOKENS["PAD"]
STRUCTURE_CHAINBREAK_TOKEN = VQVAE_SPECIAL_TOKENS["CHAINBREAK"]
STRUCTURE_UNDEFINED_TOKEN = 955
SASA_UNK_TOKEN = 2
SASA_PAD_TOKEN = 0
SS8_UNK_TOKEN = 2
SS8_PAD_TOKEN = 0
INTERPRO_PAD_TOKEN = 0
RESIDUE_PAD_TOKEN = 0
CHAIN_BREAK_STR = "|"
SEQUENCE_BOS_STR = "<cls>"
SEQUENCE_EOS_STR = "<eos>"
MASK_STR_SHORT = "_"
SEQUENCE_MASK_STR = "<mask>"
SASA_MASK_STR = "<unk>"
SS8_MASK_STR = "<unk>"
# fmt: off
SEQUENCE_VOCAB = [
"<cls>", "<pad>", "<eos>", "<unk>",
"L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K",
"Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z",
"O", ".", "-", "|",
"<mask>",
]
# fmt: on
SSE_8CLASS_VOCAB = "GHITEBSC"
SSE_3CLASS_VOCAB = "HEC"
SSE_8CLASS_TO_3CLASS_MAP = {
"G": "H",
"H": "H",
"I": "H",
"T": "C",
"E": "E",
"B": "E",
"S": "C",
"C": "C",
}
SASA_DISCRETIZATION_BOUNDARIES = [
0.8,
4.0,
9.6,
16.4,
24.5,
32.9,
42.0,
51.5,
61.2,
70.9,
81.6,
93.3,
107.2,
125.4,
151.4,
]
MAX_RESIDUE_ANNOTATIONS = 16
TFIDF_VECTOR_SIZE = 58641
@staticmethod
@cache
def data_root():
# Try a few default directories
for path in [
"esm/data",
]:
if (p := Path(path)).exists():
return p.parent
# Try to download from hugginface if it doesn't exist
path = Path(snapshot_download(repo_id="EvolutionaryScale/esm3-sm-open-v1"))
return path
INTERPRO_ENTRY = "data/entry_list_safety_29026.list"
INTERPRO_HIERARCHY = "data/ParentChildTreeFile.txt"
INTERPRO2GO = "data/ParentChildTreeFile.txt"
INTERPRO_2ID = "data/tag_dict_4_safety_filtered.json"
LSH_TABLE_PATHS = {
"8bit": "data/hyperplanes_8bit_58641.npz",
}
KEYWORDS_VOCABULARY = "data/keyword_vocabulary_safety_filtered_58641.txt"
KEYWORDS_IDF = "data/keyword_idf_safety_filtered_58641.npy"
RESID_CSV = "data/uniref90_and_mgnify90_residue_annotations_gt_1k_proteins.csv"
INTERPRO2KEYWORDS = "data/interpro_29026_to_keywords_58641.csv"