File size: 2,597 Bytes
52da96f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import itertools
aa_set = {"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"}
aa_list = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
foldseek_seq_vocab = "ACDEFGHIKLMNPQRSTVWY#"
foldseek_struc_vocab = "pynwrqhgdlvtmfsaeikc#"
struc_unit = "abcdefghijklmnopqrstuvwxyz"
def create_vocab(size: int) -> dict:
"""
Args:
size: Size of the vocabulary
Returns:
vocab: Vocabulary
"""
token_len = 1
while size > len(struc_unit) ** token_len:
token_len += 1
vocab = {}
for i, token in enumerate(itertools.product(struc_unit, repeat=token_len)):
vocab[i] = "".join(token)
if len(vocab) == size:
vocab[i+1] = "#"
return vocab
# ProTrek
residue_level = {"Active site", "Binding site", "Site", "DNA binding", "Natural variant", "Mutagenesis",
"Transmembrane", "Topological domain", "Intramembrane", "Signal peptide", "Propeptide",
"Transit peptide",
"Chain", "Peptide", "Modified residue", "Lipidation", "Glycosylation", "Disulfide bond",
"Cross-link",
"Domain", "Repeat", "Compositional bias", "Region", "Coiled coil", "Motif"}
sequence_level = {"Function", "Miscellaneous", "Caution", "Catalytic activity", "Cofactor", "Activity regulation",
"Biophysicochemical properties", "Pathway", "Involvement in disease", "Allergenic properties",
"Toxic dose", "Pharmaceutical use", "Disruption phenotype", "Subcellular location",
"Post-translational modification", "Subunit", "Domain (non-positional annotation)",
"Sequence similarities", "RNA Editing", "Tissue specificity", "Developmental stage", "Induction",
"Biotechnology", "Polymorphism", "GO annotation", "Proteomes", "Protein names", "Gene names",
"Organism", "Taxonomic lineage", "Virus host"}
raw_text_level = {"Function", "Subunit", "Tissue specificity", "Disruption phenotype", "Post-translational modification",
"Induction", "Miscellaneous", "Sequence similarities", "Developmental stage",
"Domain (non-positional annotation)", "Activity regulation", "Caution", "Polymorphism", "Toxic dose",
"Allergenic properties", "Pharmaceutical use", "Cofactor", "Biophysicochemical properties",
"Subcellular location", "RNA Editing"} |