File size: 2,523 Bytes

e048d40

from fuson_plm.utils.logging import CustomParams

# Clustering Parameters
# Need to be stacked, because there are 4 properties
CLUSTER = CustomParams(
    # MMSeqs2 parameters: see GitHub or MMSeqs2 Wiki for guidance
    MIN_SEQ_ID = 0.3,                                               # % identity
    C = 0.5,                                                        # % sequence length overlap
    COV_MODE = 1,                                                  # cov-mode: 0 = bidirectional, 1 = target coverage, 2 = query coverage, 3 = target-in-query length coverage.
    CLUSTER_MODE = 2,
    # File paths
    INPUT_PATH = 'processed_data/all_albatross_seqs_and_properties.csv',
    PATH_TO_MMSEQS = '../../mmseqs'                                    # path to where you installed MMSeqs2   
)

# Here, we'll be splitting the train set into train and val. we aren't touching test
SPLIT = CustomParams(
    IDR_DB_PATH = 'processed_data/all_albatross_seqs_and_properties.csv',
    CLUSTER_OUTPUT_PATH = 'clustering/mmseqs_full_results.csv',    
    #RANDOM_STATE = 7,                                    # random_state_1 = state for splitting all data into train & test
    #VAL_SIZE = 0.10,                                    # val size for data -> train/val split. e.g. 20 means 80% clusters in train, 20% clusters in val
    RANDOM_STATE_1 = 2,                                    # random_state_1 = state for splitting all data into train & other
    TEST_SIZE_1 = 0.21,                                    # test size for data -> train/test split. e.g. 20 means 80% clusters in train, 20% clusters in other
    RANDOM_STATE_2 = 6,                                    # random_state_2 = state for splitting other from ^ into val and test
    TEST_SIZE_2 = 0.50                                     # test size for train -> train/val split. e.g. 0.50 means 50% clusters in train, 50% clusters in test

)

# Which models to benchmark
TRAIN = CustomParams(
    BENCHMARK_FUSONPLM = True,
    FUSONPLM_CKPTS= "FusOn-pLM",                                # Dictionary: key = run name, values = epochs, or string "FusOn-pLM"
    BENCHMARK_ESM = True,

    # GPU configs
    CUDA_VISIBLE_DEVICES="0",

    # Overwriting configs
    PERMISSION_TO_OVERWRITE_EMBEDDINGS = False,                     # if False, script will halt if it believes these embeddings have already been made. 
    PERMISSION_TO_OVERWRITE_MODELS = False                     # if False, script will halt if it believes these embeddings have already been made.
)