|
''' |
|
Evaluate predictive performance of predictors in parallel with multiprocessing. |
|
''' |
|
import argparse |
|
from multiprocessing import Process, JoinableQueue |
|
from multiprocessing import set_start_method |
|
import os |
|
|
|
import pandas as pd |
|
|
|
from utils import parse_vars, merge_dfs |
|
from evaluate_multiprocessing import run_from_queue |
|
from predictors import get_predictor_names |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser( |
|
description='Example: python evaluate.py sarkisyan onehot_ridge ' |
|
'--predictor_params reg_coef=0.01') |
|
parser.add_argument('dataset_name', type=str, |
|
help='Dataset name. Folder of the same name under the data ' |
|
'and inference directories are expected to look up files.' |
|
'The data will be loaded from data/{dataset_name}/data.csv' |
|
'in the `seq` and `log_fitness` columns.') |
|
parser.add_argument('predictor_name', type=str, |
|
help='Predictor name, or all for running all predictors.') |
|
parser.add_argument('--n_threads', type=int, default=20) |
|
parser.add_argument('--n_train', type=int, default=96) |
|
parser.add_argument('--max_n_mut', type=int, default=5) |
|
parser.add_argument('--joint_training', dest='joint_training', action='store_true') |
|
parser.add_argument('--boosting', dest='joint_training', action='store_false') |
|
parser.set_defaults(joint_training=True) |
|
parser.add_argument('--train_on_single', dest='train_on_single', action='store_true') |
|
parser.add_argument('--train_on_all', dest='train_on_single', action='store_false') |
|
parser.set_defaults(train_on_single=True) |
|
parser.add_argument('--ignore_gaps', dest='ignore_gaps', action='store_true') |
|
parser.set_defaults(ignore_gaps=False) |
|
parser.add_argument('--n_seeds', type=int, default=20, |
|
help='Number of random train test splits to get confidence interval') |
|
parser.add_argument('--metric_topk', type=int, default=96, |
|
help='Top ? when evaluating hit rate and topk mean') |
|
parser.add_argument("--predictor_params", |
|
metavar="KEY=VALUE", |
|
nargs='+', |
|
help="Set a number of key-value pairs " |
|
"(do not put spaces before or after the = sign). " |
|
"If a value contains spaces, you should define " |
|
"it with double quotes: " |
|
'foo="this is a sentence". Note that ' |
|
"values are always treated as floats.") |
|
parser.add_argument('--results_suffix', type=str, default='') |
|
args = parser.parse_args() |
|
predictor_params = parse_vars(args.predictor_params) |
|
if args.ignore_gaps: |
|
predictor_params['ignore_gaps'] = args.ignore_gaps |
|
print(args) |
|
|
|
outdir = os.path.join('results', args.dataset_name) |
|
if not os.path.exists(outdir): |
|
os.mkdir(outdir) |
|
outpath = os.path.join(outdir, f'results{args.results_suffix}.csv') |
|
|
|
|
|
queue = JoinableQueue() |
|
workers = [] |
|
for i in range(args.n_threads): |
|
p = Process(target=run_from_queue, args=(i, queue)) |
|
workers.append(p) |
|
p.start() |
|
predictors = get_predictor_names(args.predictor_name) |
|
for pn in predictors: |
|
for seed in range(args.n_seeds): |
|
queue.put((args.dataset_name, pn, args.joint_training, |
|
args.n_train, args.metric_topk, args.max_n_mut, |
|
args.train_on_single, args.ignore_gaps, seed, |
|
predictor_params, outpath)) |
|
queue.join() |
|
for p in workers: |
|
p.terminate() |
|
merge_dfs(f'{outpath}*', outpath, |
|
index_cols=['dataset', 'predictor', 'predictor_params', 'seed'], |
|
groupby_cols=['predictor', 'predictor_params', 'n_train', 'topk'], |
|
ignore_cols=['seed']) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|