batch_size: 2 controller_dropout: 0.1 controller_layers: 2 controller_lr: 0.0001 dataset_name: gsm8k epochs: 1 eval_baseline: true eval_interval: 1 eval_samples: 5 hidden_size: 2560 lambda_accuracy: 1.0 lambda_flops: 0.005 log_interval: 10 max_ponder_steps: 3 model_name: microsoft/phi-2 output_dir: outputs/test_training save_interval: 1 threshold: 0.3 train_samples: 10 warmup_steps: 100