diff --git "a/mrpc-log.txt" "b/mrpc-log.txt" new file mode 100644--- /dev/null +++ "b/mrpc-log.txt" @@ -0,0 +1,9183 @@ +/home/aiscuser/.local/lib/python3.8/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.4 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +2023/07/19 14:23:22 WARNING mlflow.utils.autologging_utils: You are using an unsupported version of transformers. If you encounter errors during autologging, try upgrading / downgrading transformers to a supported version, or try upgrading MLflow. +2023/07/19 14:23:23 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn. +2023/07/19 14:23:23 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers. +Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none). +Downloading and preparing dataset glue/mrpc to /home/aiscuser/.cache/huggingface/datasets/glue/mrpc/1.0.0/a420f5e518f42454003587c47467370329f9fc0c6508d1ae0c45b58ea266a353... + Downloading data files: 0%| | 0/3 [00:00 +Training Arguments +TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +bf16=False, +bf16_full_eval=False, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_pin_memory=True, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +debug=[], +deepspeed=None, +disable_tqdm=False, +do_eval=True, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_steps=50, +evaluation_strategy=IntervalStrategy.STEPS, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +gradient_accumulation_steps=1, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=6e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=-1, +log_level=40, +log_level_replica=-1, +log_on_each_node=True, +logging_dir=/mnt/data/device-aware-bert/token_pruning/experiments/MRPC/reproduce1/s0.67_lr6e-05_reglr0.01_alpha0.0002_warmup150_bin50/runs/Jul19_14-23-23_node-0, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=25, +logging_strategy=IntervalStrategy.STEPS, +lr_scheduler_type=SchedulerType.LINEAR, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=200.0, +optim=OptimizerNames.ADAMW_HF, +output_dir=/mnt/data/device-aware-bert/token_pruning/experiments/MRPC/reproduce1/s0.67_lr6e-05_reglr0.01_alpha0.0002_warmup150_bin50, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=32, +per_device_train_batch_size=32, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +remove_unused_columns=True, +report_to=['mlflow'], +resume_from_checkpoint=None, +run_name=/mnt/data/device-aware-bert/token_pruning/experiments/MRPC/reproduce1/s0.67_lr6e-05_reglr0.01_alpha0.0002_warmup150_bin50, +save_on_each_node=False, +save_steps=0, +save_strategy=IntervalStrategy.STEPS, +save_total_limit=None, +seed=57, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_legacy_prediction_loop=False, +warmup_ratio=0.0, +warmup_steps=0, +weight_decay=0.0, +xpu_backend=None, +) +Additional Arguments +AdditionalArguments(test=False, ex_name='s0.67_lr6e-05_reglr0.01_alpha0.0002_warmup150_bin50', pruning_type='token+pruner', reg_learning_rate=0.01, scheduler_type='linear', freeze_embeddings=True, pretrained_pruned_model=None, droprate_init=0.01, temperature=0.6666666666666666, prepruning_finetune_epochs=1, lagrangian_warmup_epochs=150, target_sparsity=0.67, sparsity_epsilon=0, distillation_path='/mnt/data/device-aware-bert/token_pruning/teachers/MRPC', do_distill=True, do_layer_distill=False, layer_distill_version=4, distill_loss_alpha=0.9, distill_ce_loss_alpha=0.0002, distill_temp=2.0, use_mac_l0=True, prune_location=[2, 3, 4, 5, 6, 7, 8, 9, 10, 11], bin_num=50, topk=20) +---------------------------------------------------------------------- +time: 2023-07-19 14:23:56 +Evaluating: f1: 0.8981, eval_loss: 0.4779, step: 0 +lambda_1: 0.0000, lambda_2: 0.0000 lambda_3: 0.0000 +Starting l0 regularization! using , temperature: 0.67, init drop rate: 0.01 token_loga shape: [10, 50] prune location: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11] +NDCG TOPK= 20 +loss: 0.844315, lagrangian_loss: -0.000053, attention_score_distillation_loss: 0.001977 +---------------------------------------------------------------------- +time: 2023-07-19 14:24:11 +Evaluating: f1: 0.8975, eval_loss: 0.4174, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0019, step: 50 +lambda_1: -0.1599, lambda_2: 0.4514 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.331014, lagrangian_loss: 0.000220, attention_score_distillation_loss: 0.001975 +loss: 0.398958, lagrangian_loss: 0.001112, attention_score_distillation_loss: 0.001971 +---------------------------------------------------------------------- +time: 2023-07-19 14:24:25 +Evaluating: f1: 0.862, eval_loss: 0.6741, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0038, step: 100 +lambda_1: -0.9178, lambda_2: 1.2933 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.927653, lagrangian_loss: 0.002207, attention_score_distillation_loss: 0.001969 +ETA: 1:48:18 | Epoch 0 finished. Took 32.66 seconds. +loss: 0.056119, lagrangian_loss: 0.002750, attention_score_distillation_loss: 0.001965 +---------------------------------------------------------------------- +time: 2023-07-19 14:24:40 +Evaluating: f1: 0.881, eval_loss: 0.7317, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0057, step: 150 +lambda_1: -1.5601, lambda_2: 1.9629 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.200542, lagrangian_loss: 0.002356, attention_score_distillation_loss: 0.001962 +loss: 0.186767, lagrangian_loss: 0.000804, attention_score_distillation_loss: 0.001959 +---------------------------------------------------------------------- +time: 2023-07-19 14:24:54 +Evaluating: f1: 0.8981, eval_loss: 0.5048, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0077, step: 200 +lambda_1: -1.7949, lambda_2: 2.1305 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.057280, lagrangian_loss: -0.001841, attention_score_distillation_loss: 0.001957 +loss: 0.205831, lagrangian_loss: -0.004043, attention_score_distillation_loss: 0.001953 +ETA: 1:47:39 | Epoch 1 finished. Took 32.59 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:25:09 +Evaluating: f1: 0.8988, eval_loss: 0.4738, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0096, step: 250 +lambda_1: -1.1944, lambda_2: 2.8107 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.99 1. ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.085462, lagrangian_loss: -0.005739, attention_score_distillation_loss: 0.001950 +loss: 0.047988, lagrangian_loss: -0.004037, attention_score_distillation_loss: 0.001947 +---------------------------------------------------------------------- +time: 2023-07-19 14:25:23 +Evaluating: f1: 0.8912, eval_loss: 0.4565, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0116, step: 300 +lambda_1: -0.0979, lambda_2: 4.2245 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.98 1. ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.102424, lagrangian_loss: -0.000476, attention_score_distillation_loss: 0.001944 +loss: 0.050687, lagrangian_loss: 0.002352, attention_score_distillation_loss: 0.001941 +ETA: 1:46:59 | Epoch 2 finished. Took 32.51 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:25:38 +Evaluating: f1: 0.9016, eval_loss: 0.4389, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0135, step: 350 +lambda_1: 0.7380, lambda_2: 5.1095 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.99 1. ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.027416, lagrangian_loss: 0.002573, attention_score_distillation_loss: 0.001939 +loss: 0.040867, lagrangian_loss: 0.000737, attention_score_distillation_loss: 0.001938 +---------------------------------------------------------------------- +time: 2023-07-19 14:25:52 +Evaluating: f1: 0.913, eval_loss: 0.3877, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0155, step: 400 +lambda_1: 0.9770, lambda_2: 5.2526 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.99 1. ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.029649, lagrangian_loss: -0.001590, attention_score_distillation_loss: 0.001933 +loss: 0.028804, lagrangian_loss: -0.002968, attention_score_distillation_loss: 0.001931 +---------------------------------------------------------------------- +time: 2023-07-19 14:26:06 +Evaluating: f1: 0.8932, eval_loss: 0.4944, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0174, step: 450 +lambda_1: 0.5783, lambda_2: 5.5013 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 1. 1. 1. 0.99 1. ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.148262, lagrangian_loss: -0.002896, attention_score_distillation_loss: 0.001929 +ETA: 1:48:10 | Epoch 3 finished. Took 34.71 seconds. +loss: 0.027041, lagrangian_loss: -0.001112, attention_score_distillation_loss: 0.001927 +---------------------------------------------------------------------- +time: 2023-07-19 14:26:21 +Evaluating: f1: 0.898, eval_loss: 0.6168, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0193, step: 500 +lambda_1: -0.2234, lambda_2: 6.3774 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 1. 1. 1. 0.99 1. ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.120027, lagrangian_loss: 0.002113, attention_score_distillation_loss: 0.001923 +loss: 0.027218, lagrangian_loss: 0.006355, attention_score_distillation_loss: 0.001919 +---------------------------------------------------------------------- +time: 2023-07-19 14:26:35 +Evaluating: f1: 0.8897, eval_loss: 0.5281, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0213, step: 550 +lambda_1: -1.1509, lambda_2: 7.5535 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 1. 1. 1. 0.99 1. ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.028274, lagrangian_loss: 0.010838, attention_score_distillation_loss: 0.001915 +ETA: 1:47:17 | Epoch 4 finished. Took 32.59 seconds. +loss: 0.086222, lagrangian_loss: 0.014110, attention_score_distillation_loss: 0.001913 +---------------------------------------------------------------------- +time: 2023-07-19 14:26:50 +Evaluating: f1: 0.8862, eval_loss: 0.5352, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0232, step: 600 +lambda_1: -1.9918, lambda_2: 8.5427 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.98 0.99] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.228697, lagrangian_loss: 0.013522, attention_score_distillation_loss: 0.001910 +loss: 0.092849, lagrangian_loss: 0.005699, attention_score_distillation_loss: 0.001909 +---------------------------------------------------------------------- +time: 2023-07-19 14:27:04 +Evaluating: f1: 0.8998, eval_loss: 0.5424, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0253, expected_sparsity: 0.0219, expected_sequence_sparsity: 0.5996, target_sparsity: 0.0252, step: 650 +lambda_1: -2.3202, lambda_2: 8.8889 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.94 0.96] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111110111011111111100 +11111111111111111111111111111111111111111111111111 +loss: 0.073946, lagrangian_loss: -0.020756, attention_score_distillation_loss: 0.001913 +loss: 0.016344, lagrangian_loss: -0.046345, attention_score_distillation_loss: 0.001901 +ETA: 1:46:31 | Epoch 5 finished. Took 32.63 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:27:19 +Evaluating: f1: 0.877, eval_loss: 0.5658, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0761, expected_sparsity: 0.075, expected_sequence_sparsity: 0.6213, target_sparsity: 0.0271, step: 700 +lambda_1: -0.8683, lambda_2: 10.9040 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 0.99 0.98 0.85 0.82] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.68] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.56] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011111110100 +11111000111101011111111101011011110011110100011001 +loss: 0.030512, lagrangian_loss: -0.016740, attention_score_distillation_loss: 0.001901 +loss: 0.043250, lagrangian_loss: 0.017966, attention_score_distillation_loss: 0.001897 +---------------------------------------------------------------------- +time: 2023-07-19 14:27:33 +Evaluating: f1: 0.8679, eval_loss: 0.498, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0337, expected_sparsity: 0.0306, expected_sequence_sparsity: 0.6031, target_sparsity: 0.0291, step: 750 +lambda_1: 0.5728, lambda_2: 12.4660 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 0.99 0.98 0.89 0.9 ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101111011111110111011111110100 +11111111111111111111111111111111111111111111111111 +loss: 0.138288, lagrangian_loss: 0.020294, attention_score_distillation_loss: 0.001894 +loss: 0.012806, lagrangian_loss: 0.009389, attention_score_distillation_loss: 0.001895 +---------------------------------------------------------------------- +time: 2023-07-19 14:27:48 +Evaluating: f1: 0.8822, eval_loss: 0.5555, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0211, expected_sparsity: 0.0175, expected_sequence_sparsity: 0.5978, target_sparsity: 0.031, step: 800 +lambda_1: 1.0413, lambda_2: 12.7199 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.94 0.96] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111110111011111111110 +11111111111111111111111111111111111111111111111111 +loss: 0.075717, lagrangian_loss: -0.000495, attention_score_distillation_loss: 0.001889 +ETA: 1:46:49 | Epoch 6 finished. Took 34.79 seconds. +loss: 0.146381, lagrangian_loss: -0.006434, attention_score_distillation_loss: 0.001886 +---------------------------------------------------------------------- +time: 2023-07-19 14:28:02 +Evaluating: f1: 0.8846, eval_loss: 0.6134, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0169, expected_sparsity: 0.0131, expected_sequence_sparsity: 0.596, target_sparsity: 0.0329, step: 850 +lambda_1: 0.9062, lambda_2: 12.7535 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.97 0.98] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 0.94] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111110111011111111110 +11111111111111111111111111111111111111111111111111 +loss: 0.114676, lagrangian_loss: -0.008415, attention_score_distillation_loss: 0.001884 +loss: 0.013280, lagrangian_loss: -0.007489, attention_score_distillation_loss: 0.001882 +---------------------------------------------------------------------- +time: 2023-07-19 14:28:17 +Evaluating: f1: 0.8893, eval_loss: 0.5554, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0349, step: 900 +lambda_1: 0.4872, lambda_2: 12.9000 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.97 0.98] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.076615, lagrangian_loss: -0.004464, attention_score_distillation_loss: 0.001879 +ETA: 1:46:01 | Epoch 7 finished. Took 32.58 seconds. +loss: 0.166178, lagrangian_loss: -0.000106, attention_score_distillation_loss: 0.001874 +---------------------------------------------------------------------- +time: 2023-07-19 14:28:31 +Evaluating: f1: 0.8915, eval_loss: 0.5416, token_prune_loc: [False, False, False, False, False, False, False, False, False, False], macs_sparsity: 0.0, expected_sparsity: 0.0, expected_sequence_sparsity: 0.5906, target_sparsity: 0.0368, step: 950 +lambda_1: -0.0527, lambda_2: 13.1468 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.97 0.98] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +loss: 0.028054, lagrangian_loss: 0.004809, attention_score_distillation_loss: 0.001872 +loss: 0.064744, lagrangian_loss: 0.009240, attention_score_distillation_loss: 0.001867 +---------------------------------------------------------------------- +time: 2023-07-19 14:28:45 +Evaluating: f1: 0.8916, eval_loss: 0.5736, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0211, expected_sparsity: 0.0175, expected_sequence_sparsity: 0.5978, target_sparsity: 0.0388, step: 1000 +lambda_1: -0.6015, lambda_2: 13.4139 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.96 0.97] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111110111011111111110 +11111111111111111111111111111111111111111111111111 +loss: 0.018114, lagrangian_loss: 0.011986, attention_score_distillation_loss: 0.001863 +loss: 0.016452, lagrangian_loss: 0.011391, attention_score_distillation_loss: 0.001861 +ETA: 1:45:14 | Epoch 8 finished. Took 32.47 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:29:00 +Evaluating: f1: 0.8792, eval_loss: 0.5706, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0253, expected_sparsity: 0.0219, expected_sequence_sparsity: 0.5996, target_sparsity: 0.0407, step: 1050 +lambda_1: -1.0153, lambda_2: 13.5753 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.99 0.94 0.95] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111110111011111110110 +11111111111111111111111111111111111111111111111111 +loss: 0.152066, lagrangian_loss: 0.006394, attention_score_distillation_loss: 0.001865 +loss: 0.022090, lagrangian_loss: -0.002813, attention_score_distillation_loss: 0.001855 +---------------------------------------------------------------------- +time: 2023-07-19 14:29:14 +Evaluating: f1: 0.89, eval_loss: 0.6287, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0337, expected_sparsity: 0.0306, expected_sequence_sparsity: 0.6031, target_sparsity: 0.0426, step: 1100 +lambda_1: -1.0236, lambda_2: 13.6107 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 0.99 0.98 0.9 0.88] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101111011111110111011111110100 +11111111111111111111111111111111111111111111111111 +loss: 0.014114, lagrangian_loss: -0.011022, attention_score_distillation_loss: 0.001853 +loss: 0.095718, lagrangian_loss: -0.010398, attention_score_distillation_loss: 0.001850 +---------------------------------------------------------------------- +time: 2023-07-19 14:29:29 +Evaluating: f1: 0.884, eval_loss: 0.6404, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0719, expected_sparsity: 0.0693, expected_sequence_sparsity: 0.619, target_sparsity: 0.0446, step: 1150 +lambda_1: -0.4384, lambda_2: 13.9504 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.98 0.88 0.84] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.7] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.59] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011111110111011111110100 +11111000111101011111111101011011110011110110011001 +ETA: 1:45:10 | Epoch 9 finished. Took 34.59 seconds. +loss: 0.014720, lagrangian_loss: -0.002838, attention_score_distillation_loss: 0.001850 +loss: 0.014295, lagrangian_loss: 0.002671, attention_score_distillation_loss: 0.001843 +---------------------------------------------------------------------- +time: 2023-07-19 14:29:43 +Evaluating: f1: 0.8976, eval_loss: 0.499, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0337, expected_sparsity: 0.0306, expected_sequence_sparsity: 0.6031, target_sparsity: 0.0465, step: 1200 +lambda_1: 0.1805, lambda_2: 14.3359 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.98 0.89 0.88] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101111011111110111011111110100 +11111111111111111111111111111111111111111111111111 +loss: 0.041405, lagrangian_loss: 0.003423, attention_score_distillation_loss: 0.001841 +loss: 0.103779, lagrangian_loss: 0.001181, attention_score_distillation_loss: 0.001838 +---------------------------------------------------------------------- +time: 2023-07-19 14:29:58 +Evaluating: f1: 0.8716, eval_loss: 0.5361, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0295, expected_sparsity: 0.0263, expected_sequence_sparsity: 0.6013, target_sparsity: 0.0485, step: 1250 +lambda_1: 0.3764, lambda_2: 14.3985 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.98 0.91 0.92] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101111011111110111011111110110 +11111111111111111111111111111111111111111111111111 +loss: 0.079066, lagrangian_loss: -0.000929, attention_score_distillation_loss: 0.001835 +ETA: 1:44:27 | Epoch 10 finished. Took 32.69 seconds. +loss: 0.007130, lagrangian_loss: -0.001459, attention_score_distillation_loss: 0.001835 +---------------------------------------------------------------------- +time: 2023-07-19 14:30:12 +Evaluating: f1: 0.8835, eval_loss: 0.5822, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0295, expected_sparsity: 0.0263, expected_sequence_sparsity: 0.6013, target_sparsity: 0.0504, step: 1300 +lambda_1: 0.2121, lambda_2: 14.4316 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.98 0.92 0.93] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101111011111110111011111110110 +11111111111111111111111111111111111111111111111111 +loss: 0.036706, lagrangian_loss: -0.000766, attention_score_distillation_loss: 0.001830 +loss: 0.175276, lagrangian_loss: 0.000557, attention_score_distillation_loss: 0.001829 +---------------------------------------------------------------------- +time: 2023-07-19 14:30:27 +Evaluating: f1: 0.8822, eval_loss: 0.5845, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0295, expected_sparsity: 0.0263, expected_sequence_sparsity: 0.6013, target_sparsity: 0.0524, step: 1350 +lambda_1: -0.0910, lambda_2: 14.5182 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.98 0.92 0.92] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101111011111110111011111110110 +11111111111111111111111111111111111111111111111111 +loss: 0.017843, lagrangian_loss: 0.001768, attention_score_distillation_loss: 0.001826 +loss: 0.020944, lagrangian_loss: 0.001759, attention_score_distillation_loss: 0.001823 +ETA: 1:43:45 | Epoch 11 finished. Took 32.55 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:30:41 +Evaluating: f1: 0.8843, eval_loss: 0.567, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0337, expected_sparsity: 0.0306, expected_sequence_sparsity: 0.6031, target_sparsity: 0.0543, step: 1400 +lambda_1: -0.3260, lambda_2: 14.5733 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.99 1. 0.98 0.9 0.89] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101111011111110111011111110100 +11111111111111111111111111111111111111111111111111 +loss: 0.013917, lagrangian_loss: 0.000552, attention_score_distillation_loss: 0.001820 +loss: 0.032516, lagrangian_loss: -0.000790, attention_score_distillation_loss: 0.001817 +---------------------------------------------------------------------- +time: 2023-07-19 14:30:56 +Evaluating: f1: 0.8889, eval_loss: 0.6394, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0379, expected_sparsity: 0.035, expected_sequence_sparsity: 0.6049, target_sparsity: 0.0562, step: 1450 +lambda_1: -0.2910, lambda_2: 14.5836 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.98 0.89 0.86] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.84] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011111110111011111110100 +11111111111111111111111111111111111111111111111111 +loss: 0.012617, lagrangian_loss: -0.001138, attention_score_distillation_loss: 0.001813 +loss: 0.014826, lagrangian_loss: -0.000614, attention_score_distillation_loss: 0.001809 +ETA: 1:43:08 | Epoch 12 finished. Took 32.86 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:31:10 +Evaluating: f1: 0.8931, eval_loss: 0.5337, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0719, expected_sparsity: 0.0693, expected_sequence_sparsity: 0.619, target_sparsity: 0.0582, step: 1500 +lambda_1: -0.0762, lambda_2: 14.6249 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.98 0.88 0.85] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.7] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.59] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011111110111011111110100 +11111100111101011111111101011011110011110100011001 +loss: 0.016289, lagrangian_loss: -0.000008, attention_score_distillation_loss: 0.001807 +loss: 0.007013, lagrangian_loss: 0.000163, attention_score_distillation_loss: 0.001805 +---------------------------------------------------------------------- +time: 2023-07-19 14:31:25 +Evaluating: f1: 0.8901, eval_loss: 0.4902, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0379, expected_sparsity: 0.035, expected_sequence_sparsity: 0.6049, target_sparsity: 0.0601, step: 1550 +lambda_1: 0.0650, lambda_2: 14.6452 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.98 0.89 0.86] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.84] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011111110111011111110100 +11111111111111111111111111111111111111111111111111 +loss: 0.007331, lagrangian_loss: -0.000001, attention_score_distillation_loss: 0.001800 +loss: 0.008404, lagrangian_loss: -0.000064, attention_score_distillation_loss: 0.001799 +---------------------------------------------------------------------- +time: 2023-07-19 14:31:39 +Evaluating: f1: 0.8985, eval_loss: 0.5508, token_prune_loc: [False, False, False, False, False, False, False, False, True, False], macs_sparsity: 0.0337, expected_sparsity: 0.0306, expected_sequence_sparsity: 0.6031, target_sparsity: 0.0621, step: 1600 +lambda_1: 0.0176, lambda_2: 14.6497 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.98 0.89 0.86] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 1.0] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101111011111110111011111110100 +11111111111111111111111111111111111111111111111111 +loss: 0.023607, lagrangian_loss: 0.000078, attention_score_distillation_loss: 0.001797 +ETA: 1:42:59 | Epoch 13 finished. Took 34.89 seconds. +loss: 0.010771, lagrangian_loss: 0.000304, attention_score_distillation_loss: 0.001793 +---------------------------------------------------------------------- +time: 2023-07-19 14:31:54 +Evaluating: f1: 0.8865, eval_loss: 0.5213, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0719, expected_sparsity: 0.0693, expected_sequence_sparsity: 0.619, target_sparsity: 0.064, step: 1650 +lambda_1: -0.1047, lambda_2: 14.6620 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.98 0.89 0.85] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.7] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.59] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011111110111011111110100 +11111100111101011111111101011011110011110100011001 +loss: 0.079100, lagrangian_loss: 0.000255, attention_score_distillation_loss: 0.001788 +loss: 0.018883, lagrangian_loss: 0.000027, attention_score_distillation_loss: 0.001787 +---------------------------------------------------------------------- +time: 2023-07-19 14:32:08 +Evaluating: f1: 0.879, eval_loss: 0.6022, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0745, expected_sparsity: 0.0715, expected_sequence_sparsity: 0.6199, target_sparsity: 0.066, step: 1700 +lambda_1: -0.1376, lambda_2: 14.6646 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.98 0.88 0.83] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.68] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.57] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011111110111011111110100 +11111000111101011111111101011011110011110100011001 +loss: 0.007429, lagrangian_loss: -0.000146, attention_score_distillation_loss: 0.001785 +ETA: 1:42:18 | Epoch 14 finished. Took 32.66 seconds. +loss: 0.006198, lagrangian_loss: -0.000159, attention_score_distillation_loss: 0.001782 +---------------------------------------------------------------------- +time: 2023-07-19 14:32:23 +Evaluating: f1: 0.8858, eval_loss: 0.6018, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0745, expected_sparsity: 0.0715, expected_sequence_sparsity: 0.6199, target_sparsity: 0.0679, step: 1750 +lambda_1: -0.0622, lambda_2: 14.6694 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.87 0.82] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.68] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.57] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011111110111011111110100 +11111100111101011111111101011011110010110100011001 +loss: 0.007924, lagrangian_loss: -0.000064, attention_score_distillation_loss: 0.001779 +loss: 0.016277, lagrangian_loss: 0.000002, attention_score_distillation_loss: 0.001777 +---------------------------------------------------------------------- +time: 2023-07-19 14:32:37 +Evaluating: f1: 0.8983, eval_loss: 0.6013, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0745, expected_sparsity: 0.0715, expected_sequence_sparsity: 0.6199, target_sparsity: 0.0698, step: 1800 +lambda_1: 0.0089, lambda_2: 14.6740 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.87 0.82] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.68] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.57] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011111110111011111110100 +11111100111101011111111101011011110010110100011001 +loss: 0.007964, lagrangian_loss: -0.000001, attention_score_distillation_loss: 0.001772 +loss: 0.009511, lagrangian_loss: 0.000016, attention_score_distillation_loss: 0.001771 +ETA: 1:41:39 | Epoch 15 finished. Took 32.66 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:32:52 +Evaluating: f1: 0.8962, eval_loss: 0.5568, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0745, expected_sparsity: 0.0715, expected_sequence_sparsity: 0.6199, target_sparsity: 0.0718, step: 1850 +lambda_1: -0.0248, lambda_2: 14.6755 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.87 0.82] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.68] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.57] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011111110111011111110100 +11111100111101011111111101011011110010110100011001 +loss: 0.093316, lagrangian_loss: 0.000047, attention_score_distillation_loss: 0.001766 +loss: 0.004602, lagrangian_loss: -0.000002, attention_score_distillation_loss: 0.001766 +---------------------------------------------------------------------- +time: 2023-07-19 14:33:06 +Evaluating: f1: 0.8889, eval_loss: 0.4992, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0787, expected_sparsity: 0.0772, expected_sequence_sparsity: 0.6222, target_sparsity: 0.0737, step: 1900 +lambda_1: -0.0447, lambda_2: 14.6765 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.86 0.8 ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.66] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.54] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011111110100 +11111000111101011111111101011011110010110100011001 +loss: 0.284785, lagrangian_loss: -0.000015, attention_score_distillation_loss: 0.001761 +loss: 0.023119, lagrangian_loss: 0.000001, attention_score_distillation_loss: 0.001757 +---------------------------------------------------------------------- +time: 2023-07-19 14:33:21 +Evaluating: f1: 0.8938, eval_loss: 0.6179, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0787, expected_sparsity: 0.0772, expected_sequence_sparsity: 0.6222, target_sparsity: 0.0757, step: 1950 +lambda_1: -0.0378, lambda_2: 14.6768 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.86 0.79] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.66] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.54] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011111110100 +11111100111101011101111101011011110010110100011001 +loss: 0.015058, lagrangian_loss: 0.000037, attention_score_distillation_loss: 0.001754 +ETA: 1:41:26 | Epoch 16 finished. Took 34.96 seconds. +loss: 0.170983, lagrangian_loss: 0.000048, attention_score_distillation_loss: 0.001751 +---------------------------------------------------------------------- +time: 2023-07-19 14:33:35 +Evaluating: f1: 0.8991, eval_loss: 0.5506, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0787, expected_sparsity: 0.0772, expected_sequence_sparsity: 0.6222, target_sparsity: 0.0776, step: 2000 +lambda_1: -0.0687, lambda_2: 14.6775 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.85 0.78] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.66] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.54] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011111110100 +11111100111101011101111101011011110010110100011001 +loss: 0.004562, lagrangian_loss: -0.000008, attention_score_distillation_loss: 0.001748 +loss: 0.008097, lagrangian_loss: -0.000053, attention_score_distillation_loss: 0.001746 +---------------------------------------------------------------------- +time: 2023-07-19 14:33:50 +Evaluating: f1: 0.8956, eval_loss: 0.5843, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0855, expected_sparsity: 0.0828, expected_sequence_sparsity: 0.6245, target_sparsity: 0.0795, step: 2050 +lambda_1: -0.0333, lambda_2: 14.6790 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.84 0.78] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.64] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.51] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011011110100 +11111100111101011101111101011001110010110100011001 +loss: 0.003550, lagrangian_loss: -0.000019, attention_score_distillation_loss: 0.001744 +ETA: 1:40:47 | Epoch 17 finished. Took 32.7 seconds. +loss: 0.005303, lagrangian_loss: -0.000002, attention_score_distillation_loss: 0.001742 +---------------------------------------------------------------------- +time: 2023-07-19 14:34:04 +Evaluating: f1: 0.8981, eval_loss: 0.5534, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0855, expected_sparsity: 0.0828, expected_sequence_sparsity: 0.6245, target_sparsity: 0.0815, step: 2100 +lambda_1: -0.0039, lambda_2: 14.6798 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.84 0.77] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.64] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.51] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011011110100 +11111100111101010101111101011011110010110100011001 +loss: 0.005728, lagrangian_loss: 0.000007, attention_score_distillation_loss: 0.001740 +loss: 0.004771, lagrangian_loss: 0.000045, attention_score_distillation_loss: 0.001736 +---------------------------------------------------------------------- +time: 2023-07-19 14:34:19 +Evaluating: f1: 0.8873, eval_loss: 0.5838, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0855, expected_sparsity: 0.0828, expected_sequence_sparsity: 0.6245, target_sparsity: 0.0834, step: 2150 +lambda_1: -0.0537, lambda_2: 14.6816 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.84 0.77] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.64] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.51] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011011110100 +11111100111101010101111101011011110010110100011001 +loss: 0.010478, lagrangian_loss: 0.000068, attention_score_distillation_loss: 0.001733 +loss: 0.004719, lagrangian_loss: 0.000008, attention_score_distillation_loss: 0.001729 +ETA: 1:40:08 | Epoch 18 finished. Took 32.7 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:34:33 +Evaluating: f1: 0.892, eval_loss: 0.5872, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0855, expected_sparsity: 0.085, expected_sequence_sparsity: 0.6254, target_sparsity: 0.0854, step: 2200 +lambda_1: -0.0719, lambda_2: 14.6826 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.83 0.75] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.62] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.5] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011011110100 +11111100111101010101111101011001110010110100011001 +loss: 0.003216, lagrangian_loss: -0.000042, attention_score_distillation_loss: 0.001727 +loss: 0.005995, lagrangian_loss: -0.000029, attention_score_distillation_loss: 0.001724 +---------------------------------------------------------------------- +time: 2023-07-19 14:34:48 +Evaluating: f1: 0.895, eval_loss: 0.6068, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0855, expected_sparsity: 0.085, expected_sequence_sparsity: 0.6254, target_sparsity: 0.0873, step: 2250 +lambda_1: -0.0158, lambda_2: 14.6845 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.83 0.75] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.62] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.5] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011011110100 +11111100111101010101111101011001110010110100011001 +loss: 0.002338, lagrangian_loss: -0.000004, attention_score_distillation_loss: 0.001723 +loss: 0.004482, lagrangian_loss: 0.000007, attention_score_distillation_loss: 0.001719 +---------------------------------------------------------------------- +time: 2023-07-19 14:35:02 +Evaluating: f1: 0.8978, eval_loss: 0.6296, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0923, expected_sparsity: 0.0883, expected_sequence_sparsity: 0.6268, target_sparsity: 0.0893, step: 2300 +lambda_1: -0.0236, lambda_2: 14.6852 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.97 0.82 0.74] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.62] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.48] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011011110000 +11111100111101010101111101011001110010110100011001 +ETA: 1:39:49 | Epoch 19 finished. Took 34.79 seconds. +loss: 0.002314, lagrangian_loss: 0.000034, attention_score_distillation_loss: 0.001715 +loss: 0.002819, lagrangian_loss: 0.000034, attention_score_distillation_loss: 0.001712 +---------------------------------------------------------------------- +time: 2023-07-19 14:35:16 +Evaluating: f1: 0.8942, eval_loss: 0.5841, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0923, expected_sparsity: 0.0905, expected_sequence_sparsity: 0.6277, target_sparsity: 0.0912, step: 2350 +lambda_1: -0.0553, lambda_2: 14.6860 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 1. 0.96 0.82 0.73] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.6] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.47] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011011110000 +11111100111101010101111101011001110010100100011001 +loss: 0.005633, lagrangian_loss: -0.000027, attention_score_distillation_loss: 0.001710 +loss: 0.004544, lagrangian_loss: -0.000016, attention_score_distillation_loss: 0.001706 +---------------------------------------------------------------------- +time: 2023-07-19 14:35:31 +Evaluating: f1: 0.9059, eval_loss: 0.5423, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0923, expected_sparsity: 0.0905, expected_sequence_sparsity: 0.6277, target_sparsity: 0.0931, step: 2400 +lambda_1: -0.0270, lambda_2: 14.6865 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.81 0.72] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.6] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.47] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011011110000 +11111100111101010101111101011001110010100100011001 +loss: 0.006385, lagrangian_loss: -0.000005, attention_score_distillation_loss: 0.001705 +ETA: 1:39:11 | Epoch 20 finished. Took 32.65 seconds. +loss: 0.003317, lagrangian_loss: 0.000005, attention_score_distillation_loss: 0.001701 +---------------------------------------------------------------------- +time: 2023-07-19 14:35:45 +Evaluating: f1: 0.8928, eval_loss: 0.56, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0923, expected_sparsity: 0.0905, expected_sequence_sparsity: 0.6277, target_sparsity: 0.0951, step: 2450 +lambda_1: -0.0217, lambda_2: 14.6867 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.81 0.72] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.6] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.47] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011110110111011011110000 +11111100111101010101111101011001110010100100011001 +loss: 0.002210, lagrangian_loss: 0.000012, attention_score_distillation_loss: 0.001698 +loss: 0.009325, lagrangian_loss: 0.000014, attention_score_distillation_loss: 0.001696 +---------------------------------------------------------------------- +time: 2023-07-19 14:36:00 +Evaluating: f1: 0.8941, eval_loss: 0.5961, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0991, expected_sparsity: 0.0958, expected_sequence_sparsity: 0.6298, target_sparsity: 0.097, step: 2500 +lambda_1: -0.0413, lambda_2: 14.6869 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.8 0.71] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.58] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.44] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011100110111011011110000 +11111000111101010101111101011001110010100100011001 +loss: 0.004997, lagrangian_loss: -0.000005, attention_score_distillation_loss: 0.001695 +loss: 0.006099, lagrangian_loss: -0.000009, attention_score_distillation_loss: 0.001689 +ETA: 1:38:33 | Epoch 21 finished. Took 32.7 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:36:14 +Evaluating: f1: 0.8881, eval_loss: 0.563, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0991, expected_sparsity: 0.0958, expected_sequence_sparsity: 0.6298, target_sparsity: 0.099, step: 2550 +lambda_1: -0.0289, lambda_2: 14.6871 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.8 0.7 ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.58] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.44] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011100110111011011110000 +11111000111101010101111101011001110010100100011001 +loss: 0.002765, lagrangian_loss: 0.000002, attention_score_distillation_loss: 0.001687 +loss: 0.005608, lagrangian_loss: 0.000007, attention_score_distillation_loss: 0.001684 +---------------------------------------------------------------------- +time: 2023-07-19 14:36:29 +Evaluating: f1: 0.8982, eval_loss: 0.5326, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.0991, expected_sparsity: 0.0958, expected_sequence_sparsity: 0.6298, target_sparsity: 0.1009, step: 2600 +lambda_1: -0.0375, lambda_2: 14.6873 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.79 0.69] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.58] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.44] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101101011100110111011011110000 +11111000111101010101111101011001110010100100011001 +loss: 0.159170, lagrangian_loss: 0.000017, attention_score_distillation_loss: 0.001681 +loss: 0.005047, lagrangian_loss: -0.000019, attention_score_distillation_loss: 0.001678 +ETA: 1:37:55 | Epoch 22 finished. Took 32.61 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:36:43 +Evaluating: f1: 0.8908, eval_loss: 0.552, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1033, expected_sparsity: 0.101, expected_sequence_sparsity: 0.632, target_sparsity: 0.1028, step: 2650 +lambda_1: -0.0014, lambda_2: 14.6887 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.78 0.68] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.56] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.41] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100110111011011110000 +11111100111101010101110101011001110010100100001001 +loss: 0.003213, lagrangian_loss: 0.000001, attention_score_distillation_loss: 0.001674 +loss: 0.002689, lagrangian_loss: 0.000005, attention_score_distillation_loss: 0.001671 +---------------------------------------------------------------------- +time: 2023-07-19 14:36:58 +Evaluating: f1: 0.895, eval_loss: 0.5904, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1033, expected_sparsity: 0.101, expected_sequence_sparsity: 0.632, target_sparsity: 0.1048, step: 2700 +lambda_1: -0.0231, lambda_2: 14.6899 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.78 0.67] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.56] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.41] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100110111011011110000 +11111100111101010101110101011001110010100100001001 +loss: 0.002950, lagrangian_loss: 0.000036, attention_score_distillation_loss: 0.001669 +loss: 0.004284, lagrangian_loss: 0.000017, attention_score_distillation_loss: 0.001666 +---------------------------------------------------------------------- +time: 2023-07-19 14:37:13 +Evaluating: f1: 0.898, eval_loss: 0.5928, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1033, expected_sparsity: 0.101, expected_sequence_sparsity: 0.632, target_sparsity: 0.1067, step: 2750 +lambda_1: -0.0542, lambda_2: 14.6908 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.77 0.66] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.56] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.41] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011110110111011010110000 +11111100111101010101110101011001110010100100001001 +loss: 0.001743, lagrangian_loss: -0.000025, attention_score_distillation_loss: 0.001664 +ETA: 1:37:35 | Epoch 23 finished. Took 34.99 seconds. +loss: 0.003533, lagrangian_loss: -0.000015, attention_score_distillation_loss: 0.001661 +---------------------------------------------------------------------- +time: 2023-07-19 14:37:27 +Evaluating: f1: 0.8956, eval_loss: 0.5784, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1033, expected_sparsity: 0.101, expected_sequence_sparsity: 0.632, target_sparsity: 0.1087, step: 2800 +lambda_1: -0.0046, lambda_2: 14.6921 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.77 0.66] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.56] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.41] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011110110111011010110000 +11111100111101010101110101011001110010100100001001 +loss: 0.003537, lagrangian_loss: -0.000000, attention_score_distillation_loss: 0.001657 +loss: 0.002826, lagrangian_loss: 0.000012, attention_score_distillation_loss: 0.001653 +---------------------------------------------------------------------- +time: 2023-07-19 14:37:42 +Evaluating: f1: 0.8983, eval_loss: 0.5987, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1075, expected_sparsity: 0.1042, expected_sequence_sparsity: 0.6333, target_sparsity: 0.1106, step: 2850 +lambda_1: -0.0286, lambda_2: 14.6929 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.76 0.65] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.72, 0.56] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.72, 0.4] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100110111011010110000 +11111100111101010101110101011001110010100100001001 +loss: 0.002361, lagrangian_loss: 0.000017, attention_score_distillation_loss: 0.001651 +ETA: 1:36:59 | Epoch 24 finished. Took 32.82 seconds. +loss: 0.004175, lagrangian_loss: 0.000002, attention_score_distillation_loss: 0.001648 +---------------------------------------------------------------------- +time: 2023-07-19 14:37:56 +Evaluating: f1: 0.8916, eval_loss: 0.5692, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1075, expected_sparsity: 0.1061, expected_sequence_sparsity: 0.6341, target_sparsity: 0.1126, step: 2900 +lambda_1: -0.0400, lambda_2: 14.6933 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.75 0.64] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.72, 0.54] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.72, 0.39] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100110111011010110000 +11111100111101010101110101011001010010100100001001 +loss: 0.003659, lagrangian_loss: -0.000018, attention_score_distillation_loss: 0.001645 +loss: 0.002306, lagrangian_loss: -0.000005, attention_score_distillation_loss: 0.001643 +---------------------------------------------------------------------- +time: 2023-07-19 14:38:11 +Evaluating: f1: 0.8988, eval_loss: 0.5674, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1117, expected_sparsity: 0.1092, expected_sequence_sparsity: 0.6353, target_sparsity: 0.1145, step: 2950 +lambda_1: -0.0080, lambda_2: 14.6938 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.74 0.64] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.54] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.38] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100110111011000110000 +11111100111101010101110101011001010010100100001001 +loss: 0.003555, lagrangian_loss: 0.000002, attention_score_distillation_loss: 0.001640 +loss: 0.001858, lagrangian_loss: 0.000015, attention_score_distillation_loss: 0.001638 +ETA: 1:36:23 | Epoch 25 finished. Took 32.87 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:38:25 +Evaluating: f1: 0.897, eval_loss: 0.5436, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1117, expected_sparsity: 0.1092, expected_sequence_sparsity: 0.6353, target_sparsity: 0.1164, step: 3000 +lambda_1: -0.0374, lambda_2: 14.6944 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.74 0.63] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.54] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.38] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100110111011000110000 +11111100111101010101110101011001010010100100001001 +loss: 0.006852, lagrangian_loss: -0.000012, attention_score_distillation_loss: 0.001634 +loss: 0.004874, lagrangian_loss: -0.000008, attention_score_distillation_loss: 0.001632 +---------------------------------------------------------------------- +time: 2023-07-19 14:38:40 +Evaluating: f1: 0.8966, eval_loss: 0.5696, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1117, expected_sparsity: 0.1092, expected_sequence_sparsity: 0.6353, target_sparsity: 0.1184, step: 3050 +lambda_1: -0.0029, lambda_2: 14.6952 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.73 0.62] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.54] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.38] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100110111011000110000 +11111000111101010101110101011001110010100100001001 +loss: 0.002537, lagrangian_loss: 0.000000, attention_score_distillation_loss: 0.001629 +loss: 0.002500, lagrangian_loss: 0.000018, attention_score_distillation_loss: 0.001626 +---------------------------------------------------------------------- +time: 2023-07-19 14:38:54 +Evaluating: f1: 0.8985, eval_loss: 0.5933, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1117, expected_sparsity: 0.1092, expected_sequence_sparsity: 0.6353, target_sparsity: 0.1203, step: 3100 +lambda_1: -0.0462, lambda_2: 14.6963 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.96 0.73 0.61] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.54] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.38] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100110111011000110000 +11111000111101010101110101011001110010100100001001 +loss: 0.004406, lagrangian_loss: 0.000035, attention_score_distillation_loss: 0.001624 +ETA: 1:36:01 | Epoch 26 finished. Took 34.92 seconds. +loss: 0.001663, lagrangian_loss: -0.000024, attention_score_distillation_loss: 0.001621 +---------------------------------------------------------------------- +time: 2023-07-19 14:39:09 +Evaluating: f1: 0.8958, eval_loss: 0.5927, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1158, expected_sparsity: 0.1141, expected_sequence_sparsity: 0.6374, target_sparsity: 0.1223, step: 3150 +lambda_1: -0.0270, lambda_2: 14.6974 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.95 0.71 0.61] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.52] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.35] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100110101011000110000 +11111000111101010101110101011001010010100100001001 +loss: 0.001565, lagrangian_loss: -0.000012, attention_score_distillation_loss: 0.001617 +loss: 0.002438, lagrangian_loss: 0.000002, attention_score_distillation_loss: 0.001616 +---------------------------------------------------------------------- +time: 2023-07-19 14:39:23 +Evaluating: f1: 0.8974, eval_loss: 0.556, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1158, expected_sparsity: 0.1141, expected_sequence_sparsity: 0.6374, target_sparsity: 0.1242, step: 3200 +lambda_1: -0.0168, lambda_2: 14.6987 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.95 0.71 0.6 ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.52] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.35] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100110101011000110000 +11111000111101010101110101011001010010100100001001 +loss: 0.002271, lagrangian_loss: 0.000023, attention_score_distillation_loss: 0.001612 +ETA: 1:35:24 | Epoch 27 finished. Took 32.8 seconds. +loss: 0.002666, lagrangian_loss: 0.000007, attention_score_distillation_loss: 0.001610 +---------------------------------------------------------------------- +time: 2023-07-19 14:39:38 +Evaluating: f1: 0.9014, eval_loss: 0.5909, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.12, expected_sparsity: 0.1172, expected_sequence_sparsity: 0.6386, target_sparsity: 0.1262, step: 3250 +lambda_1: -0.0499, lambda_2: 14.7000 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.95 0.7 0.6 ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.52] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.34] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100010101011000110000 +11111000111101010101110101011001010010100100001001 +loss: 0.005432, lagrangian_loss: -0.000031, attention_score_distillation_loss: 0.001612 +loss: 0.002152, lagrangian_loss: -0.000004, attention_score_distillation_loss: 0.001604 +---------------------------------------------------------------------- +time: 2023-07-19 14:39:53 +Evaluating: f1: 0.897, eval_loss: 0.567, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.12, expected_sparsity: 0.1172, expected_sequence_sparsity: 0.6386, target_sparsity: 0.1281, step: 3300 +lambda_1: 0.0031, lambda_2: 14.7013 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.95 0.7 0.59] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.52] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.34] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100010101011000110000 +11111000111101010101110101011001010010100100001001 +loss: 0.069087, lagrangian_loss: 0.000001, attention_score_distillation_loss: 0.001601 +loss: 0.003866, lagrangian_loss: 0.000169, attention_score_distillation_loss: 0.001598 +ETA: 1:34:48 | Epoch 28 finished. Took 32.81 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:40:07 +Evaluating: f1: 0.8808, eval_loss: 0.7724, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1216, expected_sparsity: 0.1202, expected_sequence_sparsity: 0.6399, target_sparsity: 0.13, step: 3350 +lambda_1: -0.1138, lambda_2: 14.7086 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.95 0.68 0.59] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.64, 0.52] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.64, 0.33] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100010101001000110000 +11111000111101010101110101011001010010100100001001 +loss: 0.002763, lagrangian_loss: -0.000086, attention_score_distillation_loss: 0.001596 +loss: 0.001701, lagrangian_loss: -0.000053, attention_score_distillation_loss: 0.001594 +---------------------------------------------------------------------- +time: 2023-07-19 14:40:22 +Evaluating: f1: 0.8889, eval_loss: 0.5801, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1216, expected_sparsity: 0.1202, expected_sequence_sparsity: 0.6399, target_sparsity: 0.132, step: 3400 +lambda_1: 0.0304, lambda_2: 14.7178 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.95 0.68 0.58] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.64, 0.52] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.64, 0.33] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100010101001000110000 +11111000111101010101110101011001010010100100001001 +loss: 0.004504, lagrangian_loss: 0.000007, attention_score_distillation_loss: 0.001591 +loss: 0.001661, lagrangian_loss: -0.000002, attention_score_distillation_loss: 0.001587 +---------------------------------------------------------------------- +time: 2023-07-19 14:40:36 +Evaluating: f1: 0.8881, eval_loss: 0.5988, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1216, expected_sparsity: 0.1202, expected_sequence_sparsity: 0.6399, target_sparsity: 0.1339, step: 3450 +lambda_1: -0.0536, lambda_2: 14.7237 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.95 0.67 0.59] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.64, 0.52] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.64, 0.33] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100010101001000110000 +11111000111101010101110101011001010010100100001001 +ETA: 1:34:24 | Epoch 29 finished. Took 34.95 seconds. +loss: 0.005415, lagrangian_loss: 0.000080, attention_score_distillation_loss: 0.001591 +loss: 0.002078, lagrangian_loss: -0.000039, attention_score_distillation_loss: 0.001581 +---------------------------------------------------------------------- +time: 2023-07-19 14:40:51 +Evaluating: f1: 0.8955, eval_loss: 0.5882, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1258, expected_sparsity: 0.125, expected_sequence_sparsity: 0.6418, target_sparsity: 0.1359, step: 3500 +lambda_1: -0.0622, lambda_2: 14.7285 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.94 0.66 0.58] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.62, 0.5] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.62, 0.31] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100010101001000100000 +11111000111101010101110101011001010000100100001001 +loss: 0.001670, lagrangian_loss: -0.000063, attention_score_distillation_loss: 0.001579 +loss: 0.005558, lagrangian_loss: 0.000005, attention_score_distillation_loss: 0.001577 +---------------------------------------------------------------------- +time: 2023-07-19 14:41:05 +Evaluating: f1: 0.9048, eval_loss: 0.5794, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1258, expected_sparsity: 0.125, expected_sequence_sparsity: 0.6418, target_sparsity: 0.1378, step: 3550 +lambda_1: 0.0152, lambda_2: 14.7327 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.94 0.66 0.58] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.62, 0.5] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.62, 0.31] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100010101001000100000 +11111000111101010101110101011001010000100100001001 +loss: 0.002640, lagrangian_loss: 0.000001, attention_score_distillation_loss: 0.001573 +ETA: 1:33:48 | Epoch 30 finished. Took 32.83 seconds. +loss: 0.003458, lagrangian_loss: 0.000069, attention_score_distillation_loss: 0.001569 +---------------------------------------------------------------------- +time: 2023-07-19 14:41:20 +Evaluating: f1: 0.8946, eval_loss: 0.5559, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.13, expected_sparsity: 0.1279, expected_sequence_sparsity: 0.643, target_sparsity: 0.1397, step: 3600 +lambda_1: -0.0892, lambda_2: 14.7369 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.93 0.65 0.57] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 0.5] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 0.3] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100000101001000100000 +11111000111101010101110101011001010000100100001001 +loss: 0.033720, lagrangian_loss: -0.000037, attention_score_distillation_loss: 0.001567 +loss: 0.003488, lagrangian_loss: -0.000040, attention_score_distillation_loss: 0.001563 +---------------------------------------------------------------------- +time: 2023-07-19 14:41:34 +Evaluating: f1: 0.8885, eval_loss: 0.5886, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.13, expected_sparsity: 0.1279, expected_sequence_sparsity: 0.643, target_sparsity: 0.1417, step: 3650 +lambda_1: 0.0249, lambda_2: 14.7421 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.93 0.64 0.57] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 0.5] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 0.3] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100000101001000100000 +11111000111101010101110101011001010000100100001001 +loss: 0.003652, lagrangian_loss: 0.000012, attention_score_distillation_loss: 0.001560 +loss: 0.003023, lagrangian_loss: -0.000018, attention_score_distillation_loss: 0.001557 +ETA: 1:33:13 | Epoch 31 finished. Took 32.9 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:41:49 +Evaluating: f1: 0.8752, eval_loss: 0.629, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.13, expected_sparsity: 0.1279, expected_sequence_sparsity: 0.643, target_sparsity: 0.1436, step: 3700 +lambda_1: -0.0541, lambda_2: 14.7484 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.93 0.64 0.57] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 0.5] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6, 0.3] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100000101001000100000 +11111000111101010101110101011001010000100100001001 +loss: 0.005066, lagrangian_loss: 0.000121, attention_score_distillation_loss: 0.001553 +loss: 0.004385, lagrangian_loss: -0.000082, attention_score_distillation_loss: 0.001551 +---------------------------------------------------------------------- +time: 2023-07-19 14:42:04 +Evaluating: f1: 0.8878, eval_loss: 0.6474, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1315, expected_sparsity: 0.1309, expected_sequence_sparsity: 0.6442, target_sparsity: 0.1456, step: 3750 +lambda_1: -0.0542, lambda_2: 14.7586 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.92 0.62 0.56] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.58, 0.5] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.58, 0.29] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100000101001000000000 +11111000111101010101110101011001010000100100001001 +loss: 0.003426, lagrangian_loss: -0.000040, attention_score_distillation_loss: 0.001553 +loss: 0.006244, lagrangian_loss: 0.000020, attention_score_distillation_loss: 0.001548 +ETA: 1:32:38 | Epoch 32 finished. Took 32.95 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:42:18 +Evaluating: f1: 0.8877, eval_loss: 0.6504, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1315, expected_sparsity: 0.1309, expected_sequence_sparsity: 0.6442, target_sparsity: 0.1475, step: 3800 +lambda_1: 0.0491, lambda_2: 14.7781 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.93 0.63 0.57] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.58, 0.5] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.58, 0.29] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100000101001000000000 +11111000111101010101110101011001010000100100001001 +loss: 0.008981, lagrangian_loss: 0.000010, attention_score_distillation_loss: 0.001545 +loss: 0.000917, lagrangian_loss: 0.000260, attention_score_distillation_loss: 0.001540 +---------------------------------------------------------------------- +time: 2023-07-19 14:42:33 +Evaluating: f1: 0.8815, eval_loss: 0.6344, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1704, expected_sparsity: 0.1683, expected_sequence_sparsity: 0.6596, target_sparsity: 0.1495, step: 3850 +lambda_1: -0.1891, lambda_2: 14.8085 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.9 0.61 0.56] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.56, 0.48] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.45, 0.22] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111111111100011110 +11111111111111111111101001011000000101001000000000 +11011000111101010101110101011001010000100100001001 +loss: 0.001451, lagrangian_loss: -0.000490, attention_score_distillation_loss: 0.001538 +loss: 0.004553, lagrangian_loss: 0.000359, attention_score_distillation_loss: 0.001535 +---------------------------------------------------------------------- +time: 2023-07-19 14:42:48 +Evaluating: f1: 0.8694, eval_loss: 0.6309, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1315, expected_sparsity: 0.1309, expected_sequence_sparsity: 0.6442, target_sparsity: 0.1514, step: 3900 +lambda_1: 0.2065, lambda_2: 14.8672 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.91 0.62 0.56] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.58, 0.5] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.58, 0.29] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100000101001000000000 +11111000111101010101110101011001010000100100001001 +loss: 0.002135, lagrangian_loss: -0.000447, attention_score_distillation_loss: 0.001531 +ETA: 1:32:14 | Epoch 33 finished. Took 35.14 seconds. +loss: 0.003687, lagrangian_loss: 0.000761, attention_score_distillation_loss: 0.001528 +---------------------------------------------------------------------- +time: 2023-07-19 14:43:02 +Evaluating: f1: 0.8919, eval_loss: 0.6513, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1357, expected_sparsity: 0.1354, expected_sequence_sparsity: 0.6461, target_sparsity: 0.1533, step: 3950 +lambda_1: -0.3649, lambda_2: 14.9930 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.91 0.61 0.55] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.56, 0.48] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.56, 0.27] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011000000101001000000000 +11011000111101010101110101011001010000100100001001 +loss: 0.001582, lagrangian_loss: 0.000110, attention_score_distillation_loss: 0.001526 +loss: 0.002874, lagrangian_loss: -0.001265, attention_score_distillation_loss: 0.001523 +---------------------------------------------------------------------- +time: 2023-07-19 14:43:16 +Evaluating: f1: 0.8935, eval_loss: 0.6236, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1777, expected_sparsity: 0.1739, expected_sequence_sparsity: 0.6619, target_sparsity: 0.1553, step: 4000 +lambda_1: 0.2139, lambda_2: 15.2060 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.88 0.59 0.54] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.54, 0.48] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.42, 0.2] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110111100011110 +11111111111111111111101001010000000101001000000000 +11011000111101010101110101011001010000100100001001 +loss: 0.001701, lagrangian_loss: 0.001790, attention_score_distillation_loss: 0.001520 +ETA: 1:31:38 | Epoch 34 finished. Took 32.8 seconds. +loss: 0.041286, lagrangian_loss: -0.002232, attention_score_distillation_loss: 0.001521 +---------------------------------------------------------------------- +time: 2023-07-19 14:43:31 +Evaluating: f1: 0.8946, eval_loss: 0.6305, token_prune_loc: [False, False, False, False, False, False, False, False, True, True], macs_sparsity: 0.1315, expected_sparsity: 0.1309, expected_sequence_sparsity: 0.6442, target_sparsity: 0.1572, step: 4050 +lambda_1: 0.0101, lambda_2: 15.4687 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.92 0.63 0.56] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.58, 0.5] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.58, 0.29] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111101001011100000101001000000000 +11111000111101010101110101011001010000100100001001 +loss: 0.004622, lagrangian_loss: 0.002105, attention_score_distillation_loss: 0.001514 +loss: 0.003563, lagrangian_loss: 0.001391, attention_score_distillation_loss: 0.001512 +---------------------------------------------------------------------- +time: 2023-07-19 14:43:46 +Evaluating: f1: 0.8901, eval_loss: 0.5755, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1803, expected_sparsity: 0.1762, expected_sequence_sparsity: 0.6628, target_sparsity: 0.1592, step: 4100 +lambda_1: -0.5897, lambda_2: 15.7903 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.97 0.98 0.87 0.58 0.53] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.52, 0.48] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.41, 0.19] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110111100011110 +11111111111111111111101001000000000101001000000000 +11011000111101010101110101011001010000100100001001 +loss: 0.002368, lagrangian_loss: -0.003550, attention_score_distillation_loss: 0.001509 +loss: 0.001664, lagrangian_loss: 0.000405, attention_score_distillation_loss: 0.001506 +ETA: 1:31:03 | Epoch 35 finished. Took 32.94 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:44:00 +Evaluating: f1: 0.8866, eval_loss: 0.6434, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1803, expected_sparsity: 0.1762, expected_sequence_sparsity: 0.6628, target_sparsity: 0.1611, step: 4150 +lambda_1: 0.3783, lambda_2: 16.2755 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.97 0.99 0.87 0.58 0.54] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.52, 0.48] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.41, 0.19] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110111100011110 +11111111111111111111101001000000000101001000000000 +11011000111101010101110101011001010000100100001001 +loss: 0.002583, lagrangian_loss: 0.002737, attention_score_distillation_loss: 0.001503 +loss: 0.002735, lagrangian_loss: -0.002541, attention_score_distillation_loss: 0.001501 +---------------------------------------------------------------------- +time: 2023-07-19 14:44:15 +Evaluating: f1: 0.8904, eval_loss: 0.6076, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1704, expected_sparsity: 0.1671, expected_sequence_sparsity: 0.6591, target_sparsity: 0.1631, step: 4200 +lambda_1: 0.2527, lambda_2: 16.5210 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.9 0.62 0.57] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.56, 0.5] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.45, 0.22] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111111111100011110 +11111111111111111111101001011000000101001000000000 +11011000111101010101110101011001010000100100011001 +loss: 0.002468, lagrangian_loss: -0.000532, attention_score_distillation_loss: 0.001498 +loss: 0.004049, lagrangian_loss: 0.004055, attention_score_distillation_loss: 0.001495 +---------------------------------------------------------------------- +time: 2023-07-19 14:44:30 +Evaluating: f1: 0.8927, eval_loss: 0.624, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1803, expected_sparsity: 0.1762, expected_sequence_sparsity: 0.6628, target_sparsity: 0.165, step: 4250 +lambda_1: -0.6266, lambda_2: 17.0369 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.97 0.99 0.87 0.58 0.53] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.52, 0.48] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.41, 0.19] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110111100011110 +11111111111111111111101001000000000101001000000000 +11011000111101010101110101011001010000100100001001 +loss: 0.001979, lagrangian_loss: -0.000864, attention_score_distillation_loss: 0.001492 +ETA: 1:30:38 | Epoch 36 finished. Took 35.15 seconds. +loss: 0.001414, lagrangian_loss: -0.003321, attention_score_distillation_loss: 0.001490 +---------------------------------------------------------------------- +time: 2023-07-19 14:44:44 +Evaluating: f1: 0.9038, eval_loss: 0.5644, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1876, expected_sparsity: 0.1826, expected_sequence_sparsity: 0.6654, target_sparsity: 0.1669, step: 4300 +lambda_1: -0.0964, lambda_2: 17.3405 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.97 0.98 0.85 0.55 0.52] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.5, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.38, 0.17] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110111000011110 +11111111111111111110101001000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.002169, lagrangian_loss: 0.000855, attention_score_distillation_loss: 0.001486 +loss: 0.001136, lagrangian_loss: 0.001979, attention_score_distillation_loss: 0.001483 +---------------------------------------------------------------------- +time: 2023-07-19 14:44:59 +Evaluating: f1: 0.8988, eval_loss: 0.5826, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1803, expected_sparsity: 0.1762, expected_sequence_sparsity: 0.6628, target_sparsity: 0.1689, step: 4350 +lambda_1: 0.4671, lambda_2: 17.6359 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.97 0.99 0.88 0.58 0.54] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.52, 0.48] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.41, 0.19] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110111100011110 +11111111111111111111101001000000000101001000000000 +11011000111101010101110101011001010000100100001001 +loss: 0.001803, lagrangian_loss: -0.001633, attention_score_distillation_loss: 0.001481 +ETA: 1:30:02 | Epoch 37 finished. Took 32.78 seconds. +loss: 0.002306, lagrangian_loss: -0.000777, attention_score_distillation_loss: 0.001478 +---------------------------------------------------------------------- +time: 2023-07-19 14:45:13 +Evaluating: f1: 0.8873, eval_loss: 0.5692, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1803, expected_sparsity: 0.1762, expected_sequence_sparsity: 0.6628, target_sparsity: 0.1708, step: 4400 +lambda_1: -0.1490, lambda_2: 17.9658 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.98 0.99 0.88 0.58 0.54] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.52, 0.48] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.41, 0.19] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110111100011110 +11111111111111111111101001000000000101001000000000 +11011000111101010101110101011001010000100100001001 +loss: 0.002278, lagrangian_loss: 0.002348, attention_score_distillation_loss: 0.001475 +loss: 0.001694, lagrangian_loss: 0.000603, attention_score_distillation_loss: 0.001471 +---------------------------------------------------------------------- +time: 2023-07-19 14:45:28 +Evaluating: f1: 0.8908, eval_loss: 0.5759, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1892, expected_sparsity: 0.1857, expected_sequence_sparsity: 0.6667, target_sparsity: 0.1728, step: 4450 +lambda_1: -0.4619, lambda_2: 18.1359 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.97 0.98 0.85 0.55 0.51] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.5, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.37, 0.17] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111101100001111101111110111000011110 +11111111111111111110101001000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.001542, lagrangian_loss: -0.001897, attention_score_distillation_loss: 0.001469 +loss: 0.002186, lagrangian_loss: -0.000558, attention_score_distillation_loss: 0.001467 +ETA: 1:29:27 | Epoch 38 finished. Took 33.03 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:45:43 +Evaluating: f1: 0.8901, eval_loss: 0.5611, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1892, expected_sparsity: 0.1857, expected_sequence_sparsity: 0.6667, target_sparsity: 0.1747, step: 4500 +lambda_1: 0.1087, lambda_2: 18.3987 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.97 0.98 0.85 0.54 0.51] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.5, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.37, 0.17] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111101100001111101111110111000011110 +11111111111111111110101001000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.147403, lagrangian_loss: 0.001157, attention_score_distillation_loss: 0.001464 +loss: 0.002329, lagrangian_loss: -0.000249, attention_score_distillation_loss: 0.001461 +---------------------------------------------------------------------- +time: 2023-07-19 14:45:57 +Evaluating: f1: 0.8831, eval_loss: 0.6108, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1835, expected_sparsity: 0.1793, expected_sequence_sparsity: 0.6641, target_sparsity: 0.1766, step: 4550 +lambda_1: 0.2380, lambda_2: 18.4993 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 1. 0.97 0.99 0.87 0.56 0.53] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.52, 0.48] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.4, 0.19] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110111000011110 +11111111111111111111101001000000000101001000000000 +11011000111101010101110101011001010000100100001001 +loss: 0.001284, lagrangian_loss: -0.000760, attention_score_distillation_loss: 0.001456 +loss: 0.003949, lagrangian_loss: 0.001004, attention_score_distillation_loss: 0.001453 +---------------------------------------------------------------------- +time: 2023-07-19 14:46:12 +Evaluating: f1: 0.8958, eval_loss: 0.5906, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.185, expected_sparsity: 0.1816, expected_sequence_sparsity: 0.665, target_sparsity: 0.1786, step: 4600 +lambda_1: -0.2839, lambda_2: 18.7176 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.97 0.98 0.86 0.55 0.52] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.5, 0.48] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.38, 0.18] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110111000011110 +11111111111111111110101001000000000101001000000000 +11011000111101010101110101011001010000100100001001 +ETA: 1:29:02 | Epoch 39 finished. Took 35.3 seconds. +loss: 0.002275, lagrangian_loss: 0.000923, attention_score_distillation_loss: 0.001451 +loss: 0.002025, lagrangian_loss: -0.000761, attention_score_distillation_loss: 0.001449 +---------------------------------------------------------------------- +time: 2023-07-19 14:46:27 +Evaluating: f1: 0.8869, eval_loss: 0.5931, token_prune_loc: [False, False, False, False, False, True, False, True, True, True], macs_sparsity: 0.213, expected_sparsity: 0.208, expected_sequence_sparsity: 0.6758, target_sparsity: 0.1805, step: 4650 +lambda_1: -0.2657, lambda_2: 18.7785 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.96 0.98 0.83 0.54 0.51] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 1.0, 0.74, 0.5, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.68, 0.34, 0.16] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110011000011110 +11111111111111111110101001000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.007277, lagrangian_loss: -0.000894, attention_score_distillation_loss: 0.001445 +loss: 0.001671, lagrangian_loss: 0.000343, attention_score_distillation_loss: 0.001442 +---------------------------------------------------------------------- +time: 2023-07-19 14:46:41 +Evaluating: f1: 0.8794, eval_loss: 0.6231, token_prune_loc: [False, False, False, False, False, True, False, True, True, True], macs_sparsity: 0.2067, expected_sparsity: 0.2024, expected_sequence_sparsity: 0.6735, target_sparsity: 0.1825, step: 4700 +lambda_1: 0.1626, lambda_2: 18.9286 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.96 0.98 0.84 0.54 0.51] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 1.0, 0.74, 0.5, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 0.94, 0.7, 0.35, 0.16] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111111111111110110 +11111111111111111111111111111111111111111111111111 +11111111111011111111100001111101111110011000011110 +11111111111111111110101001000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.002219, lagrangian_loss: 0.000162, attention_score_distillation_loss: 0.001441 +ETA: 1:28:27 | Epoch 40 finished. Took 33.07 seconds. +loss: 0.002402, lagrangian_loss: -0.000377, attention_score_distillation_loss: 0.001436 +---------------------------------------------------------------------- +time: 2023-07-19 14:46:56 +Evaluating: f1: 0.8976, eval_loss: 0.6232, token_prune_loc: [False, False, False, False, False, False, False, True, True, True], macs_sparsity: 0.1892, expected_sparsity: 0.1857, expected_sequence_sparsity: 0.6667, target_sparsity: 0.1844, step: 4750 +lambda_1: 0.0243, lambda_2: 18.9833 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.97 0.98 0.85 0.54 0.52] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.5, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.37, 0.17] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111101100001111101111110111000011110 +11111111111111111110101001000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.001965, lagrangian_loss: 0.000308, attention_score_distillation_loss: 0.001433 +loss: 0.008746, lagrangian_loss: 0.000630, attention_score_distillation_loss: 0.001430 +---------------------------------------------------------------------- +time: 2023-07-19 14:47:10 +Evaluating: f1: 0.8955, eval_loss: 0.589, token_prune_loc: [False, False, False, False, False, True, False, True, True, True], macs_sparsity: 0.2172, expected_sparsity: 0.2108, expected_sequence_sparsity: 0.677, target_sparsity: 0.1864, step: 4800 +lambda_1: -0.2657, lambda_2: 19.0625 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.96 0.98 0.83 0.53 0.51] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 1.0, 0.72, 0.5, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.66, 0.33, 0.15] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111111111111111111111111111111111 +11111111111011111101100001111101111110011000011110 +11111111111111111110101001000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.001323, lagrangian_loss: -0.000299, attention_score_distillation_loss: 0.001428 +loss: 0.004584, lagrangian_loss: -0.000480, attention_score_distillation_loss: 0.001425 +ETA: 1:27:52 | Epoch 41 finished. Took 32.92 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:47:25 +Evaluating: f1: 0.8851, eval_loss: 0.6961, token_prune_loc: [False, False, False, False, False, True, False, True, True, True], macs_sparsity: 0.2187, expected_sparsity: 0.2127, expected_sequence_sparsity: 0.6778, target_sparsity: 0.1883, step: 4850 +lambda_1: -0.0282, lambda_2: 19.1229 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.96 0.97 0.82 0.52 0.5 ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 1.0, 0.72, 0.48, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.66, 0.32, 0.15] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111111111111111111111111111111111 +11111111111011111101100001111101111110011000011110 +11111111111111111110101000000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.003234, lagrangian_loss: 0.000107, attention_score_distillation_loss: 0.001422 +loss: 0.002721, lagrangian_loss: 0.000015, attention_score_distillation_loss: 0.001420 +---------------------------------------------------------------------- +time: 2023-07-19 14:47:40 +Evaluating: f1: 0.8907, eval_loss: 0.718, token_prune_loc: [False, False, False, False, False, True, False, True, True, True], macs_sparsity: 0.2172, expected_sparsity: 0.2108, expected_sequence_sparsity: 0.677, target_sparsity: 0.1902, step: 4900 +lambda_1: 0.0893, lambda_2: 19.1595 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.96 0.98 0.83 0.53 0.51] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 1.0, 0.72, 0.5, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.66, 0.33, 0.15] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111111111111111111111111111111111 +11111111111011111101100001111101111110011000011110 +11111111111111111110101001000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.004011, lagrangian_loss: -0.000099, attention_score_distillation_loss: 0.001416 +loss: 0.002088, lagrangian_loss: 0.000784, attention_score_distillation_loss: 0.001413 +ETA: 1:27:16 | Epoch 42 finished. Took 32.79 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:47:54 +Evaluating: f1: 0.8756, eval_loss: 0.6902, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2282, expected_sparsity: 0.2233, expected_sequence_sparsity: 0.6821, target_sparsity: 0.1922, step: 4950 +lambda_1: -0.2617, lambda_2: 19.2591 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.96 0.97 0.83 0.52 0.5 ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.94, 0.72, 0.48, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.86, 0.62, 0.3, 0.14] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111110111111111110110 +11111111111111111111111111111111011111111111110110 +11111111111011111101100001111101111110011000011110 +11111111111111111110101000000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.004913, lagrangian_loss: 0.000569, attention_score_distillation_loss: 0.001410 +loss: 0.130653, lagrangian_loss: -0.000810, attention_score_distillation_loss: 0.001407 +---------------------------------------------------------------------- +time: 2023-07-19 14:48:09 +Evaluating: f1: 0.8811, eval_loss: 0.6308, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2355, expected_sparsity: 0.2293, expected_sequence_sparsity: 0.6846, target_sparsity: 0.1941, step: 5000 +lambda_1: -0.1604, lambda_2: 19.3247 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.96 0.96 0.81 0.51 0.49] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.7, 0.48, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.59, 0.28, 0.13] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111110111111111110110 +11111111111111111111111011111111011111111111110110 +11111111111011111101100001111101111100011000011110 +11111111111111111110101000000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.006452, lagrangian_loss: -0.000297, attention_score_distillation_loss: 0.001406 +loss: 0.002083, lagrangian_loss: 0.000354, attention_score_distillation_loss: 0.001403 +---------------------------------------------------------------------- +time: 2023-07-19 14:48:23 +Evaluating: f1: 0.8835, eval_loss: 0.6476, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2282, expected_sparsity: 0.2233, expected_sequence_sparsity: 0.6821, target_sparsity: 0.1961, step: 5050 +lambda_1: 0.1716, lambda_2: 19.4326 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.96 0.96 0.82 0.52 0.5 ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.94, 0.72, 0.48, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.86, 0.62, 0.3, 0.14] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111110111111111110110 +11111111111111111111111111111111011111111111110110 +11111111111011111101100001111101111110011000011110 +11111111111111111110101000000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.002256, lagrangian_loss: -0.000220, attention_score_distillation_loss: 0.001399 +ETA: 1:26:49 | Epoch 43 finished. Took 34.97 seconds. +loss: 0.002974, lagrangian_loss: 0.000100, attention_score_distillation_loss: 0.001395 +---------------------------------------------------------------------- +time: 2023-07-19 14:48:38 +Evaluating: f1: 0.8644, eval_loss: 0.6402, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2282, expected_sparsity: 0.2233, expected_sequence_sparsity: 0.6821, target_sparsity: 0.198, step: 5100 +lambda_1: -0.1291, lambda_2: 19.5192 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.96 0.96 0.81 0.52 0.5 ] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.94, 0.72, 0.48, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.86, 0.62, 0.3, 0.14] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111110111111111110110 +11111111111111111111111111111111011111111111110110 +11111111111011111101100001111101111110011000011110 +11111111111111111110101000000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.008504, lagrangian_loss: 0.000511, attention_score_distillation_loss: 0.001394 +loss: 0.003168, lagrangian_loss: -0.000217, attention_score_distillation_loss: 0.001390 +---------------------------------------------------------------------- +time: 2023-07-19 14:48:52 +Evaluating: f1: 0.8726, eval_loss: 0.6804, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2355, expected_sparsity: 0.2293, expected_sequence_sparsity: 0.6846, target_sparsity: 0.2, step: 5150 +lambda_1: -0.1670, lambda_2: 19.5615 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.95 0.95 0.79 0.51 0.49] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.7, 0.48, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.59, 0.28, 0.13] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111111011111111011111111111110110 +11111111111011111111100001111101111100010000011110 +11111111111111111110101000000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.006599, lagrangian_loss: -0.000354, attention_score_distillation_loss: 0.001387 +ETA: 1:26:13 | Epoch 44 finished. Took 32.81 seconds. +loss: 0.119889, lagrangian_loss: 0.000104, attention_score_distillation_loss: 0.001384 +---------------------------------------------------------------------- +time: 2023-07-19 14:49:07 +Evaluating: f1: 0.8799, eval_loss: 0.7258, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2355, expected_sparsity: 0.2293, expected_sequence_sparsity: 0.6846, target_sparsity: 0.2019, step: 5200 +lambda_1: 0.0916, lambda_2: 19.6263 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.96 0.96 0.8 0.51 0.49] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.7, 0.48, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.59, 0.28, 0.13] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111111011111111011111111111110110 +11111111111011111111100001111101111100010000011110 +11111111111111111110101000000000000101001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.007245, lagrangian_loss: -0.000100, attention_score_distillation_loss: 0.001382 +loss: 0.125823, lagrangian_loss: 0.000018, attention_score_distillation_loss: 0.001379 +---------------------------------------------------------------------- +time: 2023-07-19 14:49:22 +Evaluating: f1: 0.887, eval_loss: 0.6445, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2355, expected_sparsity: 0.231, expected_sequence_sparsity: 0.6853, target_sparsity: 0.2038, step: 5250 +lambda_1: -0.0573, lambda_2: 19.6480 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.95 0.96 0.8 0.5 0.49] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.7, 0.46, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.59, 0.27, 0.13] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111101111111111011111111111110110 +11111111111011111111100001111101111100010000011110 +11111111111111111110101000000000000100001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.002161, lagrangian_loss: 0.000049, attention_score_distillation_loss: 0.001377 +loss: 0.002037, lagrangian_loss: -0.000026, attention_score_distillation_loss: 0.001374 +ETA: 1:25:39 | Epoch 45 finished. Took 33.07 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:49:36 +Evaluating: f1: 0.8859, eval_loss: 0.6789, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2355, expected_sparsity: 0.231, expected_sequence_sparsity: 0.6853, target_sparsity: 0.2058, step: 5300 +lambda_1: -0.0599, lambda_2: 19.6521 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.95 0.96 0.79 0.49 0.49] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.7, 0.46, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.59, 0.27, 0.13] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111101111111111011111111111110110 +11111111111011111111100001111101111100010000011110 +11111111111111111110101000000000000100001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.001981, lagrangian_loss: -0.000029, attention_score_distillation_loss: 0.001370 +loss: 0.002279, lagrangian_loss: -0.000011, attention_score_distillation_loss: 0.001367 +---------------------------------------------------------------------- +time: 2023-07-19 14:49:51 +Evaluating: f1: 0.8754, eval_loss: 0.6692, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2371, expected_sparsity: 0.2335, expected_sequence_sparsity: 0.6863, target_sparsity: 0.2077, step: 5350 +lambda_1: -0.0195, lambda_2: 19.6541 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.95 0.96 0.79 0.49 0.49] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.68, 0.46, 0.46] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.58, 0.26, 0.12] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111101111111111011111111111110110 +11111111111011111101100001111101111100010000011110 +11111111111111111110101000000000000100001000000000 +11010000111101010101110101011001010000100100001001 +loss: 0.008156, lagrangian_loss: 0.000034, attention_score_distillation_loss: 0.001365 +loss: 0.002996, lagrangian_loss: 0.000086, attention_score_distillation_loss: 0.001362 +---------------------------------------------------------------------- +time: 2023-07-19 14:50:06 +Evaluating: f1: 0.8912, eval_loss: 0.6219, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2371, expected_sparsity: 0.2342, expected_sequence_sparsity: 0.6866, target_sparsity: 0.2097, step: 5400 +lambda_1: -0.1236, lambda_2: 19.6646 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.95 0.95 0.78 0.48 0.48] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.68, 0.46, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.58, 0.26, 0.12] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111101111111111011111111111110110 +11111111111011111101100001111101111100010000011110 +11111111111111111110101000000000000100001000000000 +11010000111101010101110101010001010000100100001001 +loss: 0.001552, lagrangian_loss: 0.000226, attention_score_distillation_loss: 0.001359 +ETA: 1:25:12 | Epoch 46 finished. Took 35.33 seconds. +loss: 0.002585, lagrangian_loss: -0.000138, attention_score_distillation_loss: 0.001357 +---------------------------------------------------------------------- +time: 2023-07-19 14:50:20 +Evaluating: f1: 0.8801, eval_loss: 0.5993, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2371, expected_sparsity: 0.2342, expected_sequence_sparsity: 0.6866, target_sparsity: 0.2116, step: 5450 +lambda_1: -0.0930, lambda_2: 19.6823 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.95 0.95 0.76 0.48 0.48] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.68, 0.46, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.58, 0.26, 0.12] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111101111111111011111111111110110 +11111111111011111111100001111101111100010000010110 +11111111111111111110101000000000000100001000000000 +11010000111101010101110101010001010000100100001001 +loss: 0.078087, lagrangian_loss: -0.000103, attention_score_distillation_loss: 0.001354 +loss: 0.004137, lagrangian_loss: 0.000084, attention_score_distillation_loss: 0.001352 +---------------------------------------------------------------------- +time: 2023-07-19 14:50:35 +Evaluating: f1: 0.8873, eval_loss: 0.6385, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2371, expected_sparsity: 0.2342, expected_sequence_sparsity: 0.6866, target_sparsity: 0.2135, step: 5500 +lambda_1: 0.0919, lambda_2: 19.7210 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.95 0.95 0.77 0.48 0.48] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.68, 0.46, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.58, 0.26, 0.12] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110111111110110 +11111111111111111111101111111111011111111111110110 +11111111111011111101100001111101111100010000011110 +11111111111111111110101000000000000100001000000000 +11010000111101010101110101010001010000100100001001 +loss: 0.004202, lagrangian_loss: -0.000107, attention_score_distillation_loss: 0.001349 +ETA: 1:24:38 | Epoch 47 finished. Took 33.18 seconds. +loss: 0.007732, lagrangian_loss: 0.000392, attention_score_distillation_loss: 0.001346 +---------------------------------------------------------------------- +time: 2023-07-19 14:50:50 +Evaluating: f1: 0.8821, eval_loss: 0.6445, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2412, expected_sparsity: 0.2367, expected_sequence_sparsity: 0.6876, target_sparsity: 0.2155, step: 5550 +lambda_1: -0.2254, lambda_2: 19.7955 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.95 0.95 0.76 0.47 0.48] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.66, 0.46, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.56, 0.26, 0.11] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111111011111110110 +11111111111111111111101111111111011111111111110110 +11111111111011111101100001111101111100010000010110 +11111111111111111110101000000000000100001000000000 +11010000111101010101110101010001010000100100001001 +loss: 0.002439, lagrangian_loss: 0.000167, attention_score_distillation_loss: 0.001344 +loss: 0.003180, lagrangian_loss: -0.000499, attention_score_distillation_loss: 0.001340 +---------------------------------------------------------------------- +time: 2023-07-19 14:51:05 +Evaluating: f1: 0.8866, eval_loss: 0.6246, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2507, expected_sparsity: 0.2464, expected_sequence_sparsity: 0.6916, target_sparsity: 0.2174, step: 5600 +lambda_1: -0.0218, lambda_2: 19.8628 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.94 0.94 0.75 0.47 0.47] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9, 0.66, 0.44, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.53, 0.24, 0.1] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111101011111111011111111111110110 +11111111111011111101100001111101111100010000010110 +11111111111111011110101000000000000100001000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.002003, lagrangian_loss: 0.000167, attention_score_distillation_loss: 0.001338 +loss: 0.001499, lagrangian_loss: 0.000071, attention_score_distillation_loss: 0.001334 +ETA: 1:24:03 | Epoch 48 finished. Took 33.02 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:51:19 +Evaluating: f1: 0.887, eval_loss: 0.6624, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2412, expected_sparsity: 0.2367, expected_sequence_sparsity: 0.6876, target_sparsity: 0.2194, step: 5650 +lambda_1: 0.1345, lambda_2: 19.9195 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.94 0.95 0.76 0.48 0.48] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.92, 0.66, 0.46, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.85, 0.56, 0.26, 0.11] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111111011111110110 +11111111111111111111101111111111011111111111110110 +11111111111011111101100001111101111100010000010110 +11111111111111111110101000000000000100001000000000 +11010000111101010101110101010001010000100100001001 +loss: 0.004546, lagrangian_loss: -0.000222, attention_score_distillation_loss: 0.001331 +loss: 0.009626, lagrangian_loss: 0.000608, attention_score_distillation_loss: 0.001327 +---------------------------------------------------------------------- +time: 2023-07-19 14:51:34 +Evaluating: f1: 0.8873, eval_loss: 0.6521, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2507, expected_sparsity: 0.2464, expected_sequence_sparsity: 0.6916, target_sparsity: 0.2213, step: 5700 +lambda_1: -0.2832, lambda_2: 20.0430 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.94 0.95 0.74 0.47 0.47] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9, 0.66, 0.44, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.53, 0.24, 0.1] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111101011111111011111111111110110 +11111111111011110111100001111101111100010000010110 +11111111111111011110101000000000000100001000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.002739, lagrangian_loss: 0.000332, attention_score_distillation_loss: 0.001325 +loss: 0.001201, lagrangian_loss: -0.000605, attention_score_distillation_loss: 0.001323 +---------------------------------------------------------------------- +time: 2023-07-19 14:51:48 +Evaluating: f1: 0.8847, eval_loss: 0.6252, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2523, expected_sparsity: 0.2487, expected_sequence_sparsity: 0.6925, target_sparsity: 0.2233, step: 5750 +lambda_1: -0.0978, lambda_2: 20.1130 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.93 0.94 0.72 0.46 0.47] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9, 0.64, 0.44, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.52, 0.23, 0.1] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111101111111111011111111011110110 +11111111111011110111100001111101111100000000010110 +11111111111111011110101000000000000100001000000000 +11010000111101010101110101011001010000100100001000 +ETA: 1:23:35 | Epoch 49 finished. Took 35.24 seconds. +loss: 0.002118, lagrangian_loss: -0.000030, attention_score_distillation_loss: 0.001320 +loss: 0.001744, lagrangian_loss: 0.000472, attention_score_distillation_loss: 0.001316 +---------------------------------------------------------------------- +time: 2023-07-19 14:52:03 +Evaluating: f1: 0.8881, eval_loss: 0.5954, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2507, expected_sparsity: 0.2464, expected_sequence_sparsity: 0.6916, target_sparsity: 0.2252, step: 5800 +lambda_1: 0.2381, lambda_2: 20.2204 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.94 0.94 0.74 0.47 0.47] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9, 0.66, 0.44, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.53, 0.24, 0.1] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111101011111111011111111111110110 +11111111111011110111100001111101111100010000010110 +11111111111111011110101000000000000100001000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.001065, lagrangian_loss: -0.000535, attention_score_distillation_loss: 0.001314 +loss: 0.002065, lagrangian_loss: 0.000404, attention_score_distillation_loss: 0.001311 +---------------------------------------------------------------------- +time: 2023-07-19 14:52:18 +Evaluating: f1: 0.8832, eval_loss: 0.6782, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2523, expected_sparsity: 0.2487, expected_sequence_sparsity: 0.6925, target_sparsity: 0.2271, step: 5850 +lambda_1: -0.2462, lambda_2: 20.4010 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.94 0.94 0.73 0.46 0.47] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9, 0.64, 0.44, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.52, 0.23, 0.1] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111101011111111011111111111110110 +11111111111011110111100001111101111100000000010110 +11111111111111011110101000000000000100001000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.005399, lagrangian_loss: 0.000750, attention_score_distillation_loss: 0.001307 +ETA: 1:23:01 | Epoch 50 finished. Took 33.06 seconds. +loss: 0.002068, lagrangian_loss: -0.000690, attention_score_distillation_loss: 0.001304 +---------------------------------------------------------------------- +time: 2023-07-19 14:52:32 +Evaluating: f1: 0.8773, eval_loss: 0.7855, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.258, expected_sparsity: 0.2524, expected_sequence_sparsity: 0.694, target_sparsity: 0.2291, step: 5900 +lambda_1: -0.2118, lambda_2: 20.4835 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.93 0.93 0.71 0.45 0.47] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9, 0.62, 0.42, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.5, 0.21, 0.09] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111111011111111011111111011110110 +11111111111011110101100001111101111100000000010110 +11111111111111011110101000000000000100000000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.001006, lagrangian_loss: -0.000531, attention_score_distillation_loss: 0.001302 +loss: 0.002221, lagrangian_loss: 0.000464, attention_score_distillation_loss: 0.001299 +---------------------------------------------------------------------- +time: 2023-07-19 14:52:47 +Evaluating: f1: 0.8789, eval_loss: 0.6777, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2523, expected_sparsity: 0.2487, expected_sequence_sparsity: 0.6925, target_sparsity: 0.231, step: 5950 +lambda_1: 0.2473, lambda_2: 20.6397 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.93 0.93 0.72 0.46 0.47] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9, 0.64, 0.44, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.52, 0.23, 0.1] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111101011111111011111111111110110 +11111111111011110111100001111101111100000000010110 +11111111111111011110101000000000000100001000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.004002, lagrangian_loss: -0.000142, attention_score_distillation_loss: 0.001297 +loss: 0.003563, lagrangian_loss: -0.000285, attention_score_distillation_loss: 0.001295 +ETA: 1:22:27 | Epoch 51 finished. Took 33.2 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:53:02 +Evaluating: f1: 0.8754, eval_loss: 0.7229, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2523, expected_sparsity: 0.2487, expected_sequence_sparsity: 0.6925, target_sparsity: 0.233, step: 6000 +lambda_1: -0.1442, lambda_2: 20.8115 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.93 0.94 0.72 0.46 0.47] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9, 0.64, 0.44, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.52, 0.23, 0.1] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111101011111111011111111111110110 +11111111111011110111100001111101111100000000010110 +11111111111111011110101000000000000100001000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.003513, lagrangian_loss: 0.001245, attention_score_distillation_loss: 0.001290 +loss: 0.003007, lagrangian_loss: 0.000043, attention_score_distillation_loss: 0.001288 +---------------------------------------------------------------------- +time: 2023-07-19 14:53:17 +Evaluating: f1: 0.8758, eval_loss: 0.724, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2612, expected_sparsity: 0.2555, expected_sequence_sparsity: 0.6953, target_sparsity: 0.2349, step: 6050 +lambda_1: -0.3557, lambda_2: 20.9263 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.92 0.92 0.69 0.44 0.46] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.88, 0.62, 0.42, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.79, 0.49, 0.21, 0.09] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111111011111111010111111011110110 +11111111111011110101100001111101111100000000010110 +11111111111111011110101000000000000100000000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.002983, lagrangian_loss: -0.001101, attention_score_distillation_loss: 0.001286 +loss: 0.001251, lagrangian_loss: -0.000030, attention_score_distillation_loss: 0.001283 +ETA: 1:21:53 | Epoch 52 finished. Took 33.42 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:53:32 +Evaluating: f1: 0.8836, eval_loss: 0.676, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2612, expected_sparsity: 0.2555, expected_sequence_sparsity: 0.6953, target_sparsity: 0.2368, step: 6100 +lambda_1: 0.1605, lambda_2: 21.1270 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.92 0.92 0.69 0.44 0.46] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.88, 0.62, 0.42, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.79, 0.49, 0.21, 0.09] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111111011111111010111111011110110 +11111111111011110101100001111101111100000000010110 +11111111111111011110101000000000000100000000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.004098, lagrangian_loss: 0.000299, attention_score_distillation_loss: 0.001280 +loss: 0.003099, lagrangian_loss: -0.000458, attention_score_distillation_loss: 0.001277 +---------------------------------------------------------------------- +time: 2023-07-19 14:53:46 +Evaluating: f1: 0.8807, eval_loss: 0.7365, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.258, expected_sparsity: 0.2524, expected_sequence_sparsity: 0.694, target_sparsity: 0.2388, step: 6150 +lambda_1: 0.0093, lambda_2: 21.2409 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.93 0.92 0.7 0.45 0.47] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.9, 0.62, 0.42, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.5, 0.21, 0.09] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111111111111111010111111011110110 +11111111111011110101100001111101111100000000010110 +11111111111111011110101000000000000100000000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.002106, lagrangian_loss: 0.000492, attention_score_distillation_loss: 0.001274 +loss: 0.003035, lagrangian_loss: 0.000563, attention_score_distillation_loss: 0.001271 +---------------------------------------------------------------------- +time: 2023-07-19 14:54:01 +Evaluating: f1: 0.8788, eval_loss: 0.6134, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2627, expected_sparsity: 0.2577, expected_sequence_sparsity: 0.6962, target_sparsity: 0.2407, step: 6200 +lambda_1: -0.3376, lambda_2: 21.3858 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.92 0.91 0.68 0.44 0.46] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.88, 0.6, 0.42, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.79, 0.48, 0.2, 0.09] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011111110110 +11111111111111111111111011111111010111111011110110 +11111111111011010101100001111101111100000000010110 +11111111111111011110101000000000000100000000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.002766, lagrangian_loss: -0.000765, attention_score_distillation_loss: 0.001269 +ETA: 1:21:25 | Epoch 53 finished. Took 35.54 seconds. +loss: 0.001574, lagrangian_loss: -0.000224, attention_score_distillation_loss: 0.001266 +---------------------------------------------------------------------- +time: 2023-07-19 14:54:16 +Evaluating: f1: 0.8803, eval_loss: 0.6397, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.269, expected_sparsity: 0.2636, expected_sequence_sparsity: 0.6986, target_sparsity: 0.2427, step: 6250 +lambda_1: 0.0915, lambda_2: 21.5435 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.92 0.92 0.68 0.43 0.46] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88, 0.6, 0.4, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.46, 0.19, 0.08] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110110 +11111111111111111111111011111111010111111011110110 +11111111111011010111100001111101101100000000010110 +11111111111111011110101000000000000000000000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.001186, lagrangian_loss: 0.000041, attention_score_distillation_loss: 0.001264 +loss: 0.001851, lagrangian_loss: -0.000062, attention_score_distillation_loss: 0.001260 +---------------------------------------------------------------------- +time: 2023-07-19 14:54:30 +Evaluating: f1: 0.877, eval_loss: 0.6131, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.269, expected_sparsity: 0.2636, expected_sequence_sparsity: 0.6986, target_sparsity: 0.2446, step: 6300 +lambda_1: -0.1096, lambda_2: 21.6312 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.92 0.92 0.68 0.43 0.46] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88, 0.6, 0.4, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.46, 0.19, 0.08] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110110 +11111111111111111111111011111111010111111011110110 +11111111111011010111100001111101101100000000010110 +11111111111111011110101000000000000000000000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.001941, lagrangian_loss: 0.000522, attention_score_distillation_loss: 0.001257 +ETA: 1:20:51 | Epoch 54 finished. Took 33.03 seconds. +loss: 0.008417, lagrangian_loss: -0.000074, attention_score_distillation_loss: 0.001256 +---------------------------------------------------------------------- +time: 2023-07-19 14:54:45 +Evaluating: f1: 0.8786, eval_loss: 0.7099, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.269, expected_sparsity: 0.2657, expected_sequence_sparsity: 0.6995, target_sparsity: 0.2466, step: 6350 +lambda_1: -0.2436, lambda_2: 21.6846 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.91 0.91 0.66 0.42 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88, 0.58, 0.4, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.45, 0.18, 0.08] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110110 +11111111111111111111111011111111010111111011110110 +11111111111011110111100001011101101100000000000110 +11111111111111011110101000000000000000000000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.002144, lagrangian_loss: -0.000519, attention_score_distillation_loss: 0.001252 +loss: 0.002197, lagrangian_loss: 0.000012, attention_score_distillation_loss: 0.001249 +---------------------------------------------------------------------- +time: 2023-07-19 14:55:00 +Evaluating: f1: 0.8801, eval_loss: 0.6969, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.269, expected_sparsity: 0.2657, expected_sequence_sparsity: 0.6995, target_sparsity: 0.2485, step: 6400 +lambda_1: 0.1225, lambda_2: 21.7899 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.91 0.92 0.66 0.43 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88, 0.58, 0.4, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.45, 0.18, 0.08] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110110 +11111111111111111111111011111111010111111011110110 +11111111111011010111100001011101111100000000000110 +11111111111111011110101000000000000000000000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.002499, lagrangian_loss: 0.000054, attention_score_distillation_loss: 0.001245 +loss: 0.000613, lagrangian_loss: -0.000180, attention_score_distillation_loss: 0.001243 +ETA: 1:20:16 | Epoch 55 finished. Took 33.01 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:55:14 +Evaluating: f1: 0.8708, eval_loss: 0.6674, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.269, expected_sparsity: 0.2636, expected_sequence_sparsity: 0.6986, target_sparsity: 0.2504, step: 6450 +lambda_1: -0.0735, lambda_2: 21.8731 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.91 0.92 0.66 0.43 0.46] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88, 0.6, 0.4, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.46, 0.19, 0.08] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110110 +11111111111111111111111011111111010111111011110110 +11111111111011110111100001011101111100000000000110 +11111111111111011110101000000000000000000000000000 +11010000111101010101110101011001010000100100001000 +loss: 0.000863, lagrangian_loss: 0.000551, attention_score_distillation_loss: 0.001240 +loss: 0.003027, lagrangian_loss: 0.000119, attention_score_distillation_loss: 0.001237 +---------------------------------------------------------------------- +time: 2023-07-19 14:55:29 +Evaluating: f1: 0.8793, eval_loss: 0.6707, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.269, expected_sparsity: 0.2662, expected_sequence_sparsity: 0.6997, target_sparsity: 0.2524, step: 6500 +lambda_1: -0.2447, lambda_2: 21.9424 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.9 0.91 0.64 0.42 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88, 0.58, 0.4, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.45, 0.18, 0.08] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110110 +11111111111111111111111011111111010111111011110110 +11111111111011110111100001011101101100000000000110 +11111111111111011110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.000859, lagrangian_loss: -0.000530, attention_score_distillation_loss: 0.001233 +loss: 0.001205, lagrangian_loss: 0.000070, attention_score_distillation_loss: 0.001230 +---------------------------------------------------------------------- +time: 2023-07-19 14:55:44 +Evaluating: f1: 0.8792, eval_loss: 0.6639, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.269, expected_sparsity: 0.2662, expected_sequence_sparsity: 0.6997, target_sparsity: 0.2543, step: 6550 +lambda_1: 0.1470, lambda_2: 22.0688 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.9 0.91 0.64 0.42 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88, 0.58, 0.4, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.45, 0.18, 0.08] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110110 +11111111111111111111111011111111010111111011110110 +11111111111011110111100001011101101100000000000110 +11111111111111011110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001461, lagrangian_loss: 0.000101, attention_score_distillation_loss: 0.001229 +ETA: 1:19:47 | Epoch 56 finished. Took 35.15 seconds. +loss: 0.001796, lagrangian_loss: -0.000160, attention_score_distillation_loss: 0.001224 +---------------------------------------------------------------------- +time: 2023-07-19 14:55:58 +Evaluating: f1: 0.8737, eval_loss: 0.6714, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.269, expected_sparsity: 0.2657, expected_sequence_sparsity: 0.6995, target_sparsity: 0.2563, step: 6600 +lambda_1: -0.0954, lambda_2: 22.1629 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.9 0.91 0.65 0.42 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.88, 0.58, 0.4, 0.44] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.45, 0.18, 0.08] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110110 +11111111111111111111111011111111010111111011110110 +11111111111011110111100001011101101100000000000110 +11111111111111011110101000000000000000000000000000 +11010000111101010101110101010001010000100100001001 +loss: 0.019113, lagrangian_loss: 0.000455, attention_score_distillation_loss: 0.001222 +loss: 0.001869, lagrangian_loss: -0.000118, attention_score_distillation_loss: 0.001220 +---------------------------------------------------------------------- +time: 2023-07-19 14:56:13 +Evaluating: f1: 0.8811, eval_loss: 0.6534, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2811, expected_sparsity: 0.2766, expected_sequence_sparsity: 0.704, target_sparsity: 0.2582, step: 6650 +lambda_1: -0.2236, lambda_2: 22.2402 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.89 0.9 0.62 0.41 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86, 0.56, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.74, 0.41, 0.16, 0.07] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110010 +11111111111111111111111011111111010111011011110110 +11111111111011010111100001011101101100000000000110 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.004571, lagrangian_loss: -0.000478, attention_score_distillation_loss: 0.001218 +ETA: 1:19:12 | Epoch 57 finished. Took 32.99 seconds. +loss: 0.001692, lagrangian_loss: 0.000172, attention_score_distillation_loss: 0.001215 +---------------------------------------------------------------------- +time: 2023-07-19 14:56:27 +Evaluating: f1: 0.8815, eval_loss: 0.6591, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2811, expected_sparsity: 0.2755, expected_sequence_sparsity: 0.7035, target_sparsity: 0.2602, step: 6700 +lambda_1: 0.1908, lambda_2: 22.3762 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.89 0.91 0.63 0.41 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86, 0.56, 0.4, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.74, 0.41, 0.17, 0.07] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110010 +11111111111111111111111011111111010111011011110110 +11111111111011010111100001011101101100000000000110 +11111111111111011110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.000950, lagrangian_loss: 0.000116, attention_score_distillation_loss: 0.001212 +loss: 0.001666, lagrangian_loss: -0.000258, attention_score_distillation_loss: 0.001208 +---------------------------------------------------------------------- +time: 2023-07-19 14:56:42 +Evaluating: f1: 0.8739, eval_loss: 0.6494, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2811, expected_sparsity: 0.2755, expected_sequence_sparsity: 0.7035, target_sparsity: 0.2621, step: 6750 +lambda_1: -0.1156, lambda_2: 22.5201 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.89 0.91 0.63 0.42 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86, 0.56, 0.4, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.74, 0.41, 0.17, 0.07] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110010 +11111111111111111111111011111111010111011011110110 +11111111111011010111100001011101101100000000000110 +11111111111111011110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001611, lagrangian_loss: 0.000619, attention_score_distillation_loss: 0.001206 +loss: 0.001479, lagrangian_loss: -0.000047, attention_score_distillation_loss: 0.001201 +ETA: 1:18:37 | Epoch 58 finished. Took 32.83 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:56:57 +Evaluating: f1: 0.8803, eval_loss: 0.6538, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2842, expected_sparsity: 0.2786, expected_sequence_sparsity: 0.7048, target_sparsity: 0.264, step: 6800 +lambda_1: -0.2579, lambda_2: 22.6256 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.87 0.9 0.6 0.41 0.44] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86, 0.54, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.74, 0.4, 0.15, 0.06] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110010 +11111111111111111111111011111111010111011011110110 +11111111111011010111100001011101101100000000000100 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001876, lagrangian_loss: -0.000645, attention_score_distillation_loss: 0.001199 +loss: 0.002166, lagrangian_loss: 0.000364, attention_score_distillation_loss: 0.001197 +---------------------------------------------------------------------- +time: 2023-07-19 14:57:11 +Evaluating: f1: 0.875, eval_loss: 0.6359, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2811, expected_sparsity: 0.2766, expected_sequence_sparsity: 0.704, target_sparsity: 0.266, step: 6850 +lambda_1: 0.2149, lambda_2: 22.8189 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.88 0.91 0.61 0.41 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86, 0.56, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.74, 0.41, 0.16, 0.07] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110010 +11111111111111111111111011111111010111011011110110 +11111111111011010111100001011101101100000000000110 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.002707, lagrangian_loss: -0.000101, attention_score_distillation_loss: 0.001194 +loss: 0.000903, lagrangian_loss: -0.000170, attention_score_distillation_loss: 0.001190 +---------------------------------------------------------------------- +time: 2023-07-19 14:57:26 +Evaluating: f1: 0.8688, eval_loss: 0.7577, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2811, expected_sparsity: 0.2766, expected_sequence_sparsity: 0.704, target_sparsity: 0.2679, step: 6900 +lambda_1: -0.1501, lambda_2: 22.9911 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.88 0.91 0.61 0.41 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.86, 0.56, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.74, 0.41, 0.16, 0.07] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011110010 +11111111111111111111111011111111010111011011110110 +11111111111011010111100001011101101100000000000110 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +ETA: 1:18:08 | Epoch 59 finished. Took 35.07 seconds. +loss: 0.002016, lagrangian_loss: 0.000735, attention_score_distillation_loss: 0.001188 +loss: 0.001338, lagrangian_loss: -0.000265, attention_score_distillation_loss: 0.001186 +---------------------------------------------------------------------- +time: 2023-07-19 14:57:40 +Evaluating: f1: 0.8784, eval_loss: 0.614, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2874, expected_sparsity: 0.2829, expected_sequence_sparsity: 0.7066, target_sparsity: 0.2699, step: 6950 +lambda_1: -0.2297, lambda_2: 23.0953 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.98 0.86 0.9 0.59 0.4 0.44] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.86, 0.54, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.39, 0.15, 0.06] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110010011110010 +11111111111111111111111011111111010111011011110110 +11111111111011010111100001011101101100000000000100 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001778, lagrangian_loss: -0.000544, attention_score_distillation_loss: 0.001182 +loss: 0.002700, lagrangian_loss: 0.000403, attention_score_distillation_loss: 0.001179 +---------------------------------------------------------------------- +time: 2023-07-19 14:57:55 +Evaluating: f1: 0.8854, eval_loss: 0.6061, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2874, expected_sparsity: 0.2829, expected_sequence_sparsity: 0.7066, target_sparsity: 0.2718, step: 7000 +lambda_1: 0.2091, lambda_2: 23.2701 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.87 0.9 0.6 0.4 0.44] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.86, 0.54, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.39, 0.15, 0.06] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110010011110010 +11111111111111111111111011111111010111011011110110 +11111111111011010111100001011101101100000000000100 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.000911, lagrangian_loss: -0.000106, attention_score_distillation_loss: 0.001176 +ETA: 1:17:32 | Epoch 60 finished. Took 32.76 seconds. +loss: 0.001643, lagrangian_loss: 0.000038, attention_score_distillation_loss: 0.001174 +---------------------------------------------------------------------- +time: 2023-07-19 14:58:09 +Evaluating: f1: 0.8858, eval_loss: 0.6365, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2874, expected_sparsity: 0.2829, expected_sequence_sparsity: 0.7066, target_sparsity: 0.2737, step: 7050 +lambda_1: -0.1898, lambda_2: 23.4589 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.99 0.87 0.9 0.6 0.4 0.45] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.86, 0.54, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.39, 0.15, 0.06] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110010011110010 +11111111111111111111111011111111010111011011110110 +11111111111011010111100001011101101100000000000100 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001754, lagrangian_loss: 0.000846, attention_score_distillation_loss: 0.001171 +loss: 0.003400, lagrangian_loss: -0.000338, attention_score_distillation_loss: 0.001167 +---------------------------------------------------------------------- +time: 2023-07-19 14:58:24 +Evaluating: f1: 0.8832, eval_loss: 0.6641, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2889, expected_sparsity: 0.2848, expected_sequence_sparsity: 0.7073, target_sparsity: 0.2757, step: 7100 +lambda_1: -0.2418, lambda_2: 23.5646 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.98 0.85 0.89 0.57 0.39 0.44] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.86, 0.52, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.38, 0.14, 0.06] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011010010 +11111111111111111111111011111111010111011011110110 +11111111111011000111100001011101101100000000000100 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.000840, lagrangian_loss: -0.000599, attention_score_distillation_loss: 0.001165 +loss: 0.001796, lagrangian_loss: 0.000224, attention_score_distillation_loss: 0.001162 +ETA: 1:16:58 | Epoch 61 finished. Took 32.83 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:58:38 +Evaluating: f1: 0.8804, eval_loss: 0.6649, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2874, expected_sparsity: 0.2829, expected_sequence_sparsity: 0.7066, target_sparsity: 0.2776, step: 7150 +lambda_1: 0.1721, lambda_2: 23.7344 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.98 0.86 0.9 0.58 0.4 0.44] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.86, 0.54, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.39, 0.15, 0.06] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011010010 +11111111111111111111111011111111010111011011110110 +11111111111011010111100001011101101100000000000100 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.002218, lagrangian_loss: -0.000113, attention_score_distillation_loss: 0.001159 +loss: 0.001614, lagrangian_loss: 0.000072, attention_score_distillation_loss: 0.001156 +---------------------------------------------------------------------- +time: 2023-07-19 14:58:53 +Evaluating: f1: 0.8612, eval_loss: 0.695, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2889, expected_sparsity: 0.2848, expected_sequence_sparsity: 0.7073, target_sparsity: 0.2796, step: 7200 +lambda_1: -0.1783, lambda_2: 23.8799 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.98 0.86 0.9 0.58 0.4 0.44] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.86, 0.52, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.38, 0.14, 0.06] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011011010010 +11111111111111111111111011111111010111011011110110 +11111111111011000111100001011101101100000000000100 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001262, lagrangian_loss: 0.000504, attention_score_distillation_loss: 0.001153 +loss: 0.002022, lagrangian_loss: -0.000058, attention_score_distillation_loss: 0.001150 +ETA: 1:16:23 | Epoch 62 finished. Took 32.89 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 14:59:08 +Evaluating: f1: 0.8668, eval_loss: 0.6813, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.301, expected_sparsity: 0.2944, expected_sequence_sparsity: 0.7113, target_sparsity: 0.2815, step: 7250 +lambda_1: -0.3009, lambda_2: 23.9464 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.98 0.85 0.89 0.55 0.39 0.43] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.84, 0.5, 0.36, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.69, 0.34, 0.12, 0.05] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110010011010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101101000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.002031, lagrangian_loss: -0.000645, attention_score_distillation_loss: 0.001148 +loss: 0.001770, lagrangian_loss: 0.000020, attention_score_distillation_loss: 0.001144 +---------------------------------------------------------------------- +time: 2023-07-19 14:59:22 +Evaluating: f1: 0.8817, eval_loss: 0.5893, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.301, expected_sparsity: 0.2944, expected_sequence_sparsity: 0.7113, target_sparsity: 0.2835, step: 7300 +lambda_1: 0.1516, lambda_2: 24.1251 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.98 0.85 0.89 0.55 0.39 0.43] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.84, 0.5, 0.36, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.69, 0.34, 0.12, 0.05] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110010011010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101101000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001972, lagrangian_loss: 0.000150, attention_score_distillation_loss: 0.001141 +loss: 0.001034, lagrangian_loss: -0.000322, attention_score_distillation_loss: 0.001138 +---------------------------------------------------------------------- +time: 2023-07-19 14:59:37 +Evaluating: f1: 0.8759, eval_loss: 0.6431, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.2952, expected_sparsity: 0.2916, expected_sequence_sparsity: 0.7101, target_sparsity: 0.2854, step: 7350 +lambda_1: -0.0369, lambda_2: 24.2518 lambda_3: 0.0000 +train remain: [1. 1. 1. 1. 0.98 0.85 0.9 0.56 0.39 0.44] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.84, 0.52, 0.38, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.69, 0.36, 0.14, 0.06] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110010011010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101101100000000000100 +11111111111111010110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.002440, lagrangian_loss: 0.000538, attention_score_distillation_loss: 0.001135 +ETA: 1:15:53 | Epoch 63 finished. Took 35.1 seconds. +loss: 0.001492, lagrangian_loss: 0.000477, attention_score_distillation_loss: 0.001134 +---------------------------------------------------------------------- +time: 2023-07-19 14:59:51 +Evaluating: f1: 0.8858, eval_loss: 0.6231, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.301, expected_sparsity: 0.2944, expected_sequence_sparsity: 0.7113, target_sparsity: 0.2873, step: 7400 +lambda_1: -0.3572, lambda_2: 24.4122 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.98 0.84 0.89 0.54 0.38 0.43] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.84, 0.5, 0.36, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.69, 0.34, 0.12, 0.05] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110010011010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101101000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001500, lagrangian_loss: -0.000750, attention_score_distillation_loss: 0.001130 +loss: 0.006141, lagrangian_loss: -0.000199, attention_score_distillation_loss: 0.001128 +---------------------------------------------------------------------- +time: 2023-07-19 15:00:06 +Evaluating: f1: 0.8736, eval_loss: 0.6062, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.301, expected_sparsity: 0.2944, expected_sequence_sparsity: 0.7113, target_sparsity: 0.2893, step: 7450 +lambda_1: 0.1396, lambda_2: 24.6479 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.97 0.84 0.89 0.53 0.38 0.43] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.84, 0.5, 0.36, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.69, 0.34, 0.12, 0.05] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110010011010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101101000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001895, lagrangian_loss: 0.000383, attention_score_distillation_loss: 0.001125 +ETA: 1:15:18 | Epoch 64 finished. Took 32.89 seconds. +loss: 0.004323, lagrangian_loss: -0.000458, attention_score_distillation_loss: 0.001122 +---------------------------------------------------------------------- +time: 2023-07-19 15:00:21 +Evaluating: f1: 0.8797, eval_loss: 0.594, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.301, expected_sparsity: 0.2944, expected_sequence_sparsity: 0.7113, target_sparsity: 0.2912, step: 7500 +lambda_1: 0.0277, lambda_2: 24.7992 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.98 0.84 0.89 0.54 0.39 0.44] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.84, 0.5, 0.36, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.69, 0.34, 0.12, 0.05] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011001010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101101000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.005531, lagrangian_loss: 0.000449, attention_score_distillation_loss: 0.001118 +loss: 0.001741, lagrangian_loss: 0.000555, attention_score_distillation_loss: 0.001116 +---------------------------------------------------------------------- +time: 2023-07-19 15:00:35 +Evaluating: f1: 0.8938, eval_loss: 0.6211, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.3057, expected_sparsity: 0.3003, expected_sequence_sparsity: 0.7137, target_sparsity: 0.2932, step: 7550 +lambda_1: -0.3552, lambda_2: 25.0145 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.97 0.83 0.89 0.52 0.38 0.43] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.84, 0.48, 0.36, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.67, 0.32, 0.12, 0.05] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110010001010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101100000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.002066, lagrangian_loss: -0.000615, attention_score_distillation_loss: 0.001112 +loss: 0.004104, lagrangian_loss: -0.000296, attention_score_distillation_loss: 0.001110 +ETA: 1:14:44 | Epoch 65 finished. Took 33.04 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:00:50 +Evaluating: f1: 0.8739, eval_loss: 0.6106, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.3057, expected_sparsity: 0.3003, expected_sequence_sparsity: 0.7137, target_sparsity: 0.2951, step: 7600 +lambda_1: 0.0927, lambda_2: 25.2199 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.97 0.82 0.88 0.52 0.38 0.43] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.84, 0.48, 0.36, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.67, 0.32, 0.12, 0.05] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110010001010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101100000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001534, lagrangian_loss: 0.000301, attention_score_distillation_loss: 0.001107 +loss: 0.002766, lagrangian_loss: -0.000288, attention_score_distillation_loss: 0.001105 +---------------------------------------------------------------------- +time: 2023-07-19 15:01:05 +Evaluating: f1: 0.8923, eval_loss: 0.6507, token_prune_loc: [False, False, False, False, False, True, True, True, True, True], macs_sparsity: 0.301, expected_sparsity: 0.2944, expected_sequence_sparsity: 0.7113, target_sparsity: 0.2971, step: 7650 +lambda_1: 0.0657, lambda_2: 25.3355 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.97 0.83 0.89 0.53 0.38 0.43] +infer remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.84, 0.5, 0.36, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.69, 0.34, 0.12, 0.05] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111111110011001010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101101000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001663, lagrangian_loss: 0.000195, attention_score_distillation_loss: 0.001104 +loss: 0.001463, lagrangian_loss: 0.000902, attention_score_distillation_loss: 0.001099 +---------------------------------------------------------------------- +time: 2023-07-19 15:01:19 +Evaluating: f1: 0.8912, eval_loss: 0.6398, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3215, expected_sparsity: 0.3153, expected_sequence_sparsity: 0.7198, target_sparsity: 0.299, step: 7700 +lambda_1: -0.3977, lambda_2: 25.5696 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.97 0.82 0.88 0.51 0.38 0.43] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.94, 0.8, 0.84, 0.48, 0.36, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 0.75, 0.63, 0.3, 0.11, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101111110 +11111111111111111111111111011111111110010001010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101100000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001730, lagrangian_loss: -0.000338, attention_score_distillation_loss: 0.001096 +ETA: 1:14:14 | Epoch 66 finished. Took 35.26 seconds. +loss: 0.035291, lagrangian_loss: -0.000709, attention_score_distillation_loss: 0.001093 +---------------------------------------------------------------------- +time: 2023-07-19 15:01:34 +Evaluating: f1: 0.8912, eval_loss: 0.6231, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3215, expected_sparsity: 0.3153, expected_sequence_sparsity: 0.7198, target_sparsity: 0.3009, step: 7750 +lambda_1: -0.0120, lambda_2: 25.7761 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.96 0.82 0.88 0.5 0.37 0.42] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.94, 0.8, 0.84, 0.48, 0.36, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 0.75, 0.63, 0.3, 0.11, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101111110 +11111111111111111111111111011111111110010001010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101100000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001078, lagrangian_loss: 0.000373, attention_score_distillation_loss: 0.001091 +loss: 0.001250, lagrangian_loss: -0.000127, attention_score_distillation_loss: 0.001087 +---------------------------------------------------------------------- +time: 2023-07-19 15:01:49 +Evaluating: f1: 0.8874, eval_loss: 0.6631, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3215, expected_sparsity: 0.315, expected_sequence_sparsity: 0.7197, target_sparsity: 0.3029, step: 7800 +lambda_1: 0.1748, lambda_2: 25.9357 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.97 0.82 0.88 0.51 0.38 0.43] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.94, 0.8, 0.84, 0.48, 0.36, 0.42] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 0.75, 0.63, 0.3, 0.11, 0.05] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101111110 +11111111111111111111111111011111111110010001010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011101100000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100001000 +loss: 0.001116, lagrangian_loss: -0.000294, attention_score_distillation_loss: 0.001086 +ETA: 1:13:40 | Epoch 67 finished. Took 33.12 seconds. +loss: 0.002814, lagrangian_loss: 0.000754, attention_score_distillation_loss: 0.001082 +---------------------------------------------------------------------- +time: 2023-07-19 15:02:03 +Evaluating: f1: 0.8885, eval_loss: 0.6123, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3215, expected_sparsity: 0.3169, expected_sequence_sparsity: 0.7205, target_sparsity: 0.3048, step: 7850 +lambda_1: -0.3574, lambda_2: 26.2299 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.96 0.82 0.88 0.5 0.37 0.42] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.94, 0.8, 0.84, 0.46, 0.36, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 0.75, 0.63, 0.29, 0.1, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101111110 +11111111111111111111111111011111111110010001010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011001100000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001599, lagrangian_loss: -0.000116, attention_score_distillation_loss: 0.001079 +loss: 0.001467, lagrangian_loss: -0.000779, attention_score_distillation_loss: 0.001077 +---------------------------------------------------------------------- +time: 2023-07-19 15:02:18 +Evaluating: f1: 0.89, eval_loss: 0.6377, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3231, expected_sparsity: 0.3198, expected_sequence_sparsity: 0.7217, target_sparsity: 0.3068, step: 7900 +lambda_1: -0.1190, lambda_2: 26.3841 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.95 0.81 0.87 0.49 0.37 0.42] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.94, 0.8, 0.82, 0.46, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 0.75, 0.62, 0.28, 0.1, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101111110 +11111111111111111111111111011111111110010001010010 +11111111011111111111111011110111010111011011110110 +11111111111011000111100001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001852, lagrangian_loss: -0.000098, attention_score_distillation_loss: 0.001074 +loss: 0.111494, lagrangian_loss: 0.000177, attention_score_distillation_loss: 0.001071 +ETA: 1:13:06 | Epoch 68 finished. Took 33.3 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:02:33 +Evaluating: f1: 0.8889, eval_loss: 0.627, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3215, expected_sparsity: 0.3169, expected_sequence_sparsity: 0.7205, target_sparsity: 0.3087, step: 7950 +lambda_1: 0.2008, lambda_2: 26.5781 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.96 0.81 0.87 0.5 0.37 0.42] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.94, 0.8, 0.84, 0.46, 0.36, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 0.75, 0.63, 0.29, 0.1, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101111110 +11111111111111111111111111011111111110010001010010 +11111111011111111111111011111111010111011011110110 +11111111111011000111100001011001100000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001926, lagrangian_loss: -0.000359, attention_score_distillation_loss: 0.001068 +loss: 0.003467, lagrangian_loss: 0.001028, attention_score_distillation_loss: 0.001065 +---------------------------------------------------------------------- +time: 2023-07-19 15:02:48 +Evaluating: f1: 0.892, eval_loss: 0.6134, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3231, expected_sparsity: 0.3198, expected_sequence_sparsity: 0.7217, target_sparsity: 0.3106, step: 8000 +lambda_1: -0.4614, lambda_2: 27.0802 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.96 0.81 0.88 0.49 0.37 0.42] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.94, 0.8, 0.82, 0.46, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.94, 0.75, 0.62, 0.28, 0.1, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101111110 +11111111111111111111111111011111111110010001010010 +11111111011111111111111011110111010111011011110110 +11111111111011000111100001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.002481, lagrangian_loss: 0.002122, attention_score_distillation_loss: 0.001061 +loss: 0.004067, lagrangian_loss: -0.001009, attention_score_distillation_loss: 0.001059 +---------------------------------------------------------------------- +time: 2023-07-19 15:03:02 +Evaluating: f1: 0.8881, eval_loss: 0.6202, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3341, expected_sparsity: 0.3297, expected_sequence_sparsity: 0.7258, target_sparsity: 0.3126, step: 8050 +lambda_1: -0.5032, lambda_2: 27.3536 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.98 0.95 0.8 0.86 0.48 0.36 0.42] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.78, 0.82, 0.44, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.72, 0.59, 0.26, 0.09, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111110010000010010 +11111111011111111111111011110111010111011011110110 +11111111111011000111000001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +ETA: 1:12:36 | Epoch 69 finished. Took 35.39 seconds. +loss: 0.002666, lagrangian_loss: -0.002139, attention_score_distillation_loss: 0.001056 +loss: 0.001414, lagrangian_loss: 0.000775, attention_score_distillation_loss: 0.001053 +---------------------------------------------------------------------- +time: 2023-07-19 15:03:17 +Evaluating: f1: 0.8968, eval_loss: 0.6087, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3325, expected_sparsity: 0.3282, expected_sequence_sparsity: 0.7251, target_sparsity: 0.3145, step: 8100 +lambda_1: 0.3582, lambda_2: 28.1973 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.98 0.95 0.8 0.87 0.48 0.36 0.42] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.78, 0.82, 0.46, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.72, 0.59, 0.27, 0.09, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111110010000010010 +11111111011111111111111011110111010111011011110110 +11111111111011000111000001011001100000000000000110 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.002814, lagrangian_loss: 0.000167, attention_score_distillation_loss: 0.001049 +loss: 0.002013, lagrangian_loss: -0.001005, attention_score_distillation_loss: 0.001046 +---------------------------------------------------------------------- +time: 2023-07-19 15:03:32 +Evaluating: f1: 0.8957, eval_loss: 0.5946, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.331, expected_sparsity: 0.3275, expected_sequence_sparsity: 0.7248, target_sparsity: 0.3165, step: 8150 +lambda_1: -0.0308, lambda_2: 28.6338 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.99 0.96 0.81 0.87 0.49 0.37 0.43] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.78, 0.82, 0.46, 0.36, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.72, 0.59, 0.27, 0.1, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111110010000010010 +11111111011111111111111011110111010111011011110110 +11111111111011000111100001011001100000000000000100 +11111111111111000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001538, lagrangian_loss: 0.001166, attention_score_distillation_loss: 0.001044 +ETA: 1:12:02 | Epoch 70 finished. Took 33.05 seconds. +loss: 0.001991, lagrangian_loss: 0.000849, attention_score_distillation_loss: 0.001042 +---------------------------------------------------------------------- +time: 2023-07-19 15:03:46 +Evaluating: f1: 0.8969, eval_loss: 0.6016, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3341, expected_sparsity: 0.3297, expected_sequence_sparsity: 0.7258, target_sparsity: 0.3184, step: 8200 +lambda_1: -0.4770, lambda_2: 29.0220 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.98 0.95 0.8 0.86 0.47 0.36 0.41] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.78, 0.82, 0.44, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.72, 0.59, 0.26, 0.09, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111110010000010010 +11111111011111111111111011110111010111011011110110 +11111111111011000111000001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001155, lagrangian_loss: -0.001089, attention_score_distillation_loss: 0.001038 +loss: 0.001028, lagrangian_loss: -0.000495, attention_score_distillation_loss: 0.001036 +---------------------------------------------------------------------- +time: 2023-07-19 15:04:01 +Evaluating: f1: 0.8919, eval_loss: 0.6308, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3341, expected_sparsity: 0.3297, expected_sequence_sparsity: 0.7258, target_sparsity: 0.3204, step: 8250 +lambda_1: 0.0840, lambda_2: 29.4391 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.97 0.95 0.79 0.86 0.47 0.36 0.41] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.78, 0.82, 0.44, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.72, 0.59, 0.26, 0.09, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111110010000010010 +11111111011111111111111011110111010111011011110110 +11111111111011000111000001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.000888, lagrangian_loss: 0.000396, attention_score_distillation_loss: 0.001032 +loss: 0.000841, lagrangian_loss: -0.000264, attention_score_distillation_loss: 0.001029 +ETA: 1:11:27 | Epoch 71 finished. Took 33.05 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:04:16 +Evaluating: f1: 0.8801, eval_loss: 0.6063, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3341, expected_sparsity: 0.3297, expected_sequence_sparsity: 0.7258, target_sparsity: 0.3223, step: 8300 +lambda_1: 0.0671, lambda_2: 29.6244 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.98 0.96 0.8 0.87 0.48 0.36 0.42] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.78, 0.82, 0.44, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.72, 0.59, 0.26, 0.09, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111110010000010010 +11111111011111111111111011111111010111011011110100 +11111111111011000111000001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001465, lagrangian_loss: 0.000267, attention_score_distillation_loss: 0.001027 +loss: 0.001035, lagrangian_loss: 0.001036, attention_score_distillation_loss: 0.001024 +---------------------------------------------------------------------- +time: 2023-07-19 15:04:30 +Evaluating: f1: 0.8889, eval_loss: 0.5884, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3341, expected_sparsity: 0.3297, expected_sequence_sparsity: 0.7258, target_sparsity: 0.3242, step: 8350 +lambda_1: -0.3564, lambda_2: 29.9035 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.97 0.95 0.79 0.86 0.47 0.36 0.41] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.78, 0.82, 0.44, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.72, 0.59, 0.26, 0.09, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111110010000010010 +11111111011111111111111011111111010111011011110100 +11111111111011000111000001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.004048, lagrangian_loss: -0.000350, attention_score_distillation_loss: 0.001022 +loss: 0.002446, lagrangian_loss: -0.000377, attention_score_distillation_loss: 0.001018 +ETA: 1:10:53 | Epoch 72 finished. Took 33.08 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:04:45 +Evaluating: f1: 0.8893, eval_loss: 0.576, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3398, expected_sparsity: 0.3333, expected_sequence_sparsity: 0.7272, target_sparsity: 0.3262, step: 8400 +lambda_1: 0.0519, lambda_2: 30.1600 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.96 0.95 0.79 0.86 0.47 0.35 0.41] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.76, 0.82, 0.44, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.7, 0.57, 0.25, 0.09, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111100010000010010 +11111111011111111111111011111111010111011011110100 +11111111111011000111000001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.002517, lagrangian_loss: 0.000296, attention_score_distillation_loss: 0.001015 +loss: 0.001100, lagrangian_loss: -0.000196, attention_score_distillation_loss: 0.001012 +---------------------------------------------------------------------- +time: 2023-07-19 15:05:00 +Evaluating: f1: 0.8866, eval_loss: 0.6362, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3341, expected_sparsity: 0.3297, expected_sequence_sparsity: 0.7258, target_sparsity: 0.3281, step: 8450 +lambda_1: 0.0082, lambda_2: 30.2973 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.97 0.95 0.79 0.86 0.47 0.36 0.41] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.78, 0.82, 0.44, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.72, 0.59, 0.26, 0.09, 0.04] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111110010000010010 +11111111011111111111111011111111010111011011110100 +11111111111011000111000001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.000964, lagrangian_loss: 0.000257, attention_score_distillation_loss: 0.001009 +loss: 0.002089, lagrangian_loss: 0.000265, attention_score_distillation_loss: 0.001007 +---------------------------------------------------------------------- +time: 2023-07-19 15:05:14 +Evaluating: f1: 0.885, eval_loss: 0.6058, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3398, expected_sparsity: 0.3333, expected_sequence_sparsity: 0.7272, target_sparsity: 0.3301, step: 8500 +lambda_1: -0.2140, lambda_2: 30.4207 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.96 0.95 0.78 0.86 0.46 0.35 0.41] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.76, 0.82, 0.44, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.7, 0.57, 0.25, 0.09, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111100010000010010 +11111111011111111111111011111111010111011011110100 +11111111111011000111000001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001058, lagrangian_loss: -0.000313, attention_score_distillation_loss: 0.001003 +ETA: 1:10:23 | Epoch 73 finished. Took 35.37 seconds. +loss: 0.001282, lagrangian_loss: -0.000044, attention_score_distillation_loss: 0.001001 +---------------------------------------------------------------------- +time: 2023-07-19 15:05:29 +Evaluating: f1: 0.895, eval_loss: 0.6067, token_prune_loc: [False, False, False, False, True, True, True, True, True, True], macs_sparsity: 0.3398, expected_sparsity: 0.3333, expected_sequence_sparsity: 0.7272, target_sparsity: 0.332, step: 8550 +lambda_1: 0.0479, lambda_2: 30.5160 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.96 0.95 0.78 0.86 0.46 0.35 0.41] +infer remain: [1.0, 1.0, 1.0, 1.0, 0.92, 0.76, 0.82, 0.44, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 0.7, 0.57, 0.25, 0.09, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111100010000010010 +11111111011111111111111011111111010111011011110100 +11111111111011000111000001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001398, lagrangian_loss: -0.000014, attention_score_distillation_loss: 0.000998 +loss: 0.001857, lagrangian_loss: 0.000097, attention_score_distillation_loss: 0.000995 +---------------------------------------------------------------------- +time: 2023-07-19 15:05:44 +Evaluating: f1: 0.8855, eval_loss: 0.6537, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3683, expected_sparsity: 0.3628, expected_sequence_sparsity: 0.7393, target_sparsity: 0.334, step: 8600 +lambda_1: -0.1310, lambda_2: 30.5849 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.95 0.95 0.78 0.86 0.46 0.35 0.41] +infer remain: [1.0, 1.0, 1.0, 0.9, 0.92, 0.76, 0.82, 0.44, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.83, 0.63, 0.52, 0.23, 0.08, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111111110101100 +11111111111111111111111111111111111111011101011110 +11111111111111111111111111011111111100010000010010 +11111111011111111111111011111111010111011011110100 +11111111111011000111000001011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001748, lagrangian_loss: 0.000022, attention_score_distillation_loss: 0.000993 +ETA: 1:09:49 | Epoch 74 finished. Took 33.11 seconds. +loss: 0.009009, lagrangian_loss: -0.000154, attention_score_distillation_loss: 0.000990 +---------------------------------------------------------------------- +time: 2023-07-19 15:05:58 +Evaluating: f1: 0.897, eval_loss: 0.6269, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3698, expected_sparsity: 0.3641, expected_sequence_sparsity: 0.7398, target_sparsity: 0.3359, step: 8650 +lambda_1: -0.0617, lambda_2: 30.6211 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.94 0.95 0.78 0.86 0.46 0.36 0.41] +infer remain: [1.0, 1.0, 1.0, 0.9, 0.92, 0.76, 0.82, 0.42, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.83, 0.63, 0.52, 0.22, 0.07, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111111110101100 +11111111111111111111111111111111111111011101111100 +11111111111111111111111111011111111100010000010010 +11111111011111111111111011111111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.003335, lagrangian_loss: -0.000025, attention_score_distillation_loss: 0.000987 +loss: 0.003555, lagrangian_loss: 0.000007, attention_score_distillation_loss: 0.000984 +---------------------------------------------------------------------- +time: 2023-07-19 15:06:13 +Evaluating: f1: 0.8843, eval_loss: 0.5985, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3698, expected_sparsity: 0.3641, expected_sequence_sparsity: 0.7398, target_sparsity: 0.3378, step: 8700 +lambda_1: -0.0473, lambda_2: 30.6368 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.94 0.95 0.78 0.86 0.46 0.36 0.41] +infer remain: [1.0, 1.0, 1.0, 0.9, 0.92, 0.76, 0.82, 0.42, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.83, 0.63, 0.52, 0.22, 0.07, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111111110101100 +11111111111111111111111111111111111111011101101110 +11111111111111111111111111011111111100010000010010 +11111111011111111111111011111111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.002702, lagrangian_loss: 0.000083, attention_score_distillation_loss: 0.000981 +loss: 0.003429, lagrangian_loss: -0.000000, attention_score_distillation_loss: 0.000977 +ETA: 1:09:15 | Epoch 75 finished. Took 33.21 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:06:28 +Evaluating: f1: 0.8904, eval_loss: 0.5906, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.373, expected_sparsity: 0.3658, expected_sequence_sparsity: 0.7405, target_sparsity: 0.3398, step: 8750 +lambda_1: -0.1219, lambda_2: 30.6473 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.94 0.95 0.78 0.85 0.45 0.35 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.9, 0.92, 0.76, 0.8, 0.42, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.83, 0.63, 0.5, 0.21, 0.07, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110111100 +11111111111111111111111111111111111111011101101110 +11111111111111111111111111011111111100010000010010 +11111111011111111111111011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001424, lagrangian_loss: 0.000006, attention_score_distillation_loss: 0.000975 +loss: 0.000993, lagrangian_loss: 0.000137, attention_score_distillation_loss: 0.000971 +---------------------------------------------------------------------- +time: 2023-07-19 15:06:43 +Evaluating: f1: 0.8818, eval_loss: 0.6758, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.373, expected_sparsity: 0.3658, expected_sequence_sparsity: 0.7405, target_sparsity: 0.3417, step: 8800 +lambda_1: -0.2182, lambda_2: 30.6676 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.93 0.95 0.77 0.85 0.45 0.35 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.9, 0.92, 0.76, 0.8, 0.42, 0.34, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.83, 0.63, 0.5, 0.21, 0.07, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110111100 +11111111111111111111111111111111111111011101101110 +11111111111111111111111111011111111100010000010010 +11111111011111111111111011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110101000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.113589, lagrangian_loss: -0.000035, attention_score_distillation_loss: 0.000970 +loss: 0.002136, lagrangian_loss: -0.000039, attention_score_distillation_loss: 0.000967 +---------------------------------------------------------------------- +time: 2023-07-19 15:06:57 +Evaluating: f1: 0.886, eval_loss: 0.7401, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3793, expected_sparsity: 0.3735, expected_sequence_sparsity: 0.7437, target_sparsity: 0.3437, step: 8850 +lambda_1: -0.2328, lambda_2: 30.6831 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.93 0.94 0.77 0.84 0.45 0.35 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.9, 0.9, 0.74, 0.8, 0.42, 0.32, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.6, 0.48, 0.2, 0.06, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110111100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111111100000000010010 +11111111011111111111111011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.003010, lagrangian_loss: -0.000343, attention_score_distillation_loss: 0.000964 +ETA: 1:08:44 | Epoch 76 finished. Took 35.46 seconds. +loss: 0.001699, lagrangian_loss: -0.000095, attention_score_distillation_loss: 0.000961 +---------------------------------------------------------------------- +time: 2023-07-19 15:07:12 +Evaluating: f1: 0.8819, eval_loss: 0.6201, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3793, expected_sparsity: 0.3735, expected_sequence_sparsity: 0.7437, target_sparsity: 0.3456, step: 8900 +lambda_1: 0.0290, lambda_2: 30.7607 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.93 0.94 0.77 0.85 0.45 0.35 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.9, 0.9, 0.74, 0.8, 0.42, 0.32, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.6, 0.48, 0.2, 0.06, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110111100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111111100000000010010 +11111111011111111111111011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.002833, lagrangian_loss: 0.000020, attention_score_distillation_loss: 0.000958 +loss: 0.001636, lagrangian_loss: 0.000046, attention_score_distillation_loss: 0.000956 +---------------------------------------------------------------------- +time: 2023-07-19 15:07:27 +Evaluating: f1: 0.8818, eval_loss: 0.6105, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3793, expected_sparsity: 0.3735, expected_sequence_sparsity: 0.7437, target_sparsity: 0.3475, step: 8950 +lambda_1: -0.1522, lambda_2: 30.8416 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.93 0.95 0.77 0.85 0.44 0.35 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.9, 0.9, 0.74, 0.8, 0.42, 0.32, 0.4] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.6, 0.48, 0.2, 0.06, 0.03] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110111100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111111100000000010010 +11111111011111111111111011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001010000100100000000 +loss: 0.001537, lagrangian_loss: 0.000503, attention_score_distillation_loss: 0.000953 +ETA: 1:08:10 | Epoch 77 finished. Took 32.96 seconds. +loss: 0.001551, lagrangian_loss: 0.000435, attention_score_distillation_loss: 0.000949 +---------------------------------------------------------------------- +time: 2023-07-19 15:07:41 +Evaluating: f1: 0.8939, eval_loss: 0.5979, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3793, expected_sparsity: 0.3737, expected_sequence_sparsity: 0.7438, target_sparsity: 0.3495, step: 9000 +lambda_1: -0.3666, lambda_2: 30.9319 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.92 0.94 0.76 0.85 0.44 0.34 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.9, 0.9, 0.74, 0.8, 0.42, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.6, 0.48, 0.2, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110111100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111111100000000010010 +11111111011111111111111011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.007258, lagrangian_loss: -0.000671, attention_score_distillation_loss: 0.000947 +loss: 0.001253, lagrangian_loss: -0.000317, attention_score_distillation_loss: 0.000943 +---------------------------------------------------------------------- +time: 2023-07-19 15:07:56 +Evaluating: f1: 0.8933, eval_loss: 0.6784, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.384, expected_sparsity: 0.3793, expected_sequence_sparsity: 0.7461, target_sparsity: 0.3514, step: 9050 +lambda_1: -0.0297, lambda_2: 31.0627 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.92 0.94 0.76 0.84 0.44 0.34 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.88, 0.9, 0.74, 0.8, 0.42, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.79, 0.59, 0.47, 0.2, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110101100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111111100000000010010 +11111111011111111111111011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.001523, lagrangian_loss: 0.000062, attention_score_distillation_loss: 0.000940 +loss: 0.001718, lagrangian_loss: -0.000075, attention_score_distillation_loss: 0.000938 +ETA: 1:07:36 | Epoch 78 finished. Took 33.18 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:08:11 +Evaluating: f1: 0.8854, eval_loss: 0.6159, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3793, expected_sparsity: 0.3737, expected_sequence_sparsity: 0.7438, target_sparsity: 0.3534, step: 9100 +lambda_1: 0.0029, lambda_2: 31.1472 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.92 0.94 0.76 0.84 0.44 0.34 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.9, 0.9, 0.74, 0.8, 0.42, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.81, 0.6, 0.48, 0.2, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110111100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111111100000000010010 +11111111011111111111111011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.001631, lagrangian_loss: 0.000210, attention_score_distillation_loss: 0.000936 +loss: 0.000889, lagrangian_loss: 0.000437, attention_score_distillation_loss: 0.000933 +---------------------------------------------------------------------- +time: 2023-07-19 15:08:25 +Evaluating: f1: 0.9003, eval_loss: 0.6012, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.384, expected_sparsity: 0.3793, expected_sequence_sparsity: 0.7461, target_sparsity: 0.3553, step: 9150 +lambda_1: -0.3512, lambda_2: 31.2920 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.92 0.94 0.75 0.84 0.43 0.34 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.88, 0.9, 0.74, 0.8, 0.42, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.79, 0.59, 0.47, 0.2, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110101100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111111100000000010010 +11111111011111111111111011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.001385, lagrangian_loss: -0.000063, attention_score_distillation_loss: 0.000930 +loss: 0.100318, lagrangian_loss: -0.000155, attention_score_distillation_loss: 0.000927 +---------------------------------------------------------------------- +time: 2023-07-19 15:08:40 +Evaluating: f1: 0.8955, eval_loss: 0.5899, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3872, expected_sparsity: 0.3809, expected_sequence_sparsity: 0.7467, target_sparsity: 0.3573, step: 9200 +lambda_1: -0.2719, lambda_2: 31.3169 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.92 0.94 0.75 0.83 0.43 0.33 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.88, 0.9, 0.74, 0.78, 0.42, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.79, 0.59, 0.46, 0.19, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110101100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111111100000000010010 +11111111011111111111101011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +ETA: 1:07:04 | Epoch 79 finished. Took 35.09 seconds. +loss: 0.001115, lagrangian_loss: -0.000307, attention_score_distillation_loss: 0.000924 +loss: 0.026647, lagrangian_loss: -0.000116, attention_score_distillation_loss: 0.000921 +---------------------------------------------------------------------- +time: 2023-07-19 15:08:54 +Evaluating: f1: 0.895, eval_loss: 0.6212, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3872, expected_sparsity: 0.3809, expected_sequence_sparsity: 0.7467, target_sparsity: 0.3592, step: 9250 +lambda_1: 0.0413, lambda_2: 31.4195 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.91 0.94 0.74 0.83 0.43 0.33 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.88, 0.9, 0.74, 0.78, 0.42, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.79, 0.59, 0.46, 0.19, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110101100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111111100000000010010 +11111111011111111111101011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.002184, lagrangian_loss: 0.000073, attention_score_distillation_loss: 0.000919 +loss: 0.001202, lagrangian_loss: 0.000088, attention_score_distillation_loss: 0.000919 +---------------------------------------------------------------------- +time: 2023-07-19 15:09:09 +Evaluating: f1: 0.8716, eval_loss: 0.6154, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3872, expected_sparsity: 0.3809, expected_sequence_sparsity: 0.7467, target_sparsity: 0.3611, step: 9300 +lambda_1: -0.1893, lambda_2: 31.5500 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.92 0.94 0.74 0.83 0.43 0.33 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.88, 0.9, 0.74, 0.78, 0.42, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.79, 0.59, 0.46, 0.19, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110101100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111111100000000010010 +11111111011111111111101011110111010111011011110100 +11111111111011000111000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.003703, lagrangian_loss: 0.000699, attention_score_distillation_loss: 0.000913 +ETA: 1:06:30 | Epoch 80 finished. Took 32.97 seconds. +loss: 0.003162, lagrangian_loss: 0.001055, attention_score_distillation_loss: 0.000909 +---------------------------------------------------------------------- +time: 2023-07-19 15:09:24 +Evaluating: f1: 0.8845, eval_loss: 0.5905, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3903, expected_sparsity: 0.3849, expected_sequence_sparsity: 0.7484, target_sparsity: 0.3631, step: 9350 +lambda_1: -0.5967, lambda_2: 31.7529 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.91 0.93 0.74 0.82 0.42 0.33 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.88, 0.9, 0.72, 0.78, 0.4, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.79, 0.57, 0.44, 0.18, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110101100 +11111111111111111111111111111111111111011101101100 +11111111111111111111111111011111110100000000010010 +11111111011111111111111011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.004339, lagrangian_loss: -0.000008, attention_score_distillation_loss: 0.000907 +loss: 0.002785, lagrangian_loss: -0.000850, attention_score_distillation_loss: 0.000904 +---------------------------------------------------------------------- +time: 2023-07-19 15:09:38 +Evaluating: f1: 0.887, eval_loss: 0.6316, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3935, expected_sparsity: 0.3887, expected_sequence_sparsity: 0.7499, target_sparsity: 0.365, step: 9400 +lambda_1: -0.1971, lambda_2: 31.9887 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.9 0.93 0.73 0.82 0.42 0.33 0.39] +infer remain: [1.0, 1.0, 1.0, 0.88, 0.88, 0.72, 0.78, 0.4, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.56, 0.43, 0.17, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110101100 +11111111111111111111111111111111111111011101001100 +11111111111111111111111111011111110100000000010010 +11111111011111111111111011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.000602, lagrangian_loss: -0.000300, attention_score_distillation_loss: 0.000900 +loss: 0.001700, lagrangian_loss: 0.000371, attention_score_distillation_loss: 0.000899 +ETA: 1:05:56 | Epoch 81 finished. Took 32.93 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:09:53 +Evaluating: f1: 0.8889, eval_loss: 0.5995, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3935, expected_sparsity: 0.3887, expected_sequence_sparsity: 0.7499, target_sparsity: 0.367, step: 9450 +lambda_1: 0.2575, lambda_2: 32.2649 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.91 0.93 0.73 0.82 0.42 0.33 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.88, 0.88, 0.72, 0.78, 0.4, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.56, 0.43, 0.17, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110101100 +11111111111111111111111111111111111111011101001100 +11111111111111111111111111011111110100000000010010 +11111111011111111111111011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.001647, lagrangian_loss: -0.000281, attention_score_distillation_loss: 0.000895 +loss: 0.002136, lagrangian_loss: -0.000017, attention_score_distillation_loss: 0.000892 +---------------------------------------------------------------------- +time: 2023-07-19 15:10:08 +Evaluating: f1: 0.885, eval_loss: 0.5922, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.3935, expected_sparsity: 0.3887, expected_sequence_sparsity: 0.7499, target_sparsity: 0.3689, step: 9500 +lambda_1: -0.1954, lambda_2: 32.5506 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.91 0.93 0.74 0.82 0.42 0.33 0.4 ] +infer remain: [1.0, 1.0, 1.0, 0.88, 0.88, 0.72, 0.78, 0.4, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.88, 0.77, 0.56, 0.43, 0.17, 0.06, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111011111101110101100 +11111111111111111111111111111111111111011101001100 +11111111111111111111111111011111110100000000010010 +11111111011111111111111011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.001962, lagrangian_loss: 0.000709, attention_score_distillation_loss: 0.000890 +loss: 0.001367, lagrangian_loss: 0.000270, attention_score_distillation_loss: 0.000887 +ETA: 1:05:22 | Epoch 82 finished. Took 33.15 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:10:22 +Evaluating: f1: 0.8837, eval_loss: 0.5921, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4013, expected_sparsity: 0.3956, expected_sequence_sparsity: 0.7527, target_sparsity: 0.3708, step: 9550 +lambda_1: -0.5372, lambda_2: 32.7514 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.9 0.93 0.73 0.81 0.42 0.33 0.38] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.88, 0.72, 0.76, 0.4, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.76, 0.54, 0.41, 0.17, 0.05, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001101100 +11111111111111111111111111011111110100000000010010 +11111111011111111111101011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.001354, lagrangian_loss: -0.000770, attention_score_distillation_loss: 0.000884 +loss: 0.001605, lagrangian_loss: -0.000805, attention_score_distillation_loss: 0.000882 +---------------------------------------------------------------------- +time: 2023-07-19 15:10:37 +Evaluating: f1: 0.878, eval_loss: 0.6855, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4013, expected_sparsity: 0.3957, expected_sequence_sparsity: 0.7528, target_sparsity: 0.3728, step: 9600 +lambda_1: -0.1074, lambda_2: 32.9952 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.89 0.92 0.72 0.8 0.41 0.32 0.38] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.88, 0.72, 0.76, 0.4, 0.32, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.76, 0.54, 0.41, 0.17, 0.05, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001101100 +11111111111111111111111111011111110100000000010010 +11111111011111111111101011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010000000000100100000000 +loss: 0.001776, lagrangian_loss: -0.000079, attention_score_distillation_loss: 0.000878 +loss: 0.001615, lagrangian_loss: 0.000074, attention_score_distillation_loss: 0.000876 +---------------------------------------------------------------------- +time: 2023-07-19 15:10:52 +Evaluating: f1: 0.8822, eval_loss: 0.624, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4013, expected_sparsity: 0.3956, expected_sequence_sparsity: 0.7527, target_sparsity: 0.3747, step: 9650 +lambda_1: 0.1692, lambda_2: 33.1444 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.9 0.92 0.73 0.81 0.42 0.33 0.38] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.88, 0.72, 0.76, 0.4, 0.32, 0.38] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.76, 0.54, 0.41, 0.17, 0.05, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001101100 +11111111111111111111111111011111110100000000010010 +11111111011111111111101011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010001000000100100000000 +loss: 0.002328, lagrangian_loss: -0.000204, attention_score_distillation_loss: 0.000873 +ETA: 1:04:50 | Epoch 83 finished. Took 35.23 seconds. +loss: 0.001011, lagrangian_loss: 0.000402, attention_score_distillation_loss: 0.000872 +---------------------------------------------------------------------- +time: 2023-07-19 15:11:06 +Evaluating: f1: 0.8846, eval_loss: 0.6382, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4013, expected_sparsity: 0.3957, expected_sequence_sparsity: 0.7528, target_sparsity: 0.3767, step: 9700 +lambda_1: -0.2526, lambda_2: 33.3621 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.9 0.92 0.72 0.8 0.41 0.33 0.38] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.88, 0.72, 0.76, 0.4, 0.32, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.76, 0.54, 0.41, 0.17, 0.05, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001101100 +11111111111111111111111111011111110100000000010010 +11111111011111111111101011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010000000000100100000000 +loss: 0.001388, lagrangian_loss: 0.000628, attention_score_distillation_loss: 0.000867 +loss: 0.001377, lagrangian_loss: 0.000143, attention_score_distillation_loss: 0.000863 +---------------------------------------------------------------------- +time: 2023-07-19 15:11:21 +Evaluating: f1: 0.8566, eval_loss: 0.6314, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4045, expected_sparsity: 0.3988, expected_sequence_sparsity: 0.7541, target_sparsity: 0.3786, step: 9750 +lambda_1: -0.4443, lambda_2: 33.4636 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.89 0.92 0.72 0.79 0.41 0.32 0.37] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.88, 0.7, 0.76, 0.4, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.76, 0.53, 0.4, 0.16, 0.05, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001101100 +11111111111111111111111111011101110100000000010010 +11111111011111111111101011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110101010000000000100100000000 +loss: 0.000685, lagrangian_loss: -0.000913, attention_score_distillation_loss: 0.000861 +ETA: 1:04:16 | Epoch 84 finished. Took 33.02 seconds. +loss: 0.000799, lagrangian_loss: -0.000288, attention_score_distillation_loss: 0.000858 +---------------------------------------------------------------------- +time: 2023-07-19 15:11:36 +Evaluating: f1: 0.884, eval_loss: 0.6665, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4045, expected_sparsity: 0.3988, expected_sequence_sparsity: 0.7541, target_sparsity: 0.3806, step: 9800 +lambda_1: 0.1219, lambda_2: 33.8516 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.88 0.92 0.72 0.79 0.41 0.32 0.37] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.88, 0.7, 0.76, 0.4, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.76, 0.53, 0.4, 0.16, 0.05, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001101100 +11111111111111111111111111011101110100000000010010 +11111111011111111111101011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110101010000000000100100000000 +loss: 0.001067, lagrangian_loss: 0.000373, attention_score_distillation_loss: 0.000855 +loss: 0.001140, lagrangian_loss: -0.000270, attention_score_distillation_loss: 0.000852 +---------------------------------------------------------------------- +time: 2023-07-19 15:11:50 +Evaluating: f1: 0.8787, eval_loss: 0.6405, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4045, expected_sparsity: 0.3984, expected_sequence_sparsity: 0.7539, target_sparsity: 0.3825, step: 9850 +lambda_1: 0.1524, lambda_2: 34.0114 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.89 0.92 0.72 0.8 0.41 0.33 0.38] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.88, 0.7, 0.76, 0.4, 0.32, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.76, 0.53, 0.4, 0.16, 0.05, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001101100 +11111111111111111111111111011101110100000000010010 +11111111011111111111101011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111111011000110001000000000000000000000000000 +11010000111101010101110101010000000000100100000000 +loss: 0.001417, lagrangian_loss: -0.000156, attention_score_distillation_loss: 0.000850 +loss: 0.002073, lagrangian_loss: 0.001148, attention_score_distillation_loss: 0.000847 +ETA: 1:03:42 | Epoch 85 finished. Took 33.22 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:12:05 +Evaluating: f1: 0.8934, eval_loss: 0.6452, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4045, expected_sparsity: 0.3988, expected_sequence_sparsity: 0.7541, target_sparsity: 0.3844, step: 9900 +lambda_1: -0.5018, lambda_2: 34.5168 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.88 0.91 0.71 0.79 0.41 0.32 0.37] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.88, 0.7, 0.76, 0.4, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.76, 0.53, 0.4, 0.16, 0.05, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001101100 +11111111111111111111111111011101110100000000010010 +11111111011111111111101011110111010111011010110100 +11111111111011000101000000011001100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110101010000000000100100000000 +loss: 0.001403, lagrangian_loss: 0.001232, attention_score_distillation_loss: 0.000844 +loss: 0.002045, lagrangian_loss: -0.000561, attention_score_distillation_loss: 0.000841 +---------------------------------------------------------------------- +time: 2023-07-19 15:12:20 +Evaluating: f1: 0.8981, eval_loss: 0.5781, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4092, expected_sparsity: 0.4047, expected_sequence_sparsity: 0.7565, target_sparsity: 0.3864, step: 9950 +lambda_1: -0.5082, lambda_2: 34.6281 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.88 0.91 0.7 0.77 0.4 0.32 0.37] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.86, 0.7, 0.74, 0.38, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.74, 0.52, 0.38, 0.15, 0.04, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001001100 +11111111111111111111111111011101110100000000010010 +11111111011111111111101011110111010101011010110100 +11111111111011000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.000531, lagrangian_loss: -0.001083, attention_score_distillation_loss: 0.000838 +loss: 0.001166, lagrangian_loss: -0.000396, attention_score_distillation_loss: 0.000836 +---------------------------------------------------------------------- +time: 2023-07-19 15:12:34 +Evaluating: f1: 0.8924, eval_loss: 0.6115, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4092, expected_sparsity: 0.4047, expected_sequence_sparsity: 0.7565, target_sparsity: 0.3883, step: 10000 +lambda_1: 0.0744, lambda_2: 35.0273 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.88 0.9 0.7 0.77 0.4 0.32 0.37] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.86, 0.7, 0.74, 0.38, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.74, 0.52, 0.38, 0.15, 0.04, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001001100 +11111111111111111111111111011101110100000000010010 +11111111011111111111111011110111010101011000110100 +11111111111011000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.000989, lagrangian_loss: 0.000306, attention_score_distillation_loss: 0.000833 +ETA: 1:03:11 | Epoch 86 finished. Took 35.3 seconds. +loss: 0.101004, lagrangian_loss: -0.000133, attention_score_distillation_loss: 0.000830 +---------------------------------------------------------------------- +time: 2023-07-19 15:12:49 +Evaluating: f1: 0.8904, eval_loss: 0.6501, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4045, expected_sparsity: 0.4002, expected_sequence_sparsity: 0.7546, target_sparsity: 0.3903, step: 10050 +lambda_1: 0.1706, lambda_2: 35.1572 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.88 0.91 0.71 0.78 0.4 0.32 0.38] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.88, 0.7, 0.74, 0.4, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.76, 0.53, 0.39, 0.16, 0.05, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001101100 +11111111111111111111111111011101110100000000010010 +11111111011111111111101011110111010101011010110100 +11111111111011000101000000011001100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.002272, lagrangian_loss: -0.000206, attention_score_distillation_loss: 0.000827 +loss: 0.001113, lagrangian_loss: 0.000609, attention_score_distillation_loss: 0.000824 +---------------------------------------------------------------------- +time: 2023-07-19 15:13:04 +Evaluating: f1: 0.8866, eval_loss: 0.6166, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4092, expected_sparsity: 0.4047, expected_sequence_sparsity: 0.7565, target_sparsity: 0.3922, step: 10100 +lambda_1: -0.3939, lambda_2: 35.5494 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.88 0.9 0.7 0.77 0.4 0.32 0.37] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.86, 0.7, 0.74, 0.38, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.74, 0.52, 0.38, 0.15, 0.04, 0.02] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111111011001001100 +11111111111111111111111111011101110100000000010010 +11111111011111111111111011110111010101011000110100 +11111111111011000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.001330, lagrangian_loss: 0.000743, attention_score_distillation_loss: 0.000821 +ETA: 1:02:37 | Epoch 87 finished. Took 33.29 seconds. +loss: 0.000873, lagrangian_loss: -0.000059, attention_score_distillation_loss: 0.000818 +---------------------------------------------------------------------- +time: 2023-07-19 15:13:19 +Evaluating: f1: 0.8877, eval_loss: 0.6482, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4149, expected_sparsity: 0.4086, expected_sequence_sparsity: 0.7581, target_sparsity: 0.3942, step: 10150 +lambda_1: -0.5402, lambda_2: 35.6560 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.87 0.9 0.69 0.76 0.39 0.32 0.37] +infer remain: [1.0, 1.0, 1.0, 0.86, 0.86, 0.68, 0.72, 0.38, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.86, 0.74, 0.5, 0.36, 0.14, 0.04, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110101100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110000000000010010 +11111111011111111111101011110111010101011000110100 +11111111111011000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.001726, lagrangian_loss: -0.000718, attention_score_distillation_loss: 0.000815 +loss: 0.001420, lagrangian_loss: -0.000633, attention_score_distillation_loss: 0.000813 +---------------------------------------------------------------------- +time: 2023-07-19 15:13:33 +Evaluating: f1: 0.89, eval_loss: 0.5937, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4212, expected_sparsity: 0.4136, expected_sequence_sparsity: 0.7601, target_sparsity: 0.3961, step: 10200 +lambda_1: -0.0914, lambda_2: 35.9200 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.87 0.9 0.69 0.75 0.39 0.32 0.37] +infer remain: [1.0, 1.0, 1.0, 0.84, 0.86, 0.68, 0.72, 0.38, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.49, 0.35, 0.13, 0.04, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111101110101100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110100000000000010 +11111111011111111111101011110111010101011000110100 +11111111111011000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.002917, lagrangian_loss: -0.000016, attention_score_distillation_loss: 0.000810 +loss: 0.001198, lagrangian_loss: 0.000121, attention_score_distillation_loss: 0.000806 +ETA: 1:02:03 | Epoch 88 finished. Took 33.13 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:13:48 +Evaluating: f1: 0.8732, eval_loss: 0.6378, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4212, expected_sparsity: 0.4136, expected_sequence_sparsity: 0.7601, target_sparsity: 0.398, step: 10250 +lambda_1: 0.1648, lambda_2: 36.0695 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.87 0.9 0.69 0.75 0.39 0.32 0.37] +infer remain: [1.0, 1.0, 1.0, 0.84, 0.86, 0.68, 0.72, 0.38, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.49, 0.35, 0.13, 0.04, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111101110101100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110100000000000010 +11111111011111111111101011110111010101011000110100 +11111111111011000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.001004, lagrangian_loss: -0.000165, attention_score_distillation_loss: 0.000803 +loss: 0.002251, lagrangian_loss: 0.000266, attention_score_distillation_loss: 0.000801 +---------------------------------------------------------------------- +time: 2023-07-19 15:14:03 +Evaluating: f1: 0.8764, eval_loss: 0.6038, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4212, expected_sparsity: 0.4136, expected_sequence_sparsity: 0.7601, target_sparsity: 0.4, step: 10300 +lambda_1: -0.2512, lambda_2: 36.3107 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.87 0.9 0.69 0.75 0.39 0.32 0.37] +infer remain: [1.0, 1.0, 1.0, 0.84, 0.86, 0.68, 0.72, 0.38, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.49, 0.35, 0.13, 0.04, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111101110101100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110100000000000010 +11111111011111111111101011110111010101011000110100 +11111111111011000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.041986, lagrangian_loss: 0.000612, attention_score_distillation_loss: 0.000798 +loss: 0.001651, lagrangian_loss: 0.000348, attention_score_distillation_loss: 0.000795 +---------------------------------------------------------------------- +time: 2023-07-19 15:14:17 +Evaluating: f1: 0.8859, eval_loss: 0.6629, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4228, expected_sparsity: 0.4149, expected_sequence_sparsity: 0.7607, target_sparsity: 0.4019, step: 10350 +lambda_1: -0.5760, lambda_2: 36.4842 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.86 0.89 0.68 0.73 0.38 0.31 0.36] +infer remain: [1.0, 1.0, 1.0, 0.84, 0.86, 0.68, 0.7, 0.38, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.49, 0.34, 0.13, 0.04, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110100000000000010 +11111111011111111111101010110111010101011000110100 +11111111111011000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +ETA: 1:01:32 | Epoch 89 finished. Took 35.46 seconds. +loss: 0.001448, lagrangian_loss: -0.000549, attention_score_distillation_loss: 0.000792 +loss: 0.000836, lagrangian_loss: -0.000915, attention_score_distillation_loss: 0.000789 +---------------------------------------------------------------------- +time: 2023-07-19 15:14:32 +Evaluating: f1: 0.8801, eval_loss: 0.6773, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4243, expected_sparsity: 0.4182, expected_sequence_sparsity: 0.762, target_sparsity: 0.4039, step: 10400 +lambda_1: -0.1405, lambda_2: 36.7887 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.86 0.89 0.68 0.72 0.38 0.31 0.36] +infer remain: [1.0, 1.0, 1.0, 0.84, 0.86, 0.66, 0.7, 0.36, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.48, 0.33, 0.12, 0.04, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110000000000000010 +11111111011111111111101010110111010101011000110100 +11111111111010000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.002441, lagrangian_loss: -0.000085, attention_score_distillation_loss: 0.000786 +loss: 0.002913, lagrangian_loss: 0.000192, attention_score_distillation_loss: 0.000783 +---------------------------------------------------------------------- +time: 2023-07-19 15:14:47 +Evaluating: f1: 0.8689, eval_loss: 0.6501, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4228, expected_sparsity: 0.4149, expected_sequence_sparsity: 0.7607, target_sparsity: 0.4058, step: 10450 +lambda_1: 0.2414, lambda_2: 37.0494 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.86 0.89 0.68 0.73 0.38 0.31 0.36] +infer remain: [1.0, 1.0, 1.0, 0.84, 0.86, 0.68, 0.7, 0.38, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.49, 0.34, 0.13, 0.04, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110100000000000010 +11111111011111111111101010110111010101011000110100 +11111111111011000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.003924, lagrangian_loss: -0.000264, attention_score_distillation_loss: 0.000781 +ETA: 1:00:58 | Epoch 90 finished. Took 33.18 seconds. +loss: 0.000706, lagrangian_loss: 0.000021, attention_score_distillation_loss: 0.000777 +---------------------------------------------------------------------- +time: 2023-07-19 15:15:02 +Evaluating: f1: 0.8764, eval_loss: 0.7872, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4228, expected_sparsity: 0.4149, expected_sequence_sparsity: 0.7607, target_sparsity: 0.4077, step: 10500 +lambda_1: -0.2156, lambda_2: 37.3795 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.86 0.89 0.68 0.73 0.38 0.31 0.36] +infer remain: [1.0, 1.0, 1.0, 0.84, 0.86, 0.68, 0.7, 0.38, 0.3, 0.36] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.49, 0.34, 0.13, 0.04, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110100000000000010 +11111111011111111111101010110111010101011000110100 +11111111111011000101000000011000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110110010000000000100100000000 +loss: 0.001868, lagrangian_loss: 0.000943, attention_score_distillation_loss: 0.000775 +loss: 0.001224, lagrangian_loss: 0.000619, attention_score_distillation_loss: 0.000771 +---------------------------------------------------------------------- +time: 2023-07-19 15:15:16 +Evaluating: f1: 0.8756, eval_loss: 0.7088, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4259, expected_sparsity: 0.4194, expected_sequence_sparsity: 0.7625, target_sparsity: 0.4097, step: 10550 +lambda_1: -0.5559, lambda_2: 37.5780 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.85 0.88 0.67 0.71 0.38 0.31 0.36] +infer remain: [1.0, 1.0, 1.0, 0.84, 0.86, 0.66, 0.68, 0.36, 0.3, 0.34] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.84, 0.72, 0.48, 0.32, 0.12, 0.04, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111101110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110000000000000010 +11111111011111111111111010110111010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110100010000000000100100000000 +loss: 0.001128, lagrangian_loss: 0.000117, attention_score_distillation_loss: 0.000769 +loss: 0.002719, lagrangian_loss: -0.000931, attention_score_distillation_loss: 0.000766 +ETA: 1:00:24 | Epoch 91 finished. Took 33.18 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:15:31 +Evaluating: f1: 0.8789, eval_loss: 0.6869, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4322, expected_sparsity: 0.4244, expected_sequence_sparsity: 0.7645, target_sparsity: 0.4116, step: 10600 +lambda_1: -0.2976, lambda_2: 37.7154 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.84 0.88 0.66 0.7 0.37 0.31 0.35] +infer remain: [1.0, 1.0, 1.0, 0.82, 0.86, 0.66, 0.68, 0.36, 0.3, 0.34] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.71, 0.47, 0.32, 0.11, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110000000000000010 +11111111011111111111111010110111010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110100010000000000100100000000 +loss: 0.001108, lagrangian_loss: -0.000399, attention_score_distillation_loss: 0.000763 +loss: 0.003640, lagrangian_loss: 0.000049, attention_score_distillation_loss: 0.000760 +---------------------------------------------------------------------- +time: 2023-07-19 15:15:46 +Evaluating: f1: 0.8697, eval_loss: 0.7075, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4322, expected_sparsity: 0.4244, expected_sequence_sparsity: 0.7645, target_sparsity: 0.4136, step: 10650 +lambda_1: 0.1128, lambda_2: 37.9407 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.84 0.88 0.66 0.71 0.37 0.31 0.35] +infer remain: [1.0, 1.0, 1.0, 0.82, 0.86, 0.66, 0.68, 0.36, 0.3, 0.34] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.71, 0.47, 0.32, 0.11, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110000000000000010 +11111111011111111111111010110111010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110100010000000000100100000000 +loss: 0.001528, lagrangian_loss: -0.000006, attention_score_distillation_loss: 0.000757 +loss: 0.000865, lagrangian_loss: -0.000078, attention_score_distillation_loss: 0.000754 +ETA: 0:59:50 | Epoch 92 finished. Took 33.08 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:16:00 +Evaluating: f1: 0.8722, eval_loss: 0.6903, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4322, expected_sparsity: 0.4244, expected_sequence_sparsity: 0.7645, target_sparsity: 0.4155, step: 10700 +lambda_1: -0.0601, lambda_2: 38.0436 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.84 0.88 0.66 0.71 0.38 0.31 0.35] +infer remain: [1.0, 1.0, 1.0, 0.82, 0.86, 0.66, 0.68, 0.36, 0.3, 0.34] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.71, 0.47, 0.32, 0.11, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110000000000000010 +11111111011111111111111010110111010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110100010000000000100100000000 +loss: 0.001398, lagrangian_loss: 0.000257, attention_score_distillation_loss: 0.000752 +loss: 0.001177, lagrangian_loss: 0.000829, attention_score_distillation_loss: 0.000749 +---------------------------------------------------------------------- +time: 2023-07-19 15:16:15 +Evaluating: f1: 0.8744, eval_loss: 0.6773, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4322, expected_sparsity: 0.4244, expected_sequence_sparsity: 0.7645, target_sparsity: 0.4175, step: 10750 +lambda_1: -0.4647, lambda_2: 38.2545 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.84 0.88 0.66 0.71 0.37 0.31 0.34] +infer remain: [1.0, 1.0, 1.0, 0.82, 0.86, 0.66, 0.68, 0.36, 0.3, 0.34] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.71, 0.47, 0.32, 0.11, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111011101110000000000000010 +11111111011111111111111010110111010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110100010000000000100100000000 +loss: 0.002155, lagrangian_loss: 0.000301, attention_score_distillation_loss: 0.000746 +loss: 0.136131, lagrangian_loss: -0.000324, attention_score_distillation_loss: 0.000743 +---------------------------------------------------------------------- +time: 2023-07-19 15:16:30 +Evaluating: f1: 0.8908, eval_loss: 0.6563, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4353, expected_sparsity: 0.4279, expected_sequence_sparsity: 0.766, target_sparsity: 0.4194, step: 10800 +lambda_1: -0.4953, lambda_2: 38.3168 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.83 0.88 0.65 0.69 0.37 0.31 0.33] +infer remain: [1.0, 1.0, 1.0, 0.82, 0.86, 0.64, 0.66, 0.36, 0.3, 0.32] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.71, 0.45, 0.3, 0.11, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111010101110000000000000010 +11111111011111111111101010110111010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010001110100010000000000100100000000 +loss: 0.001122, lagrangian_loss: -0.000693, attention_score_distillation_loss: 0.000740 +ETA: 0:59:18 | Epoch 93 finished. Took 35.43 seconds. +loss: 0.001777, lagrangian_loss: -0.000357, attention_score_distillation_loss: 0.000738 +---------------------------------------------------------------------- +time: 2023-07-19 15:16:45 +Evaluating: f1: 0.8628, eval_loss: 0.6591, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4353, expected_sparsity: 0.4279, expected_sequence_sparsity: 0.766, target_sparsity: 0.4213, step: 10850 +lambda_1: -0.0697, lambda_2: 38.5546 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.83 0.87 0.65 0.69 0.37 0.31 0.33] +infer remain: [1.0, 1.0, 1.0, 0.82, 0.86, 0.64, 0.66, 0.36, 0.3, 0.32] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.71, 0.45, 0.3, 0.11, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111010101110000000000000010 +11111111011111111111101010110111010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010001110100010000000000100100000000 +loss: 0.006645, lagrangian_loss: -0.000031, attention_score_distillation_loss: 0.000734 +loss: 0.002023, lagrangian_loss: 0.000023, attention_score_distillation_loss: 0.000732 +---------------------------------------------------------------------- +time: 2023-07-19 15:16:59 +Evaluating: f1: 0.8793, eval_loss: 0.6571, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4353, expected_sparsity: 0.4279, expected_sequence_sparsity: 0.766, target_sparsity: 0.4233, step: 10900 +lambda_1: 0.1118, lambda_2: 38.6574 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.83 0.87 0.65 0.69 0.37 0.31 0.33] +infer remain: [1.0, 1.0, 1.0, 0.82, 0.86, 0.64, 0.66, 0.36, 0.3, 0.32] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.82, 0.71, 0.45, 0.3, 0.11, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111010101110000000000000010 +11111111011111111111101010110111010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010001110100010000000000100100000000 +loss: 0.001467, lagrangian_loss: -0.000080, attention_score_distillation_loss: 0.000729 +ETA: 0:58:44 | Epoch 94 finished. Took 33.29 seconds. +loss: 0.000673, lagrangian_loss: 0.000275, attention_score_distillation_loss: 0.000727 +---------------------------------------------------------------------- +time: 2023-07-19 15:17:14 +Evaluating: f1: 0.8789, eval_loss: 0.6823, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4401, expected_sparsity: 0.4328, expected_sequence_sparsity: 0.768, target_sparsity: 0.4252, step: 10950 +lambda_1: -0.3104, lambda_2: 38.9081 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.83 0.87 0.65 0.69 0.37 0.31 0.33] +infer remain: [1.0, 1.0, 1.0, 0.8, 0.86, 0.64, 0.66, 0.36, 0.3, 0.32] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.69, 0.44, 0.29, 0.1, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111001110100100 +11111111111111111111111111111111111110011001101100 +11111111111111111111111111010101110000000000000010 +11111111011111111111101010110111010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010001110100010000000000100100000000 +loss: 0.001499, lagrangian_loss: 0.000695, attention_score_distillation_loss: 0.000723 +loss: 0.001892, lagrangian_loss: 0.000085, attention_score_distillation_loss: 0.000720 +---------------------------------------------------------------------- +time: 2023-07-19 15:17:29 +Evaluating: f1: 0.8831, eval_loss: 0.6254, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4432, expected_sparsity: 0.4358, expected_sequence_sparsity: 0.7692, target_sparsity: 0.4272, step: 11000 +lambda_1: -0.5258, lambda_2: 39.0194 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.82 0.87 0.64 0.68 0.36 0.3 0.32] +infer remain: [1.0, 1.0, 1.0, 0.8, 0.84, 0.64, 0.66, 0.36, 0.3, 0.32] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.67, 0.43, 0.28, 0.1, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001100100100 +11111111111111111111111111111111110110011001101100 +11111111111111111111111111010101110000000000000010 +11111111011111111111101010110111010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010001110100010000000000100100000000 +loss: 0.003420, lagrangian_loss: -0.000464, attention_score_distillation_loss: 0.000718 +loss: 0.001126, lagrangian_loss: -0.000687, attention_score_distillation_loss: 0.000714 +ETA: 0:58:10 | Epoch 95 finished. Took 33.18 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:17:43 +Evaluating: f1: 0.8657, eval_loss: 0.6608, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4448, expected_sparsity: 0.439, expected_sequence_sparsity: 0.7705, target_sparsity: 0.4291, step: 11050 +lambda_1: -0.1267, lambda_2: 39.2533 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.81 0.86 0.64 0.68 0.36 0.3 0.32] +infer remain: [1.0, 1.0, 1.0, 0.8, 0.84, 0.62, 0.64, 0.36, 0.3, 0.32] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.67, 0.42, 0.27, 0.1, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001100100100 +11111111111111111111111111111111110110011001101100 +11111111111111111111111111010101110000000000000000 +11111111011111111111101010110110010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110000010000000000100100000000 +loss: 0.004817, lagrangian_loss: -0.000101, attention_score_distillation_loss: 0.000712 +loss: 0.000982, lagrangian_loss: 0.000050, attention_score_distillation_loss: 0.000709 +---------------------------------------------------------------------- +time: 2023-07-19 15:17:58 +Evaluating: f1: 0.8836, eval_loss: 0.6738, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4432, expected_sparsity: 0.4368, expected_sequence_sparsity: 0.7696, target_sparsity: 0.4311, step: 11100 +lambda_1: 0.1305, lambda_2: 39.4028 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.81 0.87 0.64 0.68 0.36 0.3 0.32] +infer remain: [1.0, 1.0, 1.0, 0.8, 0.84, 0.64, 0.64, 0.36, 0.3, 0.32] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.67, 0.43, 0.28, 0.1, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001100100100 +11111111111111111111111111111111110110011001101100 +11111111111111111111111111010101110000000000000010 +11111111011111111111101010110110010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110000010000000000100100000000 +loss: 0.000941, lagrangian_loss: -0.000067, attention_score_distillation_loss: 0.000706 +loss: 0.001463, lagrangian_loss: 0.000220, attention_score_distillation_loss: 0.000706 +---------------------------------------------------------------------- +time: 2023-07-19 15:18:13 +Evaluating: f1: 0.8816, eval_loss: 0.6431, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4448, expected_sparsity: 0.439, expected_sequence_sparsity: 0.7705, target_sparsity: 0.433, step: 11150 +lambda_1: -0.2664, lambda_2: 39.6299 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.81 0.86 0.64 0.68 0.36 0.3 0.32] +infer remain: [1.0, 1.0, 1.0, 0.8, 0.84, 0.62, 0.64, 0.36, 0.3, 0.32] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.67, 0.42, 0.27, 0.1, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011111011111001100100100 +11111111111111111111111111111111110110011001101100 +11111111111111111111111111010101110000000000000000 +11111111011111111111101010110110010001011000100100 +11111111111011000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010101110000010000000000100100000000 +loss: 0.000705, lagrangian_loss: 0.000238, attention_score_distillation_loss: 0.000700 +ETA: 0:57:39 | Epoch 96 finished. Took 35.46 seconds. +loss: 0.022674, lagrangian_loss: 0.000781, attention_score_distillation_loss: 0.000701 +---------------------------------------------------------------------- +time: 2023-07-19 15:18:28 +Evaluating: f1: 0.8752, eval_loss: 0.6706, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4495, expected_sparsity: 0.4444, expected_sequence_sparsity: 0.7727, target_sparsity: 0.4349, step: 11200 +lambda_1: -0.5323, lambda_2: 39.7671 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.8 0.86 0.63 0.67 0.36 0.3 0.32] +infer remain: [1.0, 1.0, 1.0, 0.78, 0.84, 0.62, 0.64, 0.34, 0.3, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.66, 0.41, 0.26, 0.09, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111001100100100 +11111111111111111111111111111111110110011001101100 +11111111111111111111111111010101110000000000000000 +11111111011111111111101010110110010001011000100100 +11111111111010000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.000801, lagrangian_loss: -0.000451, attention_score_distillation_loss: 0.000696 +loss: 0.000891, lagrangian_loss: -0.000658, attention_score_distillation_loss: 0.000692 +---------------------------------------------------------------------- +time: 2023-07-19 15:18:42 +Evaluating: f1: 0.8673, eval_loss: 0.7001, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4495, expected_sparsity: 0.4446, expected_sequence_sparsity: 0.7728, target_sparsity: 0.4369, step: 11250 +lambda_1: -0.1866, lambda_2: 39.9660 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.8 0.85 0.62 0.66 0.36 0.3 0.32] +infer remain: [1.0, 1.0, 1.0, 0.78, 0.84, 0.62, 0.64, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.66, 0.41, 0.26, 0.09, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111001100100100 +11111111111111111111111111111111111110011000101100 +11111111111111111111111111010101110000000000000000 +11111111011111111111101010110110010001011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001639, lagrangian_loss: -0.000217, attention_score_distillation_loss: 0.000689 +ETA: 0:57:05 | Epoch 97 finished. Took 33.13 seconds. +loss: 0.001900, lagrangian_loss: 0.000100, attention_score_distillation_loss: 0.000686 +---------------------------------------------------------------------- +time: 2023-07-19 15:18:57 +Evaluating: f1: 0.8737, eval_loss: 0.6685, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4495, expected_sparsity: 0.4444, expected_sequence_sparsity: 0.7727, target_sparsity: 0.4388, step: 11300 +lambda_1: 0.1611, lambda_2: 40.1671 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.8 0.85 0.63 0.66 0.36 0.3 0.32] +infer remain: [1.0, 1.0, 1.0, 0.78, 0.84, 0.62, 0.64, 0.34, 0.3, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.66, 0.41, 0.26, 0.09, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111001100100100 +11111111111111111111111111111111111110011000101100 +11111111111111111111111111010101110000000000000000 +11111111011111111111101010110110010001011000100100 +11111111111010000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001714, lagrangian_loss: -0.000072, attention_score_distillation_loss: 0.000683 +loss: 0.001359, lagrangian_loss: 0.000100, attention_score_distillation_loss: 0.000681 +---------------------------------------------------------------------- +time: 2023-07-19 15:19:12 +Evaluating: f1: 0.8859, eval_loss: 0.6657, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4495, expected_sparsity: 0.4444, expected_sequence_sparsity: 0.7727, target_sparsity: 0.4408, step: 11350 +lambda_1: -0.2237, lambda_2: 40.4317 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.8 0.85 0.63 0.66 0.36 0.3 0.32] +infer remain: [1.0, 1.0, 1.0, 0.78, 0.84, 0.62, 0.64, 0.34, 0.3, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.66, 0.41, 0.26, 0.09, 0.03, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111001100100100 +11111111111111111111111111111111111110011000101100 +11111111111111111111111111010101110000000000000000 +11111111011111111111101010110110010001011000100100 +11111111111010000101000000010000100000000000000100 +11111111101011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001111, lagrangian_loss: 0.000764, attention_score_distillation_loss: 0.000678 +loss: 0.001153, lagrangian_loss: 0.000204, attention_score_distillation_loss: 0.000675 +ETA: 0:56:31 | Epoch 98 finished. Took 33.02 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:19:26 +Evaluating: f1: 0.8679, eval_loss: 0.6468, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4526, expected_sparsity: 0.4484, expected_sequence_sparsity: 0.7744, target_sparsity: 0.4427, step: 11400 +lambda_1: -0.5786, lambda_2: 40.6513 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.79 0.84 0.62 0.66 0.35 0.3 0.31] +infer remain: [1.0, 1.0, 1.0, 0.78, 0.82, 0.62, 0.62, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.64, 0.4, 0.25, 0.08, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111001100100100 +11111111111111111111111111111111110110011000101100 +11111111111111111111111111010101110000000000000000 +11111111011111111111101010110100010001011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001013, lagrangian_loss: -0.000463, attention_score_distillation_loss: 0.000672 +loss: 0.001756, lagrangian_loss: -0.000699, attention_score_distillation_loss: 0.000669 +---------------------------------------------------------------------- +time: 2023-07-19 15:19:41 +Evaluating: f1: 0.8675, eval_loss: 0.7256, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4605, expected_sparsity: 0.4549, expected_sequence_sparsity: 0.777, target_sparsity: 0.4446, step: 11450 +lambda_1: -0.3147, lambda_2: 40.8000 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.78 0.83 0.62 0.65 0.35 0.29 0.31] +infer remain: [1.0, 1.0, 1.0, 0.76, 0.82, 0.6, 0.62, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.62, 0.37, 0.23, 0.08, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111000100100100 +11111111111111111111111111111111111110011000100100 +11111111111111111111111111010101010000000000000000 +11111111011111111111101010110110010000011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001003, lagrangian_loss: -0.000469, attention_score_distillation_loss: 0.000667 +loss: 0.000877, lagrangian_loss: 0.000025, attention_score_distillation_loss: 0.000663 +---------------------------------------------------------------------- +time: 2023-07-19 15:19:56 +Evaluating: f1: 0.8816, eval_loss: 0.6792, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4557, expected_sparsity: 0.4503, expected_sequence_sparsity: 0.7752, target_sparsity: 0.4466, step: 11500 +lambda_1: 0.1557, lambda_2: 41.1232 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.78 0.83 0.62 0.65 0.35 0.3 0.31] +infer remain: [1.0, 1.0, 1.0, 0.78, 0.82, 0.6, 0.62, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.64, 0.38, 0.24, 0.08, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111001100100100 +11111111111111111111111111111111111110011000100100 +11111111111111111111111111010101010000000000000000 +11111111011111111111101010110100010001011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +ETA: 0:55:59 | Epoch 99 finished. Took 35.33 seconds. +loss: 0.001461, lagrangian_loss: 0.000073, attention_score_distillation_loss: 0.000660 +loss: 0.001411, lagrangian_loss: -0.000033, attention_score_distillation_loss: 0.000657 +---------------------------------------------------------------------- +time: 2023-07-19 15:20:11 +Evaluating: f1: 0.8789, eval_loss: 0.6592, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4557, expected_sparsity: 0.4503, expected_sequence_sparsity: 0.7752, target_sparsity: 0.4485, step: 11550 +lambda_1: -0.1426, lambda_2: 41.3372 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.78 0.83 0.62 0.65 0.35 0.3 0.31] +infer remain: [1.0, 1.0, 1.0, 0.78, 0.82, 0.6, 0.62, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.78, 0.64, 0.38, 0.24, 0.08, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111001100100100 +11111111111111111111111111111111111110011000100100 +11111111111111111111110111010101110000000000000000 +11111111011111111111101010110100010001011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001371, lagrangian_loss: 0.000497, attention_score_distillation_loss: 0.000654 +loss: 0.100418, lagrangian_loss: 0.000666, attention_score_distillation_loss: 0.000652 +---------------------------------------------------------------------- +time: 2023-07-19 15:20:25 +Evaluating: f1: 0.8741, eval_loss: 0.7338, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4636, expected_sparsity: 0.4575, expected_sequence_sparsity: 0.7781, target_sparsity: 0.4505, step: 11600 +lambda_1: -0.5613, lambda_2: 41.5886 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.78 0.82 0.61 0.64 0.35 0.29 0.31] +infer remain: [1.0, 1.0, 1.0, 0.76, 0.8, 0.6, 0.62, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.61, 0.36, 0.23, 0.08, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111000100100100 +11111111111111111111111111111111101110011000100100 +11111111111111111111111111010101010000000000000000 +11111111011111111111101010110110010000011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001115, lagrangian_loss: 0.000216, attention_score_distillation_loss: 0.000649 +ETA: 0:55:25 | Epoch 100 finished. Took 33.17 seconds. +loss: 0.001358, lagrangian_loss: -0.000629, attention_score_distillation_loss: 0.000646 +---------------------------------------------------------------------- +time: 2023-07-19 15:20:40 +Evaluating: f1: 0.8666, eval_loss: 0.7149, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4636, expected_sparsity: 0.4575, expected_sequence_sparsity: 0.7781, target_sparsity: 0.4524, step: 11650 +lambda_1: -0.3741, lambda_2: 41.7162 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.77 0.81 0.61 0.64 0.34 0.29 0.31] +infer remain: [1.0, 1.0, 1.0, 0.76, 0.8, 0.6, 0.62, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.61, 0.36, 0.23, 0.08, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111000100100100 +11111111111111111111111111111111101110011000100100 +11111111111111111111111111010101010000000000000000 +11111111011111111111101010110110010000011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001786, lagrangian_loss: -0.000686, attention_score_distillation_loss: 0.000644 +loss: 0.001322, lagrangian_loss: -0.000060, attention_score_distillation_loss: 0.000643 +---------------------------------------------------------------------- +time: 2023-07-19 15:20:54 +Evaluating: f1: 0.8702, eval_loss: 0.6964, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4636, expected_sparsity: 0.4575, expected_sequence_sparsity: 0.7781, target_sparsity: 0.4544, step: 11700 +lambda_1: 0.0647, lambda_2: 41.9948 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.77 0.81 0.61 0.63 0.34 0.29 0.3 ] +infer remain: [1.0, 1.0, 1.0, 0.76, 0.8, 0.6, 0.62, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.61, 0.36, 0.23, 0.08, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111000100100100 +11111111111111111111111111111111101110011000100100 +11111111111111111111111111010101010000000000000000 +11111111011111111111101010110100010001011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.000887, lagrangian_loss: 0.000150, attention_score_distillation_loss: 0.000638 +loss: 0.003402, lagrangian_loss: -0.000064, attention_score_distillation_loss: 0.000634 +ETA: 0:54:51 | Epoch 101 finished. Took 32.92 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:21:09 +Evaluating: f1: 0.8778, eval_loss: 0.6802, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4636, expected_sparsity: 0.4575, expected_sequence_sparsity: 0.7781, target_sparsity: 0.4563, step: 11750 +lambda_1: -0.0182, lambda_2: 42.0979 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.77 0.81 0.61 0.64 0.35 0.29 0.31] +infer remain: [1.0, 1.0, 1.0, 0.76, 0.8, 0.6, 0.62, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.61, 0.36, 0.23, 0.08, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111000100100100 +11111111111111111111111111111111101110011000100100 +11111111111111111111111111010101010000000000000000 +11111111011111111111101010110100010001011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001729, lagrangian_loss: 0.000452, attention_score_distillation_loss: 0.000632 +loss: 0.001485, lagrangian_loss: 0.000588, attention_score_distillation_loss: 0.000629 +---------------------------------------------------------------------- +time: 2023-07-19 15:21:24 +Evaluating: f1: 0.8697, eval_loss: 0.6826, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4652, expected_sparsity: 0.4584, expected_sequence_sparsity: 0.7785, target_sparsity: 0.4582, step: 11800 +lambda_1: -0.5298, lambda_2: 42.4544 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.77 0.81 0.6 0.63 0.34 0.29 0.3 ] +infer remain: [1.0, 1.0, 1.0, 0.76, 0.8, 0.6, 0.6, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.61, 0.36, 0.22, 0.07, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111000100100100 +11111111111111111111111111111111101110011000100100 +11111111111111111111111111010101010000000000000000 +11111111011111111111101010110100010000011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.000897, lagrangian_loss: 0.000023, attention_score_distillation_loss: 0.000626 +loss: 0.001528, lagrangian_loss: -0.000732, attention_score_distillation_loss: 0.000623 +ETA: 0:54:17 | Epoch 102 finished. Took 33.25 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:21:38 +Evaluating: f1: 0.8648, eval_loss: 0.6836, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4715, expected_sparsity: 0.4672, expected_sequence_sparsity: 0.7821, target_sparsity: 0.4602, step: 11850 +lambda_1: -0.4196, lambda_2: 42.5987 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.75 0.8 0.6 0.62 0.34 0.29 0.3 ] +infer remain: [1.0, 1.0, 1.0, 0.74, 0.78, 0.58, 0.6, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.58, 0.33, 0.2, 0.07, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011011000100100100 +11111111111111111111111111111111100110011000100100 +11111111111111111111110111010101010000000000000000 +11111111011111111111101010110100010000011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.023355, lagrangian_loss: -0.000330, attention_score_distillation_loss: 0.000620 +loss: 0.003435, lagrangian_loss: 0.000057, attention_score_distillation_loss: 0.000617 +---------------------------------------------------------------------- +time: 2023-07-19 15:21:53 +Evaluating: f1: 0.8688, eval_loss: 0.6765, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4715, expected_sparsity: 0.4672, expected_sequence_sparsity: 0.7821, target_sparsity: 0.4621, step: 11900 +lambda_1: 0.1809, lambda_2: 43.0957 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.75 0.8 0.6 0.62 0.34 0.29 0.3 ] +infer remain: [1.0, 1.0, 1.0, 0.74, 0.78, 0.58, 0.6, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.58, 0.33, 0.2, 0.07, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011011000100100100 +11111111111111111111111111111111100110011000100100 +11111111111111111111110111010101010000000000000000 +11111111011111111111101010110100010000011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.002518, lagrangian_loss: 0.000055, attention_score_distillation_loss: 0.000614 +loss: 0.007086, lagrangian_loss: -0.000189, attention_score_distillation_loss: 0.000612 +---------------------------------------------------------------------- +time: 2023-07-19 15:22:08 +Evaluating: f1: 0.8878, eval_loss: 0.7009, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4715, expected_sparsity: 0.4672, expected_sequence_sparsity: 0.7821, target_sparsity: 0.4641, step: 11950 +lambda_1: -0.0647, lambda_2: 43.3523 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.75 0.8 0.6 0.62 0.34 0.29 0.3 ] +infer remain: [1.0, 1.0, 1.0, 0.74, 0.78, 0.58, 0.6, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.58, 0.33, 0.2, 0.07, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111000000100100 +11111111111111111111111111111111100110011000100100 +11111111111111111111110111010101010000000000000000 +11111111011111111111101010110100010000011000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001352, lagrangian_loss: 0.000418, attention_score_distillation_loss: 0.000609 +ETA: 0:53:45 | Epoch 103 finished. Took 35.39 seconds. +loss: 0.001552, lagrangian_loss: 0.000470, attention_score_distillation_loss: 0.000606 +---------------------------------------------------------------------- +time: 2023-07-19 15:22:23 +Evaluating: f1: 0.8705, eval_loss: 0.6578, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4715, expected_sparsity: 0.4672, expected_sequence_sparsity: 0.7821, target_sparsity: 0.466, step: 12000 +lambda_1: -0.5926, lambda_2: 43.7621 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.74 0.79 0.6 0.62 0.34 0.28 0.3 ] +infer remain: [1.0, 1.0, 1.0, 0.74, 0.78, 0.58, 0.6, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.74, 0.58, 0.33, 0.2, 0.07, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011111000000100100 +11111111111111111111111111111111100110011000100100 +11111111111111111111110111010101010000000000000000 +11111111011111111111101010110110010000010000100100 +11111111111010000101000000010000100000000000000100 +11111111100011000110001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.005579, lagrangian_loss: 0.000882, attention_score_distillation_loss: 0.000605 +loss: 0.001504, lagrangian_loss: -0.000470, attention_score_distillation_loss: 0.000600 +---------------------------------------------------------------------- +time: 2023-07-19 15:22:37 +Evaluating: f1: 0.8804, eval_loss: 0.6577, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4746, expected_sparsity: 0.4715, expected_sequence_sparsity: 0.7839, target_sparsity: 0.468, step: 12050 +lambda_1: -0.5561, lambda_2: 43.8503 lambda_3: 0.0000 +train remain: [0.99 1. 1. 0.73 0.79 0.59 0.61 0.34 0.28 0.3 ] +infer remain: [1.0, 1.0, 1.0, 0.72, 0.78, 0.58, 0.6, 0.34, 0.28, 0.3] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.72, 0.56, 0.33, 0.2, 0.07, 0.02, 0.01] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011011000000100100 +11111111111111111111111111111111100110011000100100 +11111111111111111111110111010101010000000000000000 +11111111011111111111101010110110010000010000100100 +11111111111010000101000000010000100000000000000100 +11111111101011000010001000000000000000000000000000 +11010000111101010001110000010000000000100100000000 +loss: 0.001853, lagrangian_loss: -0.000464, attention_score_distillation_loss: 0.000598 +ETA: 0:53:11 | Epoch 104 finished. Took 32.89 seconds. +loss: 0.001653, lagrangian_loss: -0.000440, attention_score_distillation_loss: 0.000594 +---------------------------------------------------------------------- +time: 2023-07-19 15:22:52 +Evaluating: f1: 0.8821, eval_loss: 0.682, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4762, expected_sparsity: 0.4728, expected_sequence_sparsity: 0.7844, target_sparsity: 0.4699, step: 12100 +lambda_1: -0.2189, lambda_2: 44.0224 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.73 0.79 0.59 0.61 0.34 0.28 0.29] +infer remain: [1.0, 1.0, 1.0, 0.72, 0.78, 0.58, 0.58, 0.32, 0.28, 0.28] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.72, 0.56, 0.33, 0.19, 0.06, 0.02, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011011000000100100 +11111111111111111111111111111111100110011000100100 +11111111111111111111111111010101000000000000000000 +11111111011111111111101010110100010000010000100100 +11111111111010000101000000000000100000000000000100 +11111111101011000010001000000000000000000000000000 +11010000111100010001110000010000000000100100000000 +loss: 0.003494, lagrangian_loss: -0.000232, attention_score_distillation_loss: 0.000592 +loss: 0.000987, lagrangian_loss: 0.000027, attention_score_distillation_loss: 0.000589 +---------------------------------------------------------------------- +time: 2023-07-19 15:23:07 +Evaluating: f1: 0.8679, eval_loss: 0.659, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4762, expected_sparsity: 0.4728, expected_sequence_sparsity: 0.7844, target_sparsity: 0.4718, step: 12150 +lambda_1: -0.1724, lambda_2: 44.0751 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.73 0.78 0.58 0.61 0.34 0.28 0.29] +infer remain: [1.0, 1.0, 1.0, 0.72, 0.78, 0.58, 0.58, 0.32, 0.28, 0.28] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.72, 0.56, 0.33, 0.19, 0.06, 0.02, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011011000000100100 +11111111111111111111111111111111100110011000100100 +11111111111111111111111111010101000000000000000000 +11111111011111111111101010110100010000010000100100 +11111111111010000101000000000000100000000000000100 +11111111101011000010001000000000000000000000000000 +11010000111100010001110000010000000000100100000000 +loss: 0.000902, lagrangian_loss: 0.000136, attention_score_distillation_loss: 0.000586 +loss: 0.004816, lagrangian_loss: 0.000259, attention_score_distillation_loss: 0.000583 +ETA: 0:52:36 | Epoch 105 finished. Took 33.06 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:23:21 +Evaluating: f1: 0.8901, eval_loss: 0.6206, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4809, expected_sparsity: 0.477, expected_sequence_sparsity: 0.7861, target_sparsity: 0.4738, step: 12200 +lambda_1: -0.4240, lambda_2: 44.1881 lambda_3: 0.0000 +train remain: [1. 1. 1. 0.73 0.78 0.58 0.6 0.34 0.28 0.29] +infer remain: [1.0, 1.0, 1.0, 0.72, 0.76, 0.56, 0.58, 0.32, 0.26, 0.28] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.72, 0.55, 0.31, 0.18, 0.06, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011011000000100100 +11111111111111111111111111111111100100011000100100 +11111111111111111111110111010101000000000000000000 +11111111011111111111101010110100010000010000100100 +11111111111010000101000000000000100000000000000100 +11111111100011000010001000000000000000000000000000 +11010000111100010001110000010000000000100100000000 +loss: 0.001514, lagrangian_loss: 0.000276, attention_score_distillation_loss: 0.000580 +loss: 0.001073, lagrangian_loss: 0.001286, attention_score_distillation_loss: 0.000577 +---------------------------------------------------------------------- +time: 2023-07-19 15:23:36 +Evaluating: f1: 0.873, eval_loss: 0.6777, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4809, expected_sparsity: 0.477, expected_sequence_sparsity: 0.7861, target_sparsity: 0.4757, step: 12250 +lambda_1: -0.7351, lambda_2: 44.3266 lambda_3: 0.0000 +train remain: [0.99 1. 1. 0.73 0.78 0.58 0.59 0.33 0.27 0.28] +infer remain: [1.0, 1.0, 1.0, 0.72, 0.76, 0.56, 0.58, 0.32, 0.26, 0.28] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.72, 0.55, 0.31, 0.18, 0.06, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011011000000100100 +11111111111111111111111111111111100100011000100100 +11111111111111111111110111010101000000000000000000 +11111111011111111111101010110100010000010000100100 +11111111111010000101000000000000100000000000000100 +11111101101011000010001000000000000000000000000000 +11010000111100010001110000010000000000100100000000 +loss: 0.002913, lagrangian_loss: -0.000008, attention_score_distillation_loss: 0.000575 +loss: 0.001052, lagrangian_loss: -0.000446, attention_score_distillation_loss: 0.000571 +---------------------------------------------------------------------- +time: 2023-07-19 15:23:50 +Evaluating: f1: 0.8761, eval_loss: 0.6734, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4871, expected_sparsity: 0.4812, expected_sequence_sparsity: 0.7878, target_sparsity: 0.4777, step: 12300 +lambda_1: -0.8039, lambda_2: 44.3722 lambda_3: 0.0000 +train remain: [0.99 1. 1. 0.72 0.77 0.57 0.59 0.33 0.27 0.27] +infer remain: [1.0, 1.0, 1.0, 0.7, 0.76, 0.56, 0.58, 0.32, 0.26, 0.26] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.53, 0.3, 0.17, 0.06, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000100100 +11111111111111111111111111111111100100011000100100 +11111111111111111111111111010001000000000000000000 +11111111011111111111101010110110010000010000100000 +11111111111010000101000000000000100000000000000100 +11111101101011000010001000000000000000000000000000 +11010000111100010001010000010000000000100100000000 +loss: 0.001698, lagrangian_loss: 0.000109, attention_score_distillation_loss: 0.000569 +ETA: 0:52:04 | Epoch 106 finished. Took 35.3 seconds. +loss: 0.000810, lagrangian_loss: -0.000713, attention_score_distillation_loss: 0.000566 +---------------------------------------------------------------------- +time: 2023-07-19 15:24:05 +Evaluating: f1: 0.8679, eval_loss: 0.6668, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4887, expected_sparsity: 0.4818, expected_sequence_sparsity: 0.7881, target_sparsity: 0.4796, step: 12350 +lambda_1: -0.5277, lambda_2: 44.5002 lambda_3: 0.0000 +train remain: [0.99 1. 1. 0.71 0.77 0.56 0.58 0.33 0.27 0.27] +infer remain: [1.0, 1.0, 1.0, 0.7, 0.76, 0.56, 0.56, 0.32, 0.26, 0.26] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.53, 0.3, 0.17, 0.05, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000100100 +11111111111111111111111111111111100100011000100100 +11111111111111111111111111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111111010000101000000000000100000000000000100 +11111101101011000010001000000000000000000000000000 +11010000111100010001010000010000000000100100000000 +loss: 0.001101, lagrangian_loss: -0.000205, attention_score_distillation_loss: 0.000563 +loss: 0.000370, lagrangian_loss: -0.000514, attention_score_distillation_loss: 0.000560 +---------------------------------------------------------------------- +time: 2023-07-19 15:24:20 +Evaluating: f1: 0.8719, eval_loss: 0.667, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4887, expected_sparsity: 0.4834, expected_sequence_sparsity: 0.7887, target_sparsity: 0.4815, step: 12400 +lambda_1: -0.1476, lambda_2: 44.7067 lambda_3: 0.0000 +train remain: [0.99 1. 1. 0.71 0.77 0.56 0.58 0.33 0.26 0.27] +infer remain: [1.0, 1.0, 1.0, 0.7, 0.76, 0.54, 0.56, 0.32, 0.26, 0.26] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.53, 0.29, 0.16, 0.05, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000100100 +11111111111111111111111111111111100100011000100100 +11111111111111111111110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111111010000101000000000000100000000000000100 +11111101101011000010001000000000000000000000000000 +11010000111100010001010000010000000000100100000000 +loss: 0.001304, lagrangian_loss: -0.000119, attention_score_distillation_loss: 0.000558 +ETA: 0:51:30 | Epoch 107 finished. Took 33.1 seconds. +loss: 0.001187, lagrangian_loss: 0.000011, attention_score_distillation_loss: 0.000554 +---------------------------------------------------------------------- +time: 2023-07-19 15:24:34 +Evaluating: f1: 0.8827, eval_loss: 0.6476, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4887, expected_sparsity: 0.4834, expected_sequence_sparsity: 0.7887, target_sparsity: 0.4835, step: 12450 +lambda_1: -0.0067, lambda_2: 44.8007 lambda_3: 0.0000 +train remain: [0.99 1. 1. 0.71 0.77 0.56 0.58 0.33 0.26 0.27] +infer remain: [1.0, 1.0, 1.0, 0.7, 0.76, 0.54, 0.56, 0.32, 0.26, 0.26] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.53, 0.29, 0.16, 0.05, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000100100 +11111111111111111111111111111111100100011000100100 +11111111111111111111110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111111010000101000000000000100000000000000100 +11111101101011000010001000000000000000000000000000 +11010000111100010001010000010000000000100100000000 +loss: 0.001666, lagrangian_loss: 0.000056, attention_score_distillation_loss: 0.000552 +loss: 0.001204, lagrangian_loss: 0.000787, attention_score_distillation_loss: 0.000549 +---------------------------------------------------------------------- +time: 2023-07-19 15:24:49 +Evaluating: f1: 0.8846, eval_loss: 0.6177, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4887, expected_sparsity: 0.4834, expected_sequence_sparsity: 0.7887, target_sparsity: 0.4854, step: 12500 +lambda_1: -0.5828, lambda_2: 45.2817 lambda_3: 0.0000 +train remain: [0.99 1. 1. 0.71 0.77 0.56 0.58 0.33 0.26 0.27] +infer remain: [1.0, 1.0, 1.0, 0.7, 0.76, 0.54, 0.56, 0.32, 0.26, 0.26] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.53, 0.29, 0.16, 0.05, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000100100 +11111111111111111111111111111111100100011000100100 +11111111111111111111110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111111010000101000000000000100000000000000100 +11111101101011000010001000000000000000000000000000 +11010000111100010001010000010000000000100100000000 +loss: 0.000947, lagrangian_loss: 0.001835, attention_score_distillation_loss: 0.000546 +loss: 0.000761, lagrangian_loss: 0.002188, attention_score_distillation_loss: 0.000543 +ETA: 0:50:56 | Epoch 108 finished. Took 33.2 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:25:04 +Evaluating: f1: 0.881, eval_loss: 0.6259, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4919, expected_sparsity: 0.4875, expected_sequence_sparsity: 0.7904, target_sparsity: 0.4874, step: 12550 +lambda_1: -1.1474, lambda_2: 45.7156 lambda_3: 0.0000 +train remain: [0.99 1. 1. 0.7 0.76 0.55 0.58 0.33 0.26 0.27] +infer remain: [1.0, 1.0, 1.0, 0.68, 0.76, 0.54, 0.56, 0.32, 0.26, 0.26] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.52, 0.28, 0.16, 0.05, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000000100 +11111111111111111111111111111111100100011000100100 +11111111111111111111110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111111010000101000000000000100000000000000100 +11111101101011000010001000000000000000000000000000 +11010000111100010001010000010000000000100100000000 +loss: 0.000684, lagrangian_loss: 0.001118, attention_score_distillation_loss: 0.000540 +loss: 0.001602, lagrangian_loss: 0.000035, attention_score_distillation_loss: 0.000537 +---------------------------------------------------------------------- +time: 2023-07-19 15:25:19 +Evaluating: f1: 0.8808, eval_loss: 0.6777, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4919, expected_sparsity: 0.4876, expected_sequence_sparsity: 0.7905, target_sparsity: 0.4893, step: 12600 +lambda_1: -1.3575, lambda_2: 45.8174 lambda_3: 0.0000 +train remain: [0.99 1. 1. 0.69 0.76 0.55 0.57 0.32 0.25 0.26] +infer remain: [1.0, 1.0, 1.0, 0.68, 0.76, 0.54, 0.56, 0.32, 0.24, 0.26] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.52, 0.28, 0.16, 0.05, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000000100 +11111111111111111111111111111111100100011000100100 +11111111111111111111110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111111010000101000000000000100000000000000100 +11111101100011000010001000000000000000000000000000 +11010000111100010001010000010000000000100100000000 +loss: 0.001309, lagrangian_loss: -0.000216, attention_score_distillation_loss: 0.000534 +loss: 0.001416, lagrangian_loss: -0.000388, attention_score_distillation_loss: 0.000531 +---------------------------------------------------------------------- +time: 2023-07-19 15:25:33 +Evaluating: f1: 0.884, eval_loss: 0.6849, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4965, expected_sparsity: 0.4913, expected_sequence_sparsity: 0.792, target_sparsity: 0.4913, step: 12650 +lambda_1: -1.2357, lambda_2: 45.8676 lambda_3: 0.0000 +train remain: [0.99 1. 1. 0.69 0.75 0.54 0.57 0.32 0.24 0.26] +infer remain: [1.0, 1.0, 1.0, 0.68, 0.74, 0.52, 0.56, 0.32, 0.24, 0.26] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.5, 0.26, 0.15, 0.05, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000000100 +11111111111111111111111111111111100100011000000100 +11111111111111111101110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111111010000101000000000000100000000000000100 +11111101100011000010001000000000000000000000000000 +11010000111100010001010000010000000000100100000000 +ETA: 0:50:24 | Epoch 109 finished. Took 35.24 seconds. +loss: 0.002827, lagrangian_loss: -0.001490, attention_score_distillation_loss: 0.000529 +loss: 0.002066, lagrangian_loss: -0.000448, attention_score_distillation_loss: 0.000526 +---------------------------------------------------------------------- +time: 2023-07-19 15:25:48 +Evaluating: f1: 0.8582, eval_loss: 0.6807, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4965, expected_sparsity: 0.4913, expected_sequence_sparsity: 0.792, target_sparsity: 0.4932, step: 12700 +lambda_1: -1.0027, lambda_2: 45.9659 lambda_3: 0.0000 +train remain: [0.99 1. 0.99 0.68 0.75 0.53 0.57 0.32 0.24 0.26] +infer remain: [1.0, 1.0, 1.0, 0.68, 0.74, 0.52, 0.56, 0.32, 0.24, 0.26] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.5, 0.26, 0.15, 0.05, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000000100 +11111111111111111111111111111111100100011000000100 +11111111111111111101110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111111010000101000000000000100000000000000100 +11111101100011000010001000000000000000000000000000 +11010000111100010001010000010000000000100100000000 +loss: 0.001403, lagrangian_loss: -0.000424, attention_score_distillation_loss: 0.000523 +loss: 0.002676, lagrangian_loss: -0.000550, attention_score_distillation_loss: 0.000522 +---------------------------------------------------------------------- +time: 2023-07-19 15:26:03 +Evaluating: f1: 0.8655, eval_loss: 0.6174, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4965, expected_sparsity: 0.4917, expected_sequence_sparsity: 0.7921, target_sparsity: 0.4951, step: 12750 +lambda_1: -0.7413, lambda_2: 46.0778 lambda_3: 0.0000 +train remain: [0.99 1. 0.99 0.68 0.74 0.53 0.57 0.31 0.23 0.26] +infer remain: [1.0, 1.0, 1.0, 0.68, 0.74, 0.52, 0.56, 0.3, 0.22, 0.26] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.5, 0.26, 0.15, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000000100 +11111111111111111111111111111101100100011000100100 +11111111111111111101110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11111101000011000010001000000000000000000000000000 +11010000111100010001010000010000000000100100000000 +loss: 0.003527, lagrangian_loss: -0.000618, attention_score_distillation_loss: 0.000517 +ETA: 0:49:50 | Epoch 110 finished. Took 33.14 seconds. +loss: 0.000946, lagrangian_loss: -0.000536, attention_score_distillation_loss: 0.000514 +---------------------------------------------------------------------- +time: 2023-07-19 15:26:17 +Evaluating: f1: 0.8897, eval_loss: 0.6281, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4965, expected_sparsity: 0.4917, expected_sequence_sparsity: 0.7921, target_sparsity: 0.4971, step: 12800 +lambda_1: -0.5699, lambda_2: 46.1503 lambda_3: 0.0000 +train remain: [0.99 1. 0.99 0.68 0.74 0.53 0.57 0.31 0.23 0.25] +infer remain: [1.0, 1.0, 1.0, 0.68, 0.74, 0.52, 0.56, 0.3, 0.22, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.5, 0.26, 0.15, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000000100 +11111111111111111111111111111101100100011000100100 +11111111111111111101110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11111101000011000010001000000000000000000000000000 +10010000111100010001010000010000000000100100000000 +loss: 0.000912, lagrangian_loss: -0.000081, attention_score_distillation_loss: 0.000512 +loss: 0.011712, lagrangian_loss: 0.000002, attention_score_distillation_loss: 0.000510 +---------------------------------------------------------------------- +time: 2023-07-19 15:26:32 +Evaluating: f1: 0.8847, eval_loss: 0.6734, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4981, expected_sparsity: 0.4938, expected_sequence_sparsity: 0.793, target_sparsity: 0.499, step: 12850 +lambda_1: -0.4970, lambda_2: 46.1914 lambda_3: 0.0000 +train remain: [0.99 1. 0.99 0.68 0.73 0.52 0.57 0.31 0.22 0.25] +infer remain: [1.0, 1.0, 1.0, 0.68, 0.72, 0.52, 0.56, 0.3, 0.22, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.49, 0.25, 0.14, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000000100 +11111111111111111111111111101101100100011000100100 +11111111111111111101110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11111101000011000010001000000000000000000000000000 +10010000111100010001010000010000000000100100000000 +loss: 0.001055, lagrangian_loss: -0.000098, attention_score_distillation_loss: 0.000506 +loss: 0.001179, lagrangian_loss: 0.000033, attention_score_distillation_loss: 0.000503 +ETA: 0:49:16 | Epoch 111 finished. Took 33.24 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:26:47 +Evaluating: f1: 0.8722, eval_loss: 0.7173, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4981, expected_sparsity: 0.4938, expected_sequence_sparsity: 0.793, target_sparsity: 0.501, step: 12900 +lambda_1: -0.5520, lambda_2: 46.2418 lambda_3: 0.0000 +train remain: [0.99 1. 0.99 0.68 0.73 0.52 0.56 0.31 0.22 0.24] +infer remain: [1.0, 1.0, 1.0, 0.68, 0.72, 0.52, 0.56, 0.3, 0.22, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.49, 0.25, 0.14, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000000100 +11111111111111111111111111101101100100011000100100 +11111111111111111101110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11111101000011000010001000000000000000000000000000 +10010000111100010001010000010000000000100100000000 +loss: 0.001254, lagrangian_loss: 0.000358, attention_score_distillation_loss: 0.000502 +loss: 0.000788, lagrangian_loss: 0.000008, attention_score_distillation_loss: 0.000497 +---------------------------------------------------------------------- +time: 2023-07-19 15:27:01 +Evaluating: f1: 0.8676, eval_loss: 0.6349, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.4981, expected_sparsity: 0.4939, expected_sequence_sparsity: 0.7931, target_sparsity: 0.5029, step: 12950 +lambda_1: -0.6663, lambda_2: 46.2947 lambda_3: 0.0000 +train remain: [0.99 1. 0.99 0.68 0.72 0.52 0.56 0.31 0.21 0.24] +infer remain: [1.0, 1.0, 1.0, 0.68, 0.72, 0.52, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.68, 0.49, 0.25, 0.14, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111011011011010000000000100 +11111111111111111111111111101101100100011000100100 +11111111111111111101110111010001000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +10010000111100010001010000010000000000100100000000 +loss: 0.139431, lagrangian_loss: 0.000308, attention_score_distillation_loss: 0.000495 +loss: 0.001922, lagrangian_loss: -0.000667, attention_score_distillation_loss: 0.000491 +ETA: 0:48:42 | Epoch 112 finished. Took 33.2 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:27:16 +Evaluating: f1: 0.8746, eval_loss: 0.6465, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5012, expected_sequence_sparsity: 0.796, target_sparsity: 0.5048, step: 13000 +lambda_1: -0.6079, lambda_2: 46.3424 lambda_3: 0.0000 +train remain: [0.99 1. 0.98 0.67 0.71 0.51 0.56 0.31 0.21 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.7, 0.5, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.46, 0.23, 0.13, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111111111011011011010000000000100 +11111111111111111111111111101101100100011000000100 +11111111111111111101110111010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +10010000111100010001010000010000000000100100000000 +loss: 0.001217, lagrangian_loss: -0.000257, attention_score_distillation_loss: 0.000489 +loss: 0.002290, lagrangian_loss: -0.000501, attention_score_distillation_loss: 0.000486 +---------------------------------------------------------------------- +time: 2023-07-19 15:27:31 +Evaluating: f1: 0.872, eval_loss: 0.6937, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5012, expected_sequence_sparsity: 0.796, target_sparsity: 0.5068, step: 13050 +lambda_1: -0.4029, lambda_2: 46.4198 lambda_3: 0.0000 +train remain: [0.99 1. 0.98 0.67 0.71 0.51 0.56 0.31 0.21 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.7, 0.5, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.46, 0.23, 0.13, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111111111011011011010000000000100 +11111111111111111111111111101101100100001000100100 +11111111111111111101110111010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.001181, lagrangian_loss: -0.000264, attention_score_distillation_loss: 0.000483 +loss: 0.009199, lagrangian_loss: 0.000112, attention_score_distillation_loss: 0.000481 +---------------------------------------------------------------------- +time: 2023-07-19 15:27:46 +Evaluating: f1: 0.8778, eval_loss: 0.6688, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5012, expected_sequence_sparsity: 0.796, target_sparsity: 0.5087, step: 13100 +lambda_1: -0.2581, lambda_2: 46.4875 lambda_3: 0.0000 +train remain: [0.99 1. 0.97 0.67 0.71 0.51 0.56 0.31 0.21 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.7, 0.5, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.46, 0.23, 0.13, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111111111011011011010000000000100 +11111111111111111111111111101101100100001000100100 +11111111111111111101110111010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.001172, lagrangian_loss: -0.000077, attention_score_distillation_loss: 0.000477 +ETA: 0:48:10 | Epoch 113 finished. Took 35.43 seconds. +loss: 0.000773, lagrangian_loss: -0.000112, attention_score_distillation_loss: 0.000474 +---------------------------------------------------------------------- +time: 2023-07-19 15:28:00 +Evaluating: f1: 0.8545, eval_loss: 0.6922, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5012, expected_sequence_sparsity: 0.796, target_sparsity: 0.5107, step: 13150 +lambda_1: -0.3089, lambda_2: 46.5277 lambda_3: 0.0000 +train remain: [0.99 1. 0.97 0.67 0.7 0.51 0.56 0.31 0.21 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.7, 0.5, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.46, 0.23, 0.13, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111111111011011011010000000000100 +11111111111111111111111111101101100100001000100100 +11111111111111111101110111010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.003223, lagrangian_loss: -0.000147, attention_score_distillation_loss: 0.000474 +loss: 0.001386, lagrangian_loss: -0.000045, attention_score_distillation_loss: 0.000470 +---------------------------------------------------------------------- +time: 2023-07-19 15:28:15 +Evaluating: f1: 0.8789, eval_loss: 0.7521, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5012, expected_sequence_sparsity: 0.796, target_sparsity: 0.5126, step: 13200 +lambda_1: -0.3590, lambda_2: 46.5802 lambda_3: 0.0000 +train remain: [0.99 1. 0.96 0.67 0.7 0.5 0.56 0.31 0.21 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.7, 0.5, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.46, 0.23, 0.13, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111101111111011011011010000000000100 +11111111111111111111111111101101100100001000100100 +11111111111111111101110111010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.005307, lagrangian_loss: -0.000204, attention_score_distillation_loss: 0.000467 +ETA: 0:47:36 | Epoch 114 finished. Took 33.34 seconds. +loss: 0.001746, lagrangian_loss: -0.000434, attention_score_distillation_loss: 0.000463 +---------------------------------------------------------------------- +time: 2023-07-19 15:28:30 +Evaluating: f1: 0.8779, eval_loss: 0.6287, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5012, expected_sequence_sparsity: 0.796, target_sparsity: 0.5146, step: 13250 +lambda_1: -0.1232, lambda_2: 46.7281 lambda_3: 0.0000 +train remain: [0.99 1. 0.95 0.67 0.7 0.5 0.56 0.31 0.2 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.7, 0.5, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.46, 0.23, 0.13, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111101111111011011011010000000000100 +11111111111111111111111111101101100100001000100100 +11111111111111111101110111010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.001708, lagrangian_loss: -0.000071, attention_score_distillation_loss: 0.000460 +loss: 0.001806, lagrangian_loss: 0.000028, attention_score_distillation_loss: 0.000457 +---------------------------------------------------------------------- +time: 2023-07-19 15:28:45 +Evaluating: f1: 0.8545, eval_loss: 0.6745, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5012, expected_sequence_sparsity: 0.796, target_sparsity: 0.5165, step: 13300 +lambda_1: -0.0444, lambda_2: 46.8289 lambda_3: 0.0000 +train remain: [0.99 1. 0.95 0.67 0.7 0.5 0.56 0.31 0.2 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.7, 0.5, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.46, 0.23, 0.13, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111101111111011011011010000000000100 +11111111111111111111111111101101100100001000100100 +11111111111111111101110111010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.003084, lagrangian_loss: 0.000052, attention_score_distillation_loss: 0.000454 +loss: 0.003181, lagrangian_loss: 0.000640, attention_score_distillation_loss: 0.000451 +ETA: 0:47:02 | Epoch 115 finished. Took 33.23 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:29:00 +Evaluating: f1: 0.8657, eval_loss: 0.6823, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5032, expected_sequence_sparsity: 0.7969, target_sparsity: 0.5184, step: 13350 +lambda_1: -0.7694, lambda_2: 47.5216 lambda_3: 0.0000 +train remain: [0.99 1. 0.95 0.67 0.69 0.5 0.56 0.31 0.2 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.68, 0.5, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.45, 0.22, 0.13, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111101111111011011011010000000000100 +11111111111111111111111111101101100100000000100100 +11111111111111111101110111010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.007021, lagrangian_loss: 0.002050, attention_score_distillation_loss: 0.000448 +loss: 0.001424, lagrangian_loss: 0.000735, attention_score_distillation_loss: 0.000447 +---------------------------------------------------------------------- +time: 2023-07-19 15:29:14 +Evaluating: f1: 0.8545, eval_loss: 0.6907, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5045, expected_sequence_sparsity: 0.7974, target_sparsity: 0.5204, step: 13400 +lambda_1: -1.0465, lambda_2: 47.9125 lambda_3: 0.0000 +train remain: [0.99 1. 0.93 0.67 0.68 0.49 0.56 0.31 0.2 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.68, 0.48, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.45, 0.22, 0.12, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111101111111011011011010000000000100 +11111111111111111111111111101101100100000000100100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.002116, lagrangian_loss: -0.002563, attention_score_distillation_loss: 0.000443 +loss: 0.142616, lagrangian_loss: -0.001278, attention_score_distillation_loss: 0.000440 +---------------------------------------------------------------------- +time: 2023-07-19 15:29:29 +Evaluating: f1: 0.866, eval_loss: 0.7454, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5045, expected_sequence_sparsity: 0.7974, target_sparsity: 0.5223, step: 13450 +lambda_1: -0.0796, lambda_2: 49.0226 lambda_3: 0.0000 +train remain: [0.99 1. 0.92 0.67 0.68 0.49 0.56 0.3 0.2 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.68, 0.48, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.45, 0.22, 0.12, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111101111111011011011010000000000100 +11111111111111111111111111101101100100000000100100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.052311, lagrangian_loss: 0.000184, attention_score_distillation_loss: 0.000437 +ETA: 0:46:30 | Epoch 116 finished. Took 35.46 seconds. +loss: 0.001285, lagrangian_loss: 0.000157, attention_score_distillation_loss: 0.000434 +---------------------------------------------------------------------- +time: 2023-07-19 15:29:44 +Evaluating: f1: 0.8688, eval_loss: 0.7332, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5045, expected_sequence_sparsity: 0.7974, target_sparsity: 0.5243, step: 13500 +lambda_1: 0.2251, lambda_2: 49.3891 lambda_3: 0.0000 +train remain: [0.99 1. 0.93 0.67 0.68 0.49 0.56 0.31 0.2 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.68, 0.48, 0.56, 0.3, 0.2, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.45, 0.22, 0.12, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111101111111011011011010000000000100 +11111111111111111111111111101101100100000000100100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000010000100000 +11111111110010000101000000000000100000000000000100 +11011101000011000010001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.022130, lagrangian_loss: -0.000210, attention_score_distillation_loss: 0.000432 +loss: 0.007977, lagrangian_loss: 0.001646, attention_score_distillation_loss: 0.000428 +---------------------------------------------------------------------- +time: 2023-07-19 15:29:58 +Evaluating: f1: 0.8662, eval_loss: 0.6472, token_prune_loc: [False, False, False, True, True, True, True, True, True, True], macs_sparsity: 0.5075, expected_sparsity: 0.5046, expected_sequence_sparsity: 0.7974, target_sparsity: 0.5262, step: 13550 +lambda_1: -0.7812, lambda_2: 50.6659 lambda_3: 0.0000 +train remain: [0.99 1. 0.93 0.67 0.68 0.49 0.56 0.3 0.19 0.24] +infer remain: [1.0, 1.0, 1.0, 0.66, 0.68, 0.48, 0.56, 0.3, 0.18, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.45, 0.22, 0.12, 0.04, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111111111101111111011011011010000000000100 +11111111111111111111111111101101100100000000100100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000000001100000 +11111111110010000101000000000000100000000000000100 +11011101000011000000001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.005489, lagrangian_loss: 0.003386, attention_score_distillation_loss: 0.000425 +ETA: 0:45:56 | Epoch 117 finished. Took 33.19 seconds. +loss: 0.001774, lagrangian_loss: 0.004282, attention_score_distillation_loss: 0.000423 +---------------------------------------------------------------------- +time: 2023-07-19 15:30:13 +Evaluating: f1: 0.8617, eval_loss: 0.7033, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5485, expected_sparsity: 0.543, expected_sequence_sparsity: 0.8132, target_sparsity: 0.5282, step: 13600 +lambda_1: -1.6538, lambda_2: 51.5892 lambda_3: 0.0000 +train remain: [0.99 1. 0.91 0.67 0.67 0.49 0.56 0.3 0.19 0.24] +infer remain: [1.0, 1.0, 0.82, 0.66, 0.66, 0.48, 0.56, 0.3, 0.18, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.82, 0.54, 0.36, 0.17, 0.1, 0.03, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111111111011111101110101010011110 +11111111111111111101111111011011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110101010000000000100000 +11111111111010000101000000000000100000000000000000 +11011101000011000000001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.005384, lagrangian_loss: -0.001517, attention_score_distillation_loss: 0.000420 +loss: 0.003460, lagrangian_loss: -0.005222, attention_score_distillation_loss: 0.000417 +---------------------------------------------------------------------- +time: 2023-07-19 15:30:28 +Evaluating: f1: 0.8714, eval_loss: 0.7153, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5485, expected_sparsity: 0.543, expected_sequence_sparsity: 0.8132, target_sparsity: 0.5301, step: 13650 +lambda_1: -1.1333, lambda_2: 52.2577 lambda_3: 0.0000 +train remain: [0.99 1. 0.88 0.66 0.67 0.48 0.56 0.3 0.18 0.24] +infer remain: [1.0, 1.0, 0.82, 0.66, 0.66, 0.48, 0.56, 0.3, 0.18, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.82, 0.54, 0.36, 0.17, 0.1, 0.03, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111111111011111101110101010011110 +11111111111111111111111111010011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110101010000000000100000 +11111111111010000101000000000000100000000000000000 +11011101000011000000001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.038430, lagrangian_loss: -0.004524, attention_score_distillation_loss: 0.000415 +loss: 0.006729, lagrangian_loss: -0.000966, attention_score_distillation_loss: 0.000411 +ETA: 0:45:22 | Epoch 118 finished. Took 33.31 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:30:43 +Evaluating: f1: 0.8696, eval_loss: 0.6876, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5532, expected_sparsity: 0.547, expected_sequence_sparsity: 0.8148, target_sparsity: 0.532, step: 13700 +lambda_1: 0.1499, lambda_2: 54.0245 lambda_3: 0.0000 +train remain: [0.99 1. 0.88 0.66 0.67 0.48 0.56 0.3 0.18 0.24] +infer remain: [1.0, 1.0, 0.8, 0.66, 0.66, 0.48, 0.56, 0.3, 0.18, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.8, 0.53, 0.35, 0.17, 0.09, 0.03, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111011111101110101010011110 +11111111111111111111111111010011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110101010000000000100000 +11111111111010000101000000000000100000000000000000 +11011101000011000000001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.077044, lagrangian_loss: 0.001325, attention_score_distillation_loss: 0.000408 +loss: 0.001636, lagrangian_loss: 0.000611, attention_score_distillation_loss: 0.000405 +---------------------------------------------------------------------- +time: 2023-07-19 15:30:57 +Evaluating: f1: 0.8656, eval_loss: 0.6852, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5485, expected_sparsity: 0.543, expected_sequence_sparsity: 0.8132, target_sparsity: 0.534, step: 13750 +lambda_1: 0.5451, lambda_2: 54.4503 lambda_3: 0.0000 +train remain: [0.99 1. 0.89 0.67 0.67 0.49 0.56 0.31 0.19 0.24] +infer remain: [1.0, 1.0, 0.82, 0.66, 0.66, 0.48, 0.56, 0.3, 0.18, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.82, 0.54, 0.36, 0.17, 0.1, 0.03, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111111111011111101110101010011110 +11111111111111111111111111010011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100110000000000100000 +11111111111010000101000000000000100000000000000000 +11011101000011000000001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.001351, lagrangian_loss: -0.000863, attention_score_distillation_loss: 0.000403 +loss: 0.001897, lagrangian_loss: -0.000047, attention_score_distillation_loss: 0.000399 +---------------------------------------------------------------------- +time: 2023-07-19 15:31:12 +Evaluating: f1: 0.8776, eval_loss: 0.643, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5485, expected_sparsity: 0.543, expected_sequence_sparsity: 0.8132, target_sparsity: 0.5359, step: 13800 +lambda_1: -0.2574, lambda_2: 55.2623 lambda_3: 0.0000 +train remain: [0.99 1. 0.89 0.67 0.67 0.49 0.56 0.31 0.19 0.24] +infer remain: [1.0, 1.0, 0.82, 0.66, 0.66, 0.48, 0.56, 0.3, 0.18, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.82, 0.54, 0.36, 0.17, 0.1, 0.03, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111111111011111101110101010011110 +11111111111111111111111111010011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000100000100000 +11111111111010000101000000000000100000000000000000 +11011101000011000000001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +ETA: 0:44:50 | Epoch 119 finished. Took 35.4 seconds. +loss: 0.001656, lagrangian_loss: 0.002132, attention_score_distillation_loss: 0.000397 +loss: 0.002094, lagrangian_loss: 0.002678, attention_score_distillation_loss: 0.000394 +---------------------------------------------------------------------- +time: 2023-07-19 15:31:27 +Evaluating: f1: 0.8714, eval_loss: 0.6674, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5532, expected_sparsity: 0.547, expected_sequence_sparsity: 0.8148, target_sparsity: 0.5379, step: 13850 +lambda_1: -0.9896, lambda_2: 55.9355 lambda_3: 0.0000 +train remain: [0.99 1. 0.88 0.66 0.67 0.48 0.56 0.3 0.18 0.24] +infer remain: [1.0, 1.0, 0.8, 0.66, 0.66, 0.48, 0.56, 0.3, 0.18, 0.24] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.8, 0.53, 0.35, 0.17, 0.09, 0.03, 0.01, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111011111101110101010011110 +11111111111111111111111111010011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000100000100000 +11111111111010000101000000000000100000000000000000 +11011101000011000000001000000000000000000000000000 +11010000111100010001000000010000000000100100000000 +loss: 0.203099, lagrangian_loss: 0.000998, attention_score_distillation_loss: 0.000392 +loss: 0.003027, lagrangian_loss: -0.000615, attention_score_distillation_loss: 0.000388 +---------------------------------------------------------------------- +time: 2023-07-19 15:31:42 +Evaluating: f1: 0.878, eval_loss: 0.6677, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5532, expected_sparsity: 0.5471, expected_sequence_sparsity: 0.8148, target_sparsity: 0.5398, step: 13900 +lambda_1: -1.0186, lambda_2: 56.0066 lambda_3: 0.0000 +train remain: [0.98 1. 0.86 0.66 0.66 0.48 0.56 0.3 0.17 0.23] +infer remain: [1.0, 1.0, 0.8, 0.66, 0.66, 0.48, 0.56, 0.3, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.8, 0.53, 0.35, 0.17, 0.09, 0.03, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111011111101110101010011110 +11111111111111111111111111010011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000100000100000 +11111111111010000101000000000000100000000000000000 +11011101000011000000000000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.001157, lagrangian_loss: -0.000666, attention_score_distillation_loss: 0.000385 +ETA: 0:44:16 | Epoch 120 finished. Took 33.18 seconds. +loss: 0.003141, lagrangian_loss: -0.001340, attention_score_distillation_loss: 0.000383 +---------------------------------------------------------------------- +time: 2023-07-19 15:31:56 +Evaluating: f1: 0.867, eval_loss: 0.6312, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5564, expected_sparsity: 0.55, expected_sequence_sparsity: 0.816, target_sparsity: 0.5417, step: 13950 +lambda_1: -0.5718, lambda_2: 56.2652 lambda_3: 0.0000 +train remain: [0.98 1. 0.85 0.66 0.66 0.48 0.56 0.3 0.17 0.23] +infer remain: [1.0, 1.0, 0.8, 0.64, 0.66, 0.48, 0.56, 0.3, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.8, 0.51, 0.34, 0.16, 0.09, 0.03, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111011111101110101010011110 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000000000100001 +11111111111010000101000000000000100000000000000000 +11011101000011000000000000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.002652, lagrangian_loss: -0.000603, attention_score_distillation_loss: 0.000381 +loss: 0.001597, lagrangian_loss: -0.000587, attention_score_distillation_loss: 0.000377 +---------------------------------------------------------------------- +time: 2023-07-19 15:32:11 +Evaluating: f1: 0.8691, eval_loss: 0.694, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5564, expected_sparsity: 0.55, expected_sequence_sparsity: 0.816, target_sparsity: 0.5437, step: 14000 +lambda_1: -0.2326, lambda_2: 56.4160 lambda_3: 0.0000 +train remain: [0.98 1. 0.85 0.66 0.66 0.48 0.56 0.3 0.17 0.23] +infer remain: [1.0, 1.0, 0.8, 0.64, 0.66, 0.48, 0.56, 0.3, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.8, 0.51, 0.34, 0.16, 0.09, 0.03, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111011111101110101010011110 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111111010110100010000000000100000 +11111111111010000101000000000000100000000000000000 +11010101000011000000001000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.002427, lagrangian_loss: -0.000224, attention_score_distillation_loss: 0.000374 +loss: 0.002004, lagrangian_loss: 0.000016, attention_score_distillation_loss: 0.000371 +ETA: 0:43:42 | Epoch 121 finished. Took 33.1 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:32:26 +Evaluating: f1: 0.8866, eval_loss: 0.6218, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5564, expected_sparsity: 0.55, expected_sequence_sparsity: 0.816, target_sparsity: 0.5456, step: 14050 +lambda_1: -0.1715, lambda_2: 56.4574 lambda_3: 0.0000 +train remain: [0.98 1. 0.85 0.65 0.66 0.48 0.56 0.3 0.17 0.23] +infer remain: [1.0, 1.0, 0.8, 0.64, 0.66, 0.48, 0.56, 0.3, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.8, 0.51, 0.34, 0.16, 0.09, 0.03, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111011111101110101010011110 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000000000100001 +11111111111010000101000000000000100000000000000000 +11010101000011000000001000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.003676, lagrangian_loss: 0.000778, attention_score_distillation_loss: 0.000369 +loss: 0.005993, lagrangian_loss: 0.000401, attention_score_distillation_loss: 0.000365 +---------------------------------------------------------------------- +time: 2023-07-19 15:32:41 +Evaluating: f1: 0.8805, eval_loss: 0.6608, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5564, expected_sparsity: 0.55, expected_sequence_sparsity: 0.816, target_sparsity: 0.5476, step: 14100 +lambda_1: -0.4956, lambda_2: 56.6003 lambda_3: 0.0000 +train remain: [0.98 1. 0.84 0.65 0.66 0.48 0.56 0.3 0.17 0.23] +infer remain: [1.0, 1.0, 0.8, 0.64, 0.66, 0.48, 0.56, 0.3, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.8, 0.51, 0.34, 0.16, 0.09, 0.03, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111011111101110101010011110 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100100000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000000000100001 +11111111111010000101000000000000100000000000000000 +11010101000011000000001000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.001626, lagrangian_loss: 0.001307, attention_score_distillation_loss: 0.000362 +loss: 0.003204, lagrangian_loss: 0.001536, attention_score_distillation_loss: 0.000359 +ETA: 0:43:08 | Epoch 122 finished. Took 33.16 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:32:55 +Evaluating: f1: 0.8728, eval_loss: 0.6651, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5579, expected_sparsity: 0.5539, expected_sequence_sparsity: 0.8176, target_sparsity: 0.5495, step: 14150 +lambda_1: -0.8890, lambda_2: 56.7932 lambda_3: 0.0000 +train remain: [0.98 1. 0.83 0.65 0.66 0.48 0.56 0.3 0.16 0.22] +infer remain: [1.0, 1.0, 0.78, 0.64, 0.66, 0.48, 0.56, 0.3, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.78, 0.5, 0.33, 0.16, 0.09, 0.03, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111011110111110101010011100 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000100100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010100000000100000 +11111111111010000101000000000000100000000000000000 +11010101000011000000001000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.002449, lagrangian_loss: 0.000538, attention_score_distillation_loss: 0.000357 +loss: 0.001125, lagrangian_loss: 0.000175, attention_score_distillation_loss: 0.000354 +---------------------------------------------------------------------- +time: 2023-07-19 15:33:10 +Evaluating: f1: 0.8779, eval_loss: 0.6466, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5579, expected_sparsity: 0.5539, expected_sequence_sparsity: 0.8176, target_sparsity: 0.5515, step: 14200 +lambda_1: -0.9468, lambda_2: 56.8313 lambda_3: 0.0000 +train remain: [0.98 1. 0.82 0.65 0.66 0.48 0.56 0.3 0.16 0.22] +infer remain: [1.0, 1.0, 0.78, 0.64, 0.66, 0.48, 0.56, 0.3, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.78, 0.5, 0.33, 0.16, 0.09, 0.03, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111011110111110101010011100 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000100100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010100000000100000 +11111111110010000101000000000000110000000000000000 +11010101000011000000001000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.151185, lagrangian_loss: -0.001362, attention_score_distillation_loss: 0.000352 +loss: 0.002223, lagrangian_loss: -0.000577, attention_score_distillation_loss: 0.000348 +---------------------------------------------------------------------- +time: 2023-07-19 15:33:25 +Evaluating: f1: 0.8632, eval_loss: 0.703, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5579, expected_sparsity: 0.5539, expected_sequence_sparsity: 0.8176, target_sparsity: 0.5534, step: 14250 +lambda_1: -0.6840, lambda_2: 56.9351 lambda_3: 0.0000 +train remain: [0.98 1. 0.81 0.65 0.66 0.48 0.56 0.3 0.16 0.22] +infer remain: [1.0, 1.0, 0.78, 0.64, 0.66, 0.48, 0.56, 0.3, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.78, 0.5, 0.33, 0.16, 0.09, 0.03, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111011110111110101010011100 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000100100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010100000000100000 +11111111110010000101000000000000110000000000000000 +11010101000011000000001000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.001710, lagrangian_loss: -0.000815, attention_score_distillation_loss: 0.000345 +ETA: 0:42:36 | Epoch 123 finished. Took 35.46 seconds. +loss: 0.002408, lagrangian_loss: -0.000444, attention_score_distillation_loss: 0.000342 +---------------------------------------------------------------------- +time: 2023-07-19 15:33:39 +Evaluating: f1: 0.8759, eval_loss: 0.6738, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5642, expected_sparsity: 0.5579, expected_sequence_sparsity: 0.8193, target_sparsity: 0.5553, step: 14300 +lambda_1: -0.3638, lambda_2: 57.0628 lambda_3: 0.0000 +train remain: [0.98 1. 0.8 0.65 0.65 0.48 0.56 0.3 0.16 0.22] +infer remain: [1.0, 1.0, 0.76, 0.64, 0.66, 0.48, 0.56, 0.3, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.76, 0.49, 0.32, 0.15, 0.09, 0.03, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111001110111110101010011100 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000100100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000000010100000 +11111111110010000101000000000000110000000000000000 +11010101000011000000001000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.001342, lagrangian_loss: -0.000423, attention_score_distillation_loss: 0.000339 +loss: 0.135495, lagrangian_loss: -0.000284, attention_score_distillation_loss: 0.000337 +---------------------------------------------------------------------- +time: 2023-07-19 15:33:54 +Evaluating: f1: 0.8832, eval_loss: 0.6172, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5642, expected_sparsity: 0.5579, expected_sequence_sparsity: 0.8193, target_sparsity: 0.5573, step: 14350 +lambda_1: -0.2535, lambda_2: 57.0997 lambda_3: 0.0000 +train remain: [0.98 1. 0.8 0.65 0.65 0.48 0.56 0.29 0.16 0.22] +infer remain: [1.0, 1.0, 0.76, 0.64, 0.66, 0.48, 0.56, 0.3, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.76, 0.49, 0.32, 0.15, 0.09, 0.03, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111001110111110101010011100 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000100100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000000010100000 +11111111110010000101000000000000110000000000000000 +11010101000011000000001000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.129177, lagrangian_loss: -0.000090, attention_score_distillation_loss: 0.000334 +ETA: 0:42:02 | Epoch 124 finished. Took 33.17 seconds. +loss: 0.002739, lagrangian_loss: 0.000038, attention_score_distillation_loss: 0.000331 +---------------------------------------------------------------------- +time: 2023-07-19 15:34:09 +Evaluating: f1: 0.875, eval_loss: 0.6671, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5642, expected_sparsity: 0.5595, expected_sequence_sparsity: 0.8199, target_sparsity: 0.5592, step: 14400 +lambda_1: -0.4606, lambda_2: 57.1886 lambda_3: 0.0000 +train remain: [0.98 1. 0.8 0.65 0.65 0.48 0.56 0.29 0.15 0.22] +infer remain: [1.0, 1.0, 0.76, 0.64, 0.64, 0.48, 0.56, 0.28, 0.16, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.76, 0.49, 0.31, 0.15, 0.08, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111001110111110101010011100 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010100000000100000 +11111111110010000101000000000000100000000000000000 +11010101000011000000001000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.002697, lagrangian_loss: 0.000455, attention_score_distillation_loss: 0.000328 +loss: 0.001896, lagrangian_loss: 0.001704, attention_score_distillation_loss: 0.000325 +---------------------------------------------------------------------- +time: 2023-07-19 15:34:23 +Evaluating: f1: 0.8628, eval_loss: 0.66, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5642, expected_sparsity: 0.5595, expected_sequence_sparsity: 0.8199, target_sparsity: 0.5612, step: 14450 +lambda_1: -0.8841, lambda_2: 57.4118 lambda_3: 0.0000 +train remain: [0.98 1. 0.79 0.65 0.65 0.47 0.56 0.29 0.15 0.22] +infer remain: [1.0, 1.0, 0.76, 0.64, 0.64, 0.48, 0.56, 0.28, 0.14, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.76, 0.49, 0.31, 0.15, 0.08, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111001110111110101010011100 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111101110110010000000000000000000000 +11111111011111111111101010110100010000000000100001 +11111111110010000101000000000000100000000000000000 +11010101000011000000000000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.002422, lagrangian_loss: 0.000709, attention_score_distillation_loss: 0.000322 +loss: 0.001248, lagrangian_loss: 0.004348, attention_score_distillation_loss: 0.000319 +ETA: 0:41:28 | Epoch 125 finished. Took 33.01 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:34:38 +Evaluating: f1: 0.8722, eval_loss: 0.6585, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5689, expected_sparsity: 0.5646, expected_sequence_sparsity: 0.822, target_sparsity: 0.5631, step: 14500 +lambda_1: -1.4127, lambda_2: 57.8195 lambda_3: 0.0000 +train remain: [0.99 0.99 0.78 0.64 0.65 0.47 0.56 0.29 0.15 0.21] +infer remain: [1.0, 1.0, 0.74, 0.64, 0.64, 0.46, 0.54, 0.28, 0.14, 0.22] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.74, 0.47, 0.3, 0.14, 0.08, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111001110111110101010011000 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111101010110100010000000000100000 +11111111110010000101000000000000100000000000000000 +11010101000011000000000000000000000000000000000000 +10010000111100010001000000010000000000100100000000 +loss: 0.003709, lagrangian_loss: 0.004897, attention_score_distillation_loss: 0.000316 +loss: 0.204541, lagrangian_loss: 0.002620, attention_score_distillation_loss: 0.000313 +---------------------------------------------------------------------- +time: 2023-07-19 15:34:53 +Evaluating: f1: 0.8734, eval_loss: 0.6232, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5689, expected_sparsity: 0.5646, expected_sequence_sparsity: 0.822, target_sparsity: 0.5651, step: 14550 +lambda_1: -1.9687, lambda_2: 58.2209 lambda_3: 0.0000 +train remain: [0.98 0.99 0.76 0.64 0.65 0.47 0.55 0.28 0.15 0.2 ] +infer remain: [1.0, 1.0, 0.74, 0.64, 0.64, 0.46, 0.54, 0.28, 0.14, 0.2] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.74, 0.47, 0.3, 0.14, 0.08, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111001110111110101010011000 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111101010110100010000000000100000 +11111111110010000101000000000000100000000000000000 +11010101000011000000000000000000000000000000000000 +10010000011100010001000000010000000000100100000000 +loss: 0.002217, lagrangian_loss: 0.001590, attention_score_distillation_loss: 0.000312 +loss: 0.001671, lagrangian_loss: -0.001356, attention_score_distillation_loss: 0.000308 +---------------------------------------------------------------------- +time: 2023-07-19 15:35:07 +Evaluating: f1: 0.8717, eval_loss: 0.6467, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5736, expected_sparsity: 0.5685, expected_sequence_sparsity: 0.8236, target_sparsity: 0.567, step: 14600 +lambda_1: -1.8426, lambda_2: 58.3207 lambda_3: 0.0000 +train remain: [0.98 0.99 0.74 0.64 0.64 0.47 0.55 0.28 0.14 0.2 ] +infer remain: [1.0, 1.0, 0.72, 0.64, 0.64, 0.46, 0.54, 0.28, 0.14, 0.2] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.72, 0.46, 0.29, 0.14, 0.07, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111001110111110101010010000 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111101010110100010000000000100000 +11111111110010000101000000000000100000000000000000 +11010101000011000000000000000000000000000000000000 +10010000011100010001000000010000000000100100000000 +loss: 0.003029, lagrangian_loss: -0.004543, attention_score_distillation_loss: 0.000305 +ETA: 0:40:55 | Epoch 126 finished. Took 35.4 seconds. +loss: 0.001292, lagrangian_loss: -0.004322, attention_score_distillation_loss: 0.000302 +---------------------------------------------------------------------- +time: 2023-07-19 15:35:22 +Evaluating: f1: 0.8743, eval_loss: 0.6541, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5768, expected_sparsity: 0.5725, expected_sequence_sparsity: 0.8253, target_sparsity: 0.5689, step: 14650 +lambda_1: -1.0757, lambda_2: 59.0377 lambda_3: 0.0000 +train remain: [0.98 0.99 0.73 0.64 0.64 0.47 0.54 0.27 0.14 0.19] +infer remain: [1.0, 1.0, 0.7, 0.64, 0.64, 0.46, 0.54, 0.26, 0.14, 0.2] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.7, 0.45, 0.29, 0.13, 0.07, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111000110111110101010010000 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111101010110100010000000000100000 +11111111110010000101000000000000000000000000000000 +11010101000011000000000000000000000000000000000000 +10010000010100010001000000010000000000100100000001 +loss: 0.001743, lagrangian_loss: -0.003369, attention_score_distillation_loss: 0.000299 +loss: 0.002059, lagrangian_loss: -0.001476, attention_score_distillation_loss: 0.000296 +---------------------------------------------------------------------- +time: 2023-07-19 15:35:37 +Evaluating: f1: 0.8848, eval_loss: 0.7084, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5768, expected_sparsity: 0.5725, expected_sequence_sparsity: 0.8253, target_sparsity: 0.5709, step: 14700 +lambda_1: -0.2317, lambda_2: 59.8998 lambda_3: 0.0000 +train remain: [0.98 0.99 0.73 0.64 0.64 0.47 0.54 0.27 0.14 0.19] +infer remain: [1.0, 1.0, 0.7, 0.64, 0.64, 0.46, 0.54, 0.26, 0.14, 0.2] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.7, 0.45, 0.29, 0.13, 0.07, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011111111011111000110111110101010010000 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111101010110100010000000000100000 +11111111110010000101000000000000000000000000000000 +11010101000011000000000000000000000000000000000000 +10010000010100010001000000010000000000100100000001 +loss: 0.004131, lagrangian_loss: -0.000114, attention_score_distillation_loss: 0.000293 +ETA: 0:40:21 | Epoch 127 finished. Took 33.22 seconds. +loss: 0.007172, lagrangian_loss: 0.000010, attention_score_distillation_loss: 0.000291 +---------------------------------------------------------------------- +time: 2023-07-19 15:35:52 +Evaluating: f1: 0.8866, eval_loss: 0.6446, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5768, expected_sparsity: 0.5725, expected_sequence_sparsity: 0.8253, target_sparsity: 0.5728, step: 14750 +lambda_1: 0.1480, lambda_2: 60.1371 lambda_3: 0.0000 +train remain: [0.98 0.99 0.73 0.64 0.64 0.47 0.54 0.27 0.14 0.19] +infer remain: [1.0, 1.0, 0.7, 0.64, 0.64, 0.46, 0.54, 0.26, 0.14, 0.2] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.7, 0.45, 0.29, 0.13, 0.07, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011110111011111001110111110101010010000 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111101010110100010000000000100000 +11111111110010000101000000000000000000000000000000 +11010101000011000000000000000000000000000000000000 +10010000010100010001000000010000000000100100000001 +loss: 0.003271, lagrangian_loss: 0.000047, attention_score_distillation_loss: 0.000288 +loss: 0.001387, lagrangian_loss: 0.000064, attention_score_distillation_loss: 0.000285 +---------------------------------------------------------------------- +time: 2023-07-19 15:36:06 +Evaluating: f1: 0.8429, eval_loss: 0.6964, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5768, expected_sparsity: 0.5725, expected_sequence_sparsity: 0.8253, target_sparsity: 0.5748, step: 14800 +lambda_1: -0.1268, lambda_2: 60.3048 lambda_3: 0.0000 +train remain: [0.98 0.99 0.73 0.64 0.64 0.47 0.54 0.27 0.14 0.19] +infer remain: [1.0, 1.0, 0.7, 0.64, 0.64, 0.46, 0.54, 0.26, 0.14, 0.2] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.7, 0.45, 0.29, 0.13, 0.07, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011110111011111001110111110101010010000 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111101010110100010000000000100000 +11111111110010000101000000000000000000000000000000 +11010101000011000000000000000000000000000000000000 +10010000010100010001000000010000000000100100000001 +loss: 0.001661, lagrangian_loss: 0.000699, attention_score_distillation_loss: 0.000282 +loss: 0.001800, lagrangian_loss: 0.003002, attention_score_distillation_loss: 0.000279 +ETA: 0:39:47 | Epoch 128 finished. Took 33.18 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:36:21 +Evaluating: f1: 0.8722, eval_loss: 0.7274, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5768, expected_sparsity: 0.5725, expected_sequence_sparsity: 0.8253, target_sparsity: 0.5767, step: 14850 +lambda_1: -0.7486, lambda_2: 60.8002 lambda_3: 0.0000 +train remain: [0.98 0.99 0.73 0.63 0.64 0.47 0.54 0.27 0.14 0.19] +infer remain: [1.0, 1.0, 0.7, 0.64, 0.64, 0.46, 0.54, 0.26, 0.14, 0.18] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.7, 0.45, 0.29, 0.13, 0.07, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011110111011111001110111110101010010000 +11111111111111111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111101010110100010000000000100000 +11111111110010000101000000000000000000000000000000 +11010101000011000000000000000000000000000000000000 +10010000010100010001000000010000000000100100000000 +loss: 0.004895, lagrangian_loss: 0.002281, attention_score_distillation_loss: 0.000276 +loss: 0.002205, lagrangian_loss: 0.004023, attention_score_distillation_loss: 0.000273 +---------------------------------------------------------------------- +time: 2023-07-19 15:36:36 +Evaluating: f1: 0.8702, eval_loss: 0.66, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5799, expected_sparsity: 0.575, expected_sequence_sparsity: 0.8263, target_sparsity: 0.5786, step: 14900 +lambda_1: -1.4409, lambda_2: 61.4185 lambda_3: 0.0000 +train remain: [0.99 0.99 0.72 0.63 0.64 0.47 0.54 0.27 0.14 0.19] +infer remain: [1.0, 1.0, 0.7, 0.62, 0.64, 0.46, 0.54, 0.26, 0.14, 0.18] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.7, 0.43, 0.28, 0.13, 0.07, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011110111011111001110111110101010010000 +11111111111011111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111101010110100010000000000100000 +11111111110010000101000000000000000000000000000000 +11010101000011000000000000000000000000000000000000 +10010000010100010001000000010000000000100100000000 +loss: 0.001793, lagrangian_loss: 0.005030, attention_score_distillation_loss: 0.000271 +loss: 0.001506, lagrangian_loss: 0.004772, attention_score_distillation_loss: 0.000268 +---------------------------------------------------------------------- +time: 2023-07-19 15:36:51 +Evaluating: f1: 0.8763, eval_loss: 0.6732, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5831, expected_sparsity: 0.5788, expected_sequence_sparsity: 0.8278, target_sparsity: 0.5806, step: 14950 +lambda_1: -2.0840, lambda_2: 61.9884 lambda_3: 0.0000 +train remain: [0.98 0.99 0.71 0.63 0.64 0.46 0.53 0.26 0.13 0.18] +infer remain: [1.0, 1.0, 0.68, 0.62, 0.64, 0.46, 0.54, 0.26, 0.12, 0.18] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.68, 0.42, 0.27, 0.12, 0.07, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011110111011111000110111110101010010000 +11111111111011111101111111010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111101010110100010000000000100000 +11111111110010000101000000000000000000000000000000 +11000101000011000000000000000000000000000000000000 +10010000010100010001000000010000000000000100000001 +ETA: 0:39:15 | Epoch 129 finished. Took 35.46 seconds. +loss: 0.002328, lagrangian_loss: 0.005116, attention_score_distillation_loss: 0.000265 +loss: 0.011365, lagrangian_loss: 0.000100, attention_score_distillation_loss: 0.000262 +---------------------------------------------------------------------- +time: 2023-07-19 15:37:05 +Evaluating: f1: 0.8741, eval_loss: 0.6584, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5831, expected_sparsity: 0.5791, expected_sequence_sparsity: 0.8279, target_sparsity: 0.5825, step: 15000 +lambda_1: -2.3410, lambda_2: 62.1159 lambda_3: 0.0000 +train remain: [0.98 0.99 0.69 0.63 0.64 0.46 0.53 0.26 0.12 0.18] +infer remain: [1.0, 1.0, 0.68, 0.62, 0.64, 0.46, 0.52, 0.26, 0.12, 0.18] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.68, 0.42, 0.27, 0.12, 0.06, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011110111011111000110111110101010010000 +11111111111111111101111101010011011010000000000100 +11111111111111111111111111101101100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111001010110100010000000000100000 +11111111110010000101000000000000000000000000000000 +11000101000011000000000000000000000000000000000000 +10010000010100010001000000010000000000000100000001 +loss: 0.181324, lagrangian_loss: -0.000174, attention_score_distillation_loss: 0.000259 +loss: 0.001722, lagrangian_loss: -0.003707, attention_score_distillation_loss: 0.000256 +---------------------------------------------------------------------- +time: 2023-07-19 15:37:20 +Evaluating: f1: 0.8667, eval_loss: 0.6744, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5862, expected_sparsity: 0.584, expected_sequence_sparsity: 0.83, target_sparsity: 0.5845, step: 15050 +lambda_1: -2.1984, lambda_2: 62.1787 lambda_3: 0.0000 +train remain: [0.98 0.99 0.68 0.62 0.63 0.46 0.53 0.26 0.12 0.17] +infer remain: [1.0, 1.0, 0.66, 0.62, 0.62, 0.46, 0.52, 0.26, 0.12, 0.16] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.66, 0.41, 0.25, 0.12, 0.06, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011110111011111000110101110101010010000 +11111111111111111101111101010011011010000000000100 +11111111111111111111111111101100100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111001010110100010000000000100000 +11111111110010000101000000000000000000000000000000 +11000101000011000000000000000000000000000000000000 +10010000010100010001000000010000000000000100000000 +loss: 0.002957, lagrangian_loss: -0.003767, attention_score_distillation_loss: 0.000253 +ETA: 0:38:41 | Epoch 130 finished. Took 33.06 seconds. +loss: 0.000799, lagrangian_loss: -0.002619, attention_score_distillation_loss: 0.000250 +---------------------------------------------------------------------- +time: 2023-07-19 15:37:35 +Evaluating: f1: 0.88, eval_loss: 0.6488, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5862, expected_sparsity: 0.584, expected_sequence_sparsity: 0.83, target_sparsity: 0.5864, step: 15100 +lambda_1: -1.8176, lambda_2: 62.3793 lambda_3: 0.0000 +train remain: [0.98 0.99 0.67 0.62 0.63 0.46 0.53 0.25 0.12 0.16] +infer remain: [1.0, 1.0, 0.66, 0.62, 0.62, 0.46, 0.52, 0.26, 0.12, 0.16] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.66, 0.41, 0.25, 0.12, 0.06, 0.02, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011110111011111000110101110101010010000 +11111111111111111101111101010011011010000000000100 +11111111111111111111111111101100100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111001010110100010000000000100000 +11111111110010000101000000000000000000000000000000 +10000101001011000000000000000000000000000000000000 +10010000010100010001000000010000000000000100000000 +loss: 0.004209, lagrangian_loss: -0.004118, attention_score_distillation_loss: 0.000247 +loss: 0.001818, lagrangian_loss: -0.003735, attention_score_distillation_loss: 0.000245 +---------------------------------------------------------------------- +time: 2023-07-19 15:37:49 +Evaluating: f1: 0.8702, eval_loss: 0.718, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5878, expected_sparsity: 0.5841, expected_sequence_sparsity: 0.83, target_sparsity: 0.5884, step: 15150 +lambda_1: -1.3515, lambda_2: 62.6702 lambda_3: 0.0000 +train remain: [0.98 0.99 0.67 0.62 0.63 0.46 0.52 0.25 0.12 0.16] +infer remain: [1.0, 1.0, 0.66, 0.62, 0.62, 0.46, 0.52, 0.24, 0.12, 0.16] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.66, 0.41, 0.25, 0.12, 0.06, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111111011110111011111000110101110101010010000 +11111111111111111101111101010011011010000000000100 +11111111111111111111111111101100100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101001011000000000000000000000000000000000000 +10010000010100010001000000010000000000000100000000 +loss: 0.005516, lagrangian_loss: -0.003339, attention_score_distillation_loss: 0.000242 +loss: 0.001899, lagrangian_loss: -0.002277, attention_score_distillation_loss: 0.000239 +ETA: 0:38:07 | Epoch 131 finished. Took 33.21 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:38:04 +Evaluating: f1: 0.8652, eval_loss: 0.6903, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5878, expected_sparsity: 0.5841, expected_sequence_sparsity: 0.83, target_sparsity: 0.5903, step: 15200 +lambda_1: -0.9081, lambda_2: 62.9441 lambda_3: 0.0000 +train remain: [0.98 0.98 0.67 0.62 0.62 0.46 0.52 0.25 0.12 0.16] +infer remain: [1.0, 1.0, 0.66, 0.62, 0.62, 0.46, 0.52, 0.24, 0.12, 0.16] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.66, 0.41, 0.25, 0.12, 0.06, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111101011110111011111000110111110101010010000 +11111111111111111101111101010011011010000000000100 +11111111111111111111111111101100100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101001011000000000000000000000000000000000000 +10010000010100010001000000010000000000000100000000 +loss: 0.004155, lagrangian_loss: -0.000606, attention_score_distillation_loss: 0.000236 +loss: 0.002629, lagrangian_loss: -0.000497, attention_score_distillation_loss: 0.000233 +---------------------------------------------------------------------- +time: 2023-07-19 15:38:19 +Evaluating: f1: 0.8824, eval_loss: 0.6687, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5925, expected_sparsity: 0.5879, expected_sequence_sparsity: 0.8316, target_sparsity: 0.5922, step: 15250 +lambda_1: -0.6723, lambda_2: 63.0385 lambda_3: 0.0000 +train remain: [0.98 0.98 0.66 0.62 0.62 0.46 0.52 0.25 0.12 0.15] +infer remain: [1.0, 1.0, 0.64, 0.62, 0.62, 0.46, 0.52, 0.24, 0.12, 0.16] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.64, 0.4, 0.25, 0.11, 0.06, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111101011110111011111000110101110101010010000 +11111111111111111101111101010011011010000000000100 +11111111111111111111111111101100100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101001011000000000000000000000000000000000000 +10010000010100010001000000010000000000000100000000 +loss: 0.110417, lagrangian_loss: -0.000627, attention_score_distillation_loss: 0.000230 +loss: 0.002001, lagrangian_loss: 0.000122, attention_score_distillation_loss: 0.000228 +ETA: 0:37:33 | Epoch 132 finished. Took 33.24 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:38:33 +Evaluating: f1: 0.8744, eval_loss: 0.7117, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5925, expected_sparsity: 0.5879, expected_sequence_sparsity: 0.8316, target_sparsity: 0.5942, step: 15300 +lambda_1: -0.6379, lambda_2: 63.0638 lambda_3: 0.0000 +train remain: [0.98 0.98 0.66 0.62 0.62 0.46 0.52 0.25 0.12 0.15] +infer remain: [1.0, 1.0, 0.64, 0.62, 0.62, 0.46, 0.52, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.64, 0.4, 0.25, 0.11, 0.06, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111101011110111011111000110101110101010010000 +11111111111111111101111101010011011010000000000100 +11111111111111111111111111101100100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101001011000000000000000000000000000000000000 +10000000010100010001000000010000000000000100000000 +loss: 0.002437, lagrangian_loss: 0.000388, attention_score_distillation_loss: 0.000225 +loss: 0.003287, lagrangian_loss: 0.000607, attention_score_distillation_loss: 0.000222 +---------------------------------------------------------------------- +time: 2023-07-19 15:38:48 +Evaluating: f1: 0.8723, eval_loss: 0.6464, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5925, expected_sparsity: 0.5879, expected_sequence_sparsity: 0.8316, target_sparsity: 0.5961, step: 15350 +lambda_1: -0.7981, lambda_2: 63.1259 lambda_3: 0.0000 +train remain: [0.98 0.98 0.66 0.62 0.62 0.46 0.52 0.25 0.12 0.15] +infer remain: [1.0, 1.0, 0.64, 0.62, 0.62, 0.46, 0.52, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.64, 0.4, 0.25, 0.11, 0.06, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111101011110111011111000110101110101010010000 +11111111111111111101111101010011011010000000000100 +11111111111111111111111111101100100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101001011000000000000000000000000000000000000 +10000000010100010001000000010000000000000100000000 +loss: 0.002151, lagrangian_loss: 0.001998, attention_score_distillation_loss: 0.000219 +loss: 0.177561, lagrangian_loss: 0.001195, attention_score_distillation_loss: 0.000216 +---------------------------------------------------------------------- +time: 2023-07-19 15:39:03 +Evaluating: f1: 0.8737, eval_loss: 0.6766, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5925, expected_sparsity: 0.5879, expected_sequence_sparsity: 0.8316, target_sparsity: 0.5981, step: 15400 +lambda_1: -1.0835, lambda_2: 63.2559 lambda_3: 0.0000 +train remain: [0.98 0.98 0.66 0.61 0.62 0.45 0.51 0.25 0.12 0.15] +infer remain: [1.0, 1.0, 0.64, 0.62, 0.62, 0.46, 0.52, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.64, 0.4, 0.25, 0.11, 0.06, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111101011110111011111000110101110101010010000 +11111111111111111101111101010011011010000000000100 +11111111111111111111111111101100100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111111001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101001011000000000000000000000000000000000000 +10000000010100010001000000010000000000000100000000 +loss: 0.001902, lagrangian_loss: 0.002498, attention_score_distillation_loss: 0.000213 +ETA: 0:37:00 | Epoch 133 finished. Took 35.29 seconds. +loss: 0.001281, lagrangian_loss: 0.001589, attention_score_distillation_loss: 0.000211 +---------------------------------------------------------------------- +time: 2023-07-19 15:39:18 +Evaluating: f1: 0.8646, eval_loss: 0.6855, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5987, expected_sparsity: 0.5914, expected_sequence_sparsity: 0.833, target_sparsity: 0.6, step: 15450 +lambda_1: -1.3411, lambda_2: 63.3609 lambda_3: 0.0000 +train remain: [0.98 0.97 0.65 0.61 0.61 0.45 0.51 0.25 0.12 0.15] +infer remain: [1.0, 1.0, 0.64, 0.6, 0.6, 0.46, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.64, 0.38, 0.23, 0.11, 0.05, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111101011110111011111000110101110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000100 +11111111111111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101001011000000000000000000000000000000000000 +10000000010100010001000000010000000000000100000000 +loss: 0.002222, lagrangian_loss: -0.000537, attention_score_distillation_loss: 0.000207 +loss: 0.138751, lagrangian_loss: 0.001609, attention_score_distillation_loss: 0.000205 +---------------------------------------------------------------------- +time: 2023-07-19 15:39:32 +Evaluating: f1: 0.8714, eval_loss: 0.6712, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5987, expected_sparsity: 0.5914, expected_sequence_sparsity: 0.833, target_sparsity: 0.602, step: 15500 +lambda_1: -1.5837, lambda_2: 63.4781 lambda_3: 0.0000 +train remain: [0.98 0.97 0.65 0.61 0.6 0.45 0.51 0.25 0.12 0.15] +infer remain: [1.0, 1.0, 0.64, 0.6, 0.6, 0.46, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.64, 0.38, 0.23, 0.11, 0.05, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111101011110111011111000110101110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111111101100100000000000000000 +11111111111111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011001000000000000000000000000000000000 +10000000010100010001000000010000000000000100000000 +loss: 0.005439, lagrangian_loss: 0.003906, attention_score_distillation_loss: 0.000202 +ETA: 0:36:26 | Epoch 134 finished. Took 33.19 seconds. +loss: 0.002967, lagrangian_loss: 0.003280, attention_score_distillation_loss: 0.000199 +---------------------------------------------------------------------- +time: 2023-07-19 15:39:47 +Evaluating: f1: 0.8705, eval_loss: 0.6847, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.5987, expected_sparsity: 0.592, expected_sequence_sparsity: 0.8333, target_sparsity: 0.6039, step: 15550 +lambda_1: -2.1133, lambda_2: 63.8635 lambda_3: 0.0000 +train remain: [0.98 0.96 0.65 0.61 0.6 0.45 0.51 0.24 0.12 0.15] +infer remain: [1.0, 1.0, 0.64, 0.6, 0.6, 0.44, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.64, 0.38, 0.23, 0.1, 0.05, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111101011110111011111000110101110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111111101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011000000000000000000000000000000000001 +10000000010100010001000000010000000000000100000000 +loss: 0.004646, lagrangian_loss: 0.003113, attention_score_distillation_loss: 0.000196 +loss: 0.000978, lagrangian_loss: -0.002311, attention_score_distillation_loss: 0.000193 +---------------------------------------------------------------------- +time: 2023-07-19 15:40:02 +Evaluating: f1: 0.8724, eval_loss: 0.6839, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.6019, expected_sparsity: 0.5967, expected_sequence_sparsity: 0.8352, target_sparsity: 0.6058, step: 15600 +lambda_1: -2.1185, lambda_2: 63.9727 lambda_3: 0.0000 +train remain: [0.98 0.95 0.64 0.61 0.59 0.45 0.5 0.24 0.12 0.14] +infer remain: [1.0, 1.0, 0.62, 0.6, 0.58, 0.44, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.62, 0.37, 0.22, 0.09, 0.05, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111101011110111011111000110100110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101010011000000000000000000000000000000000000 +10000000010100010001000000010000000000000100000000 +loss: 0.002437, lagrangian_loss: -0.003967, attention_score_distillation_loss: 0.000190 +loss: 0.086966, lagrangian_loss: -0.005129, attention_score_distillation_loss: 0.000187 +ETA: 0:35:53 | Epoch 135 finished. Took 33.21 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:40:16 +Evaluating: f1: 0.8858, eval_loss: 0.6451, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6303, expected_sparsity: 0.6238, expected_sequence_sparsity: 0.8463, target_sparsity: 0.6078, step: 15650 +lambda_1: -1.3891, lambda_2: 64.7355 lambda_3: 0.0000 +train remain: [0.98 0.93 0.64 0.6 0.59 0.45 0.5 0.24 0.12 0.14] +infer remain: [1.0, 0.86, 0.62, 0.6, 0.58, 0.44, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.86, 0.53, 0.32, 0.19, 0.08, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111111111111111101111110111111101110110 +11111111101011110111011111000110100110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100010001000000010000000000000100000000 +loss: 0.003580, lagrangian_loss: -0.004472, attention_score_distillation_loss: 0.000184 +loss: 0.143183, lagrangian_loss: -0.002385, attention_score_distillation_loss: 0.000181 +---------------------------------------------------------------------- +time: 2023-07-19 15:40:31 +Evaluating: f1: 0.8691, eval_loss: 0.7763, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6303, expected_sparsity: 0.6238, expected_sequence_sparsity: 0.8463, target_sparsity: 0.6097, step: 15700 +lambda_1: -0.2457, lambda_2: 66.3016 lambda_3: 0.0000 +train remain: [0.98 0.92 0.64 0.6 0.58 0.45 0.5 0.24 0.12 0.14] +infer remain: [1.0, 0.86, 0.62, 0.6, 0.58, 0.44, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.86, 0.53, 0.32, 0.19, 0.08, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111111111111111101111110111111101110110 +11111111101011110111011111000110100110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011000000000000000000000000000000000001 +10000000010100010001000000010000000000000100000000 +loss: 0.158118, lagrangian_loss: 0.000288, attention_score_distillation_loss: 0.000179 +loss: 0.001747, lagrangian_loss: 0.001045, attention_score_distillation_loss: 0.000176 +---------------------------------------------------------------------- +time: 2023-07-19 15:40:46 +Evaluating: f1: 0.8468, eval_loss: 0.7195, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6303, expected_sparsity: 0.6238, expected_sequence_sparsity: 0.8463, target_sparsity: 0.6117, step: 15750 +lambda_1: 0.3923, lambda_2: 66.8617 lambda_3: 0.0000 +train remain: [0.98 0.93 0.64 0.61 0.59 0.45 0.5 0.24 0.12 0.15] +infer remain: [1.0, 0.86, 0.62, 0.6, 0.58, 0.44, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.86, 0.53, 0.32, 0.19, 0.08, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111111111111111101111110111111101110110 +11111111101011110111011111000110100110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011000000000000000000000000000000000001 +10000000010100010001000000010000000000000100000000 +loss: 0.174319, lagrangian_loss: 0.000165, attention_score_distillation_loss: 0.000173 +ETA: 0:35:20 | Epoch 136 finished. Took 35.36 seconds. +loss: 0.003573, lagrangian_loss: -0.000495, attention_score_distillation_loss: 0.000170 +---------------------------------------------------------------------- +time: 2023-07-19 15:41:01 +Evaluating: f1: 0.8387, eval_loss: 0.77, token_prune_loc: [False, False, True, True, True, True, True, True, True, True], macs_sparsity: 0.6003, expected_sparsity: 0.5961, expected_sequence_sparsity: 0.8349, target_sparsity: 0.6136, step: 15800 +lambda_1: 0.2059, lambda_2: 67.0296 lambda_3: 0.0000 +train remain: [0.98 0.93 0.64 0.61 0.59 0.45 0.5 0.24 0.12 0.15] +infer remain: [1.0, 1.0, 0.62, 0.6, 0.58, 0.46, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 1.0, 0.62, 0.37, 0.22, 0.1, 0.05, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111111111111111111111111111111111111111111111111 +11111111101011110111011111000110100110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111111111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011000000000000000000000000000000000001 +10000000010100010001000000010000000000000100000000 +loss: 0.002947, lagrangian_loss: -0.000102, attention_score_distillation_loss: 0.000167 +loss: 0.001858, lagrangian_loss: 0.000816, attention_score_distillation_loss: 0.000164 +---------------------------------------------------------------------- +time: 2023-07-19 15:41:15 +Evaluating: f1: 0.8612, eval_loss: 0.7308, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6303, expected_sparsity: 0.6238, expected_sequence_sparsity: 0.8463, target_sparsity: 0.6155, step: 15850 +lambda_1: -0.5346, lambda_2: 67.7248 lambda_3: 0.0000 +train remain: [0.98 0.93 0.64 0.61 0.59 0.45 0.5 0.24 0.12 0.14] +infer remain: [1.0, 0.86, 0.62, 0.6, 0.58, 0.44, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.86, 0.53, 0.32, 0.19, 0.08, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111011111111101111111111101111110111111101110110 +11111111101011110111011111000110100110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100010001000000010000000000000100000000 +loss: 0.000858, lagrangian_loss: 0.001974, attention_score_distillation_loss: 0.000161 +ETA: 0:34:46 | Epoch 137 finished. Took 33.15 seconds. +loss: 0.001490, lagrangian_loss: 0.001547, attention_score_distillation_loss: 0.000158 +---------------------------------------------------------------------- +time: 2023-07-19 15:41:30 +Evaluating: f1: 0.862, eval_loss: 0.7697, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6303, expected_sparsity: 0.6238, expected_sequence_sparsity: 0.8463, target_sparsity: 0.6175, step: 15900 +lambda_1: -1.1512, lambda_2: 68.2349 lambda_3: 0.0000 +train remain: [0.98 0.92 0.64 0.6 0.58 0.45 0.5 0.24 0.12 0.14] +infer remain: [1.0, 0.86, 0.62, 0.6, 0.58, 0.44, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.86, 0.53, 0.32, 0.19, 0.08, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111011111111101111111111101111110111111101110110 +11111111101011110111011111000110100110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100010001000000010000000000000100000000 +loss: 0.001991, lagrangian_loss: 0.003390, attention_score_distillation_loss: 0.000156 +loss: 0.008554, lagrangian_loss: 0.001382, attention_score_distillation_loss: 0.000153 +---------------------------------------------------------------------- +time: 2023-07-19 15:41:45 +Evaluating: f1: 0.8673, eval_loss: 0.7456, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6303, expected_sparsity: 0.6238, expected_sequence_sparsity: 0.8463, target_sparsity: 0.6194, step: 15950 +lambda_1: -1.4117, lambda_2: 68.3691 lambda_3: 0.0000 +train remain: [0.97 0.9 0.63 0.6 0.58 0.45 0.5 0.24 0.12 0.14] +infer remain: [1.0, 0.86, 0.62, 0.6, 0.58, 0.44, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.86, 0.53, 0.32, 0.19, 0.08, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +11111011111111101111111111101111110111111101110110 +11111111101011110111011111000110100110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100010001000000010000000000000000000001 +loss: 0.009650, lagrangian_loss: -0.000511, attention_score_distillation_loss: 0.000150 +loss: 0.008759, lagrangian_loss: -0.001755, attention_score_distillation_loss: 0.000147 +ETA: 0:34:12 | Epoch 138 finished. Took 33.1 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:41:59 +Evaluating: f1: 0.8676, eval_loss: 0.7333, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6318, expected_sparsity: 0.6276, expected_sequence_sparsity: 0.8479, target_sparsity: 0.6214, step: 16000 +lambda_1: -1.2698, lambda_2: 68.4294 lambda_3: 0.0000 +train remain: [0.97 0.89 0.63 0.6 0.58 0.45 0.49 0.23 0.12 0.14] +infer remain: [1.0, 0.84, 0.62, 0.6, 0.58, 0.44, 0.5, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.84, 0.52, 0.31, 0.18, 0.08, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111111101110110 +11111111101011110111011111000110100110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111011111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011001000000000000000000000000000000000 +10000000010100010001000000010000000000000000000001 +loss: 0.001650, lagrangian_loss: -0.000942, attention_score_distillation_loss: 0.000144 +loss: 0.003177, lagrangian_loss: -0.001426, attention_score_distillation_loss: 0.000141 +---------------------------------------------------------------------- +time: 2023-07-19 15:42:14 +Evaluating: f1: 0.8731, eval_loss: 0.7526, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6318, expected_sparsity: 0.6278, expected_sequence_sparsity: 0.8479, target_sparsity: 0.6233, step: 16050 +lambda_1: -0.8697, lambda_2: 68.6640 lambda_3: 0.0000 +train remain: [0.97 0.88 0.63 0.6 0.57 0.45 0.49 0.23 0.12 0.14] +infer remain: [1.0, 0.84, 0.62, 0.6, 0.58, 0.44, 0.48, 0.24, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.84, 0.52, 0.31, 0.18, 0.08, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111111101110110 +11111111101011110111001111000110101110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111010111111011001010110100010000000000100000 +11111111110010000001000000000000000000000000000000 +10000101000011001000000000000000000000000000000000 +10000000010100010001000000010000000000000000000001 +loss: 0.006447, lagrangian_loss: -0.001072, attention_score_distillation_loss: 0.000139 +loss: 0.005200, lagrangian_loss: -0.000518, attention_score_distillation_loss: 0.000136 +---------------------------------------------------------------------- +time: 2023-07-19 15:42:29 +Evaluating: f1: 0.871, eval_loss: 0.7727, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6318, expected_sparsity: 0.6279, expected_sequence_sparsity: 0.848, target_sparsity: 0.6253, step: 16100 +lambda_1: -0.5283, lambda_2: 68.8386 lambda_3: 0.0000 +train remain: [0.97 0.88 0.63 0.6 0.57 0.45 0.49 0.23 0.12 0.14] +infer remain: [1.0, 0.84, 0.62, 0.6, 0.58, 0.44, 0.48, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.84, 0.52, 0.31, 0.18, 0.08, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111111101110110 +11111111101011110111001111000110101110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111111111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111010111111011001010110100010000000000100000 +11101111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100010001000000010000000000000000000001 +ETA: 0:33:39 | Epoch 139 finished. Took 35.42 seconds. +loss: 0.004265, lagrangian_loss: -0.000541, attention_score_distillation_loss: 0.000133 +loss: 0.006619, lagrangian_loss: -0.000205, attention_score_distillation_loss: 0.000130 +---------------------------------------------------------------------- +time: 2023-07-19 15:42:44 +Evaluating: f1: 0.8847, eval_loss: 0.6917, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6334, expected_sparsity: 0.6287, expected_sequence_sparsity: 0.8483, target_sparsity: 0.6272, step: 16150 +lambda_1: -0.4700, lambda_2: 68.8875 lambda_3: 0.0000 +train remain: [0.97 0.88 0.62 0.6 0.57 0.45 0.49 0.23 0.12 0.14] +infer remain: [1.0, 0.84, 0.62, 0.6, 0.56, 0.44, 0.48, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.84, 0.52, 0.31, 0.17, 0.08, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111111101110110 +11111111101011110111001111000110101110101010010000 +11111111111011111101111101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111010111111011001010110100010000000000100000 +11101111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100010001000000010000000000000000000001 +loss: 0.003459, lagrangian_loss: 0.000171, attention_score_distillation_loss: 0.000127 +loss: 0.004751, lagrangian_loss: 0.000414, attention_score_distillation_loss: 0.000124 +---------------------------------------------------------------------- +time: 2023-07-19 15:42:58 +Evaluating: f1: 0.866, eval_loss: 0.7306, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6381, expected_sparsity: 0.6317, expected_sequence_sparsity: 0.8495, target_sparsity: 0.6291, step: 16200 +lambda_1: -0.6775, lambda_2: 68.9718 lambda_3: 0.0000 +train remain: [0.97 0.88 0.62 0.6 0.57 0.45 0.49 0.23 0.12 0.14] +infer remain: [1.0, 0.84, 0.6, 0.6, 0.56, 0.44, 0.48, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.84, 0.5, 0.3, 0.17, 0.07, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111111101110110 +11111111101011110111001111000110101110100010010000 +11111111111111111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111010111111011001010110100010000000000100000 +11101111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100010001000000010000000000000000000001 +loss: 0.127728, lagrangian_loss: 0.000942, attention_score_distillation_loss: 0.000121 +ETA: 0:33:05 | Epoch 140 finished. Took 33.09 seconds. +loss: 0.002692, lagrangian_loss: 0.000891, attention_score_distillation_loss: 0.000118 +---------------------------------------------------------------------- +time: 2023-07-19 15:43:13 +Evaluating: f1: 0.8542, eval_loss: 0.7302, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6381, expected_sparsity: 0.6317, expected_sequence_sparsity: 0.8495, target_sparsity: 0.6311, step: 16250 +lambda_1: -1.0198, lambda_2: 69.1458 lambda_3: 0.0000 +train remain: [0.97 0.87 0.62 0.6 0.57 0.45 0.49 0.23 0.12 0.14] +infer remain: [1.0, 0.84, 0.6, 0.6, 0.56, 0.44, 0.48, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.84, 0.5, 0.3, 0.17, 0.07, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111111101110110 +11111111101011110111001111000110101110100010010000 +11111111111111111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111010111111011001010110100010000000000100000 +11101111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100010001000000010000000000000000000001 +loss: 0.002633, lagrangian_loss: 0.002175, attention_score_distillation_loss: 0.000116 +loss: 0.004739, lagrangian_loss: 0.003634, attention_score_distillation_loss: 0.000113 +---------------------------------------------------------------------- +time: 2023-07-19 15:43:28 +Evaluating: f1: 0.8609, eval_loss: 0.8503, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6381, expected_sparsity: 0.6317, expected_sequence_sparsity: 0.8495, target_sparsity: 0.633, step: 16300 +lambda_1: -1.3202, lambda_2: 69.2947 lambda_3: 0.0000 +train remain: [0.97 0.87 0.62 0.59 0.57 0.45 0.48 0.23 0.12 0.14] +infer remain: [1.0, 0.84, 0.6, 0.6, 0.56, 0.44, 0.48, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.84, 0.5, 0.3, 0.17, 0.07, 0.04, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111111101110110 +11111111101011110111001111000110101110100010010000 +11111111111111111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111010111111011001010110100010000000000100000 +11101111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100010001000000010000000000000000000001 +loss: 0.002589, lagrangian_loss: 0.000621, attention_score_distillation_loss: 0.000110 +loss: 0.134506, lagrangian_loss: 0.001687, attention_score_distillation_loss: 0.000107 +ETA: 0:32:31 | Epoch 141 finished. Took 33.39 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:43:42 +Evaluating: f1: 0.8581, eval_loss: 0.7425, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6397, expected_sparsity: 0.6333, expected_sequence_sparsity: 0.8502, target_sparsity: 0.635, step: 16350 +lambda_1: -1.5862, lambda_2: 69.4057 lambda_3: 0.0000 +train remain: [0.97 0.86 0.61 0.59 0.57 0.44 0.48 0.23 0.12 0.14] +infer remain: [1.0, 0.84, 0.6, 0.58, 0.56, 0.44, 0.48, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.84, 0.5, 0.29, 0.16, 0.07, 0.03, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111111101110110 +11111111101011110111001111000110101110100010010000 +11111111111011111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111010111111011001010110100010000000000100000 +11101111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000011 +loss: 0.008445, lagrangian_loss: 0.001599, attention_score_distillation_loss: 0.000104 +loss: 0.003550, lagrangian_loss: 0.002469, attention_score_distillation_loss: 0.000101 +---------------------------------------------------------------------- +time: 2023-07-19 15:43:57 +Evaluating: f1: 0.8774, eval_loss: 0.7076, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6428, expected_sparsity: 0.6372, expected_sequence_sparsity: 0.8518, target_sparsity: 0.6369, step: 16400 +lambda_1: -1.7893, lambda_2: 69.4857 lambda_3: 0.0000 +train remain: [0.97 0.85 0.61 0.59 0.56 0.44 0.47 0.22 0.12 0.14] +infer remain: [1.0, 0.82, 0.6, 0.58, 0.56, 0.44, 0.46, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.82, 0.49, 0.29, 0.16, 0.07, 0.03, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111101101110110 +11111111101011110111001111000110101110100010010000 +11111111111011111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +11111111101111111100110110010000000000000000000000 +11111111010111111011001010110100010000000000000000 +11101111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000011 +loss: 0.001532, lagrangian_loss: 0.001442, attention_score_distillation_loss: 0.000098 +loss: 0.002115, lagrangian_loss: 0.001714, attention_score_distillation_loss: 0.000095 +ETA: 0:31:58 | Epoch 142 finished. Took 33.14 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:44:12 +Evaluating: f1: 0.8645, eval_loss: 0.7318, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6475, expected_sparsity: 0.64, expected_sequence_sparsity: 0.8529, target_sparsity: 0.6388, step: 16450 +lambda_1: -1.8637, lambda_2: 69.5243 lambda_3: 0.0000 +train remain: [0.97 0.85 0.6 0.59 0.56 0.43 0.47 0.22 0.12 0.14] +infer remain: [1.0, 0.82, 0.58, 0.58, 0.56, 0.44, 0.46, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.82, 0.48, 0.28, 0.15, 0.07, 0.03, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111101101110110 +11111111101011110111001111000110100110100010010000 +11111111111011111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +10111111111111111100110110010000000000000000000000 +10111111010111111011001010110100010000000000100000 +11101111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000011 +loss: 0.009147, lagrangian_loss: 0.001093, attention_score_distillation_loss: 0.000092 +loss: 0.002512, lagrangian_loss: -0.000366, attention_score_distillation_loss: 0.000089 +---------------------------------------------------------------------- +time: 2023-07-19 15:44:27 +Evaluating: f1: 0.8788, eval_loss: 0.7173, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6475, expected_sparsity: 0.6404, expected_sequence_sparsity: 0.8531, target_sparsity: 0.6408, step: 16500 +lambda_1: -1.8103, lambda_2: 69.5543 lambda_3: 0.0000 +train remain: [0.97 0.84 0.6 0.59 0.56 0.43 0.46 0.21 0.12 0.14] +infer remain: [1.0, 0.82, 0.58, 0.58, 0.56, 0.42, 0.46, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.82, 0.48, 0.28, 0.15, 0.06, 0.03, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111101101110110 +11111111101011110111001111000010101110100010010000 +11111111111011111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +10111111101111111100110110010000000000000000000000 +10111111010111111011001010110100010000000000100000 +11101111110010000001000000000000000000000000000000 +10000101000011000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000011 +loss: 0.002435, lagrangian_loss: -0.001553, attention_score_distillation_loss: 0.000087 +loss: 0.002631, lagrangian_loss: -0.002602, attention_score_distillation_loss: 0.000084 +---------------------------------------------------------------------- +time: 2023-07-19 15:44:41 +Evaluating: f1: 0.8866, eval_loss: 0.6696, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6475, expected_sparsity: 0.6405, expected_sequence_sparsity: 0.8531, target_sparsity: 0.6427, step: 16550 +lambda_1: -1.5954, lambda_2: 69.6327 lambda_3: 0.0000 +train remain: [0.96 0.84 0.59 0.58 0.56 0.43 0.46 0.21 0.12 0.14] +infer remain: [1.0, 0.82, 0.58, 0.58, 0.56, 0.42, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.82, 0.48, 0.28, 0.15, 0.06, 0.03, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111101101110110 +11111111101011110111001111000010101110100010010000 +11111111111011111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +10111111101111111100110110010000000000000000000000 +10111111010111111011001010110100010000000000100000 +11101011110010000001000000000000000000000000000000 +10000101000011001000000000000000000000000000000000 +10000000010100000001000000010000000000000000000011 +loss: 0.002116, lagrangian_loss: -0.001460, attention_score_distillation_loss: 0.000081 +ETA: 0:31:25 | Epoch 143 finished. Took 35.46 seconds. +loss: 0.145425, lagrangian_loss: -0.001636, attention_score_distillation_loss: 0.000078 +---------------------------------------------------------------------- +time: 2023-07-19 15:44:56 +Evaluating: f1: 0.8773, eval_loss: 0.7223, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6475, expected_sparsity: 0.6405, expected_sequence_sparsity: 0.8531, target_sparsity: 0.6447, step: 16600 +lambda_1: -1.3609, lambda_2: 69.7219 lambda_3: 0.0000 +train remain: [0.97 0.83 0.59 0.58 0.56 0.43 0.46 0.21 0.12 0.14] +infer remain: [1.0, 0.82, 0.58, 0.58, 0.56, 0.42, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.82, 0.48, 0.28, 0.15, 0.06, 0.03, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111101101110110 +11111111101011110111001111000010101110100010010000 +11111111111011111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +10111111101111111100110110010000000000000000000000 +10111111010111111011001010110100010000000000100000 +11101011110010000001000000000000000000000000000000 +10000101000011001000000000000000000000000000000000 +10000000010100000001000000010000000000000000000011 +loss: 0.007107, lagrangian_loss: -0.000407, attention_score_distillation_loss: 0.000075 +loss: 0.009919, lagrangian_loss: -0.000167, attention_score_distillation_loss: 0.000072 +---------------------------------------------------------------------- +time: 2023-07-19 15:45:11 +Evaluating: f1: 0.8741, eval_loss: 0.7097, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6475, expected_sparsity: 0.6405, expected_sequence_sparsity: 0.8531, target_sparsity: 0.6466, step: 16650 +lambda_1: -1.3337, lambda_2: 69.7619 lambda_3: 0.0000 +train remain: [0.96 0.83 0.58 0.58 0.56 0.42 0.46 0.21 0.12 0.14] +infer remain: [1.0, 0.82, 0.58, 0.58, 0.56, 0.42, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.82, 0.48, 0.28, 0.15, 0.06, 0.03, 0.01, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101111111111101111110111101101110110 +11111111101011110111001111000010101110100010010000 +11111111111011111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +10111111101111111100110110010000000000000000000000 +10111111010111111011001010110100010000000000100000 +11101011110010000001000000000000000000000000000000 +10000101000011001000000000000000000000000000000000 +10000000010100000001000000010000000000000000000011 +loss: 0.002916, lagrangian_loss: 0.001448, attention_score_distillation_loss: 0.000069 +ETA: 0:30:51 | Epoch 144 finished. Took 33.3 seconds. +loss: 0.002908, lagrangian_loss: 0.002041, attention_score_distillation_loss: 0.000066 +---------------------------------------------------------------------- +time: 2023-07-19 15:45:26 +Evaluating: f1: 0.8605, eval_loss: 0.8001, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6665, expected_sparsity: 0.6616, expected_sequence_sparsity: 0.8618, target_sparsity: 0.6486, step: 16700 +lambda_1: -1.6311, lambda_2: 69.9012 lambda_3: 0.0000 +train remain: [0.96 0.83 0.58 0.58 0.55 0.42 0.46 0.21 0.12 0.14] +infer remain: [0.92, 0.82, 0.56, 0.58, 0.56, 0.42, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.75, 0.42, 0.25, 0.14, 0.06, 0.03, 0.01, 0.0, 0.0] +01111111111111111111111111111111110111111111111010 +10111011111111101111111111101111110111101101110110 +11111111101011110111001011000010101110100010010000 +11111111111011111101110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +10111111101111111100110110010000000000000000000000 +10111111010111111011001010110100010000000000100000 +11101011110010000001000000000000000000000000000000 +10000101000011001000000000000000000000000000000000 +10000000010100000001000000010000000000000000000011 +loss: 0.003135, lagrangian_loss: -0.000663, attention_score_distillation_loss: 0.000064 +loss: 0.006193, lagrangian_loss: 0.002916, attention_score_distillation_loss: 0.000061 +---------------------------------------------------------------------- +time: 2023-07-19 15:45:41 +Evaluating: f1: 0.8735, eval_loss: 0.7033, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6696, expected_sparsity: 0.663, expected_sequence_sparsity: 0.8623, target_sparsity: 0.6505, step: 16750 +lambda_1: -2.0119, lambda_2: 70.0971 lambda_3: 0.0000 +train remain: [0.96 0.83 0.58 0.57 0.55 0.42 0.46 0.2 0.12 0.14] +infer remain: [0.92, 0.82, 0.56, 0.56, 0.56, 0.42, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.75, 0.42, 0.24, 0.13, 0.06, 0.03, 0.01, 0.0, 0.0] +01111111111111111111111111111111110111111111111010 +10111011111111101111111111101111110111101101110110 +11111111101011110111001011000010101110100010010000 +11111111111011111100110101010011011010000000000100 +11111111111101111111111110101100100000000000000000 +10111111111111111100110010010000000000000000000000 +10111111010111111011001010110100010000000000100000 +11101011110010000001000000000000000000000000000000 +10000101000011001000000000000000000000000000000000 +10000000010100000001000000010000000000000000000011 +loss: 0.002854, lagrangian_loss: 0.005571, attention_score_distillation_loss: 0.000058 +loss: 0.001962, lagrangian_loss: 0.001002, attention_score_distillation_loss: 0.000055 +ETA: 0:30:17 | Epoch 145 finished. Took 33.34 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:45:55 +Evaluating: f1: 0.8744, eval_loss: 0.7185, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6712, expected_sparsity: 0.6636, expected_sequence_sparsity: 0.8626, target_sparsity: 0.6524, step: 16800 +lambda_1: -2.4114, lambda_2: 70.3093 lambda_3: 0.0000 +train remain: [0.96 0.83 0.57 0.57 0.55 0.42 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.82, 0.56, 0.56, 0.54, 0.42, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.75, 0.42, 0.24, 0.13, 0.05, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111111111111110111111111111010 +10111011111111101111111111101111110111101101110110 +11111111101011110111001011000010101110100010010000 +11111111111011111100110101010011011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111111100110010010000000000000000000000 +10111111010111111011001010110100010000000000100000 +10101111110010000001000000000000000000000000000000 +10000101000011001000000000000000000000000000000000 +10000000010100000001000000010000000000000000000011 +loss: 0.004987, lagrangian_loss: -0.000376, attention_score_distillation_loss: 0.000052 +loss: 0.001900, lagrangian_loss: 0.001524, attention_score_distillation_loss: 0.000049 +---------------------------------------------------------------------- +time: 2023-07-19 15:46:10 +Evaluating: f1: 0.8611, eval_loss: 0.7567, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6759, expected_sparsity: 0.6693, expected_sequence_sparsity: 0.8649, target_sparsity: 0.6544, step: 16850 +lambda_1: -2.6790, lambda_2: 70.4488 lambda_3: 0.0000 +train remain: [0.96 0.82 0.56 0.57 0.55 0.42 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.8, 0.54, 0.56, 0.54, 0.42, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.74, 0.4, 0.22, 0.12, 0.05, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111111111111110111111111111010 +10111011111111101101111111101111110111101101110110 +11111111101011110111001011000010100110100010010000 +11111111111011111100110101010011011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111111100110010010000000000000000000000 +10111111010111111011001010110100010000000000100000 +10101111110010000001000000000000000000000000000000 +10000101000011000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000011 +loss: 0.168615, lagrangian_loss: 0.002915, attention_score_distillation_loss: 0.000046 +loss: 0.003455, lagrangian_loss: 0.006691, attention_score_distillation_loss: 0.000044 +---------------------------------------------------------------------- +time: 2023-07-19 15:46:25 +Evaluating: f1: 0.8744, eval_loss: 0.6847, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6759, expected_sparsity: 0.6693, expected_sequence_sparsity: 0.8649, target_sparsity: 0.6563, step: 16900 +lambda_1: -3.1740, lambda_2: 70.7482 lambda_3: 0.0000 +train remain: [0.96 0.82 0.55 0.56 0.55 0.41 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.8, 0.54, 0.56, 0.54, 0.42, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.74, 0.4, 0.22, 0.12, 0.05, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111111111111110111111111111010 +10111011111111101101111111101111110111101101110110 +11111111101011110111001011000010100110100010010000 +11111111111011111100110101010011011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111111100110010010000000000000000000000 +10111111010111111011001010110100010000000000100000 +10101111110010000001000000000000000000000000000000 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +loss: 0.133191, lagrangian_loss: 0.006953, attention_score_distillation_loss: 0.000041 +ETA: 0:29:44 | Epoch 146 finished. Took 35.35 seconds. +loss: 0.003346, lagrangian_loss: 0.006039, attention_score_distillation_loss: 0.000038 +---------------------------------------------------------------------- +time: 2023-07-19 15:46:39 +Evaluating: f1: 0.867, eval_loss: 0.7245, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6759, expected_sparsity: 0.6696, expected_sequence_sparsity: 0.8651, target_sparsity: 0.6583, step: 16950 +lambda_1: -3.6650, lambda_2: 71.0466 lambda_3: 0.0000 +train remain: [0.96 0.82 0.55 0.56 0.54 0.41 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.8, 0.54, 0.56, 0.54, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.74, 0.4, 0.22, 0.12, 0.05, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111111111111110111111111111010 +10111011111111101101111111101111110111101101110110 +11111111101011110111001011000010100110100010010000 +11111111111011111100110101010011011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101111110010000001000000000000000000000000000000 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +loss: 0.127968, lagrangian_loss: 0.005726, attention_score_distillation_loss: 0.000035 +loss: 0.001936, lagrangian_loss: 0.009225, attention_score_distillation_loss: 0.000032 +Starting saving the best from epoch 147 and step 17000 +---------------------------------------------------------------------- +time: 2023-07-19 15:46:54 +Evaluating: f1: 0.8714, eval_loss: 0.7322, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.679, expected_sparsity: 0.672, expected_sequence_sparsity: 0.8661, target_sparsity: 0.6602, step: 17000 +lambda_1: -4.1869, lambda_2: 71.3869 lambda_3: 0.0000 +train remain: [0.96 0.82 0.54 0.56 0.54 0.41 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.8, 0.52, 0.56, 0.54, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.74, 0.38, 0.21, 0.12, 0.05, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101101111111101111110111101101110110 +11111111101011110111001011000010100110100010000000 +11111111111011111100110101010011011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101111110010000001000000000000000000000000000000 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +Saving the best model so far: [Epoch 147 | Step: 17000 | MACs sparsity: 0.679 | Score: 0.8714 | Loss: 0.7322] +loss: 0.133940, lagrangian_loss: 0.004829, attention_score_distillation_loss: 0.000029 +ETA: 0:29:15 | Epoch 147 finished. Took 47.63 seconds. +loss: 0.002726, lagrangian_loss: 0.007066, attention_score_distillation_loss: 0.000026 +---------------------------------------------------------------------- +time: 2023-07-19 15:47:23 +Evaluating: f1: 0.8627, eval_loss: 0.6787, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6821, expected_sparsity: 0.6763, expected_sequence_sparsity: 0.8678, target_sparsity: 0.6622, step: 17050 +lambda_1: -4.5306, lambda_2: 71.5690 lambda_3: 0.0000 +train remain: [0.95 0.81 0.53 0.55 0.54 0.4 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.78, 0.52, 0.54, 0.54, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.72, 0.37, 0.2, 0.11, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101101111111101111110111001101110110 +11111111101011110111001011000010100110100010000000 +11111111111011111100110101010010011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101111110010000001000000000000000000000000000000 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8714 @ step 17000 epoch 147.83 +loss: 0.014005, lagrangian_loss: -0.000084, attention_score_distillation_loss: 0.000023 +loss: 0.006553, lagrangian_loss: 0.008311, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:47:38 +Evaluating: f1: 0.8735, eval_loss: 0.6959, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6821, expected_sparsity: 0.6763, expected_sequence_sparsity: 0.8678, target_sparsity: 0.6641, step: 17100 +lambda_1: -4.6151, lambda_2: 71.6265 lambda_3: 0.0000 +train remain: [0.95 0.81 0.52 0.55 0.53 0.4 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.78, 0.52, 0.54, 0.54, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.72, 0.37, 0.2, 0.11, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101101111111101111110111001101110110 +11111111101011110111001011000010100110100010000000 +11111111111011111100110101010010011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000000000000010000000000000000 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8714 @ step 17000 epoch 147.83 +Saving the best model so far: [Epoch 148 | Step: 17100 | MACs sparsity: 0.6821 | Score: 0.8735 | Loss: 0.6959] +loss: 0.001852, lagrangian_loss: 0.001555, attention_score_distillation_loss: 0.000020 +loss: 0.007361, lagrangian_loss: -0.006592, attention_score_distillation_loss: 0.000020 +ETA: 0:28:47 | Epoch 148 finished. Took 50.88 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:48:10 +Evaluating: f1: 0.8616, eval_loss: 0.7365, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6853, expected_sparsity: 0.6791, expected_sequence_sparsity: 0.869, target_sparsity: 0.666, step: 17150 +lambda_1: -4.3411, lambda_2: 71.7736 lambda_3: 0.0000 +train remain: [0.95 0.8 0.52 0.55 0.53 0.4 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.78, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.72, 0.36, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101101111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000000000000010000000000000000 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +loss: 0.004291, lagrangian_loss: -0.008991, attention_score_distillation_loss: 0.000020 +loss: 0.005514, lagrangian_loss: -0.004422, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:48:25 +Evaluating: f1: 0.8646, eval_loss: 0.7136, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6853, expected_sparsity: 0.6791, expected_sequence_sparsity: 0.869, target_sparsity: 0.668, step: 17200 +lambda_1: -3.8956, lambda_2: 72.0295 lambda_3: 0.0000 +train remain: [0.95 0.8 0.51 0.54 0.53 0.4 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.78, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.72, 0.36, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101101111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000000000000000000000000000001 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +loss: 0.006193, lagrangian_loss: -0.009504, attention_score_distillation_loss: 0.000020 +loss: 0.008559, lagrangian_loss: -0.007060, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:48:40 +Evaluating: f1: 0.8684, eval_loss: 0.7344, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6822, expected_sequence_sparsity: 0.8702, target_sparsity: 0.6699, step: 17250 +lambda_1: -3.3490, lambda_2: 72.4049 lambda_3: 0.0000 +train remain: [0.94 0.79 0.51 0.54 0.52 0.39 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +ETA: 0:28:14 | Epoch 149 finished. Took 35.62 seconds. +loss: 0.002547, lagrangian_loss: -0.006992, attention_score_distillation_loss: 0.000020 +loss: 0.015886, lagrangian_loss: -0.008724, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:48:55 +Evaluating: f1: 0.8699, eval_loss: 0.7249, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6824, expected_sequence_sparsity: 0.8703, target_sparsity: 0.67, step: 17300 +lambda_1: -2.6269, lambda_2: 73.0800 lambda_3: 0.0000 +train remain: [0.94 0.78 0.5 0.54 0.52 0.39 0.46 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.38, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +loss: 0.002372, lagrangian_loss: -0.006692, attention_score_distillation_loss: 0.000020 +loss: 0.007142, lagrangian_loss: -0.008612, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:49:10 +Evaluating: f1: 0.864, eval_loss: 0.7346, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6824, expected_sequence_sparsity: 0.8703, target_sparsity: 0.67, step: 17350 +lambda_1: -1.6432, lambda_2: 74.2868 lambda_3: 0.0000 +train remain: [0.94 0.78 0.5 0.54 0.52 0.39 0.46 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.38, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +loss: 0.003114, lagrangian_loss: -0.006976, attention_score_distillation_loss: 0.000020 +ETA: 0:27:40 | Epoch 150 finished. Took 33.54 seconds. +loss: 0.010309, lagrangian_loss: -0.002986, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:49:24 +Evaluating: f1: 0.865, eval_loss: 0.7636, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6846, expected_sequence_sparsity: 0.8712, target_sparsity: 0.67, step: 17400 +lambda_1: -0.5653, lambda_2: 75.6840 lambda_3: 0.0000 +train remain: [0.94 0.78 0.5 0.54 0.52 0.39 0.46 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000000000000000000000000000011 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +loss: 0.003043, lagrangian_loss: -0.000798, attention_score_distillation_loss: 0.000020 +loss: 0.003158, lagrangian_loss: 0.001665, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:49:39 +Evaluating: f1: 0.8678, eval_loss: 0.7542, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6822, expected_sequence_sparsity: 0.8702, target_sparsity: 0.67, step: 17450 +lambda_1: 0.3908, lambda_2: 76.8025 lambda_3: 0.0000 +train remain: [0.94 0.78 0.5 0.54 0.52 0.39 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +loss: 0.022671, lagrangian_loss: 0.002830, attention_score_distillation_loss: 0.000020 +loss: 0.073721, lagrangian_loss: 0.003918, attention_score_distillation_loss: 0.000020 +ETA: 0:27:06 | Epoch 151 finished. Took 33.32 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:49:54 +Evaluating: f1: 0.8697, eval_loss: 0.7745, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6822, expected_sequence_sparsity: 0.8702, target_sparsity: 0.67, step: 17500 +lambda_1: 1.1375, lambda_2: 77.5334 lambda_3: 0.0000 +train remain: [0.94 0.79 0.5 0.54 0.53 0.4 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +loss: 0.003659, lagrangian_loss: 0.003443, attention_score_distillation_loss: 0.000020 +loss: 0.002455, lagrangian_loss: 0.002364, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:50:09 +Evaluating: f1: 0.8616, eval_loss: 0.7418, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6816, expected_sequence_sparsity: 0.87, target_sparsity: 0.67, step: 17550 +lambda_1: 1.5023, lambda_2: 77.7559 lambda_3: 0.0000 +train remain: [0.95 0.79 0.51 0.55 0.53 0.4 0.47 0.21 0.13 0.14] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.54, 0.4, 0.46, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111010111111011001010110100010000000000000001 +10001011110010000001000000000000000010000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000011 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +loss: 0.140542, lagrangian_loss: -0.000501, attention_score_distillation_loss: 0.000020 +loss: 0.005515, lagrangian_loss: -0.003270, attention_score_distillation_loss: 0.000020 +ETA: 0:26:32 | Epoch 152 finished. Took 33.34 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:50:24 +Evaluating: f1: 0.8699, eval_loss: 0.7265, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6853, expected_sparsity: 0.6785, expected_sequence_sparsity: 0.8687, target_sparsity: 0.67, step: 17600 +lambda_1: 1.3356, lambda_2: 77.8588 lambda_3: 0.0000 +train remain: [0.95 0.8 0.52 0.55 0.54 0.41 0.48 0.23 0.14 0.15] +infer remain: [0.92, 0.78, 0.5, 0.54, 0.54, 0.4, 0.48, 0.22, 0.14, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.72, 0.36, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101101111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111010111111011001010110100110000000000000001 +10001011110010000001000000000000000010000000010001 +10000101000010000000000000000000100000000000000011 +10000000010100000001000000010000000000000000000011 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +loss: 0.004088, lagrangian_loss: -0.002679, attention_score_distillation_loss: 0.000020 +loss: 0.003141, lagrangian_loss: -0.002640, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:50:38 +Evaluating: f1: 0.8714, eval_loss: 0.7308, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6821, expected_sparsity: 0.6761, expected_sequence_sparsity: 0.8677, target_sparsity: 0.67, step: 17650 +lambda_1: 0.7073, lambda_2: 78.3977 lambda_3: 0.0000 +train remain: [0.95 0.8 0.52 0.55 0.54 0.41 0.48 0.24 0.15 0.16] +infer remain: [0.92, 0.78, 0.52, 0.54, 0.54, 0.4, 0.48, 0.24, 0.14, 0.16] +layerwise remain: [1.0, 1.0, 0.92, 0.72, 0.37, 0.2, 0.11, 0.04, 0.02, 0.01, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101101111111101111110111001101110110 +11111111101011110111001011000010100110100010000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111010111111011001010110100110000000000000001 +10001011110010010001000000000000000010000000010001 +10000101000010000000000000000000100000000000000011 +10000000000100000001000000010000000000000000010111 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +loss: 0.002142, lagrangian_loss: -0.001575, attention_score_distillation_loss: 0.000020 +loss: 0.003520, lagrangian_loss: 0.000979, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:50:53 +Evaluating: f1: 0.8821, eval_loss: 0.6836, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6853, expected_sparsity: 0.6785, expected_sequence_sparsity: 0.8687, target_sparsity: 0.67, step: 17700 +lambda_1: -0.2270, lambda_2: 79.5347 lambda_3: 0.0000 +train remain: [0.95 0.8 0.52 0.55 0.54 0.41 0.48 0.23 0.14 0.15] +infer remain: [0.92, 0.78, 0.5, 0.54, 0.54, 0.4, 0.48, 0.22, 0.14, 0.16] +layerwise remain: [1.0, 1.0, 0.92, 0.72, 0.36, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101101111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111010111111011001010110100110000000000000001 +10001011110010000001000000000000000010000000010001 +10000101000010000000000000000000100000000000000011 +10000000010100000001000000010000000000000000010011 +Best eval score so far: 0.8735 @ step 17100 epoch 148.70 +Saving the best model so far: [Epoch 153 | Step: 17700 | MACs sparsity: 0.6853 | Score: 0.8821 | Loss: 0.6836] +loss: 0.146984, lagrangian_loss: 0.002908, attention_score_distillation_loss: 0.000020 +ETA: 0:26:02 | Epoch 153 finished. Took 47.88 seconds. +loss: 0.122997, lagrangian_loss: 0.002678, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:51:20 +Evaluating: f1: 0.8744, eval_loss: 0.7094, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6853, expected_sparsity: 0.6786, expected_sequence_sparsity: 0.8688, target_sparsity: 0.67, step: 17750 +lambda_1: -0.9625, lambda_2: 80.2731 lambda_3: 0.0000 +train remain: [0.95 0.79 0.52 0.55 0.53 0.4 0.47 0.22 0.13 0.14] +infer remain: [0.92, 0.78, 0.5, 0.54, 0.54, 0.4, 0.46, 0.22, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.72, 0.36, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101101111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111111110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111010111111011001010110100010000000000000001 +10001011110010000001000000000000000010000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000011 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.005594, lagrangian_loss: 0.005160, attention_score_distillation_loss: 0.000020 +loss: 0.003260, lagrangian_loss: 0.004361, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:51:35 +Evaluating: f1: 0.867, eval_loss: 0.7239, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6822, expected_sequence_sparsity: 0.8702, target_sparsity: 0.67, step: 17800 +lambda_1: -1.3775, lambda_2: 80.5314 lambda_3: 0.0000 +train remain: [0.95 0.79 0.51 0.55 0.53 0.4 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.002171, lagrangian_loss: 0.002651, attention_score_distillation_loss: 0.000020 +ETA: 0:25:28 | Epoch 154 finished. Took 33.14 seconds. +loss: 0.002710, lagrangian_loss: 0.001142, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:51:50 +Evaluating: f1: 0.8632, eval_loss: 0.7154, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6822, expected_sequence_sparsity: 0.8702, target_sparsity: 0.67, step: 17850 +lambda_1: -1.5787, lambda_2: 80.6145 lambda_3: 0.0000 +train remain: [0.95 0.79 0.51 0.54 0.53 0.4 0.46 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.006560, lagrangian_loss: -0.000010, attention_score_distillation_loss: 0.000020 +loss: 0.005817, lagrangian_loss: -0.000090, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:52:04 +Evaluating: f1: 0.8621, eval_loss: 0.7683, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6822, expected_sequence_sparsity: 0.8702, target_sparsity: 0.67, step: 17900 +lambda_1: -1.5033, lambda_2: 80.6592 lambda_3: 0.0000 +train remain: [0.95 0.78 0.5 0.54 0.52 0.4 0.46 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.005282, lagrangian_loss: -0.002634, attention_score_distillation_loss: 0.000020 +loss: 0.004886, lagrangian_loss: -0.001457, attention_score_distillation_loss: 0.000020 +ETA: 0:24:54 | Epoch 155 finished. Took 33.15 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:52:19 +Evaluating: f1: 0.87, eval_loss: 0.6913, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6844, expected_sequence_sparsity: 0.8711, target_sparsity: 0.67, step: 17950 +lambda_1: -1.2109, lambda_2: 80.7915 lambda_3: 0.0000 +train remain: [0.95 0.78 0.5 0.54 0.52 0.4 0.45 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.4, 0.46, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101001110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.003436, lagrangian_loss: -0.002359, attention_score_distillation_loss: 0.000020 +loss: 0.004639, lagrangian_loss: -0.002327, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:52:34 +Evaluating: f1: 0.869, eval_loss: 0.7136, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6844, expected_sequence_sparsity: 0.8711, target_sparsity: 0.67, step: 18000 +lambda_1: -0.7878, lambda_2: 81.0432 lambda_3: 0.0000 +train remain: [0.95 0.78 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.4, 0.46, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101001110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.001982, lagrangian_loss: -0.001456, attention_score_distillation_loss: 0.000020 +loss: 0.003408, lagrangian_loss: -0.000841, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:52:49 +Evaluating: f1: 0.8681, eval_loss: 0.7329, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6844, expected_sequence_sparsity: 0.8711, target_sparsity: 0.67, step: 18050 +lambda_1: -0.2910, lambda_2: 81.3843 lambda_3: 0.0000 +train remain: [0.95 0.78 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.4, 0.46, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101001110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000000000000010000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.002550, lagrangian_loss: -0.000235, attention_score_distillation_loss: 0.000020 +ETA: 0:24:20 | Epoch 156 finished. Took 35.5 seconds. +loss: 0.003243, lagrangian_loss: 0.000125, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:53:03 +Evaluating: f1: 0.8694, eval_loss: 0.7256, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6844, expected_sequence_sparsity: 0.8711, target_sparsity: 0.67, step: 18100 +lambda_1: 0.1170, lambda_2: 81.6176 lambda_3: 0.0000 +train remain: [0.95 0.78 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.4, 0.46, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101001110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000000000000000000000000000001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.002923, lagrangian_loss: 0.000694, attention_score_distillation_loss: 0.000020 +loss: 0.003525, lagrangian_loss: 0.000997, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:53:18 +Evaluating: f1: 0.8776, eval_loss: 0.7111, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6844, expected_sequence_sparsity: 0.8711, target_sparsity: 0.67, step: 18150 +lambda_1: 0.4285, lambda_2: 81.7666 lambda_3: 0.0000 +train remain: [0.95 0.78 0.5 0.54 0.52 0.4 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101001110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000000000000000000000000000001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.003168, lagrangian_loss: -0.000075, attention_score_distillation_loss: 0.000020 +ETA: 0:23:46 | Epoch 157 finished. Took 33.38 seconds. +loss: 0.004004, lagrangian_loss: 0.001173, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:53:33 +Evaluating: f1: 0.8721, eval_loss: 0.7514, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6822, expected_sequence_sparsity: 0.8702, target_sparsity: 0.67, step: 18200 +lambda_1: 0.5869, lambda_2: 81.8274 lambda_3: 0.0000 +train remain: [0.95 0.78 0.5 0.54 0.53 0.4 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.002964, lagrangian_loss: -0.000306, attention_score_distillation_loss: 0.000020 +loss: 0.001471, lagrangian_loss: -0.000497, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:53:48 +Evaluating: f1: 0.8473, eval_loss: 0.7448, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6821, expected_sequence_sparsity: 0.8702, target_sparsity: 0.67, step: 18250 +lambda_1: 0.5351, lambda_2: 81.8591 lambda_3: 0.0000 +train remain: [0.96 0.79 0.51 0.54 0.53 0.4 0.46 0.21 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111010111111011001010110100010000000000000001 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000011 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.002930, lagrangian_loss: -0.000823, attention_score_distillation_loss: 0.000020 +loss: 0.002938, lagrangian_loss: -0.000081, attention_score_distillation_loss: 0.000020 +ETA: 0:23:12 | Epoch 158 finished. Took 33.31 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:54:02 +Evaluating: f1: 0.8643, eval_loss: 0.7991, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6821, expected_sequence_sparsity: 0.8702, target_sparsity: 0.67, step: 18300 +lambda_1: 0.3330, lambda_2: 81.9338 lambda_3: 0.0000 +train remain: [0.96 0.79 0.51 0.54 0.53 0.4 0.46 0.21 0.12 0.14] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111010111111011001010110100010000000000000001 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000011 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.001908, lagrangian_loss: -0.000282, attention_score_distillation_loss: 0.000020 +loss: 0.006030, lagrangian_loss: -0.000017, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:54:17 +Evaluating: f1: 0.8586, eval_loss: 0.7456, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6821, expected_sequence_sparsity: 0.8702, target_sparsity: 0.67, step: 18350 +lambda_1: 0.0030, lambda_2: 82.0949 lambda_3: 0.0000 +train remain: [0.96 0.79 0.51 0.54 0.53 0.4 0.46 0.21 0.12 0.14] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111010111111011001010110100010000000000000001 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000011 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.002210, lagrangian_loss: 0.000419, attention_score_distillation_loss: 0.000020 +loss: 0.001725, lagrangian_loss: 0.000949, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:54:32 +Evaluating: f1: 0.8723, eval_loss: 0.7942, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6821, expected_sequence_sparsity: 0.8702, target_sparsity: 0.67, step: 18400 +lambda_1: -0.4295, lambda_2: 82.3513 lambda_3: 0.0000 +train remain: [0.96 0.79 0.51 0.54 0.53 0.4 0.46 0.21 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.4, 0.46, 0.2, 0.12, 0.14] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101100110010010000000000000000000000 +10111111010111111011001010110100010000000000000001 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000011 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +ETA: 0:22:38 | Epoch 159 finished. Took 35.6 seconds. +loss: 0.002749, lagrangian_loss: 0.000648, attention_score_distillation_loss: 0.000020 +loss: 0.004844, lagrangian_loss: 0.000975, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:54:47 +Evaluating: f1: 0.8601, eval_loss: 0.7596, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6824, expected_sequence_sparsity: 0.8703, target_sparsity: 0.67, step: 18450 +lambda_1: -0.7523, lambda_2: 82.5068 lambda_3: 0.0000 +train remain: [0.96 0.79 0.51 0.54 0.52 0.39 0.46 0.2 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.38, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000000000000000000000000000001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.001489, lagrangian_loss: 0.000047, attention_score_distillation_loss: 0.000020 +loss: 0.003326, lagrangian_loss: -0.000387, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:55:02 +Evaluating: f1: 0.8596, eval_loss: 0.7291, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8712, target_sparsity: 0.67, step: 18500 +lambda_1: -0.8178, lambda_2: 82.5507 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.46, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.009153, lagrangian_loss: -0.000661, attention_score_distillation_loss: 0.000020 +ETA: 0:22:04 | Epoch 160 finished. Took 33.31 seconds. +loss: 0.002924, lagrangian_loss: -0.000367, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:55:17 +Evaluating: f1: 0.8693, eval_loss: 0.7611, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 18550 +lambda_1: -0.7382, lambda_2: 82.5837 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.002700, lagrangian_loss: -0.000655, attention_score_distillation_loss: 0.000020 +loss: 0.051913, lagrangian_loss: -0.000631, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:55:31 +Evaluating: f1: 0.8654, eval_loss: 0.7536, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 18600 +lambda_1: -0.5083, lambda_2: 82.6806 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10101011110010000001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.002180, lagrangian_loss: -0.000433, attention_score_distillation_loss: 0.000020 +loss: 0.006298, lagrangian_loss: -0.000154, attention_score_distillation_loss: 0.000020 +ETA: 0:21:30 | Epoch 161 finished. Took 33.22 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:55:46 +Evaluating: f1: 0.8714, eval_loss: 0.7381, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 18650 +lambda_1: -0.2051, lambda_2: 82.8126 lambda_3: 0.0000 +train remain: [0.95 0.78 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10101011110010000001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.001190, lagrangian_loss: -0.000127, attention_score_distillation_loss: 0.000020 +loss: 0.001193, lagrangian_loss: 0.000084, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:56:00 +Evaluating: f1: 0.8776, eval_loss: 0.724, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 18700 +lambda_1: 0.0665, lambda_2: 82.9393 lambda_3: 0.0000 +train remain: [0.95 0.78 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10101011110010000001000000000000000000000000000001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.001821, lagrangian_loss: 0.000202, attention_score_distillation_loss: 0.000020 +loss: 0.002870, lagrangian_loss: 0.000822, attention_score_distillation_loss: 0.000020 +ETA: 0:20:56 | Epoch 162 finished. Took 33.02 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:56:15 +Evaluating: f1: 0.8661, eval_loss: 0.7248, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 18750 +lambda_1: 0.2976, lambda_2: 83.0369 lambda_3: 0.0000 +train remain: [0.95 0.79 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.002990, lagrangian_loss: -0.000052, attention_score_distillation_loss: 0.000020 +loss: 0.005201, lagrangian_loss: -0.000114, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:56:30 +Evaluating: f1: 0.8582, eval_loss: 0.7291, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6824, expected_sequence_sparsity: 0.8703, target_sparsity: 0.67, step: 18800 +lambda_1: 0.2959, lambda_2: 83.0881 lambda_3: 0.0000 +train remain: [0.96 0.79 0.51 0.54 0.53 0.39 0.45 0.2 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.38, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +loss: 0.002691, lagrangian_loss: -0.000055, attention_score_distillation_loss: 0.000020 +loss: 0.003214, lagrangian_loss: -0.000085, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:56:45 +Evaluating: f1: 0.8908, eval_loss: 0.6478, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6824, expected_sequence_sparsity: 0.8703, target_sparsity: 0.67, step: 18850 +lambda_1: 0.1293, lambda_2: 83.1625 lambda_3: 0.0000 +train remain: [0.96 0.79 0.51 0.54 0.53 0.39 0.45 0.2 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.38, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8821 @ step 17700 epoch 153.91 +Saving the best model so far: [Epoch 163 | Step: 18850 | MACs sparsity: 0.6868 | Score: 0.8908 | Loss: 0.6478] +loss: 0.002464, lagrangian_loss: 0.000052, attention_score_distillation_loss: 0.000020 +ETA: 0:20:24 | Epoch 163 finished. Took 43.92 seconds. +loss: 0.003624, lagrangian_loss: 0.000180, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:57:08 +Evaluating: f1: 0.8606, eval_loss: 0.7187, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6868, expected_sparsity: 0.6824, expected_sequence_sparsity: 0.8703, target_sparsity: 0.67, step: 18900 +lambda_1: -0.1517, lambda_2: 83.2908 lambda_3: 0.0000 +train remain: [0.96 0.79 0.51 0.54 0.53 0.39 0.45 0.2 0.12 0.13] +infer remain: [0.92, 0.76, 0.5, 0.54, 0.52, 0.38, 0.46, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.35, 0.19, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001011000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.004227, lagrangian_loss: 0.000056, attention_score_distillation_loss: 0.000020 +loss: 0.005288, lagrangian_loss: 0.000992, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:57:23 +Evaluating: f1: 0.8816, eval_loss: 0.6763, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 18950 +lambda_1: -0.3686, lambda_2: 83.3826 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10101011110010000001000000000000000000000000000001 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.004319, lagrangian_loss: 0.000213, attention_score_distillation_loss: 0.000020 +ETA: 0:19:50 | Epoch 164 finished. Took 33.47 seconds. +loss: 0.003914, lagrangian_loss: -0.000035, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:57:38 +Evaluating: f1: 0.8778, eval_loss: 0.7053, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19000 +lambda_1: -0.4478, lambda_2: 83.4331 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111011111111111111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10101011110010000001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.003418, lagrangian_loss: -0.000358, attention_score_distillation_loss: 0.000020 +loss: 0.049512, lagrangian_loss: 0.000256, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:57:52 +Evaluating: f1: 0.8655, eval_loss: 0.7326, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19050 +lambda_1: -0.4498, lambda_2: 83.4764 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.101981, lagrangian_loss: -0.000078, attention_score_distillation_loss: 0.000020 +loss: 0.002131, lagrangian_loss: 0.000408, attention_score_distillation_loss: 0.000020 +ETA: 0:19:16 | Epoch 165 finished. Took 33.38 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:58:07 +Evaluating: f1: 0.8651, eval_loss: 0.7015, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19100 +lambda_1: -0.4295, lambda_2: 83.5153 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10101011110010000001000010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001654, lagrangian_loss: -0.000328, attention_score_distillation_loss: 0.000020 +loss: 0.003756, lagrangian_loss: -0.000298, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:58:22 +Evaluating: f1: 0.8557, eval_loss: 0.7759, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19150 +lambda_1: -0.2891, lambda_2: 83.5810 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000010000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.011739, lagrangian_loss: -0.000212, attention_score_distillation_loss: 0.000020 +loss: 0.003640, lagrangian_loss: -0.000017, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:58:37 +Evaluating: f1: 0.8654, eval_loss: 0.7522, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19200 +lambda_1: -0.0734, lambda_2: 83.6706 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000010000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.118608, lagrangian_loss: 0.000037, attention_score_distillation_loss: 0.000020 +ETA: 0:18:42 | Epoch 166 finished. Took 35.53 seconds. +loss: 0.001860, lagrangian_loss: 0.000055, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:58:51 +Evaluating: f1: 0.8581, eval_loss: 0.7507, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19250 +lambda_1: 0.0298, lambda_2: 83.7407 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000010000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.006087, lagrangian_loss: 0.000041, attention_score_distillation_loss: 0.000020 +loss: 0.003031, lagrangian_loss: 0.000068, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:59:06 +Evaluating: f1: 0.8656, eval_loss: 0.7219, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19300 +lambda_1: 0.1749, lambda_2: 83.8183 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002189, lagrangian_loss: -0.000089, attention_score_distillation_loss: 0.000020 +ETA: 0:18:08 | Epoch 167 finished. Took 33.31 seconds. +loss: 0.001886, lagrangian_loss: 0.000004, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:59:21 +Evaluating: f1: 0.8611, eval_loss: 0.7705, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19350 +lambda_1: 0.2345, lambda_2: 83.8586 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.004097, lagrangian_loss: -0.000028, attention_score_distillation_loss: 0.000020 +loss: 0.003245, lagrangian_loss: 0.000013, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 15:59:36 +Evaluating: f1: 0.8451, eval_loss: 0.7944, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19400 +lambda_1: 0.1340, lambda_2: 83.9120 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.003030, lagrangian_loss: -0.000049, attention_score_distillation_loss: 0.000020 +loss: 0.002807, lagrangian_loss: 0.000085, attention_score_distillation_loss: 0.000020 +ETA: 0:17:34 | Epoch 168 finished. Took 33.11 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 15:59:50 +Evaluating: f1: 0.8357, eval_loss: 0.7928, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19450 +lambda_1: -0.0842, lambda_2: 84.0012 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.45 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.12, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000100000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001248, lagrangian_loss: -0.000016, attention_score_distillation_loss: 0.000020 +loss: 0.002005, lagrangian_loss: 0.000146, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:00:05 +Evaluating: f1: 0.861, eval_loss: 0.7979, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19500 +lambda_1: -0.2886, lambda_2: 84.0951 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002164, lagrangian_loss: 0.000007, attention_score_distillation_loss: 0.000020 +loss: 0.002151, lagrangian_loss: 0.000178, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:00:20 +Evaluating: f1: 0.8557, eval_loss: 0.7901, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19550 +lambda_1: -0.3465, lambda_2: 84.1427 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +ETA: 0:17:00 | Epoch 169 finished. Took 35.25 seconds. +loss: 0.002931, lagrangian_loss: 0.000111, attention_score_distillation_loss: 0.000020 +loss: 0.002651, lagrangian_loss: -0.000178, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:00:34 +Evaluating: f1: 0.8601, eval_loss: 0.7372, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19600 +lambda_1: -0.2344, lambda_2: 84.1923 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002128, lagrangian_loss: -0.000117, attention_score_distillation_loss: 0.000020 +loss: 0.002778, lagrangian_loss: 0.000138, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:00:49 +Evaluating: f1: 0.866, eval_loss: 0.7242, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19650 +lambda_1: -0.0626, lambda_2: 84.2643 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.004632, lagrangian_loss: 0.000105, attention_score_distillation_loss: 0.000020 +ETA: 0:16:26 | Epoch 170 finished. Took 33.1 seconds. +loss: 0.001214, lagrangian_loss: 0.000008, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:01:04 +Evaluating: f1: 0.8678, eval_loss: 0.7561, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19700 +lambda_1: 0.0857, lambda_2: 84.3253 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.003981, lagrangian_loss: 0.000091, attention_score_distillation_loss: 0.000020 +loss: 0.002792, lagrangian_loss: -0.000035, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:01:18 +Evaluating: f1: 0.8748, eval_loss: 0.735, token_prune_loc: [True, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.69, expected_sparsity: 0.6847, expected_sequence_sparsity: 0.8713, target_sparsity: 0.67, step: 19750 +lambda_1: 0.1134, lambda_2: 84.3761 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [0.92, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 0.92, 0.7, 0.34, 0.18, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +01111111111111111111111011111111111111111111111010 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010100001000000000000000000000000000001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.003369, lagrangian_loss: 0.000002, attention_score_distillation_loss: 0.000020 +loss: 0.142612, lagrangian_loss: -0.000025, attention_score_distillation_loss: 0.000020 +ETA: 0:15:52 | Epoch 171 finished. Took 33.19 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:01:33 +Evaluating: f1: 0.8707, eval_loss: 0.762, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 19800 +lambda_1: 0.0361, lambda_2: 84.4315 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001810, lagrangian_loss: 0.000016, attention_score_distillation_loss: 0.000020 +loss: 0.001038, lagrangian_loss: 0.000190, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:01:48 +Evaluating: f1: 0.8761, eval_loss: 0.7528, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 19850 +lambda_1: -0.2607, lambda_2: 84.5545 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.009421, lagrangian_loss: -0.000022, attention_score_distillation_loss: 0.000020 +loss: 0.002408, lagrangian_loss: 0.000031, attention_score_distillation_loss: 0.000020 +ETA: 0:15:18 | Epoch 172 finished. Took 33.1 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:02:02 +Evaluating: f1: 0.8767, eval_loss: 0.7228, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 19900 +lambda_1: -0.3633, lambda_2: 84.6072 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.006374, lagrangian_loss: 0.001063, attention_score_distillation_loss: 0.000020 +loss: 0.002648, lagrangian_loss: -0.000183, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:02:17 +Evaluating: f1: 0.8626, eval_loss: 0.7558, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 19950 +lambda_1: -0.3942, lambda_2: 84.6655 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.008101, lagrangian_loss: 0.000185, attention_score_distillation_loss: 0.000020 +loss: 0.003919, lagrangian_loss: -0.000078, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:02:32 +Evaluating: f1: 0.8832, eval_loss: 0.7133, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20000 +lambda_1: -0.2228, lambda_2: 84.7516 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002097, lagrangian_loss: -0.000058, attention_score_distillation_loss: 0.000020 +ETA: 0:14:44 | Epoch 173 finished. Took 35.46 seconds. +loss: 0.001695, lagrangian_loss: -0.000018, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:02:47 +Evaluating: f1: 0.875, eval_loss: 0.7539, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20050 +lambda_1: -0.0045, lambda_2: 84.8690 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.009993, lagrangian_loss: 0.000005, attention_score_distillation_loss: 0.000020 +loss: 0.001037, lagrangian_loss: 0.000192, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:03:02 +Evaluating: f1: 0.8566, eval_loss: 0.7743, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20100 +lambda_1: 0.2145, lambda_2: 84.9749 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001210, lagrangian_loss: -0.000008, attention_score_distillation_loss: 0.000020 +ETA: 0:14:10 | Epoch 174 finished. Took 33.24 seconds. +loss: 0.002070, lagrangian_loss: -0.000129, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:03:16 +Evaluating: f1: 0.8456, eval_loss: 0.8236, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20150 +lambda_1: 0.1426, lambda_2: 85.0440 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001194, lagrangian_loss: 0.000153, attention_score_distillation_loss: 0.000020 +loss: 0.001844, lagrangian_loss: -0.000002, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:03:31 +Evaluating: f1: 0.865, eval_loss: 0.7715, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20200 +lambda_1: -0.0640, lambda_2: 85.1440 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002895, lagrangian_loss: 0.000336, attention_score_distillation_loss: 0.000020 +loss: 0.001708, lagrangian_loss: 0.000037, attention_score_distillation_loss: 0.000020 +ETA: 0:13:36 | Epoch 175 finished. Took 33.29 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:03:46 +Evaluating: f1: 0.8678, eval_loss: 0.7617, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20250 +lambda_1: -0.3265, lambda_2: 85.2593 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000010000000000000001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.004211, lagrangian_loss: 0.000088, attention_score_distillation_loss: 0.000020 +loss: 0.001055, lagrangian_loss: 0.000311, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:04:01 +Evaluating: f1: 0.8727, eval_loss: 0.7556, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20300 +lambda_1: -0.3798, lambda_2: 85.3103 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002133, lagrangian_loss: -0.000364, attention_score_distillation_loss: 0.000020 +loss: 0.195103, lagrangian_loss: 0.000328, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:04:15 +Evaluating: f1: 0.8703, eval_loss: 0.7479, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20350 +lambda_1: -0.2586, lambda_2: 85.3998 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001746, lagrangian_loss: -0.000025, attention_score_distillation_loss: 0.000020 +ETA: 0:13:02 | Epoch 176 finished. Took 35.43 seconds. +loss: 0.006257, lagrangian_loss: -0.000036, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:04:30 +Evaluating: f1: 0.8517, eval_loss: 0.796, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20400 +lambda_1: 0.0371, lambda_2: 85.5357 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010001001000000000000010000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002954, lagrangian_loss: 0.000085, attention_score_distillation_loss: 0.000020 +loss: 0.001349, lagrangian_loss: -0.000006, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:04:45 +Evaluating: f1: 0.8546, eval_loss: 0.762, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20450 +lambda_1: 0.2252, lambda_2: 85.6391 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001010000000000000000000000000001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002633, lagrangian_loss: -0.000113, attention_score_distillation_loss: 0.000020 +ETA: 0:12:28 | Epoch 177 finished. Took 33.18 seconds. +loss: 0.002853, lagrangian_loss: -0.000058, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:05:00 +Evaluating: f1: 0.872, eval_loss: 0.7189, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20500 +lambda_1: 0.1634, lambda_2: 85.7141 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002780, lagrangian_loss: 0.000012, attention_score_distillation_loss: 0.000020 +loss: 0.000718, lagrangian_loss: 0.000004, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:05:14 +Evaluating: f1: 0.8651, eval_loss: 0.7357, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20550 +lambda_1: -0.1070, lambda_2: 85.8416 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002766, lagrangian_loss: 0.000075, attention_score_distillation_loss: 0.000020 +loss: 0.001001, lagrangian_loss: -0.000095, attention_score_distillation_loss: 0.000020 +ETA: 0:11:54 | Epoch 178 finished. Took 33.17 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:05:29 +Evaluating: f1: 0.8581, eval_loss: 0.7623, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20600 +lambda_1: -0.2240, lambda_2: 85.9192 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001010000000000000000000000000001 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001774, lagrangian_loss: -0.000107, attention_score_distillation_loss: 0.000020 +loss: 0.002291, lagrangian_loss: 0.000061, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:05:44 +Evaluating: f1: 0.8616, eval_loss: 0.7231, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20650 +lambda_1: -0.0489, lambda_2: 86.0052 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001654, lagrangian_loss: 0.000014, attention_score_distillation_loss: 0.000020 +loss: 0.004153, lagrangian_loss: 0.000025, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:05:58 +Evaluating: f1: 0.8576, eval_loss: 0.7454, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20700 +lambda_1: 0.1441, lambda_2: 86.1123 lambda_3: 0.0000 +train remain: [0.96 0.78 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010100001010000000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +ETA: 0:11:20 | Epoch 179 finished. Took 35.47 seconds. +loss: 0.002002, lagrangian_loss: -0.000043, attention_score_distillation_loss: 0.000020 +loss: 0.001561, lagrangian_loss: -0.000045, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:06:13 +Evaluating: f1: 0.8571, eval_loss: 0.7436, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20750 +lambda_1: 0.1420, lambda_2: 86.1911 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010100001010000000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002854, lagrangian_loss: 0.000034, attention_score_distillation_loss: 0.000020 +loss: 0.001173, lagrangian_loss: 0.000132, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:06:28 +Evaluating: f1: 0.87, eval_loss: 0.7184, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20800 +lambda_1: -0.1154, lambda_2: 86.3454 lambda_3: 0.0000 +train remain: [0.96 0.79 0.5 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010110100010000000000000000 +10001011110010100001010000000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.091195, lagrangian_loss: 0.000006, attention_score_distillation_loss: 0.000020 +ETA: 0:10:46 | Epoch 180 finished. Took 33.15 seconds. +loss: 0.002847, lagrangian_loss: -0.000003, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:06:42 +Evaluating: f1: 0.8566, eval_loss: 0.7497, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20850 +lambda_1: -0.3265, lambda_2: 86.4612 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.2 0.11 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002411, lagrangian_loss: 0.000789, attention_score_distillation_loss: 0.000020 +loss: 0.002621, lagrangian_loss: -0.000074, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:06:57 +Evaluating: f1: 0.8503, eval_loss: 0.8098, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20900 +lambda_1: -0.3795, lambda_2: 86.5651 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001899, lagrangian_loss: -0.000010, attention_score_distillation_loss: 0.000020 +loss: 0.087168, lagrangian_loss: 0.000032, attention_score_distillation_loss: 0.000020 +ETA: 0:10:12 | Epoch 181 finished. Took 33.22 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:07:12 +Evaluating: f1: 0.8669, eval_loss: 0.7718, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 20950 +lambda_1: -0.1762, lambda_2: 86.6673 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002364, lagrangian_loss: -0.000024, attention_score_distillation_loss: 0.000020 +loss: 0.001172, lagrangian_loss: -0.000001, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:07:27 +Evaluating: f1: 0.862, eval_loss: 0.7935, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21000 +lambda_1: 0.0363, lambda_2: 86.7993 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000000000000010000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.006221, lagrangian_loss: 0.000035, attention_score_distillation_loss: 0.000020 +loss: 0.001526, lagrangian_loss: -0.000000, attention_score_distillation_loss: 0.000020 +ETA: 0:09:37 | Epoch 182 finished. Took 33.3 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:07:41 +Evaluating: f1: 0.8486, eval_loss: 0.7599, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21050 +lambda_1: -0.1781, lambda_2: 86.9663 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000000000000010000000000000000 +10000000010100000001000000010000010000000000000000 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001772, lagrangian_loss: -0.000085, attention_score_distillation_loss: 0.000020 +loss: 0.002683, lagrangian_loss: -0.000009, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:07:56 +Evaluating: f1: 0.8606, eval_loss: 0.7133, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21100 +lambda_1: -0.4115, lambda_2: 87.1168 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000000000000010000000000000000 +10000000010100000001000000010000010000000000000000 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001809, lagrangian_loss: 0.000197, attention_score_distillation_loss: 0.000020 +loss: 0.002033, lagrangian_loss: 0.001337, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:08:11 +Evaluating: f1: 0.8532, eval_loss: 0.7781, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21150 +lambda_1: -0.3903, lambda_2: 87.2219 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000010100000001000000010000010000000000000000 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001898, lagrangian_loss: -0.000286, attention_score_distillation_loss: 0.000020 +ETA: 0:09:04 | Epoch 183 finished. Took 35.31 seconds. +loss: 0.001768, lagrangian_loss: 0.000057, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:08:25 +Evaluating: f1: 0.8487, eval_loss: 0.7804, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6689, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 21200 +lambda_1: -0.1513, lambda_2: 87.4056 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000010100000001000000010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001702, lagrangian_loss: -0.000065, attention_score_distillation_loss: 0.000020 +loss: 0.003401, lagrangian_loss: 0.000128, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:08:40 +Evaluating: f1: 0.8527, eval_loss: 0.768, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21250 +lambda_1: 0.2079, lambda_2: 87.6079 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010100001010000000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001500, lagrangian_loss: 0.000536, attention_score_distillation_loss: 0.000020 +ETA: 0:08:29 | Epoch 184 finished. Took 33.08 seconds. +loss: 0.002227, lagrangian_loss: 0.000407, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:08:55 +Evaluating: f1: 0.8537, eval_loss: 0.7681, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21300 +lambda_1: 0.3653, lambda_2: 87.7662 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.21 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000001 +10001011110010100001010000000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002762, lagrangian_loss: -0.000273, attention_score_distillation_loss: 0.000020 +loss: 0.002053, lagrangian_loss: -0.000046, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:09:10 +Evaluating: f1: 0.8566, eval_loss: 0.7426, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21350 +lambda_1: -0.0805, lambda_2: 88.0463 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.21 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.14] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000001 +10001011110010100001000000000000000000000000000001 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000010011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.003112, lagrangian_loss: 0.000033, attention_score_distillation_loss: 0.000020 +loss: 0.001553, lagrangian_loss: 0.000171, attention_score_distillation_loss: 0.000020 +ETA: 0:07:55 | Epoch 185 finished. Took 33.12 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:09:24 +Evaluating: f1: 0.8591, eval_loss: 0.7435, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21400 +lambda_1: -0.4411, lambda_2: 88.2999 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010100001010000000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001744, lagrangian_loss: -0.000243, attention_score_distillation_loss: 0.000020 +loss: 0.002581, lagrangian_loss: -0.000077, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:09:39 +Evaluating: f1: 0.8531, eval_loss: 0.7636, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6689, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 21450 +lambda_1: -0.1828, lambda_2: 88.4583 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.003321, lagrangian_loss: -0.000007, attention_score_distillation_loss: 0.000020 +loss: 0.001784, lagrangian_loss: 0.000037, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:09:54 +Evaluating: f1: 0.8537, eval_loss: 0.7538, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21500 +lambda_1: 0.1555, lambda_2: 88.6680 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001687, lagrangian_loss: 0.000215, attention_score_distillation_loss: 0.000020 +ETA: 0:07:22 | Epoch 186 finished. Took 35.46 seconds. +loss: 0.001474, lagrangian_loss: 0.000629, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:10:08 +Evaluating: f1: 0.8625, eval_loss: 0.7456, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21550 +lambda_1: 0.3285, lambda_2: 88.8723 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000001 +10001011110010000001010000000000000000000000000001 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001634, lagrangian_loss: -0.000023, attention_score_distillation_loss: 0.000020 +loss: 0.002179, lagrangian_loss: -0.000101, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:10:23 +Evaluating: f1: 0.8556, eval_loss: 0.741, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21600 +lambda_1: -0.0380, lambda_2: 89.1487 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.21 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000001 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.000575, lagrangian_loss: 0.000018, attention_score_distillation_loss: 0.000020 +ETA: 0:06:47 | Epoch 187 finished. Took 33.05 seconds. +loss: 0.001805, lagrangian_loss: 0.000483, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:10:38 +Evaluating: f1: 0.8526, eval_loss: 0.7655, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21650 +lambda_1: -0.4347, lambda_2: 89.4338 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010001000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001195, lagrangian_loss: -0.000423, attention_score_distillation_loss: 0.000020 +loss: 0.001808, lagrangian_loss: 0.000390, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:10:52 +Evaluating: f1: 0.8681, eval_loss: 0.7174, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6689, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 21700 +lambda_1: -0.1589, lambda_2: 89.6395 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000000000000010000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001363, lagrangian_loss: 0.000064, attention_score_distillation_loss: 0.000020 +loss: 0.003172, lagrangian_loss: 0.000028, attention_score_distillation_loss: 0.000020 +ETA: 0:06:13 | Epoch 188 finished. Took 33.19 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:11:07 +Evaluating: f1: 0.8694, eval_loss: 0.7135, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21750 +lambda_1: 0.2661, lambda_2: 89.9294 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010000000000000000010000000000 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002358, lagrangian_loss: -0.000120, attention_score_distillation_loss: 0.000020 +loss: 0.001429, lagrangian_loss: -0.000066, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:11:22 +Evaluating: f1: 0.8596, eval_loss: 0.747, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21800 +lambda_1: 0.2150, lambda_2: 90.0482 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.21 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000001 +10001011110010000001000000000000000000000000010001 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001851, lagrangian_loss: -0.000117, attention_score_distillation_loss: 0.000020 +loss: 0.002425, lagrangian_loss: 0.000040, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:11:36 +Evaluating: f1: 0.8679, eval_loss: 0.7305, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 21850 +lambda_1: -0.2101, lambda_2: 90.3547 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.52 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000001 +10001011110010000001010000000000000000000000000001 +10000101000010000000000000000000000000000000000001 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +ETA: 0:05:39 | Epoch 189 finished. Took 35.33 seconds. +loss: 0.000704, lagrangian_loss: -0.000092, attention_score_distillation_loss: 0.000020 +loss: 0.000859, lagrangian_loss: 0.000433, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:11:51 +Evaluating: f1: 0.8709, eval_loss: 0.7177, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6689, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 21900 +lambda_1: -0.2950, lambda_2: 90.5115 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000000000000010000000000000000 +10000000000100001001000010010000000000000000000000 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001505, lagrangian_loss: -0.000078, attention_score_distillation_loss: 0.000020 +loss: 0.001651, lagrangian_loss: -0.000106, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:12:06 +Evaluating: f1: 0.8679, eval_loss: 0.7343, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6689, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 21950 +lambda_1: -0.0568, lambda_2: 90.6472 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000000000000010000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001825, lagrangian_loss: 0.000020, attention_score_distillation_loss: 0.000020 +ETA: 0:05:05 | Epoch 190 finished. Took 33.0 seconds. +loss: 0.001877, lagrangian_loss: -0.000002, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:12:20 +Evaluating: f1: 0.8709, eval_loss: 0.7104, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 22000 +lambda_1: 0.0952, lambda_2: 90.8486 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010000000000010000000000000000 +10000101000010000000000000000000010000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002109, lagrangian_loss: 0.000229, attention_score_distillation_loss: 0.000020 +loss: 0.001293, lagrangian_loss: 0.000091, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:12:35 +Evaluating: f1: 0.8664, eval_loss: 0.748, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 22050 +lambda_1: 0.0097, lambda_2: 90.9873 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010000000000000000000000000001 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002194, lagrangian_loss: -0.000000, attention_score_distillation_loss: 0.000020 +loss: 0.000904, lagrangian_loss: 0.000022, attention_score_distillation_loss: 0.000020 +ETA: 0:04:31 | Epoch 191 finished. Took 33.17 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:12:50 +Evaluating: f1: 0.8586, eval_loss: 0.7496, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 22100 +lambda_1: -0.2219, lambda_2: 91.1646 lambda_3: 0.0000 +train remain: [0.97 0.78 0.5 0.54 0.51 0.39 0.44 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010000000000010000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000010010000000000000000000001 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002082, lagrangian_loss: 0.000022, attention_score_distillation_loss: 0.000020 +loss: 0.001520, lagrangian_loss: 0.000359, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:13:05 +Evaluating: f1: 0.8532, eval_loss: 0.7638, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6689, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 22150 +lambda_1: -0.3488, lambda_2: 91.2887 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.38 0.43 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001827, lagrangian_loss: -0.000175, attention_score_distillation_loss: 0.000020 +loss: 0.001174, lagrangian_loss: -0.000187, attention_score_distillation_loss: 0.000020 +ETA: 0:03:57 | Epoch 192 finished. Took 33.23 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:13:19 +Evaluating: f1: 0.8566, eval_loss: 0.7657, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6757, expected_sparsity: 0.67, expected_sequence_sparsity: 0.8652, target_sparsity: 0.67, step: 22200 +lambda_1: -0.1045, lambda_2: 91.4392 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.38 0.43 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.52, 0.5, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.19, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111011111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111110111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001657, lagrangian_loss: -0.000016, attention_score_distillation_loss: 0.000020 +loss: 0.001459, lagrangian_loss: 0.000132, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:13:34 +Evaluating: f1: 0.8551, eval_loss: 0.7698, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6689, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 22250 +lambda_1: 0.2273, lambda_2: 91.6546 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.44 0.2 0.1 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010001000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002587, lagrangian_loss: -0.000042, attention_score_distillation_loss: 0.000020 +loss: 0.000916, lagrangian_loss: -0.000066, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:13:49 +Evaluating: f1: 0.8611, eval_loss: 0.7683, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6684, expected_sequence_sparsity: 0.8646, target_sparsity: 0.67, step: 22300 +lambda_1: 0.0348, lambda_2: 91.8501 lambda_3: 0.0000 +train remain: [0.97 0.78 0.5 0.54 0.51 0.39 0.44 0.2 0.11 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.52, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +11111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000001 +10001011110010000001010000000000000000000000000001 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001516, lagrangian_loss: 0.000019, attention_score_distillation_loss: 0.000020 +ETA: 0:03:23 | Epoch 193 finished. Took 35.33 seconds. +loss: 0.001377, lagrangian_loss: 0.000081, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:14:03 +Evaluating: f1: 0.8601, eval_loss: 0.7673, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6689, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 22350 +lambda_1: -0.2663, lambda_2: 92.1052 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.38 0.43 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010001000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001168, lagrangian_loss: 0.000611, attention_score_distillation_loss: 0.000020 +loss: 0.001424, lagrangian_loss: -0.000017, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:14:18 +Evaluating: f1: 0.8646, eval_loss: 0.7443, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.669, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 22400 +lambda_1: -0.2004, lambda_2: 92.2354 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.38 0.43 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.000935, lagrangian_loss: -0.000097, attention_score_distillation_loss: 0.000020 +ETA: 0:02:49 | Epoch 194 finished. Took 33.22 seconds. +loss: 0.004615, lagrangian_loss: -0.000008, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:14:33 +Evaluating: f1: 0.8551, eval_loss: 0.7674, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.669, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 22450 +lambda_1: 0.0140, lambda_2: 92.3896 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.43 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010000001010010000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.003741, lagrangian_loss: 0.000018, attention_score_distillation_loss: 0.000020 +loss: 0.001238, lagrangian_loss: 0.000010, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:14:47 +Evaluating: f1: 0.8546, eval_loss: 0.7638, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.669, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 22500 +lambda_1: 0.0640, lambda_2: 92.5192 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.43 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.000877, lagrangian_loss: -0.000011, attention_score_distillation_loss: 0.000020 +loss: 0.001550, lagrangian_loss: -0.000019, attention_score_distillation_loss: 0.000020 +ETA: 0:02:15 | Epoch 195 finished. Took 32.91 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:15:02 +Evaluating: f1: 0.8571, eval_loss: 0.7567, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6726, expected_sparsity: 0.6689, expected_sequence_sparsity: 0.8648, target_sparsity: 0.67, step: 22550 +lambda_1: 0.0445, lambda_2: 92.6913 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.54 0.51 0.39 0.43 0.2 0.1 0.13] +infer remain: [1.0, 0.76, 0.48, 0.54, 0.5, 0.38, 0.44, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.2, 0.1, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111111111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010001000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.166347, lagrangian_loss: 0.000017, attention_score_distillation_loss: 0.000020 +loss: 0.001544, lagrangian_loss: 0.000131, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:15:17 +Evaluating: f1: 0.8646, eval_loss: 0.7374, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6757, expected_sparsity: 0.6701, expected_sequence_sparsity: 0.8652, target_sparsity: 0.67, step: 22600 +lambda_1: -0.3134, lambda_2: 92.9561 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.38 0.43 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.52, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.19, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111011111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002077, lagrangian_loss: 0.000380, attention_score_distillation_loss: 0.000020 +loss: 0.001404, lagrangian_loss: -0.000018, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:15:31 +Evaluating: f1: 0.8675, eval_loss: 0.7283, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6789, expected_sparsity: 0.6732, expected_sequence_sparsity: 0.8665, target_sparsity: 0.67, step: 22650 +lambda_1: -0.3634, lambda_2: 93.1178 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.38 0.42 0.2 0.1 0.12] +infer remain: [1.0, 0.74, 0.48, 0.52, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.74, 0.36, 0.18, 0.09, 0.04, 0.01, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101011110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111011111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002117, lagrangian_loss: -0.000319, attention_score_distillation_loss: 0.000020 +ETA: 0:01:41 | Epoch 196 finished. Took 35.35 seconds. +loss: 0.001624, lagrangian_loss: -0.000043, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:15:46 +Evaluating: f1: 0.869, eval_loss: 0.7248, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6789, expected_sparsity: 0.6732, expected_sequence_sparsity: 0.8665, target_sparsity: 0.67, step: 22700 +lambda_1: 0.0395, lambda_2: 93.4047 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.38 0.42 0.2 0.1 0.12] +infer remain: [1.0, 0.74, 0.48, 0.52, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.74, 0.36, 0.18, 0.09, 0.04, 0.01, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101011110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111011111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001322, lagrangian_loss: 0.000101, attention_score_distillation_loss: 0.000020 +loss: 0.001156, lagrangian_loss: 0.000128, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:16:01 +Evaluating: f1: 0.8651, eval_loss: 0.7446, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6757, expected_sparsity: 0.6701, expected_sequence_sparsity: 0.8652, target_sparsity: 0.67, step: 22750 +lambda_1: 0.2353, lambda_2: 93.7072 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.39 0.43 0.2 0.1 0.13] +infer remain: [1.0, 0.76, 0.48, 0.52, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.19, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111011111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010000001000000000000000000000000010001 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001924, lagrangian_loss: -0.000144, attention_score_distillation_loss: 0.000020 +ETA: 0:01:07 | Epoch 197 finished. Took 33.01 seconds. +loss: 0.001262, lagrangian_loss: 0.000258, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:16:15 +Evaluating: f1: 0.867, eval_loss: 0.7414, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6757, expected_sparsity: 0.6701, expected_sequence_sparsity: 0.8652, target_sparsity: 0.67, step: 22800 +lambda_1: -0.2827, lambda_2: 94.1122 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.38 0.42 0.2 0.1 0.12] +infer remain: [1.0, 0.76, 0.48, 0.52, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.76, 0.36, 0.19, 0.09, 0.04, 0.02, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101111110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111011111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.003389, lagrangian_loss: 0.000274, attention_score_distillation_loss: 0.000020 +loss: 0.001050, lagrangian_loss: -0.000213, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:16:30 +Evaluating: f1: 0.8596, eval_loss: 0.7526, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6789, expected_sparsity: 0.6732, expected_sequence_sparsity: 0.8665, target_sparsity: 0.67, step: 22850 +lambda_1: -0.2575, lambda_2: 94.2527 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.5 0.38 0.42 0.2 0.09 0.12] +infer remain: [1.0, 0.74, 0.48, 0.52, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.74, 0.36, 0.18, 0.09, 0.04, 0.01, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101011110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111011111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002498, lagrangian_loss: -0.000067, attention_score_distillation_loss: 0.000020 +loss: 0.001408, lagrangian_loss: 0.000325, attention_score_distillation_loss: 0.000020 +ETA: 0:00:33 | Epoch 198 finished. Took 33.34 seconds. +---------------------------------------------------------------------- +time: 2023-07-19 16:16:45 +Evaluating: f1: 0.8596, eval_loss: 0.7494, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6789, expected_sparsity: 0.6732, expected_sequence_sparsity: 0.8665, target_sparsity: 0.67, step: 22900 +lambda_1: 0.1379, lambda_2: 94.5999 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.38 0.42 0.2 0.1 0.12] +infer remain: [1.0, 0.74, 0.48, 0.52, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.74, 0.36, 0.18, 0.09, 0.04, 0.01, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101011110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111011111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.002369, lagrangian_loss: 0.000137, attention_score_distillation_loss: 0.000020 +loss: 0.000617, lagrangian_loss: 0.000096, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:17:00 +Evaluating: f1: 0.8611, eval_loss: 0.7508, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6789, expected_sparsity: 0.6732, expected_sequence_sparsity: 0.8665, target_sparsity: 0.67, step: 22950 +lambda_1: 0.0115, lambda_2: 94.8216 lambda_3: 0.0000 +train remain: [0.97 0.78 0.5 0.53 0.51 0.38 0.43 0.2 0.1 0.12] +infer remain: [1.0, 0.74, 0.48, 0.52, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.74, 0.36, 0.18, 0.09, 0.04, 0.01, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101011110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111011111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010001001000000000000000000010000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +loss: 0.001884, lagrangian_loss: 0.000071, attention_score_distillation_loss: 0.000020 +loss: 0.001276, lagrangian_loss: -0.000035, attention_score_distillation_loss: 0.000020 +---------------------------------------------------------------------- +time: 2023-07-19 16:17:14 +Evaluating: f1: 0.8596, eval_loss: 0.7537, token_prune_loc: [False, True, True, True, True, True, True, True, True, True], macs_sparsity: 0.6789, expected_sparsity: 0.6732, expected_sequence_sparsity: 0.8665, target_sparsity: 0.67, step: 23000 +lambda_1: -0.2657, lambda_2: 95.0416 lambda_3: 0.0000 +train remain: [0.97 0.78 0.49 0.53 0.51 0.38 0.42 0.2 0.09 0.12] +infer remain: [1.0, 0.74, 0.48, 0.52, 0.5, 0.38, 0.42, 0.2, 0.1, 0.12] +layerwise remain: [1.0, 1.0, 1.0, 0.74, 0.36, 0.18, 0.09, 0.04, 0.01, 0.0, 0.0, 0.0] +11111111111111111111111111111111111111111111111111 +10111011111111101100111111101011110111001101110110 +11111111101011110111001001000010100110100000000000 +11111111111011111100110001010010011010000000000100 +10111111111101111111110110001100100000000000000000 +10111111111111101000110010010000000000000000000000 +10111111010111111011001010010100010000000000000000 +10001011110010001001010000000000000000000000000000 +10000101000010000000000010000000000000000000000000 +10000000000100000001000000010000000000000000000011 +Best eval score so far: 0.8908 @ step 18850 epoch 163.91 +ETA: 0:00:00 | Epoch 199 finished. Took 35.49 seconds.