- dataset_max_seq_length=1024, dataset_sample_size=1000000, per_device_train_batch_size=16
- dataset_max_seq_length=1024, dataset_sample_size=1000000, per_device_train_batch_size=4
- dataset_max_seq_length=2048, dataset_sample_size=500000, per_device_train_batch_size=4
- dataset_max_seq_length=512, dataset_sample_size=2000000, learning_rate=0.0001, per_device_train_batch_size=16
- dataset_max_seq_length=512, dataset_sample_size=2000000, per_device_train_batch_size=16, warmup_ratio=0.1
- dataset_max_seq_length=512, dataset_sample_size=2000000, per_device_train_batch_size=16
- dataset_max_seq_length=512, dataset_sample_size=2000000, per_device_train_batch_size=4