CUDA: OutOfMemoryError
#33
by
8497prashant
- opened
Hi, I am trying to finetune DNABERT2 and DNABERTS on my data. My data has approx 800k sequences which are around 500 nucleotides each. I am fine tuning on kaggle's P100 GPU. I am getting CUDA out of memory error, unless I choose a very small dataset.
Following is the error when running trainer.train() on the whole dataset, it could process about 35k sequences before crashing. Below are the training arguments followed by the error.
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="accuracy",
gradient_accumulation_steps = 4,
fp16 = True,
)
---------------------------------------------------------------------------
OutOfMemoryError Traceback (most recent call last)
<ipython-input-10-9f78b7486d29> in <cell line: 2>()
1 # Step 6: Train the model
----> 2 trainer.train()
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1631 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1632 )
-> 1633 return inner_training_loop(
1634 args=args,
1635 resume_from_checkpoint=resume_from_checkpoint,
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1992
1993 self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
-> 1994 self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
1995
1996 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval)
2234 )
2235 else:
-> 2236 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
2237 self._report_to_hp_search(trial, self.state.global_step, metrics)
2238
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
2930
2931 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 2932 output = eval_loop(
2933 eval_dataloader,
2934 description="Evaluation",
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
3138 if self.preprocess_logits_for_metrics is not None:
3139 logits = self.preprocess_logits_for_metrics(logits, labels)
-> 3140 preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
3141 self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
3142
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in nested_concat(tensors, new_tensors, padding_index)
111 ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
112 if isinstance(tensors, (list, tuple)):
--> 113 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
114 elif isinstance(tensors, torch.Tensor):
115 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in <genexpr>(.0)
111 ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
112 if isinstance(tensors, (list, tuple)):
--> 113 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
114 elif isinstance(tensors, torch.Tensor):
115 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in nested_concat(tensors, new_tensors, padding_index)
113 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
114 elif isinstance(tensors, torch.Tensor):
--> 115 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
116 elif isinstance(tensors, Mapping):
117 return type(tensors)(
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in torch_pad_and_concatenate(tensor1, tensor2, padding_index)
78
79 # Now let's fill the result tensor
---> 80 result = tensor1.new_full(new_shape, padding_index)
81 result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1
82 result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2
OutOfMemoryError: CUDA out of memory. Tried to allocate 6.99 GiB. GPU 0 has a total capacity of 15.89 GiB of which 6.99 GiB is free. Process 3145 has 8.90 GiB memory in use. Of the allocated memory 8.33 GiB is allocated by PyTorch, and 281.04 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Please let me know how to resolve this?