togethercomputer
/

evo-1-131k-base

@@ -350,6 +350,8 @@ class StripedHyena(nn.Module):
         self.blocks = nn.ModuleList(
             get_block(config, layer_idx, flash_fft=self.flash_fft) for layer_idx in range(config.num_layers)
         )
     def forward(self, x, inference_params_dict=None, padding_mask=None):
         L = x.shape[1]
@@ -379,7 +381,11 @@ class StripedHyena(nn.Module):
             x = x * padding_mask[..., None]
         for _, block in enumerate(self.blocks):
-            x, _ = block(x, inference_params=None, padding_mask=padding_mask)
         return x, None
     def initialize_inference_params(self):

         self.blocks = nn.ModuleList(
             get_block(config, layer_idx, flash_fft=self.flash_fft) for layer_idx in range(config.num_layers)
         )
+        self.gradient_checkpointing = False
+        self._gradient_checkpointing_func = None
     def forward(self, x, inference_params_dict=None, padding_mask=None):
         L = x.shape[1]
             x = x * padding_mask[..., None]
         for _, block in enumerate(self.blocks):
+            if self.gradient_checkpointing and self.training:
+                x, _ = self._gradient_checkpointing_func(block.__call__, x, None, padding_mask)
+            else:
+                x, _ = block(x, inference_params=None, padding_mask=padding_mask)
         return x, None
     def initialize_inference_params(self):

modeling_hyena.py CHANGED Viewed

@@ -2,6 +2,7 @@
 """StripedHyena custom code port for the Hugging Face Hub"""
 import torch
 from torch.nn import functional as F
 from .configuration_hyena import StripedHyenaConfig
 from transformers import PreTrainedModel
@@ -50,8 +51,32 @@ class StripedHyenaModelForCausalLM(StripedHyenaPreTrainedModel):
     def force_dtype(self):
         self.backbone.to_bfloat16_except_poles_residues()
     def _set_gradient_checkpointing(self, enable, gradient_checkpointing_func):
         self.backbone.gradient_checkpointing = enable
     def get_input_embeddings(self):
         return self.backbone.embedding_layer

 """StripedHyena custom code port for the Hugging Face Hub"""
 import torch
+import functools
 from torch.nn import functional as F
 from .configuration_hyena import StripedHyenaConfig
 from transformers import PreTrainedModel
     def force_dtype(self):
         self.backbone.to_bfloat16_except_poles_residues()
+    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+        if not self.supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        if gradient_checkpointing_kwargs is None:
+            gradient_checkpointing_kwargs = {"use_reentrant": True}
+        # TODO support deepspeed checkpoint
+        gradient_checkpointing_func = functools.partial(
+            torch.utils.checkpoint.checkpoint, **gradient_checkpointing_kwargs
+        )
+        self._set_gradient_checkpointing(
+            enable=True, gradient_checkpointing_func=gradient_checkpointing_func
+        )
+        if getattr(self, "_hf_peft_config_loaded", False):
+            # When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True
+            # we do it also on PEFT: https://github.com/huggingface/peft/blob/85013987aa82aa1af3da1236b6902556ce3e483e/src/peft/peft_model.py#L334
+            # When training with PEFT, only LoRA layers will have requires grad set to True, but the output of frozen layers need to propagate
+            # the gradients to make sure the gradient flows.
+            self.enable_input_require_grads()
     def _set_gradient_checkpointing(self, enable, gradient_checkpointing_func):
         self.backbone.gradient_checkpointing = enable
+        self.backbone._gradient_checkpointing_func = gradient_checkpointing_func
     def get_input_embeddings(self):
         return self.backbone.embedding_layer