fixed flash_attention backward_compat

by itlevy - opened Sep 24, 2024

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+14

-70

Files changed (4) hide show

NOTICE +0 -5
README.md +3 -5
modeling_decilm.py +2 -46
variable_cache.py +9 -14

NOTICE DELETED Viewed

@@ -1,5 +0,0 @@
-Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-NVIDIA CORPORATION, its affiliates and licensors retain all intellectual property and proprietary rights in and to this material, related documentation and any modifications thereto. Any use, reproduction, disclosure or distribution of this material and related documentation without an express license agreement from NVIDIA CORPORATION or its affiliates is strictly prohibited.
-Llama 3.1 is licensed under the Llama 3.1 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.

README.md CHANGED Viewed

@@ -8,9 +8,9 @@ tags:
   - llama-3
   - pytorch
 license: other
-license_name: nvidia-open-model-license
 license_link: >-
-  https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf
 ---
 # Llama-3_1-Nemotron-51B-instruct
@@ -22,8 +22,7 @@ Llama-3_1-Nemotron-51B-instruct is a model which offers a great tradeoff between
 ## License
-This model is released under the [NVIDIA Open Model License Agreement](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf).
-Additional Information: [Llama 3.1 Community License Agreement](https://www.llama.com/llama3_1/license/). Built with Llama.
 ## How was the model developed
@@ -33,7 +32,6 @@ The KD step included 40 billion tokens consisting of a mixture of 3 datasets - F
 Links to [NIM](https://build.nvidia.com/nvidia/llama-3_1-nemotron-51b-instruct), [blog](https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/) and [huggingface](https://huggingface.co/nvidia/Llama-3_1-Nemotron-51B-Instruct)
 This results in a final model that is aligned for human chat preferences.
 **Model Developers:** NVIDIA

   - llama-3
   - pytorch
 license: other
+license_name: nvidia-ai-foundation-models-community-license
 license_link: >-
+  https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-ai-foundation-models-community-license-agreement/
 ---
 # Llama-3_1-Nemotron-51B-instruct
 ## License
+[NVIDIA AI Foundation Models Community License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-ai-foundation-models-community-license-agreement/). Additional Information: [Llama 3.1 Community License Agreement](https://www.llama.com/llama3_1/license/). Built with Llama.
 ## How was the model developed
 Links to [NIM](https://build.nvidia.com/nvidia/llama-3_1-nemotron-51b-instruct), [blog](https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/) and [huggingface](https://huggingface.co/nvidia/Llama-3_1-Nemotron-51B-Instruct)
 This results in a final model that is aligned for human chat preferences.
 **Model Developers:** NVIDIA

modeling_decilm.py CHANGED Viewed

@@ -25,7 +25,7 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import GenerationConfig
-from transformers.generation.utils import NEED_SETUP_CACHE_CLASSES_MAPPING, GenerationMixin, GenerateOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_start_docstrings,
@@ -1131,7 +1131,7 @@ class DeciLMModel(DeciLMPreTrainedModel):
         return causal_mask
-class DeciLMForCausalLM(DeciLMPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
@@ -1311,50 +1311,6 @@ class DeciLMForCausalLM(DeciLMPreTrainedModel, GenerationMixin):
         )
         return model_inputs
-    def _maybe_initialize_input_ids_for_generation(
-            self,
-            inputs: Optional[torch.Tensor] = None,
-            bos_token_id: Optional[torch.Tensor] = None,
-            model_kwargs: Optional[dict[str, torch.Tensor]] = None,
-    ) -> torch.LongTensor:
-        """
-        Patching hf bug that creates wrong cache length if only inputs_embeds are passed to the model
-        """
-        input_ids = super()._maybe_initialize_input_ids_for_generation(
-            inputs=inputs, bos_token_id=bos_token_id, model_kwargs=model_kwargs)
-        if (
-                "inputs_embeds" in model_kwargs
-                and input_ids is not None
-                and input_ids.shape[1] == 0
-        ):
-            batch_size, input_sequence_length = model_kwargs["inputs_embeds"].shape[:2]
-            input_ids = torch.zeros((batch_size, input_sequence_length), dtype=torch.long, device=self.device)
-        return input_ids
-    def generate(
-            self,
-            inputs: Optional[torch.Tensor] = None,
-            *args,
-            **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        """
-        Patching hf bug that creates wrong cache length if only inputs_embeds are passed to the model
-        """
-        only_passed_inputs_embeds = (
-                "inputs_embeds" in kwargs and
-                "input_ids" not in kwargs and
-                inputs is None
-        )
-        if only_passed_inputs_embeds:
-            input_sequence_length = kwargs["inputs_embeds"].shape[1]
-        generation_output = super().generate(inputs=inputs, *args, **kwargs)
-        if only_passed_inputs_embeds and isinstance(generation_output, torch.Tensor):
-            generation_output = generation_output[:, input_sequence_length:]
-        return generation_output
 @add_start_docstrings(
     """

 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import GenerationConfig
+from transformers.generation.utils import NEED_SETUP_CACHE_CLASSES_MAPPING
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_start_docstrings,
         return causal_mask
+class DeciLMForCausalLM(DeciLMPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         )
         return model_inputs
 @add_start_docstrings(
     """

variable_cache.py CHANGED Viewed

@@ -32,21 +32,17 @@ class VariableCache(Cache_4_44_2, Cache):
     The cache of each layer is allocated to the same gpu as the layer itself.
     """
-    def __init__(
-            self,
-            *,  # key-word only, no positional args allowed to avoid mix-ups with newer transformers versions
-            config: DeciLMConfig,
-            batch_size: int = None,
-            max_cache_len: int = None,
-            dtype: torch.dtype = torch.float32,
-            max_batch_size: Optional[int] = None,
-            **kwargs: Any,
-    ) -> None:
         Cache_4_44_2.__init__(self)
-        self.config = deepcopy(config)
-        self.max_batch_size = batch_size or max_batch_size
-        self.batch_size = self.max_batch_size
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype
@@ -83,7 +79,6 @@ class VariableCache(Cache_4_44_2, Cache):
         if attention_config.no_op or attention_config.replace_with_linear:
             return None
         config = deepcopy(self.config)
-        config.num_hidden_layers = 1
         config.num_key_value_heads = self.config.num_attention_heads // attention_config.n_heads_in_group
         return StaticCache(config, self.max_batch_size, self.max_cache_len, device, self.dtype)

     The cache of each layer is allocated to the same gpu as the layer itself.
     """
+    def __init__(self,
+                 config: DeciLMConfig,
+                 max_batch_size: int,
+                 max_cache_len: int | None,
+                 device: torch.device | str | None = None,
+                 dtype: torch.dtype | None = None,
+                 ):
         Cache_4_44_2.__init__(self)
+        self.config = config
+        self.max_batch_size = max_batch_size
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype
         if attention_config.no_op or attention_config.replace_with_linear:
             return None
         config = deepcopy(self.config)
         config.num_key_value_heads = self.config.num_attention_heads // attention_config.n_heads_in_group
         return StaticCache(config, self.max_batch_size, self.max_cache_len, device, self.dtype)