tomg-group-umd
/

huginn-0125

@@ -1,4 +1,4 @@
-"""Minimal modeling.py file for HF compatibility and funny zero-shot experiments. Use only for inference."""
 import torch
 import math
@@ -289,7 +289,7 @@ class RavenForCausalLM(RavenPreTrainedModel):
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
-        num_steps_pair: Optional[torch.Tensor] = None,
         past_key_values: Optional[Cache] = None,
         output_details: dict = {
             "return_logits": True,
@@ -302,12 +302,12 @@ class RavenForCausalLM(RavenPreTrainedModel):
         cache_position: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> CausalLMOutputRecurrentLatents:
         if position_ids is None and cache_position is None:
             freqs_cis = self.freqs_cis[:, : input_ids.shape[1]]
         elif position_ids is not None:
             freqs_cis = self.freqs_cis.index_select(1, position_ids.squeeze())
-        elif cache_position is not None:  # support HF format
             freqs_cis = self.freqs_cis[:, cache_position]
         if input_embeds is None:
@@ -331,7 +331,7 @@ class RavenForCausalLM(RavenPreTrainedModel):
             block_idx,
             attention_mask,
             past_key_values,
-            num_steps_pair,
         )
         latent_states = x.clone().detach()
@@ -371,16 +371,16 @@ class RavenForCausalLM(RavenPreTrainedModel):
         block_idx,
         mask,
         past_key_values: Optional[Cache] = None,
-        num_steps_pair: Optional[torch.Tensor] = None,
     ):
         x = xk = self.initialize_state(input_embeds) if input_states is None else input_states.clone()
-        if num_steps_pair is None:
             num_steps_no_grad, num_steps_with_grad = self.randomized_iteration_sampler()  # type: ignore
-        elif len(num_steps_pair) > 1:
-            num_steps_no_grad, num_steps_with_grad = num_steps_pair
         else:
-            num_steps_no_grad, num_steps_with_grad = num_steps_pair, torch.tensor(0)
         with torch.no_grad():
             # ultra annoying in ddp due to
@@ -421,13 +421,13 @@ class RavenForCausalLM(RavenPreTrainedModel):
         return n.to(dtype=torch.long), k.to(dtype=torch.long)
-    def initialize_state(self, input_embeds):
         x = torch.randn_like(input_embeds)
         std = self.config.init_values["std"]
         torch.nn.init.trunc_normal_(x, mean=0.0, std=std, a=-3 * std, b=3 * std)
         if self.emb_scale != 1:
             x = x * self.emb_scale
-        return x
     def prepare_inputs_for_generation(
         self,
@@ -442,7 +442,11 @@ class RavenForCausalLM(RavenPreTrainedModel):
         model_inputs["cache_position"] = cache_position
         current_input_length = input_ids.shape[1]
         if past_key_values is not None:
-            model_inputs["past_key_values"] = past_key_values
             input_ids = input_ids[:, cache_position]  # type: ignore
         model_inputs["input_ids"] = input_ids.clone(memory_format=torch.contiguous_format)

+"""Minimal modeling.py file for HF compatibility and funny zero-shot experiments. Usability for finetuning not guaranteed"""
 import torch
 import math
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
+        num_steps: Optional[torch.Tensor] = None,
         past_key_values: Optional[Cache] = None,
         output_details: dict = {
             "return_logits": True,
         cache_position: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> CausalLMOutputRecurrentLatents:
+        # Support multiple position formats:
         if position_ids is None and cache_position is None:
             freqs_cis = self.freqs_cis[:, : input_ids.shape[1]]
         elif position_ids is not None:
             freqs_cis = self.freqs_cis.index_select(1, position_ids.squeeze())
+        elif cache_position is not None:
             freqs_cis = self.freqs_cis[:, cache_position]
         if input_embeds is None:
             block_idx,
             attention_mask,
             past_key_values,
+            num_steps,
         )
         latent_states = x.clone().detach()
         block_idx,
         mask,
         past_key_values: Optional[Cache] = None,
+        num_steps: Optional[torch.Tensor] = None,
     ):
         x = xk = self.initialize_state(input_embeds) if input_states is None else input_states.clone()
+        if num_steps is None:
             num_steps_no_grad, num_steps_with_grad = self.randomized_iteration_sampler()  # type: ignore
+        elif hasattr(num_steps, "__len__") and len(num_steps) > 1:
+            num_steps_no_grad, num_steps_with_grad = num_steps
         else:
+            num_steps_no_grad, num_steps_with_grad = num_steps, torch.tensor(0)
         with torch.no_grad():
             # ultra annoying in ddp due to
         return n.to(dtype=torch.long), k.to(dtype=torch.long)
+    def initialize_state(self, input_embeds, deterministic: bool = False):
         x = torch.randn_like(input_embeds)
         std = self.config.init_values["std"]
         torch.nn.init.trunc_normal_(x, mean=0.0, std=std, a=-3 * std, b=3 * std)
         if self.emb_scale != 1:
             x = x * self.emb_scale
+        return x if not deterministic else x.zero_()
     def prepare_inputs_for_generation(
         self,
         model_inputs["cache_position"] = cache_position
         current_input_length = input_ids.shape[1]
         if past_key_values is not None:
+            if type(past_key_values) == DynamicCache:
+                # Need to use custom cache, detect and replace HF dynamic cache if generate injects it
+                assert past_key_values.get_seq_length() == 0
+                past_key_values = HuginnDynamicCache()
+            model_inputs["past_key_values"] = past_key_values if kwargs["use_cache"] else None
             input_ids = input_ids[:, cache_position]  # type: ignore
         model_inputs["input_ids"] = input_ids.clone(memory_format=torch.contiguous_format)