Longformer attn config (#4)

Browse files

- Longformer attn config (929ceeb15a1635aa4858e9e2e20a6860ee4eb32d)

Co-authored-by: Plasmarine <[email protected]>

Files changed (1) hide show

modeling_cocom.py +48 -22

modeling_cocom.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, PreTrainedModel, PretrainedConfig, AutoModel,LongformerForCausalLM, LongformerTokenizer
 import torch
 import math
 from peft import get_peft_model, LoraConfig, TaskType
@@ -71,7 +71,8 @@ class COCOMConfig(PretrainedConfig):
                 lora = False,
                 training_form="both",
                 lora_r=16,
-                attn_implementation="eager",
                 device_map = "cuda",
                  **kwargs):
         super().__init__(**kwargs)
@@ -95,6 +96,28 @@ class COCOM(PreTrainedModel):
         super().__init__(cfg)
         # define models
         attn_impl = cfg.attn_implementation
         # model could be loaded in three quantization modes: no, int4, int8
         if cfg.quantization == "no":
             self.decoder = AutoModelForCausalLM.from_pretrained(
@@ -193,15 +216,20 @@ class COCOM(PreTrainedModel):
         self.compr_rate = cfg.compr_rate
         self.local_rank = os.getenv('LOCAL_RANK', '0')
-    def compress_and_replace_emb(self, enc_input_ids, enc_attention_mask, dec_input_ids):
         indices = range(0, enc_input_ids.size(0) + 1, self.generation_top_k)
         if self.compr:
             compressed_embs = self.compr(enc_input_ids, enc_attention_mask)
-            input_embeds = self.replace_embeddings(compressed_embs, dec_input_ids, indices)
         else:
             compressed_embs = self.compr_decoder(enc_input_ids, enc_attention_mask)
-            input_embeds = self.replace_embeddings(compressed_embs, dec_input_ids, indices)
         return input_embeds
     def compr_decoder(self, input_ids, attention_mask):
         emb = self.decoder(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
@@ -212,19 +240,23 @@ class COCOM(PreTrainedModel):
     def replace_embeddings(self, compressed_embs, dec_input_ids, indices):
         # Embed the decoder input
         inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
         num_embs = compressed_embs.size(1)
-        if self.sep:
-            slot_len = num_embs + 1
-        else:
-            slot_len = num_embs
-        # get first mem_token inidices
         first_mem_token_indices = torch.argmax((dec_input_ids == self.decoder_tokenizer.mem_token_id).int(), dim=1)
         batch_size = inputs_embeds.size(0)
-        # for each example in batch, replace them with compressed embeddings
         for i in range(batch_size):
             for j in range(indices[i], indices[i + 1]):
-                start_idx = first_mem_token_indices[i].item() + (j-indices[i]) * slot_len
                 inputs_embeds[i, start_idx:start_idx + num_embs, :] = compressed_embs[j]
         return inputs_embeds
@@ -235,19 +267,13 @@ class COCOM(PreTrainedModel):
             dec_attention_mask: torch.LongTensor = None,
             labels: torch.LongTensor = None):
-        # enc_input_ids: stores the contexts, should be flattened from all queries before input, dimention (batch_size*generation_top_k, token_length)
-        # enc_attention_mask: attention mask of enc_input_ids
-        # dec_input_ids: stores the prompts (including mem tokens), dimention (batch_size, token_length)
-        # dec_attention_mask: attention mask of dec_input_ids
-        # Perform compression with gradient tracking
-        inputs_embeds = self.compress_and_replace_emb(enc_input_ids, enc_attention_mask, dec_input_ids)
-        # if training_form is compressor, then detach the inputs_embeds, to make gradient not count in decoder
         if (self.training_form == "compressor") and (self.compr is None):
-            inputs_embeds  = inputs_embeds.detach()
-        # decoding
         decoder_outputs = self.decoder(inputs_embeds=inputs_embeds, attention_mask=dec_attention_mask, labels=labels)
         return {"loss": decoder_outputs.loss, "logits": decoder_outputs.logits}

+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, PreTrainedModel, PretrainedConfig, AutoModel,LongformerForCausalLM, LongformerTokenizer, LongformerConfig
 import torch
 import math
 from peft import get_peft_model, LoraConfig, TaskType
                 lora = False,
                 training_form="both",
                 lora_r=16,
+                attn_implementation="longformer",
+                attention_window=512,
                 device_map = "cuda",
                  **kwargs):
         super().__init__(**kwargs)
         super().__init__(cfg)
         # define models
         attn_impl = cfg.attn_implementation
+        if cfg.attn_implementation == "longformer":
+            # Initialize Longformer
+            longformer_config = LongformerConfig.from_pretrained(cfg.decoder_model_name)
+            longformer_config.attention_window = 512  # Modify based on context window size
+            self.decoder = LongformerForCausalLM.from_pretrained(
+                cfg.decoder_model_name,
+                config=longformer_config,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+                device_map=cfg.device_map
+            )
+        else:
+            # Original decoder initialization
+            self.decoder = AutoModelForCausalLM.from_pretrained(
+                cfg.decoder_model_name,
+                torch_dtype=torch.float16,
+                attn_implementation=attn_impl,
+                low_cpu_mem_usage=True,
+                device_map=cfg.device_map
+            )
         # model could be loaded in three quantization modes: no, int4, int8
         if cfg.quantization == "no":
             self.decoder = AutoModelForCausalLM.from_pretrained(
         self.compr_rate = cfg.compr_rate
         self.local_rank = os.getenv('LOCAL_RANK', '0')
+    def compress_and_replace_emb(self, enc_input_ids, enc_attention_mask, dec_input_ids, dec_attention_mask):
         indices = range(0, enc_input_ids.size(0) + 1, self.generation_top_k)
+        # Perform compression
         if self.compr:
             compressed_embs = self.compr(enc_input_ids, enc_attention_mask)
         else:
             compressed_embs = self.compr_decoder(enc_input_ids, enc_attention_mask)
+        # Replace embeddings with compressed ones
+        input_embeds = self.replace_embeddings(compressed_embs, dec_input_ids, dec_attention_mask, indices)
         return input_embeds
     def compr_decoder(self, input_ids, attention_mask):
         emb = self.decoder(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
     def replace_embeddings(self, compressed_embs, dec_input_ids, indices):
         # Embed the decoder input
         inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
+        # Number of compressed embeddings
         num_embs = compressed_embs.size(1)
+        # Define slot length for memory tokens
+        slot_len = num_embs + 1 if self.sep else num_embs
+        # Find the first memory token indices
         first_mem_token_indices = torch.argmax((dec_input_ids == self.decoder_tokenizer.mem_token_id).int(), dim=1)
         batch_size = inputs_embeds.size(0)
+        # Replace memory tokens with compressed embeddings
         for i in range(batch_size):
             for j in range(indices[i], indices[i + 1]):
+                start_idx = first_mem_token_indices[i].item() + (j - indices[i]) * slot_len
                 inputs_embeds[i, start_idx:start_idx + num_embs, :] = compressed_embs[j]
         return inputs_embeds
             dec_attention_mask: torch.LongTensor = None,
             labels: torch.LongTensor = None):
+        inputs_embeds = self.compress_and_replace_emb(enc_input_ids, enc_attention_mask, dec_input_ids, dec_attention_mask)
+        # Detach inputs_embeds if training compressor only
         if (self.training_form == "compressor") and (self.compr is None):
+            inputs_embeds = inputs_embeds.detach()
+        # Pass through the decoder
         decoder_outputs = self.decoder(inputs_embeds=inputs_embeds, attention_mask=dec_attention_mask, labels=labels)
         return {"loss": decoder_outputs.loss, "logits": decoder_outputs.logits}