naist-nlp
/

mitre_913m

Translation

Safetensors

mitre

custom_code

Model card Files Files and versions Community

zhiqu22 commited on 22 days ago

Commit

23f06b4

1 Parent(s): 6ff51cd

improve kv cache

Browse files

Files changed (1) hide show

modeling_mitre.py +143 -50

modeling_mitre.py CHANGED Viewed

@@ -280,22 +280,48 @@ class MitreDecoder(MitrePreTrainedModel):
         registers = input_ids[range(batch_size), torch.argmax(input_ids, dim=-1)].unsqueeze(1).repeat(1, max_register_nums)
         return registers, register_nums, total_token_nums
-    def combine_src_and_registers(self, input_ids, registers, register_nums, total_token_nums):
         '''
             return a expanded_src_tokens for positional embedding.
         '''
         pads = torch.full_like(registers, self.padding_idx)
         expanded_src_tokens = torch.cat((pads, input_ids, registers), dim=1)
-        indices = torch.arange(total_token_nums).expand(input_ids.size(0), -1).to(input_ids.device)
-        indices = indices + register_nums.unsqueeze(1)
-        batch_indices = torch.arange(input_ids.shape[0]).unsqueeze(1).expand(-1, indices.size(1)).contiguous()
-        return expanded_src_tokens, batch_indices, indices
     def fill_with_neg_inf(self, t):
         return t.float().fill_(float("-inf")).type_as(t)
-    def build_future_mask(self, embeds, src_length, register_nums, padding_mask=None, past_key_values_length=0):
         b = register_nums.size(0)
         ns = src_length - register_nums
         if past_key_values_length == 0:
@@ -331,11 +357,6 @@ class MitreDecoder(MitrePreTrainedModel):
             batch_mask[batch_indices[target_indices], row_indices[target_indices], col_indices[target_indices]] = float('-inf')
             # shape: batch_size, head_num (1 for broadcasting), seq_len, seq_len
             batch_mask = batch_mask.unsqueeze(1)
-            # 6. masking pads
-            if padding_mask is not None:
-                if padding_mask.any():
-                    padding_mask = padding_mask.to(batch_mask.device).unsqueeze(1).unsqueeze(2)
-                    batch_mask = batch_mask.masked_fill(padding_mask == 1, float('-inf'))
         elif past_key_values_length > 0:
             # in generation
@@ -350,7 +371,6 @@ class MitreDecoder(MitrePreTrainedModel):
             batch_mask[batch_indices[target_to_source_mask], token_indices[target_to_source_mask]] = float('-inf')
             batch_mask = batch_mask.unsqueeze(1)
-        # ensure contiguous
         batch_mask = batch_mask.view(b, 1, batch_mask.shape[-2], batch_mask.shape[-1])
         return batch_mask
@@ -359,13 +379,12 @@ class MitreDecoder(MitrePreTrainedModel):
         self,
         input_ids: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.Tensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         registering_cache: dict = None,
     ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -374,33 +393,49 @@ class MitreDecoder(MitrePreTrainedModel):
         # past_key_values_length
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-        decoder_input_shape = decoder_input_ids.size()
-        decoder_input_ids = decoder_input_ids.view(-1, decoder_input_shape[-1])
-        padding_mask = None
         if past_key_values_length > 0:
             register_nums = registering_cache["register_nums"]
             src_length = registering_cache["src_length"]
         if input_ids is not None and past_key_values_length == 0:
-            # .view() additionally ensure that the memory is contiguous
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            registers, register_nums, total_token_nums = self.create_registers(input_ids)
-            expanded_src_tokens, batch_indices, indices = self.combine_src_and_registers(input_ids, registers, register_nums, total_token_nums)
-            # positional embedding for source tokens and registers
-            inputs_embeds = self.embed_tokens(expanded_src_tokens)
-            inputs_embeds_1 = inputs_embeds[:,:total_token_nums,:] + self.src_embed_positions(expanded_src_tokens[:,:total_token_nums])
-            inputs_embeds_2 = inputs_embeds[:,total_token_nums:,:] + self.register_embed_positions(expanded_src_tokens[:,total_token_nums:])
-            inputs_embeds = torch.cat((inputs_embeds_1, inputs_embeds_2), dim=1)
-            inputs_embeds = inputs_embeds[batch_indices, indices]
-            # padding mask
-            source_tokens = expanded_src_tokens[batch_indices, indices]
-            src_length = source_tokens.shape[1]
             # replace the inference trigger with langtok
             # namely, enc-tgt-dec-tgt strategy
@@ -408,16 +443,47 @@ class MitreDecoder(MitrePreTrainedModel):
                 decoder_input_ids[:, 0] = source_tokens[:, -1]
             tokens = torch.cat([source_tokens, decoder_input_ids], dim=1)
-            padding_mask = tokens.eq(self.padding_idx)
         decoder_inputs_embeds = self.embed_tokens(decoder_input_ids)
         decoder_inputs_embeds = decoder_inputs_embeds + self.tgt_embed_positions(decoder_input_ids, past_key_values_length, src_length=src_length)
         if past_key_values_length == 0:
             hidden_states = torch.cat([inputs_embeds, decoder_inputs_embeds], dim=1)
         else:
             hidden_states = decoder_inputs_embeds
-        attention_mask = self.build_future_mask(hidden_states, src_length, register_nums, padding_mask, past_key_values_length)
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         if self.gradient_checkpointing and self.training:
@@ -429,8 +495,6 @@ class MitreDecoder(MitrePreTrainedModel):
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if output_attentions else None
         next_decoder_cache = () if use_cache else None
         for idx, decoder_layer in enumerate(self.layers):
@@ -458,7 +522,16 @@ class MitreDecoder(MitrePreTrainedModel):
                 hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache += (layer_outputs[1],)
         if past_key_values_length == 0:
             hidden_states = hidden_states[:,src_length:,:]
@@ -475,13 +548,19 @@ class MitreDecoder(MitrePreTrainedModel):
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
             hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            cross_attentions=all_cross_attentions,
         )
-        model_output.registering_cache = {
-            "register_nums": register_nums,
-            "src_length": src_length
-        }
         return model_output
@@ -579,6 +658,7 @@ class MitreModel(MitrePreTrainedModel):
         self,
         input_ids: Optional[torch.LongTensor] = None,
         decoder_input_ids: Optional[torch.Tensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -594,6 +674,7 @@ class MitreModel(MitrePreTrainedModel):
         decoder_outputs = self.decoder(
             input_ids=input_ids,
             decoder_input_ids=decoder_input_ids,
             past_key_values=past_key_values,
             use_cache=use_cache,
             output_hidden_states=output_hidden_states,
@@ -634,15 +715,18 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
         self,
         input_ids: Optional[torch.LongTensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         registering_cache: dict = None,
     ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
         outputs = self.model(
             input_ids=input_ids,
             decoder_input_ids=decoder_input_ids,
             past_key_values=past_key_values,
             use_cache=use_cache,
             output_hidden_states=output_hidden_states,
@@ -674,8 +758,8 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
         return reordered_past
     @staticmethod
-    def _reorder_register_nums(register_nums, beam_idx):
-        return register_nums.index_select(0, beam_idx.to(register_nums.device))
     @staticmethod
     def _expand_inputs_for_generation(
@@ -752,6 +836,7 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
         this_peer_finished = False
         past_key_values = None
         registering_cache = None
         logits_processor = LogitsProcessorList()
         stopping_criteria = StoppingCriteriaList()
@@ -763,6 +848,12 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
             if past_key_values is not None:
                 decoder_input_ids_for_generation = decoder_input_ids[:, -1:]
             else:
                 decoder_input_ids_for_generation = decoder_input_ids
@@ -817,8 +908,10 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
             del outputs
             past_key_values = self._reorder_cache(past_key_values, beam_idx)
-            registering_cache["register_nums"] = self._reorder_register_nums(registering_cache["register_nums"], beam_idx)
             cur_len = cur_len + 1
             if beam_scorer.is_done:

         registers = input_ids[range(batch_size), torch.argmax(input_ids, dim=-1)].unsqueeze(1).repeat(1, max_register_nums)
         return registers, register_nums, total_token_nums
+    def get_token_indices(self, input_ids, total_token_nums, register_nums):
+        '''
+            return a token_indices for selecting source tokens from expanded_src_tokens
+        '''
+        token_indices = torch.arange(total_token_nums).expand(input_ids.size(0), -1).to(input_ids.device)
+        token_indices = token_indices + register_nums.unsqueeze(1)
+        return token_indices
+    def get_batch_indices(self, input_ids, token_indices):
+        '''
+            return a batch_indices for selecting source tokens from expanded_src_tokens
+        '''
+        batch_indices = torch.arange(input_ids.shape[0]).unsqueeze(1).expand(-1, token_indices.size(1)).contiguous()
+        return batch_indices
+    def combine_src_and_registers(self, input_ids, registers):
         '''
             return a expanded_src_tokens for positional embedding.
         '''
         pads = torch.full_like(registers, self.padding_idx)
         expanded_src_tokens = torch.cat((pads, input_ids, registers), dim=1)
+        return expanded_src_tokens
+    def source_tokens_embedding_with_positions(self, expanded_src_tokens, total_token_nums, batch_indices, indices):
+        '''
+            return the embeds of source tokens
+        '''
+        inputs_embeds = self.embed_tokens(expanded_src_tokens)
+        inputs_embeds_1 = inputs_embeds[:,:total_token_nums,:] + self.src_embed_positions(expanded_src_tokens[:,:total_token_nums])
+        inputs_embeds_2 = inputs_embeds[:,total_token_nums:,:] + self.register_embed_positions(expanded_src_tokens[:,total_token_nums:])
+        inputs_embeds = torch.cat((inputs_embeds_1, inputs_embeds_2), dim=1)
+        inputs_embeds = inputs_embeds[batch_indices, indices]
+        return inputs_embeds
     def fill_with_neg_inf(self, t):
         return t.float().fill_(float("-inf")).type_as(t)
+    def check_contiguous(self, t: torch.Tensor):
+        return t if t.is_contiguous() else t.contiguous()
+    def build_future_mask(self, embeds, src_length, register_nums, past_key_values_length=0):
         b = register_nums.size(0)
         ns = src_length - register_nums
         if past_key_values_length == 0:
             batch_mask[batch_indices[target_indices], row_indices[target_indices], col_indices[target_indices]] = float('-inf')
             # shape: batch_size, head_num (1 for broadcasting), seq_len, seq_len
             batch_mask = batch_mask.unsqueeze(1)
         elif past_key_values_length > 0:
             # in generation
             batch_mask[batch_indices[target_to_source_mask], token_indices[target_to_source_mask]] = float('-inf')
             batch_mask = batch_mask.unsqueeze(1)
         batch_mask = batch_mask.view(b, 1, batch_mask.shape[-2], batch_mask.shape[-1])
         return batch_mask
         self,
         input_ids: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         use_cache: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         registering_cache: dict = None,
     ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         # past_key_values_length
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
         if past_key_values_length > 0:
             register_nums = registering_cache["register_nums"]
             src_length = registering_cache["src_length"]
         if input_ids is not None and past_key_values_length == 0:
+            # ensure contiguous
+            input_ids = self.check_contiguous(input_ids)
+            decoder_input_ids = self.check_contiguous(decoder_input_ids)
+            if attention_mask is None:
+                # create registers from input_ids
+                registers, register_nums, total_token_nums = self.create_registers(input_ids)
+                # 'expanded_src_tokens' is combined by input_ids, registers, and pads.
+                expanded_src_tokens = self.combine_src_and_registers(input_ids, registers)
+                token_indices = self.get_token_indices(input_ids, total_token_nums, register_nums)
+                batch_indices = self.get_batch_indices(input_ids, token_indices)
+                # source tokens (input_ids + registers)
+                source_tokens = expanded_src_tokens[batch_indices, token_indices]
+            else:
+                # although we do not give the attention mask in training and the 1st step of generation,
+                # we still leave this block here.
+                if registering_cache is None or \
+                   not all(key in registering_cache for key in \
+                   ("register_nums", "total_token_nums", "expanded_src_tokens",\
+                    "batch_indices", "token_indices", "source_tokens")):
+                    raise ValueError(
+                        "If you generate registers by external codes, \
+                        you must provide 'register_nums', 'total_token_nums', \
+                        'expanded_src_tokens', 'batch_indices', 'token_indices' \
+                        and 'source_tokens' in 'registering_cache' in the training."
+                        )
+                register_nums, total_token_nums = registering_cache["register_nums"], registering_cache["total_token_nums"]
+                expanded_src_tokens = registering_cache["expanded_src_tokens"]
+                batch_indices, token_indices = registering_cache["batch_indices"], registering_cache["token_indices"]
+                source_tokens = registering_cache["source_tokens"]
+            # ensure contiguous
+            expanded_src_tokens = self.check_contiguous(expanded_src_tokens)
+            source_tokens = self.check_contiguous(source_tokens)
+            # get embeds with positions for source tokens (input_ids + registers)
+            inputs_embeds = self.source_tokens_embedding_with_positions(expanded_src_tokens, total_token_nums, batch_indices, token_indices)
             # replace the inference trigger with langtok
             # namely, enc-tgt-dec-tgt strategy
                 decoder_input_ids[:, 0] = source_tokens[:, -1]
             tokens = torch.cat([source_tokens, decoder_input_ids], dim=1)
+            src_length = source_tokens.shape[1]
         decoder_inputs_embeds = self.embed_tokens(decoder_input_ids)
         decoder_inputs_embeds = decoder_inputs_embeds + self.tgt_embed_positions(decoder_input_ids, past_key_values_length, src_length=src_length)
         if past_key_values_length == 0:
             hidden_states = torch.cat([inputs_embeds, decoder_inputs_embeds], dim=1)
         else:
             hidden_states = decoder_inputs_embeds
+        # ensure contiguous
+        hidden_states = self.check_contiguous(hidden_states)
+        # if attention_mask is NOT given, we build the attention mask from current hyperparams
+        # if attention_mask is given, check the shape of attention mask
+        if attention_mask is None:
+            attention_mask = self.build_future_mask(hidden_states, src_length, register_nums, past_key_values_length)
+        else:
+            bsz, src_len = hidden_states.shape[0], hidden_states.shape[1]
+            tgt_len = hidden_states.shape[1] if past_key_values_length == 0 else past_key_values_length + 1
+            if attention_mask.size() != (bsz, 1, src_len, tgt_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, src_len, tgt_len)}, but is {attention_mask.size()}"
+                    )
+        # ensure contiguous
+        attention_mask = self.check_contiguous(attention_mask)
+        # this is a param to turncate kv cache
+        # in training, it's None, namely, unactivated.
+        max_register_num = None
+        # masking pads for attention_mask in the training or the 1st step of generation
+        if past_key_values_length == 0:
+            # if in generation, activate
+            max_register_num = register_nums.max().item() if use_cache else None
+            padding_mask = tokens.eq(self.padding_idx)
+            if padding_mask.any():
+                padding_mask = padding_mask.unsqueeze(1).unsqueeze(2)
+                attention_mask = attention_mask.masked_fill(padding_mask == 1, float('-inf'))
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         if self.gradient_checkpointing and self.training:
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         next_decoder_cache = () if use_cache else None
         for idx, decoder_layer in enumerate(self.layers):
                 hidden_states = layer_outputs[0]
             if use_cache:
+                if past_key_values_length > 0:
+                    next_decoder_cache += (layer_outputs[1],)
+                else:
+                    cache_key, cache_value = layer_outputs[1]
+                    clipped_rep = (
+                        cache_key[:, :, src_length - max_register_num:, :],
+                        cache_value[:, :, src_length - max_register_num:, :]
+                    )
+                    next_decoder_cache += (clipped_rep,)
         if past_key_values_length == 0:
             hidden_states = hidden_states[:,src_length:,:]
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
             hidden_states=all_hidden_states,
         )
+        # the registering cache used in generation
+        # in the 1st step, we turncate the kv cache to save cost, so we have to change the src_length
+        if use_cache:
+            model_output.registering_cache = {
+                "register_nums": register_nums,
+                "src_length": src_length if past_key_values_length > 0 else max_register_num,
+                "attention_mask": attention_mask if past_key_values_length > 0 else None
+            }
+        else:
+            model_output.registering_cache = None
         return model_output
         self,
         input_ids: Optional[torch.LongTensor] = None,
         decoder_input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         decoder_outputs = self.decoder(
             input_ids=input_ids,
             decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
             past_key_values=past_key_values,
             use_cache=use_cache,
             output_hidden_states=output_hidden_states,
         self,
         input_ids: Optional[torch.LongTensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         registering_cache: dict = None,
     ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
         outputs = self.model(
             input_ids=input_ids,
             decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
             past_key_values=past_key_values,
             use_cache=use_cache,
             output_hidden_states=output_hidden_states,
         return reordered_past
     @staticmethod
+    def _reorder_register_cache(t, beam_idx):
+        return t.index_select(dim=0, index=beam_idx.to(t.device))
     @staticmethod
     def _expand_inputs_for_generation(
         this_peer_finished = False
         past_key_values = None
         registering_cache = None
+        attention_mask = None
         logits_processor = LogitsProcessorList()
         stopping_criteria = StoppingCriteriaList()
             if past_key_values is not None:
                 decoder_input_ids_for_generation = decoder_input_ids[:, -1:]
+                attention_mask = registering_cache["attention_mask"]
+                # Get the mask when the first time using kv cache.
+                # After it, we can simply repeat 0. (the last column of mask) to get the next mask.
+                # As a result, we avoid generate the mask from scratch in kv cache and save memory.
+                if attention_mask is not None:
+                    attention_mask = torch.cat((attention_mask, attention_mask[..., -1:]), dim=-1)
             else:
                 decoder_input_ids_for_generation = decoder_input_ids
             del outputs
             past_key_values = self._reorder_cache(past_key_values, beam_idx)
+            registering_cache["register_nums"] = self._reorder_register_cache(registering_cache["register_nums"], beam_idx)
+            if registering_cache["attention_mask"] is not None:
+                registering_cache["attention_mask"] = self._reorder_register_cache(registering_cache["attention_mask"], beam_idx)
             cur_len = cur_len + 1
             if beam_scorer.is_done: