# Copyright 2023 DeepMind Technologies Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Transformer language model generate mode.""" from typing import Any, Tuple import beam_search import decoder_stack import gin import jax import jax.numpy as jnp from aglib.meliad.transformer import models @gin.configurable class DecoderOnlyLanguageModelGenerate(models.DecoderOnlyLanguageModel): """Decoder only language modeling in inference mode.""" decoder_factory = decoder_stack.DecoderStackGenerate num_heads: int = gin.REQUIRED head_size: int = gin.REQUIRED def get_fake_input(self) -> dict[str, Any]: fake_input_dict = super().get_fake_input() b = self.task_config.batch_size n = self.num_heads h = self.head_size fake_input_dict.update({ 'dstate': tuple( [{ 'current_index': jnp.array([0] * b, dtype=jnp.int32), 'keys': jnp.zeros((b, 2048, n, h), dtype=jnp.bfloat16), 'values': jnp.zeros((b, 2048, n, h), dtype=jnp.bfloat16), 'recurrent_kvq': None, 'relative_position_bias': jnp.zeros( (b, n, 1, 1024), dtype=jnp.bfloat16 ), }] * 12 ), 'eos': jnp.zeros([1024], dtype=jnp.bfloat16), 'mask': jnp.ones([1024], dtype=jnp.bfloat16), 'length': 1, 'temperature': 1.0, }) return fake_input_dict def __call__(self, inputs: ...) -> tuple[Any, dict[str, Any]]: # Make sure this code is not used on untested cases. if self.mode not in ['init', 'beam_search']: raise ValueError(f'{type(self)} cannot do mode {self.mode}') if self.decoder.supports_generate(): raise ValueError(f'{type(self)}.decoder cannot supports_generate()') self.decoder( input_tokens=inputs['targets'][:, 0:1], target_tokens=None, start_of_sequence=inputs['start_of_sequence'], ) b = inputs['targets'].shape[0] no_start_of_seq = jnp.array([False] * b, dtype=jnp.bool_) # This fn is used in both beam_search or topk_sampling. def tokens_to_logits_fn( input_token: jnp.ndarray, dstate: tuple[dict[str, jnp.ndarray], ...] ) -> tuple[jnp.ndarray, tuple[dict[str, jnp.ndarray], ...]]: (logits, dstate, _) = self.decoder( input_tokens=input_token, target_tokens=None, start_of_sequence=no_start_of_seq, decoder_state=dstate, ) return logits[:, -1, :], dstate last_token = jax.lax.dynamic_slice_in_dim( inputs['targets'], inputs['length'] - 1, 1, axis=1 ) # last token is used to seed beam_search inputs['targets'] = inputs['targets'][:, 0:-1] dstate = jax.lax.cond( inputs['start_of_sequence'][0], lambda: self.generate(inputs)[0], lambda: inputs['dstate'], ) # Then we run beam search, init with last_token & dstate. finished_seqs, finished_scores, dstate = beam_search.beam_search_flat( last_token, dstate, tokens_to_logits_fn, max_decode_len=512, eos=inputs['eos'].reshape((1, 1, -1)), mask=inputs['mask'].reshape((1, 1, -1)), ) return 0.0, { 'finished_seqs': finished_seqs, 'finished_scores': finished_scores, 'dstate': dstate, } def generate( self, inputs: ... ) -> tuple[tuple[dict[str, jnp.ndarray, ...], ...], jnp.ndarray]: """Generate an output sequence. Args: inputs: the same as argument to _call_. Returns: An array of generated tokens of shape (batch_size, sequence_length). """ input_tokens = inputs['targets'] # [b,seq_len] start_of_sequence = inputs['start_of_sequence'] # [b] target_tokens = jnp.pad(input_tokens[:, 1:], [(0, 0), (0, 1)]) batch_size = target_tokens.shape[0] # Assuming all sequences start at the same time. start0 = inputs['start_of_sequence'][0] dstate = jax.lax.cond( start0, lambda: self.decoder.init_decoder_state_vanilla( # pylint: disable=g-long-lambda 1024, start_of_sequence ), lambda: inputs['dstate'], ) first_token = input_tokens[:, 0:1] no_start_of_seq = jnp.array([False] * batch_size, dtype=jnp.bool_) temperature = 1 if 'temperature' in inputs: temperature = inputs['temperature'] num_steps = inputs['length'] if self.mode == 'beam_search': num_steps -= 1 def cond_fn(scan_state) -> jnp.bool_: _, _, i, _ = scan_state return i < num_steps def loop_fn(scan_state: Any) -> Tuple[Any, Any, Any, Any]: (dstate, input_token, i, _) = scan_state (logits, dstate, _) = self.decoder( input_tokens=input_token, target_tokens=None, start_of_sequence=no_start_of_seq, decoder_state=dstate, ) logits = logits / temperature output_token = jax.lax.dynamic_slice_in_dim(target_tokens, i, 1, axis=1) return (dstate, output_token, i + 1, logits) # Scan over the sequence length. dummy_logits = jnp.zeros((batch_size, 1, 1024)) initial_scan_state = (dstate, first_token, 0, dummy_logits) dstate, _, _, logits = jax.lax.while_loop( cond_fn, loop_fn, initial_scan_state ) return dstate, logits