stts

Runtime error

App Files Files Community

Afrinetwork7 commited on Aug 20, 2024

Commit

b2d7654

verified ·

1 Parent(s): 928c402

Upload 6 files

Browse files

Files changed (6) hide show

whisper_jax/whisper_jax___init__.py +21 -0
whisper_jax/whisper_jax_layers.py +1310 -0
whisper_jax/whisper_jax_modeling_flax_whisper.py +1686 -0
whisper_jax/whisper_jax_partitioner.py +939 -0
whisper_jax/whisper_jax_pipeline.py +506 -0
whisper_jax/whisper_jax_train_state.py +130 -0

whisper_jax/whisper_jax___init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "0.0.1"
+from .modeling_flax_whisper import FlaxWhisperForConditionalGeneration
+from .partitioner import PjitPartitioner
+from .pipeline import FlaxWhisperPipline
+from .train_state import InferenceState

whisper_jax/whisper_jax_layers.py ADDED Viewed

	@@ -0,0 +1,1310 @@

+# coding=utf-8
+# Copyright 2023 The T5X Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dense attention classes and mask/weighting functions."""
+# pylint: disable=attribute-defined-outside-init,g-bare-generic
+import dataclasses
+import functools
+import operator
+from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, Union
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax import linen as nn
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.dtypes import promote_dtype
+from jax import lax, random
+# from flax.linen.partitioning import param_with_axes, with_sharding_constraint
+param_with_axes = nn_partitioning.param_with_axes
+with_sharding_constraint = nn_partitioning.with_sharding_constraint
+# Type annotations
+Array = jnp.ndarray
+DType = jnp.dtype
+PRNGKey = jnp.ndarray
+Shape = Iterable[int]
+Activation = Callable[..., Array]
+PrecisionLike = Union[None, str, lax.Precision, Tuple[str, str], Tuple[lax.Precision, lax.Precision]]
+DotGeneralT = Callable[..., Array]
+ConvGeneralDilatedT = Callable[..., Array]
+PaddingLike = Union[str, int, Sequence[Union[int, Tuple[int, int]]]]
+LaxPadding = Union[str, Sequence[Tuple[int, int]]]
+# Parameter initializers.
+Initializer = Callable[[PRNGKey, Shape, DType], Array]
+InitializerAxis = Union[int, Tuple[int, ...]]
+NdInitializer = Callable[[PRNGKey, Shape, DType, InitializerAxis, InitializerAxis], Array]
+default_embed_init = nn.initializers.variance_scaling(1.0, "fan_in", "normal", out_axis=0)
+# ------------------------------------------------------------------------------
+# Temporary inlined JAX N-d initializer code
+# TODO(levskaya): remove once new JAX release is out.
+# ------------------------------------------------------------------------------
+def _compute_fans(shape: jax.core.NamedShape, in_axis=-2, out_axis=-1):
+    """Inlined JAX `nn.initializer._compute_fans`."""
+    if isinstance(in_axis, int):
+        in_size = shape[in_axis]
+    else:
+        in_size = int(np.prod([shape[i] for i in in_axis]))
+    if isinstance(out_axis, int):
+        out_size = shape[out_axis]
+    else:
+        out_size = int(np.prod([shape[i] for i in out_axis]))
+    receptive_field_size = shape.total / in_size / out_size
+    fan_in = in_size * receptive_field_size
+    fan_out = out_size * receptive_field_size
+    return fan_in, fan_out
+def variance_scaling(scale, mode, distribution, in_axis=-2, out_axis=-1, dtype=jnp.float_):
+    """Inlined JAX `nn.initializer.variance_scaling`."""
+    def init(key, shape, dtype=dtype):
+        return jnp.zeros(shape, dtype=dtype)
+        dtype = jax.dtypes.canonicalize_dtype(dtype)
+        shape = jax.core.as_named_shape(shape)
+        fan_in, fan_out = _compute_fans(shape, in_axis, out_axis)
+        if mode == "fan_in":
+            denominator = fan_in
+        elif mode == "fan_out":
+            denominator = fan_out
+        elif mode == "fan_avg":
+            denominator = (fan_in + fan_out) / 2
+        else:
+            raise ValueError("invalid mode for variance scaling initializer: {}".format(mode))
+        variance = jnp.array(scale / denominator, dtype=dtype)
+        if distribution == "truncated_normal":
+            # constant is stddev of standard normal truncated to (-2, 2)
+            stddev = jnp.sqrt(variance) / jnp.array(0.87962566103423978, dtype)
+            return random.truncated_normal(key, -2, 2, shape, dtype) * stddev
+        elif distribution == "normal":
+            return random.normal(key, shape, dtype) * jnp.sqrt(variance)
+        elif distribution == "uniform":
+            return random.uniform(key, shape, dtype, -1) * jnp.sqrt(3 * variance)
+        else:
+            raise ValueError("invalid distribution for variance scaling " "initializer: {}".format(distribution))
+    return init
+# ------------------------------------------------------------------------------
+def nd_dense_init(scale, mode, distribution):
+    """Initializer with in_axis, out_axis set at call time."""
+    def init_fn(key, shape, dtype, in_axis, out_axis):
+        fn = variance_scaling(scale, mode, distribution, in_axis, out_axis)
+        return fn(key, shape, dtype)
+    return init_fn
+def dot_product_attention(
+    query: Array,
+    key: Array,
+    value: Array,
+    bias: Optional[Array] = None,
+    dropout_rng: Optional[PRNGKey] = None,
+    dropout_rate: float = 0.0,
+    deterministic: bool = False,
+    dtype: DType = jnp.float32,
+    float32_logits: bool = False,
+):
+    """Computes dot-product attention given query, key, and value.
+    This is the core function for applying attention based on
+    https://arxiv.org/abs/1706.03762. It calculates the attention weights given
+    query and key and combines the values using the attention weights.
+    Args:
+      query: queries for calculating attention with shape of `[batch, q_length,
+        num_heads, qk_depth_per_head]`.
+      key: keys for calculating attention with shape of `[batch, kv_length,
+        num_heads, qk_depth_per_head]`.
+      value: values to be used in attention with shape of `[batch, kv_length,
+        num_heads, v_depth_per_head]`.
+      bias: bias for the attention weights. This should be broadcastable to the
+        shape `[batch, num_heads, q_length, kv_length]` This can be used for
+        incorporating causal masks, padding masks, proximity bias, etc.
+      dropout_rng: JAX PRNGKey: to be used for dropout
+      dropout_rate: dropout rate
+      deterministic: bool, deterministic or not (to apply dropout)
+      dtype: the dtype of the computation (default: float32)
+      float32_logits: bool, if True then compute logits in float32 to avoid
+        numerical issues with bfloat16.
+    Returns:
+      Output of shape `[batch, length, num_heads, v_depth_per_head]`.
+    """
+    assert key.ndim == query.ndim == value.ndim, "q, k, v must have same rank."
+    assert query.shape[:-3] == key.shape[:-3] == value.shape[:-3], "q, k, v batch dims must match."
+    assert query.shape[-2] == key.shape[-2] == value.shape[-2], "q, k, v num_heads must match."
+    assert key.shape[-3] == value.shape[-3], "k, v lengths must match."
+    assert query.shape[-1] == key.shape[-1], "q, k depths must match."
+    # Casting logits and softmax computation for float32 for model stability.
+    if float32_logits:
+        query = query.astype(jnp.float32)
+        key = key.astype(jnp.float32)
+    # `attn_weights`: [batch, num_heads, q_length, kv_length]
+    attn_weights = jnp.einsum("bqhd,bkhd->bhqk", query, key)
+    # Apply attention bias: masking, dropout, proximity bias, etc.
+    if bias is not None:
+        attn_weights = attn_weights + bias.astype(attn_weights.dtype)
+    # Normalize the attention weights across `kv_length` dimension.
+    attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
+    # Apply attention dropout.
+    if not deterministic and dropout_rate > 0.0:
+        keep_prob = 1.0 - dropout_rate
+        # T5 broadcasts along the "length" dim, but unclear which one that
+        # corresponds to in positional dimensions here, assuming query dim.
+        dropout_shape = list(attn_weights.shape)
+        dropout_shape[-2] = 1
+        keep = random.bernoulli(dropout_rng, keep_prob, dropout_shape)
+        keep = jnp.broadcast_to(keep, attn_weights.shape)
+        multiplier = keep.astype(attn_weights.dtype) / jnp.asarray(keep_prob, dtype=dtype)
+        attn_weights = attn_weights * multiplier
+    # Take the linear combination of `value`.
+    return jnp.einsum("bhqk,bkhd->bqhd", attn_weights, value)
+dynamic_vector_slice_in_dim = jax.vmap(lax.dynamic_slice_in_dim, in_axes=(None, 0, None, None))
+class MultiHeadDotProductAttention(nn.Module):
+    """Multi-head dot-product attention.
+    Attributes:
+      num_heads: number of attention heads. Features (i.e. inputs_q.shape[-1])
+        should be divisible by the number of heads.
+      head_dim: dimension of each head.
+      dtype: the dtype of the computation.
+      dropout_rate: dropout rate
+      kernel_init: initializer for the kernel of the Dense layers.
+      float32_logits: bool, if True then compute logits in float32 to avoid
+        numerical issues with bfloat16.
+    """
+    num_heads: int
+    head_dim: int
+    dtype: DType = jnp.float32
+    dropout_rate: float = 0.0
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "normal")
+    float32_logits: bool = False  # computes logits in float32 for stability.
+    @nn.compact
+    def __call__(
+        self,
+        inputs_q: Array,
+        inputs_kv: Array,
+        mask: Optional[Array] = None,
+        bias: Optional[Array] = None,
+        *,
+        decode: bool = False,
+        deterministic: bool = False,
+    ) -> Array:
+        """Applies multi-head dot product attention on the input data.
+        Projects the inputs into multi-headed query, key, and value vectors,
+        applies dot-product attention and project the results to an output vector.
+        There are two modes: decoding and non-decoding (e.g., training). The mode is
+        determined by `decode` argument. For decoding, this method is called twice,
+        first to initialize the cache and then for an actual decoding process. The
+        two calls are differentiated by the presence of 'cached_key' in the variable
+        dict. In the cache initialization stage, the cache variables are initialized
+        as zeros and will be filled in the subsequent decoding process.
+        In the cache initialization call, `inputs_q` has a shape [batch, length,
+        q_features] and `inputs_kv`: [batch, length, kv_features]. During the
+        incremental decoding stage, query, key and value all have the shape [batch,
+        1, qkv_features] corresponding to a single step.
+        Args:
+          inputs_q: input queries of shape `[batch, q_length, q_features]`.
+          inputs_kv: key/values of shape `[batch, kv_length, kv_features]`.
+          mask: attention mask of shape `[batch, num_heads, q_length, kv_length]`.
+          bias: attention bias of shape `[batch, num_heads, q_length, kv_length]`.
+          decode: Whether to prepare and use an autoregressive cache.
+          deterministic: Disables dropout if set to True.
+        Returns:
+          output of shape `[batch, length, q_features]`.
+        """
+        projection = functools.partial(
+            DenseGeneral,
+            axis=-1,
+            features=(self.num_heads, self.head_dim),
+            kernel_axes=("embed", "heads", "kv"),
+            dtype=self.dtype,
+        )
+        # NOTE: T5 does not explicitly rescale the attention logits by
+        #       1/sqrt(depth_kq)!  This is folded into the initializers of the
+        #       linear transformations, which is equivalent under Adafactor.
+        depth_scaling = jnp.sqrt(self.head_dim).astype(self.dtype)
+        def query_init(*args):
+            return self.kernel_init(*args) / depth_scaling
+        # Project inputs_q to multi-headed q/k/v
+        # dimensions are then [batch, length, num_heads, head_dim]
+        query = projection(kernel_init=query_init, name="query")(inputs_q)
+        key = projection(kernel_init=self.kernel_init, name="key")(inputs_kv)
+        value = projection(kernel_init=self.kernel_init, name="value")(inputs_kv)
+        query = with_sharding_constraint(query, ("batch", "length", "heads", "kv"))
+        key = with_sharding_constraint(key, ("batch", "length", "heads", "kv"))
+        value = with_sharding_constraint(value, ("batch", "length", "heads", "kv"))
+        if decode:
+            # Detect if we're initializing by absence of existing cache data.
+            is_initialized = self.has_variable("cache", "cached_key")
+            # The key and value have dimension [batch, length, num_heads, head_dim],
+            # but we cache them as [batch, num_heads, head_dim, length] as a TPU
+            # fusion optimization. This also enables the "scatter via one-hot
+            # broadcast" trick, which means we do a one-hot broadcast instead of a
+            # scatter/gather operations, resulting in a 3-4x speedup in practice.
+            def swap_dims(x):
+                return x[:-3] + tuple(x[i] for i in [-2, -1, -3])
+            cached_key = self.variable("cache", "cached_key", jnp.zeros, swap_dims(key.shape), key.dtype)
+            cached_value = self.variable("cache", "cached_value", jnp.zeros, swap_dims(value.shape), value.dtype)
+            cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+            if is_initialized:
+                batch, num_heads, head_dim, length = cached_key.value.shape
+                # During fast autoregressive decoding, we feed one position at a time,
+                # and cache the keys and values step by step.
+                # Sanity shape check of cached key against input query.
+                expected_shape = (batch, 1, num_heads, head_dim)
+                if expected_shape != query.shape:
+                    raise ValueError(
+                        "Autoregressive cache shape error, "
+                        "expected query shape %s instead got %s." % (expected_shape, query.shape)
+                    )
+                # Create a OHE of the current index. NOTE: the index is increased below.
+                cur_index = cache_index.value
+                one_hot_indices = jax.nn.one_hot(cur_index, length, dtype=key.dtype)
+                # In order to update the key, value caches with the current key and
+                # value, we move the length axis to the back, similar to what we did for
+                # the cached ones above.
+                # Note these are currently the key and value of a single position, since
+                # we feed one position at a time.
+                one_token_key = jnp.moveaxis(key, -3, -1)
+                one_token_value = jnp.moveaxis(value, -3, -1)
+                # Update key, value caches with our new 1d spatial slices.
+                # We implement an efficient scatter into the cache via one-hot
+                # broadcast and addition.
+                key = cached_key.value + one_token_key * one_hot_indices
+                value = cached_value.value + one_token_value * one_hot_indices
+                cached_key.value = key
+                cached_value.value = value
+                cache_index.value = cache_index.value + 1
+                # Move the keys and values back to their original shapes.
+                key = jnp.moveaxis(key, -1, -3)
+                value = jnp.moveaxis(value, -1, -3)
+                # Causal mask for cached decoder self-attention: our single query
+                # position should only attend to those key positions that have already
+                # been generated and cached, not the remaining zero elements.
+                mask = combine_masks(
+                    mask,
+                    jnp.broadcast_to(
+                        jnp.arange(length) <= cur_index,
+                        # (1, 1, length) represent (head dim, query length, key length)
+                        # query length is 1 because during decoding we deal with one
+                        # index.
+                        # The same mask is applied to all batch elements and heads.
+                        (batch, 1, 1, length),
+                    ),
+                )
+                # Grab the correct relative attention bias during decoding. This is
+                # only required during single step decoding.
+                if bias is not None:
+                    # The bias is a full attention matrix, but during decoding we only
+                    # have to take a slice of it.
+                    # This is equivalent to bias[..., cur_index:cur_index+1, :].
+                    bias = dynamic_vector_slice_in_dim(jnp.squeeze(bias, axis=0), jnp.reshape(cur_index, (-1)), 1, -2)
+        # Convert the boolean attention mask to an attention bias.
+        if mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                mask > 0, jnp.full(mask.shape, 0.0).astype(self.dtype), jnp.full(mask.shape, -1e10).astype(self.dtype)
+            )
+        else:
+            attention_bias = None
+        # Add provided bias term (e.g. relative position embedding).
+        if bias is not None:
+            attention_bias = combine_biases(attention_bias, bias)
+        dropout_rng = None
+        if not deterministic and self.dropout_rate > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        # Apply attention.
+        x = dot_product_attention(
+            query,
+            key,
+            value,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout_rate,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            float32_logits=self.float32_logits,
+        )
+        # Back to the original inputs dimensions.
+        out = DenseGeneral(
+            features=inputs_q.shape[-1],  # output dim is set to the input dim.
+            axis=(-2, -1),
+            kernel_init=self.kernel_init,
+            kernel_axes=("heads", "kv", "embed"),
+            dtype=self.dtype,
+            name="out",
+        )(x)
+        return out
+def _normalize_axes(axes: Iterable[int], ndim: int) -> Tuple[int]:
+    # A tuple by convention. len(axes_tuple) then also gives the rank efficiently.
+    return tuple([ax if ax >= 0 else ndim + ax for ax in axes])
+def _canonicalize_tuple(x):
+    if isinstance(x, Iterable):
+        return tuple(x)
+    else:
+        return (x,)
+# ------------------------------------------------------------------------------
+# DenseGeneral for attention layers.
+# ------------------------------------------------------------------------------
+class DenseGeneral(nn.Module):
+    """A linear transformation (without bias) with flexible axes.
+    Attributes:
+      features: tuple with numbers of output features.
+      axis: tuple with axes to apply the transformation on.
+      dtype: the dtype of the computation (default: float32).
+      kernel_init: initializer function for the weight matrix.
+    """
+    features: Union[Iterable[int], int]
+    axis: Union[Iterable[int], int] = -1
+    dtype: DType = jnp.float32
+    params_dtype: DType = jnp.float32
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "normal")
+    kernel_axes: Tuple[str, ...] = ()
+    use_bias: bool = True
+    bias_init: Any = nn.initializers.zeros
+    @nn.compact
+    def __call__(self, inputs: Array) -> Array:
+        """Applies a linear transformation to the inputs along multiple dimensions.
+        Args:
+          inputs: The nd-array to be transformed.
+        Returns:
+          The transformed input.
+        """
+        features = _canonicalize_tuple(self.features)
+        axis = _canonicalize_tuple(self.axis)
+        inputs = jnp.asarray(inputs, self.dtype)
+        axis = _normalize_axes(axis, inputs.ndim)
+        kernel_shape = tuple([inputs.shape[ax] for ax in axis]) + features
+        kernel_in_axis = np.arange(len(axis))
+        kernel_out_axis = np.arange(len(axis), len(axis) + len(features))
+        kernel = param_with_axes(
+            "kernel",
+            self.kernel_init,
+            kernel_shape,
+            self.params_dtype,
+            kernel_in_axis,
+            kernel_out_axis,
+            axes=self.kernel_axes,
+        )
+        if self.use_bias:
+            bias = param_with_axes("bias", self.bias_init, features, self.params_dtype, axes=(self.kernel_axes[-1],))
+        kernel = jnp.asarray(kernel, self.dtype)
+        contract_ind = tuple(range(0, len(axis)))
+        y = lax.dot_general(inputs, kernel, ((axis, contract_ind), ((), ())))
+        if self.use_bias:
+            bias = jnp.asarray(bias, self.dtype)
+            # y += jnp.reshape(bias, (1,) * (y.ndim - 1) + (-1,))
+            y += jnp.reshape(bias, (1,) * (len(features) - y.ndim) + bias.shape[:])
+        return y
+def _convert_to_activation_function(fn_or_string: Union[str, Callable]) -> Callable:
+    """Convert a string to an activation function."""
+    if fn_or_string == "linear":
+        return lambda x: x
+    elif isinstance(fn_or_string, str):
+        return getattr(nn, fn_or_string)
+    elif callable(fn_or_string):
+        return fn_or_string
+    else:
+        raise ValueError("don't know how to convert %s to an activation function" % (fn_or_string,))
+class MlpBlock(nn.Module):
+    """Transformer MLP / feed-forward block.
+    Attributes:
+      intermediate_dim: Shared dimension of hidden layers.
+      activations: Type of activations for each layer.  Each element is either
+        'linear', a string function name in flax.linen, or a function.
+      kernel_init: Kernel function, passed to the dense layers.
+      deterministic: Whether the dropout layers should be deterministic.
+      intermediate_dropout_rate: Dropout rate used after the intermediate layers.
+      dtype: Type for the dense layer.
+    """
+    intermediate_dim: int = 2048
+    activations: Sequence[Union[str, Callable]] = ("relu",)
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "truncated_normal")
+    intermediate_dropout_rate: float = 0.1
+    dtype: Any = jnp.float32
+    @nn.compact
+    def __call__(self, inputs, decode: bool = False, deterministic: bool = False):
+        """Applies Transformer MlpBlock module."""
+        # Iterate over specified MLP input activation functions.
+        # e.g. ('relu',) or ('gelu', 'linear') for gated-gelu.
+        activations = []
+        for idx, act_fn in enumerate(self.activations):
+            dense_name = "wi" if len(self.activations) == 1 else f"wi_{idx}"
+            x = DenseGeneral(
+                self.intermediate_dim,
+                dtype=self.dtype,
+                kernel_init=self.kernel_init,
+                kernel_axes=("embed", "mlp"),
+                name=dense_name,
+            )(inputs)
+            x = _convert_to_activation_function(act_fn)(x)
+            activations.append(x)
+        # Take elementwise product of above intermediate activations.
+        x = functools.reduce(operator.mul, activations)
+        # Apply dropout and final dense output projection.
+        x = nn.Dropout(rate=self.intermediate_dropout_rate, broadcast_dims=(-2,))(
+            x, deterministic=deterministic
+        )  # Broadcast along length.
+        x = with_sharding_constraint(x, ("batch", "length", "mlp"))
+        output = DenseGeneral(
+            inputs.shape[-1], dtype=self.dtype, kernel_init=self.kernel_init, kernel_axes=("mlp", "embed"), name="wo"
+        )(x)
+        return output
+class Embed(nn.Module):
+    """A parameterized function from integers [0, n) to d-dimensional vectors.
+    Attributes:
+      num_embeddings: number of embeddings.
+      features: number of feature dimensions for each embedding.
+      dtype: the dtype of the embedding vectors (default: float32).
+      embedding_init: embedding initializer.
+      one_hot: performs the gather with a one-hot contraction rather than a true
+        gather. This is currently needed for SPMD partitioning.
+    """
+    num_embeddings: int
+    features: int
+    cast_input_dtype: Optional[DType] = None
+    dtype: DType = jnp.float32
+    params_dtype: DType = jnp.float32
+    attend_dtype: Optional[DType] = None
+    embedding_init: Initializer = default_embed_init
+    one_hot: bool = True
+    embedding: Array = dataclasses.field(init=False)
+    def setup(self):
+        self.embedding = param_with_axes(
+            "embedding",
+            self.embedding_init,
+            (self.num_embeddings, self.features),
+            self.params_dtype,
+            axes=("vocab", "embed"),
+        )
+    def __call__(self, inputs: Array) -> Array:
+        """Embeds the inputs along the last dimension.
+        Args:
+          inputs: input data, all dimensions are considered batch dimensions.
+        Returns:
+          Output which is embedded input data.  The output shape follows the input,
+          with an additional `features` dimension appended.
+        """
+        if self.cast_input_dtype:
+            inputs = inputs.astype(self.cast_input_dtype)
+        if not jnp.issubdtype(inputs.dtype, jnp.integer):
+            raise ValueError("Input type must be an integer or unsigned integer.")
+        if self.one_hot:
+            iota = lax.iota(jnp.int32, self.num_embeddings)
+            one_hot = jnp.array(inputs[..., jnp.newaxis] == iota, dtype=self.dtype)
+            output = jnp.dot(one_hot, jnp.asarray(self.embedding, self.dtype))
+        else:
+            output = jnp.asarray(self.embedding, self.dtype)[inputs]
+            output = with_sharding_constraint(output, ("batch", "length", "embed"))
+        return output
+    def attend(self, query: Array) -> Array:
+        """Attend over the embedding using a query array.
+        Args:
+          query: array with last dimension equal the feature depth `features` of the
+            embedding.
+        Returns:
+          An array with final dim `num_embeddings` corresponding to the batched
+          inner-product of the array of query vectors against each embedding.
+          Commonly used for weight-sharing between embeddings and logit transform
+          in NLP models.
+        """
+        dtype = self.attend_dtype if self.attend_dtype is not None else self.dtype
+        return jnp.dot(query, jnp.asarray(self.embedding, dtype).T)
+class RelativePositionBiases(nn.Module):
+    """Adds T5-style relative positional embeddings to the attention logits.
+    Attributes:
+      num_buckets: Number of buckets to bucket distances between key and query
+        positions into.
+      max_distance: Maximum distance before everything is lumped into the last
+        distance bucket.
+      num_heads: Number of heads in the attention layer. Each head will get a
+        different relative position weighting.
+      dtype: Type of arrays through this module.
+      embedding_init: initializer for relative embedding table.
+    """
+    num_buckets: int
+    max_distance: int
+    num_heads: int
+    dtype: Any
+    embedding_init: Callable[..., Array] = nn.linear.default_embed_init
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger
+        buckets for larger absolute relative_positions.  All relative
+        positions >=max_distance  map to the same bucket.  All relative
+        positions <=-max_distance map to the same bucket.  This should allow for
+        more graceful generalization to longer sequences than the model has been
+        trained on.
+        Args:
+          relative_position: an int32 array
+          bidirectional: a boolean - whether the attention is bidirectional
+          num_buckets: an integer
+          max_distance: an integer
+        Returns:
+          a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += (n < 0).astype(np.int32) * num_buckets
+            n = np.abs(n)
+        else:
+            n = np.maximum(n, 0)
+        # now n is in the range [0, inf)
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+        val_if_large = max_exact + (
+            np.log(n.astype(np.float32) / max_exact + np.finfo(np.float32).eps)
+            / np.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).astype(np.int32)
+        val_if_large = np.minimum(val_if_large, num_buckets - 1)
+        ret += np.where(is_small, n, val_if_large)
+        return ret
+    @nn.compact
+    def __call__(self, qlen, klen, bidirectional=True):
+        """Produce relative position embedding attention biases.
+        Args:
+          qlen: attention query length.
+          klen: attention key length.
+          bidirectional: whether to allow positive memory-query relative position
+            embeddings.
+        Returns:
+          output: `(1, len, q_len, k_len)` attention bias
+        """
+        # TODO(levskaya): should we be computing this w. numpy as a program
+        # constant?
+        context_position = np.arange(qlen, dtype=jnp.int32)[:, None]
+        memory_position = np.arange(klen, dtype=jnp.int32)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(
+            relative_position,
+            bidirectional=bidirectional,
+            num_buckets=self.num_buckets,
+            max_distance=self.max_distance,
+        )
+        relative_attention_bias = param_with_axes(
+            "rel_embedding",
+            self.embedding_init,
+            (self.num_heads, self.num_buckets),
+            jnp.float32,
+            axes=("heads", "relpos_buckets"),
+        )
+        relative_attention_bias = jnp.asarray(relative_attention_bias, self.dtype)
+        # Instead of using a slow gather, we create a leading-dimension one-hot
+        # array from rp_bucket and use it to perform the gather-equivalent via a
+        # contraction, i.e.:
+        # (num_head, num_buckets) x (num_buckets one-hot, qlen, klen).
+        # This is equivalent to relative_attention_bias[:, rp_bucket]
+        bcast_iota = lax.broadcasted_iota(jnp.int32, (self.num_buckets, 1, 1), 0)
+        rp_bucket_one_hot = jnp.array(rp_bucket[jnp.newaxis, ...] == bcast_iota, dtype=self.dtype)
+        # --> shape (qlen, klen, num_heads)
+        values = lax.dot_general(
+            relative_attention_bias, rp_bucket_one_hot, (((1,), (0,)), ((), ()))  # rhs, lhs contracting dims
+        )  # no batched dims
+        # Add a singleton batch dimension.
+        # --> shape (1, num_heads, qlen, klen)
+        return values[jnp.newaxis, ...]
+# ------------------------------------------------------------------------------
+# T5 Layernorm - no subtraction of mean or bias.
+# ------------------------------------------------------------------------------
+# class LayerNorm(nn.Module):
+#   """T5 Layer normalization operating on the last axis of the input data."""
+#   epsilon: float = 1e-6
+#   dtype: Any = jnp.float32
+#   scale_init: Initializer = nn.initializers.ones
+#   @nn.compact
+#   def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+#     """Applies layer normalization on the input."""
+#     x = jnp.asarray(x, jnp.float32)
+#     features = x.shape[-1]
+#     mean2 = jnp.mean(lax.square(x), axis=-1, keepdims=True)
+#     y = jnp.asarray(x * lax.rsqrt(mean2 + self.epsilon), self.dtype)
+#     scale = param_with_axes(
+#         'scale', self.scale_init, (features,), jnp.float32, axes=('embed',))
+#     scale = jnp.asarray(scale, self.dtype)
+#     return y * scale
+class LayerNorm(nn.Module):
+    """Layer normalization (https://arxiv.org/abs/1607.06450).
+    Operates on the last axis of the input data.
+    It normalizes the activations of the layer for each given example in a
+    batch independently, rather than across a batch like Batch Normalization.
+    i.e. applies a transformation that maintains the mean activation within
+    each example close to 0 and the activation standard deviation close to 1.
+    Attributes:
+      epsilon: A small float added to variance to avoid dividing by zero.
+      dtype: the dtype of the computation (default: float32).
+      use_bias:  If True, bias (beta) is added.
+      use_scale: If True, multiply by scale (gamma). When the next layer is linear
+        (also e.g. nn.relu), this can be disabled since the scaling will be done
+        by the next layer.
+      bias_init: Initializer for bias, by default, zero.
+      scale_init: Initializer for scale, by default, one.
+    """
+    epsilon: float = 1e-6
+    dtype: Any = jnp.float32
+    params_dtype: DType = jnp.float32
+    use_bias: bool = True
+    use_scale: bool = True
+    bias_init: Callable[[PRNGKey, Shape, Any], Array] = nn.initializers.zeros
+    scale_init: Callable[[PRNGKey, Shape, Any], Array] = nn.initializers.ones
+    @nn.compact
+    def __call__(self, x):
+        """Applies layer normalization on the input.
+        Args:
+          x: the inputs
+        Returns:
+          Normalized inputs (the same shape as inputs).
+        """
+        x = jnp.asarray(x, jnp.float32)
+        features = x.shape[-1]
+        mean = jnp.mean(x, axis=-1, keepdims=True)
+        mean2 = jnp.mean(lax.square(x), axis=-1, keepdims=True)
+        var = mean2 - lax.square(mean)
+        mul = lax.rsqrt(var + self.epsilon)
+        if self.use_scale:
+            scale = param_with_axes("scale", self.scale_init, (features,), self.params_dtype, axes=("embed",))
+            mul = mul * jnp.asarray(scale, self.dtype)
+        y = (x - mean) * mul
+        if self.use_bias:
+            bias = param_with_axes("bias", self.bias_init, (features,), self.params_dtype, axes=("embed",))
+            y = y + jnp.asarray(bias, self.dtype)
+        return jnp.asarray(y, self.dtype)
+# ------------------------------------------------------------------------------
+# Mask-making utility functions.
+# ------------------------------------------------------------------------------
+def make_attention_mask(
+    query_input: Array,
+    key_input: Array,
+    pairwise_fn: Callable = jnp.multiply,
+    extra_batch_dims: int = 0,
+    dtype: DType = jnp.float32,
+) -> Array:
+    """Mask-making helper for attention weights.
+    In case of 1d inputs (i.e., `[batch, len_q]`, `[batch, len_kv]`, the
+    attention weights will be `[batch, heads, len_q, len_kv]` and this
+    function will produce `[batch, 1, len_q, len_kv]`.
+    Args:
+      query_input: a batched, flat input of query_length size
+      key_input: a batched, flat input of key_length size
+      pairwise_fn: broadcasting elementwise comparison function
+      extra_batch_dims: number of extra batch dims to add singleton axes for, none
+        by default
+      dtype: mask return dtype
+    Returns:
+      A `[batch, 1, len_q, len_kv]` shaped mask for 1d attention.
+    """
+    # [batch, len_q, len_kv]
+    mask = pairwise_fn(
+        # [batch, len_q] -> [batch, len_q, 1]
+        jnp.expand_dims(query_input, axis=-1),
+        # [batch, len_q] -> [batch, 1, len_kv]
+        jnp.expand_dims(key_input, axis=-2),
+    )
+    # [batch, 1, len_q, len_kv]. This creates the head dim.
+    mask = jnp.expand_dims(mask, axis=-3)
+    mask = jnp.expand_dims(mask, axis=tuple(range(extra_batch_dims)))
+    return mask.astype(dtype)
+def make_causal_mask(x: Array, extra_batch_dims: int = 0, dtype: DType = jnp.float32) -> Array:
+    """Make a causal mask for self-attention.
+    In case of 1d inputs (i.e., `[batch, len]`, the self-attention weights
+    will be `[batch, heads, len, len]` and this function will produce a
+    causal mask of shape `[batch, 1, len, len]`.
+    Note that a causal mask does not depend on the values of x; it only depends on
+    the shape. If x has padding elements, they will not be treated in a special
+    manner.
+    Args:
+      x: input array of shape `[batch, len]`
+      extra_batch_dims: number of batch dims to add singleton axes for, none by
+        default
+      dtype: mask return dtype
+    Returns:
+      A `[batch, 1, len, len]` shaped causal mask for 1d attention.
+    """
+    idxs = jnp.broadcast_to(jnp.arange(x.shape[-1], dtype=jnp.int32), x.shape)
+    return make_attention_mask(idxs, idxs, jnp.greater_equal, extra_batch_dims=extra_batch_dims, dtype=dtype)
+def combine_masks(*masks: Optional[Array], dtype: DType = jnp.float32):
+    """Combine attention masks.
+    Args:
+      *masks: set of attention mask arguments to combine, some can be None.
+      dtype: final mask dtype
+    Returns:
+      Combined mask, reduced by logical and, returns None if no masks given.
+    """
+    masks = [m for m in masks if m is not None]
+    if not masks:
+        return None
+    assert all(
+        (x.ndim == masks[0].ndim for x in masks)
+    ), f"masks must have same rank: {tuple((x.ndim for x in masks))}"
+    mask, *other_masks = masks
+    for other_mask in other_masks:
+        mask = jnp.logical_and(mask, other_mask)
+    return mask.astype(dtype)
+def combine_biases(*masks: Optional[Array]):
+    """Combine attention biases.
+    Args:
+      *masks: set of attention bias arguments to combine, some can be None.
+    Returns:
+      Combined mask, reduced by summation, returns None if no masks given.
+    """
+    masks = [m for m in masks if m is not None]
+    if not masks:
+        return None
+    assert all(
+        (x.ndim == masks[0].ndim for x in masks)
+    ), f"masks must have same rank: {tuple((x.ndim for x in masks))}"
+    mask, *other_masks = masks
+    for other_mask in other_masks:
+        mask = mask + other_mask
+    return mask
+def make_decoder_mask(
+    decoder_target_tokens: Array,
+    dtype: DType,
+    decoder_causal_attention: Optional[Array] = None,
+    decoder_segment_ids: Optional[Array] = None,
+) -> Array:
+    """Compute the self-attention mask for a decoder.
+    Decoder mask is formed by combining a causal mask, a padding mask and an
+    optional packing mask. If decoder_causal_attention is passed, it makes the
+    masking non-causal for positions that have value of 1.
+    A prefix LM is applied to a dataset which has a notion of "inputs" and
+    "targets", e.g., a machine translation task. The inputs and targets are
+    concatenated to form a new target. `decoder_target_tokens` is the concatenated
+    decoder output tokens.
+    The "inputs" portion of the concatenated sequence can attend to other "inputs"
+    tokens even for those at a later time steps. In order to control this
+    behavior, `decoder_causal_attention` is necessary. This is a binary mask with
+    a value of 1 indicating that the position belonged to "inputs" portion of the
+    original dataset.
+    Example:
+      Suppose we have a dataset with two examples.
+      ds = [{"inputs": [6, 7], "targets": [8]},
+            {"inputs": [3, 4], "targets": [5]}]
+      After the data preprocessing with packing, the two examples are packed into
+      one example with the following three fields (some fields are skipped for
+      simplicity).
+         decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]]
+           decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]
+      decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]
+      where each array has [batch, length] shape with batch size being 1. Then,
+      this function computes the following mask.
+                        mask = [[[[1, 1, 0, 0, 0, 0, 0],
+                                  [1, 1, 0, 0, 0, 0, 0],
+                                  [1, 1, 1, 0, 0, 0, 0],
+                                  [0, 0, 0, 1, 1, 0, 0],
+                                  [0, 0, 0, 1, 1, 0, 0],
+                                  [0, 0, 0, 1, 1, 1, 0],
+                                  [0, 0, 0, 0, 0, 0, 0]]]]
+      mask[b, 1, :, :] represents the mask for the example `b` in the batch.
+      Because mask is for a self-attention layer, the mask's shape is a square of
+      shape [query length, key length].
+      mask[b, 1, i, j] = 1 means that the query token at position i can attend to
+      the key token at position j.
+    Args:
+      decoder_target_tokens: decoder output tokens. [batch, length]
+      dtype: dtype of the output mask.
+      decoder_causal_attention: a binary mask indicating which position should
+        only attend to earlier positions in the sequence. Others will attend
+        bidirectionally. [batch, length]
+      decoder_segment_ids: decoder segmentation info for packed examples. [batch,
+        length]
+    Returns:
+      the combined decoder mask.
+    """
+    masks = []
+    # The same mask is applied to all attention heads. So the head dimension is 1,
+    # i.e., the mask will be broadcast along the heads dim.
+    # [batch, 1, length, length]
+    causal_mask = make_causal_mask(decoder_target_tokens, dtype=dtype)
+    # Positions with value 1 in `decoder_causal_attneition` can attend
+    # bidirectionally.
+    if decoder_causal_attention is not None:
+        # [batch, 1, length, length]
+        inputs_mask = make_attention_mask(
+            decoder_causal_attention, decoder_causal_attention, jnp.logical_and, dtype=dtype
+        )
+        masks.append(jnp.logical_or(causal_mask, inputs_mask).astype(dtype))
+    else:
+        masks.append(causal_mask)
+    # Padding mask.
+    masks.append(make_attention_mask(decoder_target_tokens > 0, decoder_target_tokens > 0, dtype=dtype))
+    # Packing mask
+    if decoder_segment_ids is not None:
+        masks.append(make_attention_mask(decoder_segment_ids, decoder_segment_ids, jnp.equal, dtype=dtype))
+    return combine_masks(*masks, dtype=dtype)
+def canonicalize_padding(padding: PaddingLike, rank: int) -> LaxPadding:
+    """ "Canonicalizes conv padding to a jax.lax supported format."""
+    if isinstance(padding, str):
+        return padding
+    if isinstance(padding, int):
+        return [(padding, padding)] * rank
+    if isinstance(padding, Sequence) and len(padding) == rank:
+        new_pad = []
+        for p in padding:
+            if isinstance(p, int):
+                new_pad.append((p, p))
+            elif isinstance(p, tuple) and len(p) == 2:
+                new_pad.append(p)
+            else:
+                break
+        if len(new_pad) == rank:
+            return new_pad
+    raise ValueError(
+        f"Invalid padding format: {padding}, should be str, int,"
+        f" or a sequence of len {rank} where each element is an"
+        f" int or pair of ints."
+    )
+def _conv_dimension_numbers(input_shape):
+    """Computes the dimension numbers based on the input shape."""
+    ndim = len(input_shape)
+    lhs_spec = (0, ndim - 1) + tuple(range(1, ndim - 1))
+    rhs_spec = (ndim - 1, ndim - 2) + tuple(range(0, ndim - 2))
+    out_spec = lhs_spec
+    return lax.ConvDimensionNumbers(lhs_spec, rhs_spec, out_spec)
+class _Conv(nn.Module):
+    """Convolution Module wrapping `lax.conv_general_dilated[_local]`.
+    Attributes:
+      features: number of convolution filters.
+      kernel_size: shape of the convolutional kernel. For 1D convolution,
+        the kernel size can be passed as an integer. For all other cases, it must
+        be a sequence of integers.
+      strides: an integer or a sequence of `n` integers, representing the
+        inter-window strides (default: 1).
+      padding: either the string `'SAME'`, the string `'VALID'`, the string
+        `'CIRCULAR'` (periodic boundary conditions), or a sequence of `n` `(low,
+        high)` integer pairs that give the padding to apply before and after each
+        spatial dimension. A single int is interpeted as applying the same padding
+        in all dims and passign a single int in a sequence causes the same padding
+        to be used on both sides. `'CAUSAL'` padding for a 1D convolution will
+        left-pad the convolution axis, resulting in same-sized output.
+      input_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of `inputs`
+        (default: 1). Convolution with input dilation `d` is equivalent to
+        transposed convolution with stride `d`.
+      kernel_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of the convolution
+        kernel (default: 1). Convolution with kernel dilation
+        is also known as 'atrous convolution'.
+      feature_group_count: integer, default 1. If specified divides the input
+        features into groups.
+      use_bias: whether to add a bias to the output (default: True).
+      mask: Optional mask for the weights during masked convolution. The mask must
+            be the same shape as the convolution weight matrix.
+      dtype: the dtype of the computation (default: infer from input and params).
+      params_dtype: the dtype passed to parameter initializers (default: float32).
+      precision: numerical precision of the computation see `jax.lax.Precision`
+        for details.
+      kernel_init: initializer for the convolutional kernel.
+      bias_init: initializer for the bias.
+    """
+    features: int
+    kernel_size: Sequence[int]
+    strides: Union[None, int, Sequence[int]] = 1
+    padding: PaddingLike = "SAME"
+    input_dilation: Union[None, int, Sequence[int]] = 1
+    kernel_dilation: Union[None, int, Sequence[int]] = 1
+    feature_group_count: int = 1
+    use_bias: bool = True
+    mask: Optional[Array] = None
+    dtype: Optional[DType] = None
+    params_dtype: DType = jnp.float32
+    precision: PrecisionLike = None
+    kernel_init: Callable[[PRNGKey, Shape, DType], Array] = nn.initializers.lecun_normal()
+    bias_init: Callable[[PRNGKey, Shape, DType], Array] = nn.initializers.zeros
+    conv_general_dilated: ConvGeneralDilatedT = lax.conv_general_dilated
+    kernel_axes: Tuple[str, ...] = ()
+    @property
+    def shared_weights(self) -> bool:  # type: ignore
+        """Defines whether weights are shared or not between different pixels.
+        Returns:
+          `True` to use shared weights in convolution (regular convolution).
+          `False` to use different weights at different pixels, a.k.a.
+          "locally connected layer", "unshared convolution", or "local convolution".
+        """
+        ...
+    @nn.compact
+    def __call__(self, inputs: Array) -> Array:
+        """Applies a (potentially unshared) convolution to the inputs.
+        Args:
+          inputs: input data with dimensions (*batch_dims, spatial_dims...,
+            features). This is the channels-last convention, i.e. NHWC for a 2d
+            convolution and NDHWC for a 3D convolution. Note: this is different from
+            the input convention used by `lax.conv_general_dilated`, which puts the
+            spatial dimensions last.
+            Note: If the input has more than 1 batch dimension, all batch dimensions
+            are flattened into a single dimension for the convolution and restored
+            before returning.  In some cases directly vmap'ing the layer may yield
+            better performance than this default flattening approach.  If the input
+            lacks a batch dimension it will be added for the convolution and removed
+            n return, an allowance made to enable writing single-example code.
+        Returns:
+          The convolved data.
+        """
+        if isinstance(self.kernel_size, int):
+            raise TypeError(
+                "Expected Conv kernel_size to be a"
+                " tuple/list of integers (eg.: [3, 3]) but got"
+                f" {self.kernel_size}."
+            )
+        else:
+            kernel_size = tuple(self.kernel_size)
+        def maybe_broadcast(x: Optional[Union[int, Sequence[int]]]) -> Tuple[int, ...]:
+            if x is None:
+                # backward compatibility with using None as sentinel for
+                # broadcast 1
+                x = 1
+            if isinstance(x, int):
+                return (x,) * len(kernel_size)
+            return tuple(x)
+        # Combine all input batch dimensions into a single leading batch axis.
+        num_batch_dimensions = inputs.ndim - (len(kernel_size) + 1)
+        if num_batch_dimensions != 1:
+            input_batch_shape = inputs.shape[:num_batch_dimensions]
+            total_batch_size = int(np.prod(input_batch_shape))
+            flat_input_shape = (total_batch_size,) + inputs.shape[num_batch_dimensions:]
+            inputs = jnp.reshape(inputs, flat_input_shape)
+        # self.strides or (1,) * (inputs.ndim - 2)
+        strides = maybe_broadcast(self.strides)
+        input_dilation = maybe_broadcast(self.input_dilation)
+        kernel_dilation = maybe_broadcast(self.kernel_dilation)
+        padding_lax = canonicalize_padding(self.padding, len(kernel_size))
+        if padding_lax == "CIRCULAR":
+            kernel_size_dilated = [(k - 1) * d + 1 for k, d in zip(kernel_size, kernel_dilation)]
+            zero_pad: List[Tuple[int, int]] = [(0, 0)]
+            pads = zero_pad + [((k - 1) // 2, k // 2) for k in kernel_size_dilated] + [(0, 0)]
+            inputs = jnp.pad(inputs, pads, mode="wrap")
+            padding_lax = "VALID"
+        elif padding_lax == "CAUSAL":
+            if len(kernel_size) != 1:
+                raise ValueError("Causal padding is only implemented for 1D convolutions.")
+            left_pad = kernel_dilation[0] * (kernel_size[0] - 1)
+            pads = [(0, 0), (left_pad, 0), (0, 0)]
+            inputs = jnp.pad(inputs, pads)
+            padding_lax = "VALID"
+        dimension_numbers = _conv_dimension_numbers(inputs.shape)
+        in_features = jnp.shape(inputs)[-1]
+        if self.shared_weights:
+            # One shared convolutional kernel for all pixels in the output.
+            assert in_features % self.feature_group_count == 0
+            kernel_shape = kernel_size + (in_features // self.feature_group_count, self.features)
+        else:
+            if self.feature_group_count != 1:
+                raise NotImplementedError(
+                    f"`lax.conv_general_dilated_local` does not support "
+                    f"`feature_group_count != 1`, got `{self.feature_group_count}`."
+                )
+            # Need to know the spatial output shape of a standard convolution to
+            # create the unshared convolution kernel.
+            conv_output_shape = jax.eval_shape(
+                lambda lhs, rhs: self.conv_general_dilated(  # pylint: disable=g-long-lambda
+                    lhs=lhs,
+                    rhs=rhs,
+                    window_strides=strides,
+                    padding=padding_lax,
+                    dimension_numbers=dimension_numbers,
+                    lhs_dilation=input_dilation,
+                    rhs_dilation=kernel_dilation,
+                ),
+                inputs,
+                jax.ShapedArray(kernel_size + (in_features, self.features), inputs.dtype),
+            ).shape
+            # One (unshared) convolutional kernel per each pixel in the output.
+            kernel_shape = conv_output_shape[1:-1] + (np.prod(kernel_size) * in_features, self.features)
+        if self.mask is not None and self.mask.shape != kernel_shape:
+            raise ValueError(
+                "Mask needs to have the same shape as weights. " f"Shapes are: {self.mask.shape}, {kernel_shape}"
+            )
+        kernel = param_with_axes(
+            "kernel",
+            self.kernel_init,
+            kernel_shape,
+            self.params_dtype,
+            axes=self.kernel_axes,
+        )
+        if self.mask is not None:
+            kernel *= self.mask
+        if self.use_bias:
+            if self.shared_weights:
+                # One bias weight per output channel, shared between pixels.
+                bias_shape = (self.features,)
+            else:
+                # One bias weight per output entry, unshared betwen pixels.
+                bias_shape = conv_output_shape[1:]
+            bias = param_with_axes(
+                "bias",
+                self.bias_init,
+                bias_shape,
+                self.params_dtype,
+                axes=(self.kernel_axes[-1],),
+            )
+        else:
+            bias = None
+        inputs, kernel, bias = promote_dtype(inputs, kernel, bias, dtype=self.dtype)
+        if self.shared_weights:
+            y = self.conv_general_dilated(
+                inputs,
+                kernel,
+                strides,
+                padding_lax,
+                lhs_dilation=input_dilation,
+                rhs_dilation=kernel_dilation,
+                dimension_numbers=dimension_numbers,
+                feature_group_count=self.feature_group_count,
+                precision=self.precision,
+            )
+        else:
+            y = lax.conv_general_dilated_local(
+                lhs=inputs,
+                rhs=kernel,
+                window_strides=strides,
+                padding=padding_lax,
+                filter_shape=kernel_size,
+                lhs_dilation=input_dilation,
+                rhs_dilation=kernel_dilation,
+                dimension_numbers=dimension_numbers,
+                precision=self.precision,
+            )
+        if self.use_bias:
+            bias = bias.reshape((1,) * (y.ndim - bias.ndim) + bias.shape)
+            y += bias
+        if num_batch_dimensions != 1:
+            output_shape = input_batch_shape + y.shape[1:]
+            y = jnp.reshape(y, output_shape)
+        return y
+class Conv(_Conv):
+    """Convolution Module wrapping `lax.conv_general_dilated`.
+    Attributes:
+      features: number of convolution filters.
+      kernel_size: shape of the convolutional kernel. For 1D convolution,
+        the kernel size can be passed as an integer. For all other cases, it must
+        be a sequence of integers.
+      strides: an integer or a sequence of `n` integers, representing the
+        inter-window strides (default: 1).
+      padding: either the string `'SAME'`, the string `'VALID'`, the string
+        `'CIRCULAR'` (periodic boundary conditions), or a sequence of `n` `(low,
+        high)` integer pairs that give the padding to apply before and after each
+        spatial dimension. A single int is interpeted as applying the same padding
+        in all dims and passign a single int in a sequence causes the same padding
+        to be used on both sides. `'CAUSAL'` padding for a 1D convolution will
+        left-pad the convolution axis, resulting in same-sized output.
+      input_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of `inputs`
+        (default: 1). Convolution with input dilation `d` is equivalent to
+        transposed convolution with stride `d`.
+      kernel_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of the convolution
+        kernel (default: 1). Convolution with kernel dilation
+        is also known as 'atrous convolution'.
+      feature_group_count: integer, default 1. If specified divides the input
+        features into groups.
+      use_bias: whether to add a bias to the output (default: True).
+      mask: Optional mask for the weights during masked convolution. The mask must
+            be the same shape as the convolution weight matrix.
+      dtype: the dtype of the computation (default: infer from input and params).
+      params_dtype: the dtype passed to parameter initializers (default: float32).
+      precision: numerical precision of the computation see `jax.lax.Precision`
+        for details.
+      kernel_init: initializer for the convolutional kernel.
+      bias_init: initializer for the bias.
+    """
+    @property
+    def shared_weights(self) -> bool:
+        return True

whisper_jax/whisper_jax_modeling_flax_whisper.py ADDED Viewed

	@@ -0,0 +1,1686 @@

+# coding=utf-8
+# Copyright 2023 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Flax whisper model."""
+import random
+from functools import partial
+from typing import Optional, Tuple
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+from transformers import WhisperConfig
+from transformers.generation.flax_logits_process import (
+    FlaxLogitsProcessor,
+    FlaxLogitsProcessorList,
+    FlaxWhisperTimeStampLogitsProcessor,
+)
+from transformers.modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxSeq2SeqLMOutput,
+    FlaxSeq2SeqModelOutput,
+)
+from transformers.modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from whisper_jax import layers
+from whisper_jax.layers import with_sharding_constraint
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "openai/whisper-tiny"
+_CONFIG_FOR_DOC = "WhisperConfig"
+WHISPER_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.) This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+    Parameters:
+        config ([`WhisperConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs). This can be used to enable mixed-precision training or half-precision
+            inference on GPUs or TPUs. If specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.** If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`]
+            and [`~FlaxPreTrainedModel.to_bf16`].
+"""
+WHISPER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids) Whisper uses the `decoder_start_token_id` as
+            the starting token for `decoder_input_ids` generation.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not use `position_ids` in the encoder as `input_features` is always the same size and doesn't
+            use masking, but this argument is preserved for compatibility. By default the silence in the input log mel
+            spectrogram are ignored.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+WHISPER_ENCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`].
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+WHISPER_DECODE_INPUTS_DOCSTRING = r"""
+    Args:
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        encoder_outputs (`tuple(tuple(numpy.ndarray)`):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        encoder_attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+           Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
+            but it is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        past_key_values (`Dict[str, numpy.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class FlaxStaticForceTokensLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] that takes a list of pairs of integers which indicates a mapping from generation indices to
+    token indices that will be forced before sampling. The processor will set their log probs to 0 and all other tokens
+    to `-inf` so that they are sampled at their corresponding index. This is a static version of the `transformers` logit
+    processor [`FlaxForceTokensLogitsProcessor`] that is compatible with sharded forced tokens.
+    Args:
+        force_token_map (`list`):
+            Map giving token ids and indices where they will be forced to be sampled.
+    """
+    def __init__(self, force_token_map):
+        # The generic `transformers` logit processor builds `force_token_array` as a dictionary - this is not a valid
+        # JAX type, and so we switch to using a JAX array instead
+        force_token_map = jnp.array(force_token_map)
+        # Converts the array of format [[index, token]] containing the tokens to be forced to an array, where the
+        # index of the array corresponds to the index of the token to be forced. For XLA compatibility,
+        # indexes without forced tokens will have a negative value. Note that the last token we ever need to force in
+        # Whisper is at position 3, so we only construct an array up to this index. The native version constructs a tensor
+        # dynamically according to the length of the `force_token_map`. Array shapes need to be concrete for XLA compatibility,
+        # so this is not permitted here.
+        force_token_array = jnp.ones(3, dtype=jnp.int32) * -1
+        for index, token in force_token_map:
+            force_token_array = force_token_array.at[index].set(token)
+        self.force_token_array = jnp.int32(force_token_array)
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        def _force_token(generation_idx):
+            batch_size = scores.shape[0]
+            current_token = self.force_token_array[generation_idx]
+            new_scores = jnp.ones_like(scores, dtype=scores.dtype) * -float("inf")
+            updates = jnp.zeros((batch_size, 1), dtype=scores.dtype)
+            new_scores = lax.dynamic_update_slice(new_scores, updates, (0, current_token))
+            return new_scores
+        scores = lax.cond(
+            cur_len >= self.force_token_array.shape[0],
+            # If the current length is geq than the length of force_token_array, the processor does nothing.
+            lambda: scores,
+            # Otherwise, it may force a certain token.
+            lambda: lax.cond(
+                self.force_token_array[cur_len] >= 0,
+                # Only valid (positive) tokens are forced
+                lambda: _force_token(cur_len),
+                # Otherwise, the processor does nothing.
+                lambda: scores,
+            ),
+        )
+        return scores
+class FlaxWhisperAttention(nn.Module):
+    config: WhisperConfig
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    causal: bool = False
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        dense = partial(
+            layers.DenseGeneral,
+            self.embed_dim,
+            axis=-1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "joined_kv"),
+        )
+        self.q_proj = dense(use_bias=self.bias)
+        self.k_proj = dense(use_bias=False)
+        self.v_proj = dense(use_bias=self.bias)
+        self.out_proj = layers.DenseGeneral(
+            self.embed_dim,
+            axis=-1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("joined_kv", "embed"),
+            use_bias=self.bias,
+        )
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_target_positions), dtype="bool"), dtype="bool"
+            )
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+        query_states = self.q_proj(hidden_states)
+        if is_cross_attention:
+            key_states = self.k_proj(key_value_states)
+            value_states = self.v_proj(key_value_states)
+        else:
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+        query_states = with_sharding_constraint(query_states, ("batch", "length", "heads", "kv"))
+        key_states = with_sharding_constraint(key_states, ("batch", "length", "heads", "kv"))
+        value_states = with_sharding_constraint(value_states, ("batch", "length", "heads", "kv"))
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                # max_length of cached_key is last dim
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[-1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask,
+                    (0, 0, mask_shift, 0),
+                    (1, 1, query_length, max_decoder_length),
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+    def _split_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.num_heads, self.head_dim))
+    def _merge_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.embed_dim,))
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        # The following code is largely copied from: https://github.com/google-research/t5x/blob/63d9addf628c6d8c547a407a32095fcb527bb20b/t5x/examples/scalable_t5/layers.py#L280-L284
+        is_initialized = self.has_variable("cache", "cached_key")
+        # The key and value have dimension [batch_size, seq_length, num_heads, head_dim],
+        # but we cache them as [batch_size, num_heads, head_dim, seq_length] as a TPU
+        # fusion optimization. This also enables the "scatter via one-hot
+        # broadcast" trick, which means we do a one-hot broadcast instead of a
+        # scatter/gather operations, resulting in a 3-4x speedup in practice.
+        def swap_dims(x):
+            return x[:-3] + tuple(x[i] for i in [-2, -1, -3])
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, swap_dims(key.shape), key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, swap_dims(value.shape), value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+        if is_initialized:
+            batch_size, num_heads, head_dim, seq_length = cached_key.value.shape
+            # During fast autoregressive decoding, we feed one position at a time,
+            # and cache the keys and values step by step.
+            # Sanity shape check of cached key against input query.
+            num_updated_cache_vectors = query.shape[1]
+            expected_shape = (batch_size, 1, num_heads, head_dim)
+            if num_updated_cache_vectors == 1 and expected_shape != query.shape:
+                raise ValueError(
+                    f"Autoregressive cache shape error, expected query shape {expected_shape} instead got {query.shape}"
+                )
+            # Create a OHE of the current index. NOTE: the index is increased below.
+            cur_index = cache_index.value
+            # In order to update the key, value caches with the current key and
+            # value, we move the seq_length axis to the back, similar to what we did for
+            # the cached ones above.
+            # Note these are currently the key and value of a single position, since
+            # we feed one position at a time.
+            one_token_key = jnp.moveaxis(key, -3, -1)
+            one_token_value = jnp.moveaxis(value, -3, -1)
+            # Update key, value caches with our new 1d spatial slices.
+            # We implement an efficient scatter into the cache via one-hot
+            # broadcast and addition.
+            if num_updated_cache_vectors > 1:
+                indices = jnp.eye(num_updated_cache_vectors, seq_length)[None, None]
+                key = cached_key.value + jnp.matmul(one_token_key, indices)
+                value = cached_value.value + jnp.matmul(one_token_value, indices)
+            else:
+                one_hot_indices = jax.nn.one_hot(cur_index, seq_length, dtype=key.dtype)
+                key = cached_key.value + one_token_key * one_hot_indices
+                value = cached_value.value + one_token_value * one_hot_indices
+            cached_key.value = key
+            cached_value.value = value
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # Move the keys and values back to their original shapes.
+            key = jnp.moveaxis(key, -1, -3)
+            value = jnp.moveaxis(value, -1, -3)
+            # causal mask for cached decoder self-attention: our single query position should only
+            # attend to those key positions that have already been generated and cached, not the
+            # remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(seq_length) < cur_index + num_updated_cache_vectors,
+                (batch_size,) + (1, num_updated_cache_vectors, seq_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartEncoderLayer with MBart->Whisper
+class FlaxWhisperEncoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.encoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.self_attn_layer_norm = layers.LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.fc1 = layers.DenseGeneral(
+            self.config.encoder_ffn_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "mlp"),
+        )
+        self.fc2 = layers.DenseGeneral(
+            self.embed_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("mlp", "embed"),
+        )
+        self.final_layer_norm = layers.LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        residual = hidden_states
+        layernorm_output = self.self_attn_layer_norm(hidden_states)
+        layernorm_output = with_sharding_constraint(layernorm_output, ("batch", "length", "embed"))
+        attn_output, attn_weights = self.self_attn(hidden_states=layernorm_output, attention_mask=attention_mask)
+        attn_output = self.dropout_layer(attn_output, deterministic=deterministic)
+        attn_output = residual + attn_output
+        attn_output = with_sharding_constraint(attn_output, ("batch", "length", "embed"))
+        residual = attn_output
+        post_layer_norm = self.final_layer_norm(attn_output)
+        post_layer_norm = with_sharding_constraint(post_layer_norm, ("batch", "length", "embed"))
+        fc1_output = self.activation_fn(self.fc1(post_layer_norm))
+        fc1_output = self.activation_dropout_layer(fc1_output, deterministic=deterministic)
+        fc1_output = with_sharding_constraint(fc1_output, ("batch", "length", "mlp"))
+        hidden_states = self.fc2(fc1_output)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartEncoderLayerCollection with MBart->Whisper
+class FlaxWhisperEncoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.layers = [
+            FlaxWhisperEncoderLayer(self.config, name=str(i), dtype=self.dtype, params_dtype=self.params_dtype)
+            for i in range(self.config.encoder_layers)
+        ]
+        self.layerdrop = self.config.encoder_layerdrop
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not deterministic and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                    deterministic,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer with MBart->Whisper
+class FlaxWhisperDecoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            causal=True,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.self_attn_layer_norm = layers.LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.encoder_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.encoder_attn_layer_norm = layers.LayerNorm(
+            dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype
+        )
+        self.fc1 = layers.DenseGeneral(
+            self.config.decoder_ffn_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "mlp"),
+        )
+        self.fc2 = layers.DenseGeneral(
+            self.embed_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("mlp", "embed"),
+        )
+        self.final_layer_norm = layers.LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        residual = hidden_states
+        layer_norm_output = self.self_attn_layer_norm(hidden_states)
+        layer_norm_output = with_sharding_constraint(layer_norm_output, ("batch", "length", "embed"))
+        # Self Attention
+        self_attn_output, self_attn_weights = self.self_attn(
+            hidden_states=layer_norm_output, attention_mask=attention_mask, init_cache=init_cache
+        )
+        self_attn_output = self.dropout_layer(self_attn_output, deterministic=deterministic)
+        self_attn_output = residual + self_attn_output
+        self_attn_output = with_sharding_constraint(self_attn_output, ("batch", "length", "embed"))
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = self_attn_output
+            encoder_layer_norm_output = self.encoder_attn_layer_norm(self_attn_output)
+            encoder_layer_norm_output = with_sharding_constraint(
+                encoder_layer_norm_output, ("batch", "length", "embed")
+            )
+            cross_attn_output, cross_attn_weights = self.encoder_attn(
+                hidden_states=encoder_layer_norm_output,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            cross_attn_output = self.dropout_layer(cross_attn_output, deterministic=deterministic)
+            cross_attn_output = residual + cross_attn_output
+            cross_attn_output = with_sharding_constraint(cross_attn_output, ("batch", "length", "embed"))
+        # Fully Connected
+        residual = cross_attn_output
+        post_layer_norm = self.final_layer_norm(cross_attn_output)
+        post_layer_norm = with_sharding_constraint(post_layer_norm, ("batch", "length", "embed"))
+        fc1_output = self.activation_fn(self.fc1(post_layer_norm))
+        fc1_output = self.activation_dropout_layer(fc1_output, deterministic=deterministic)
+        fc1_output = with_sharding_constraint(fc1_output, ("batch", "length", "mlp"))
+        hidden_states = self.fc2(fc1_output)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        return outputs
+# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayerCollection with MBart->Whisper
+class FlaxWhisperDecoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.layers = [
+            FlaxWhisperDecoderLayer(self.config, name=str(i), dtype=self.dtype, params_dtype=self.params_dtype)
+            for i in range(self.config.decoder_layers)
+        ]
+        self.layerdrop = self.config.decoder_layerdrop
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+                # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not deterministic and (dropout_probability < self.layerdrop):
+                layer_outputs = (None, None, None)
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    init_cache=init_cache,
+                    output_attentions=output_attentions,
+                    deterministic=deterministic,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions]
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+class FlaxWhisperEncoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.conv1 = layers.Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            padding=1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("channels", "num_mel", "embed"),
+        )
+        self.conv2 = layers.Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            strides=2,
+            padding=1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("channels", "embed", "num_mel"),
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.layers = FlaxWhisperEncoderLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.embed_positions = layers.Embed(
+            self.config.max_source_positions, self.config.d_model, dtype=self.dtype, params_dtype=self.params_dtype
+        )
+        self.layer_norm = layers.LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        if input_features.shape[1:] != (self.config.num_mel_bins, self.config.max_source_positions * 2):
+            raise ValueError(
+                "input_features.shape[1:], must be equal to (self.config.num_mel_bins,"
+                f" self.config.max_source_positions * 2) (got {input_features.shape[1:]}, but should be"
+                f" ({self.config.num_mel_bins}, {self.config.max_source_positions * 2}))"
+            )
+        input_features = input_features.transpose(0, 2, 1)
+        hidden_states = jax.nn.gelu(self.conv1(input_features), approximate=False)
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "embed", "num_mel"))
+        hidden_states = jax.nn.gelu(self.conv2(hidden_states), approximate=False)
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        embed_positions = self.embed_positions(jnp.arange(self.config.max_source_positions))
+        hidden_states = hidden_states + embed_positions
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=None,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_states,)
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+        )
+class FlaxWhisperDecoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.embed_tokens = layers.Embed(
+            self.config.vocab_size, self.config.d_model, dtype=self.dtype, params_dtype=self.params_dtype
+        )
+        self.embed_positions = layers.Embed(
+            self.config.max_target_positions, self.config.d_model, dtype=self.dtype, params_dtype=self.params_dtype
+        )
+        self.layers = FlaxWhisperDecoderLayerCollection(self.config, dtype=self.dtype, params_dtype=self.params_dtype)
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.layer_norm = layers.LayerNorm(dtype=self.dtype, epsilon=1e-5, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        position_ids: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        input_embeds = self.embed_tokens(input_ids)
+        position_embeds = self.embed_positions(position_ids)
+        hidden_states = input_embeds + position_embeds
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_states,)
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+class FlaxWhisperModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.encoder = FlaxWhisperEncoder(self.config, dtype=self.dtype, params_dtype=self.params_dtype)
+        self.decoder = FlaxWhisperDecoder(self.config, dtype=self.dtype, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        decoder_attention_mask: jnp.ndarray,
+        decoder_position_ids: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        encoder_outputs = self.encoder(
+            input_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return FlaxSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+    def _get_encoder_module(self):
+        return self.encoder
+    def _get_decoder_module(self):
+        return self.decoder
+class FlaxWhisperPreTrainedModel(FlaxPreTrainedModel):
+    config_class = WhisperConfig
+    base_model_prefix: str = "model"
+    main_input_name = "input_features"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: WhisperConfig,
+        input_shape: Tuple[int, int, int] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        params_dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        if input_shape is None:
+            input_shape = (1, config.num_mel_bins, 2 * config.max_source_positions)
+        module = self.module_class(config=config, dtype=dtype, params_dtype=params_dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_features = jnp.zeros(input_shape, dtype="f4")
+        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
+        decoder_input_ids = jnp.zeros((input_shape[0], 1), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        batch_size, sequence_length = decoder_input_ids.shape
+        decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        random_params = self.module.init(
+            rngs,
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+        )["params"]
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartPreTrainedModel.init_cache with Bart->Whisper
+    def init_cache(self, batch_size, max_length, encoder_outputs):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
+        )
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                **kwargs,
+            )
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+    @add_start_docstrings(WHISPER_ENCODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=WhisperConfig)
+    def encode(
+        self,
+        input_features: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+        **kwargs,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        def _encoder_forward(module, input_features, **kwargs):
+            encode_module = module._get_encoder_module()
+            return encode_module(input_features, **kwargs)
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=WhisperConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        encoder_hidden_states = encoder_outputs[0]
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past = outputs
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past = outputs
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # prepare decoder inputs
+        if decoder_position_ids is None:
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                batch_size, sequence_length = decoder_input_ids.shape
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+@add_start_docstrings(
+    "The bare Whisper Model transformer outputting raw hidden-states without any specific head on top.",
+    WHISPER_START_DOCSTRING,
+)
+class FlaxWhisperModel(FlaxWhisperPreTrainedModel):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    module_class = FlaxWhisperModule
+append_call_sample_docstring(FlaxWhisperModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
+class FlaxWhisperForConditionalGenerationModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.model = FlaxWhisperModule(config=self.config, dtype=self.dtype, params_dtype=self.params_dtype)
+        self.lm_head = layers.DenseGeneral(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "vocab"),
+        )
+    def _get_encoder_module(self):
+        return self.model.encoder
+    def _get_decoder_module(self):
+        return self.model.decoder
+    def __call__(
+        self,
+        input_features,
+        decoder_input_ids,
+        decoder_attention_mask: jnp.ndarray = None,
+        decoder_position_ids: jnp.ndarray = None,
+        position_ids: jnp.ndarray = None,
+        attention_mask: jnp.ndarray = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        outputs = self.model(
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.model.decoder.embed_tokens.variables["params"]["embedding"]
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return output
+        return FlaxSeq2SeqLMOutput(
+            logits=lm_logits,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+@add_start_docstrings("The Whisper Model with a language modeling head.", WHISPER_START_DOCSTRING)
+class FlaxWhisperForConditionalGeneration(FlaxWhisperPreTrainedModel):
+    module_class = FlaxWhisperForConditionalGenerationModule
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=WhisperConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        encoder_hidden_states = encoder_outputs[0]
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length), dtype="i4")
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            outputs = decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+            hidden_states = outputs[0]
+            if self.config.tie_word_embeddings:
+                shared_embedding = module.model.decoder.embed_tokens.variables["params"]["embedding"]
+                lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+            else:
+                lm_logits = module.lm_head(hidden_states)
+            return lm_logits, outputs
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        if past_key_values is None:
+            lm_logits, decoder_outputs = outputs
+        else:
+            (lm_logits, decoder_outputs), past = outputs
+        if return_dict:
+            outputs = FlaxCausalLMOutputWithCrossAttentions(
+                logits=lm_logits,
+                hidden_states=decoder_outputs.hidden_states,
+                attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+            )
+        else:
+            outputs = (lm_logits,) + decoder_outputs[1:]
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs
+    def generate(
+        self,
+        input_features,
+        generation_config=None,
+        logits_processor=None,
+        return_timestamps=None,
+        task=None,
+        language=None,
+        is_multilingual=None,
+        **kwargs,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+        if return_timestamps is not None:
+            generation_config.return_timestamps = return_timestamps
+        if task is not None:
+            generation_config.task = task
+        if is_multilingual is not None:
+            generation_config.is_multilingual = is_multilingual
+        if language is not None:
+            generation_config.language = language
+        if kwargs is not None and "decoder_input_ids" in kwargs:
+            decoder_input_length = len(kwargs["decoder_input_ids"])
+        else:
+            decoder_input_length = 1
+        forced_decoder_ids = []
+        if hasattr(generation_config, "is_multilingual") and generation_config.is_multilingual:
+            if hasattr(generation_config, "language"):
+                forced_decoder_ids.append((1, generation_config.lang_to_id[generation_config.language]))
+            else:
+                forced_decoder_ids.append((1, None))
+            if hasattr(generation_config, "task"):
+                forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
+        if (
+            hasattr(generation_config, "return_timestamps") and generation_config.return_timestamps
+        ) or return_timestamps:
+            logits_processor = [
+                FlaxWhisperTimeStampLogitsProcessor(generation_config, self.config, decoder_input_length)
+            ]
+        else:
+            if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+        if len(forced_decoder_ids) > 0:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+    def pipeline_generate(
+        self,
+        input_features,
+        forced_decoder_ids,
+        return_timestamps=False,
+        generation_config=None,
+        **kwargs,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+        # override the generation config forced decoder ids in preference of the ones we have set
+        generation_config.forced_decoder_ids = None
+        logits_processor = FlaxLogitsProcessorList()
+        logits_processor.append(FlaxStaticForceTokensLogitsProcessor(forced_decoder_ids))
+        if hasattr(generation_config, "return_timestamps") and return_timestamps:
+            logits_processor.append(FlaxWhisperTimeStampLogitsProcessor(generation_config, self.config, 1))
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            position_ids = decoder_attention_mask.cumsum(-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+        return {
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "encoder_attention_mask": attention_mask,
+            "decoder_attention_mask": extended_attention_mask,
+            "decoder_position_ids": position_ids,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+        return model_kwargs
+FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING = r"""
+    Returns:
+    Transcription example:
+    ```python
+    >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+    >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+    >>> input_features = inputs.input_features
+    >>> generated_ids = model.generate(input_ids=input_features)
+    >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    >>> transcription
+    ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+    ```
+"""
+overwrite_call_docstring(
+    FlaxWhisperForConditionalGeneration, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING
+)
+append_replace_return_docstrings(
+    FlaxWhisperForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
+)

whisper_jax/whisper_jax_partitioner.py ADDED Viewed

	@@ -0,0 +1,939 @@

+# coding=utf-8
+# Copyright 2023 The T5X Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for partitioning."""
+import abc
+import collections
+import dataclasses
+import typing
+from typing import Any, Callable, Optional, Sequence, Tuple, Union
+import cached_property
+import jax
+import numpy as np
+from absl import logging
+from flax import traverse_util
+from flax.linen import partitioning as flax_partitioning
+from jax import numpy as jnp
+from jax import random
+from jax.experimental import multihost_utils
+from jax.experimental.mesh_utils import create_hybrid_device_mesh
+from jax.experimental.pjit import pjit as jax_pjit
+from jax.sharding import Mesh, PartitionSpec
+JaxDevice = Any
+TpuMesh = Tuple[int, int, int, int]  # (x, y, z, num_cores).
+OtherMesh = Tuple[int, int]
+HardwareMesh = Union[TpuMesh, OtherMesh]
+PyTreeDef = type(jax.tree_util.tree_structure(None))
+TrainState = Any
+LogicalAxisRules = Sequence[Tuple[str, Optional[str]]]
+if typing.TYPE_CHECKING:  # See b/163639353
+    cached_property = property  # pylint: disable=invalid-name
+else:
+    cached_property = cached_property.cached_property
+class AxisNames(tuple):
+    """Tuple of strings specifying name for each axis.
+    We create a separate class for this so JAX's pytree utilities can distinguish
+    it from a tuple that should be treated as a pytree, instead treating it as a
+    leaf.
+    """
+    def __new__(cls, *names):
+        return tuple.__new__(AxisNames, names)
+    def __repr__(self):
+        return "AxisNames%s" % tuple.__repr__(self)
+# pjit wrappers for cpu fallback.
+# ----------------------------------------------------------------------------
+# TODO(levskaya): This function is now no different than jax_pjit, but callers
+# currently depend on `backend` argument
+def pjit(
+    fun: Callable,  # pylint: disable=g-bare-generic
+    in_axis_resources,
+    out_axis_resources,
+    static_argnums: Union[int, Sequence[int]] = (),
+    donate_argnums: Union[int, Sequence[int]] = (),
+    backend: Optional[str] = None,
+):
+    """Wrapper for pjit."""
+    del backend
+    return jax_pjit(
+        fun, in_axis_resources, out_axis_resources, static_argnums=static_argnums, donate_argnums=donate_argnums
+    )
+# pjit wrappers for cpu fallback.
+# -----------------------------------------------------------------------------
+# TODO(levskaya): upstream this fallback behavior to jax pjit.
+def pjit_with_cpu_fallback(
+    fun: Callable,  # pylint: disable=g-bare-generic
+    in_axis_resources,
+    out_axis_resources,
+    static_argnums: Union[int, Sequence[int]] = (),
+    donate_argnums: Union[int, Sequence[int]] = (),
+    backend: Optional[str] = None,
+):
+    """Wrapper for pjit that calls normal jit on cpu."""
+    if jax.devices(backend)[0].platform == "cpu":
+        return jax.jit(fun, static_argnums=static_argnums, donate_argnums=donate_argnums)
+    else:
+        return jax_pjit(
+            fun, in_axis_resources, out_axis_resources, static_argnums=static_argnums, donate_argnums=donate_argnums
+        )
+def with_sharding_constraint(x, axis_resources):
+    """Wrapper for pjit with_sharding_constraint, no-op on cpu or outside pjit."""
+    if jax.devices()[0].platform == "cpu" or not global_mesh_defined():
+        return x
+    else:
+        return jax.experimental.pjit.with_sharding_constraint(x, axis_resources)
+# pjit Mesh creation functions.
+# -----------------------------------------------------------------------------
+def bounds_from_last_device(last_device: JaxDevice) -> HardwareMesh:
+    """Get the bound from the given last device."""
+    # Must be passed the device at the highest-coordinate corner of the
+    # relevant mesh, which is a requirement we know is satisfied by the last
+    # device in jax.devices().
+    if hasattr(last_device, "coords"):
+        x, y, z = last_device.coords
+        return x + 1, y + 1, z + 1, last_device.core_on_chip + 1
+    else:
+        # On non-TPU platforms, the "mesh" is hosts x devices per host in order
+        # to take advantage of faster within-host interconnect.
+        return jax.host_count(), jax.local_device_count()
+def get_coords(device: JaxDevice) -> HardwareMesh:
+    """Returns the coordinates of the given device."""
+    if hasattr(device, "coords"):
+        return (*device.coords, device.core_on_chip)
+    return (device.process_index, device.id % jax.local_device_count())
+def global_mesh_defined():
+    """Checks if global xmap/pjit mesh resource environment is defined."""
+    maps_env = jax.experimental.maps.thread_resources.env
+    return maps_env.physical_mesh.devices.shape != ()  # pylint: disable=g-explicit-bool-comparison
+def get_mesh(
+    model_parallel_submesh: HardwareMesh,
+    input_devices: Sequence[JaxDevice] = (),
+    input_local_devices: Sequence[JaxDevice] = (),
+    tile_by_host_if_needed: bool = True,
+    backend: Optional[str] = None,
+) -> Mesh:
+    """Construct an xmap/pjit Mesh for the given model-parallel submesh.
+    The resulting mesh has two resource axes: 'model', with the provided submesh
+    shape, and 'data', which covers the rest of the mesh.
+    Args:
+      model_parallel_submesh: a HardwareMesh spec, namely (x,y,z,core) on TPU for
+        a single model-parallel replica's "tile" in the physical device mesh. The
+        first three elements (`x`, `y`, and `z`) should be factors of the pod
+        slice; e.g., if you are using df_4x8, then `x` should be a factor of 4
+        (one of 1, 2, 4), `y` should be a factor of 8 (one of 1, 2, 4, 8), and `z`
+        must be 1, because TPU v3 slices are only 2D. `z` can be >1 for TPU v4
+        (and maybe later TPUs) that allow 3D slices. `core` is the number of cores
+        to use from each TPU node. As communication is usually fastest inside the
+        same node, if you need a tile of more than 1 core, then
+        you should first increase `core`: e.g., for TPU v3, (1,1,1,2) is better
+          than (2,1,1,1). To pick a good spec, try a few possible values until you
+          get high TPU utilization.
+      input_devices: the devices to use, will use jax.devices() if this is not
+        set.
+      input_local_devices: the local devices to use, will use jax.local_devices()
+        if this is not set.
+      tile_by_host_if_needed: JAX currently requires that the parts of any sharded
+        array that are located on one host's local devices form a single
+        contiguous slice. A best effort will be made to achieve this without
+        "tiling" the device assignment over hosts (which can reduce XLA collective
+        performance). If this flag is True, then the device assignment will be
+        tiled over hosts if necessary to satisfy this constraint and create a
+        buildable mesh; if false, mesh construction will fail instead.
+      backend: get devices from the pinned backend, if specified. This is
+        useful for explicitly specifying the devices other than relying on
+        jax_platform_name.
+    Returns:
+      A xmap / pjit Mesh containing the virtual device mesh with data, model axes.
+    """
+    input_devices = input_devices or jax.devices(backend)
+    input_local_devices = input_local_devices or jax.local_devices(0, backend)
+    # Sort input_devices based on coords, as backends might not return devices
+    # in order.
+    last_device = sorted(input_devices, key=get_coords)[-1]
+    last_input_local_devices = sorted(input_local_devices, key=get_coords)[-1]
+    logging.info(
+        "last device coords : %r\nlast local device coords: %r",
+        get_coords(last_device),
+        get_coords(last_input_local_devices),
+    )
+    global_hardware_mesh = bounds_from_last_device(last_device)
+    mesh_ndim = len(global_hardware_mesh)
+    local_hardware_mesh = bounds_from_last_device(last_input_local_devices)
+    mesh_err = (
+        f"each dimension of the model parallel submesh {model_parallel_submesh} "
+        "must be a factor of the corresponding dimension of the global device "
+        f"mesh {global_hardware_mesh}"
+    )
+    assert not any(g % m for g, m in zip(global_hardware_mesh, model_parallel_submesh)), mesh_err
+    assert not any(g % l for g, l in zip(global_hardware_mesh, local_hardware_mesh))
+    devices = np.empty(global_hardware_mesh, dtype=object)
+    for device in input_devices:
+        device_coords = get_coords(device)
+        devices[device_coords] = device
+    tile_by_host = tile_by_host_if_needed
+    if len(global_hardware_mesh) == 4:
+        # enable contiguous local chunks without host tiling by making Z major
+        global_hardware_mesh = typing.cast(Tuple[int, int, int, int], global_hardware_mesh)
+        model_parallel_submesh = typing.cast(Tuple[int, int, int, int], model_parallel_submesh)
+        gx, gy, gz, gc = global_hardware_mesh
+        mx, my, mz, mc = model_parallel_submesh
+        if (mx == gx > 1 and my == mz == 1) or (mx == 1 and my == gy > 1 and mz == gz > 1):
+            logging.info("ensuring YZ plane has a Z-major device order")
+            # YZ should be ZY
+            assert mc == gc, (mc, gc)
+            global_hardware_mesh = gx, gz, gy, gc
+            model_parallel_submesh = mx, mz, my, mc
+            devices = devices.swapaxes(1, 2)
+            tile_by_host = False
+        if (my == gy > 1 and mx == mz == 1) or (my == 1 and mx == gx > 1 and mz == gz > 1):
+            logging.info("ensuring XZ plane has a Z-major device order")
+            # XZ should be ZX
+            assert mc == gc, (mc, gc)
+            global_hardware_mesh = gz, gy, gx, gc
+            model_parallel_submesh = mz, my, mx, mc
+            devices = devices.swapaxes(0, 2)
+            tile_by_host = False
+    if tile_by_host:
+        logging.warning(
+            "Tiling device assignment mesh by hosts, which may lead to "
+            "reduced XLA collective performance. To avoid this, modify "
+            "the model parallel submesh or run with more tasks per host."
+        )
+        tile_err = (
+            "to tile the mesh by hosts, each dimension of the model parallel "
+            "submesh must be either a factor or a multiple of the corresponding "
+            "dimension of the per-host submesh"
+        )
+        def dh_dd_mh_md(g: int, m: int, l: int) -> Tuple[int, int, int, int]:
+            """Split a global mesh dimension into four tiling components.
+            Args:
+              g: global mesh bounds dimension size
+              m: model-parallel submesh bounds dimension size
+              l: local submesh bounds dimension size
+            Returns:
+              The resulting tuple divides the dimension into the hosts component of
+              the data-parallel submesh, the devices component of the data-parallel
+              submesh, the hosts component of the model-parallel submesh, and the
+              devices component of the model-parallel submesh.
+            """
+            d = g // m
+            if m >= l:
+                assert not m % l, tile_err
+                return (d, 1, m // l, l)
+            else:
+                assert not l % m, tile_err
+                return (d // (l // m), l // m, 1, m)
+        # e.g. [(x_data_hosts, x_data_devs, x_model_hosts, x_model_devs), ...]
+        dh_dd_mh_md_tups = map(dh_dd_mh_md, global_hardware_mesh, model_parallel_submesh, local_hardware_mesh)
+        # reshape to e.g. (x_dh, x_dd, x_mh, x_md, y_dh, ...)
+        devices = devices.reshape(*(s for t in dh_dd_mh_md_tups for s in t))  # pylint: disable=g-complex-comprehension
+        # TODO(jekbradbury): reorder local subgroups for ring locality
+        # Transpose to [data_host], [data_device], [model_host], [model_device]
+        # block ordering e.g. (x_dh, y_dh, ..., x_dd, y_dd, ...)
+        devices = devices.transpose(
+            *(4 * i for i in range(mesh_ndim)),
+            *(4 * i + 1 for i in range(mesh_ndim)),
+            *(4 * i + 2 for i in range(mesh_ndim)),
+            *(4 * i + 3 for i in range(mesh_ndim)),
+        )
+    else:
+        # e.g. [(x_data, x_model), (y_data, y_model), ...]
+        model_data_tups = [(g // m, m) for g, m in zip(global_hardware_mesh, model_parallel_submesh)]
+        # reshape to e.g. (x_data, x_model, y_data, y_model...)
+        devices = devices.reshape(*(s for t in model_data_tups for s in t))  # pylint: disable=g-complex-comprehension
+        # TODO(jekbradbury): reorder small subgroups for ring locality
+        # transpose to e.g. (x_data, y_data, ..., x_model, ...)
+        devices = devices.transpose(*(2 * i for i in range(mesh_ndim)), *(2 * i + 1 for i in range(mesh_ndim)))
+    # reshape to (data, model)
+    devices = devices.reshape(-1, np.prod(model_parallel_submesh))
+    global_mesh = Mesh(devices, ["data", "model"])
+    logging.info("global_mesh axis_names: %s", global_mesh.axis_names)
+    logging.info("global_mesh devices: %s", global_mesh.devices)
+    logging.info("global_mesh devices shape: %s", global_mesh.devices.shape)
+    return global_mesh
+def get_cpu_mesh() -> Mesh:
+    """Trivial mesh for CPU Testing."""
+    devices = np.empty((jax.host_count(), jax.local_device_count()), dtype=object)
+    for device in jax.devices():
+        devices[device.process_index, device.id % jax.local_device_count()] = device
+    return Mesh(devices, ["data", "model"])
+def get_gpu_mesh(num_partitions: int) -> Mesh:
+    """Mesh for GPUs that preferentially places 'model' on NVLink."""
+    nvlink_size = jax.local_device_count()
+    dcn_size = jax.process_count()
+    nvlink_mp = min(num_partitions, nvlink_size)
+    nvlink_dp, extra1 = divmod(nvlink_size, nvlink_mp)
+    dcn_mp, extra2 = divmod(num_partitions, nvlink_mp)
+    assert not (extra1 or extra2), (
+        "number of partitions on GPU must be a factor" " or multiple of the number of local devices"
+    )
+    dcn_dp = dcn_size // dcn_mp
+    devices = create_hybrid_device_mesh(
+        mesh_shape=[nvlink_dp, nvlink_mp], dcn_mesh_shape=[dcn_dp, dcn_mp], process_is_granule=True
+    )
+    global_mesh = Mesh(devices, ["data", "model"])
+    logging.info("global_mesh axis_names: %s", global_mesh.axis_names)
+    logging.info("global_mesh devices: %s", global_mesh.devices)
+    return global_mesh
+def default_mesh(
+    num_partitions: int, model_parallel_submesh: Optional[HardwareMesh] = None, backend: Optional[str] = None
+) -> Mesh:
+    """Attempt to return a default mesh for simple cases.
+    Args:
+      num_partitions: number of partitions to use, will be ignored if
+        model_parallel_submesh is provided.
+      model_parallel_submesh: 4-tuple that specifies the x,y,z,c submesh to use as
+        the model-parallel device tile.
+      backend: get devices from the pinned backend, if specified. This is useful
+        for explicitly specifying the devices other than relying on
+        jax_platform_name.
+    Returns:
+      xmap/pjit 2D Mesh with 'data', 'model' mesh axes.
+    """
+    last_device = jax.devices(backend)[-1]
+    platform = last_device.platform
+    device_kind = last_device.device_kind
+    bounds = bounds_from_last_device(last_device)
+    if model_parallel_submesh:
+        return get_mesh(model_parallel_submesh, backend=backend)
+    if platform == "cpu":
+        return get_cpu_mesh()
+    elif platform == "gpu":
+        return get_gpu_mesh(num_partitions)
+    mps = None
+    if device_kind in ("TPU v2", "TPU v3"):
+        if num_partitions == 1:
+            mps = (1, 1, 1, 1)
+        elif num_partitions == 2:
+            mps = (1, 1, 1, 2)
+        elif num_partitions == 4:
+            mps = (2, 1, 1, 2)
+        elif num_partitions == 8:
+            mps = (2, 2, 1, 2)
+        elif num_partitions == 16:
+            mps = (4, 2, 1, 2)
+    # assume the use of megacore on TPU v4
+    elif (device_kind == "TPU v4" or device_kind == "TPU v4 lite") and bounds[3] == 1:
+        if num_partitions == 1:
+            mps = (1, 1, 1, 1)
+        elif num_partitions == 2:
+            mps = (1, 2, 1, 1)
+        elif num_partitions == 4:
+            if bounds[0] >= 4:
+                mps = (4, 1, 1, 1)
+            else:
+                mps = (2, 2, 1, 1)
+        elif num_partitions == 8:
+            if bounds[2] >= 8:
+                mps = (1, 1, 8, 1)
+            else:
+                mps = (4, 2, 1, 1)
+        elif num_partitions == 16:
+            if bounds[2] >= 16:
+                mps = (1, 1, 16, 1)
+            elif bounds[0] >= 8:
+                mps = (8, 2, 1, 1)
+            elif bounds[0] >= 4:
+                mps = (4, 4, 1, 1)
+            else:
+                mps = (2, 2, 4, 1)
+    if mps is None:
+        raise ValueError(
+            "No default mesh for this configuration: specify " "config.model_parallel_submesh explicitly."
+        )
+    return get_mesh(mps, backend=backend)
+# Data chunking helper.
+# -----------------------------------------------------------------------------
+@dataclasses.dataclass
+class LocalChunkInfo:
+    # The logical slice of an array located on this host's local devices.
+    slice: Tuple[slice, ...]
+    # A unique index for this host/local chunk among chunks with the same slice.
+    replica_id: int
+class LocalChunker:
+    """Utility class to aid chunking of sharded arrays in multihost settings."""
+    def __init__(self, global_mesh: Mesh):
+        self.global_mesh = global_mesh
+        local_mesh = global_mesh.local_mesh
+        first_local_device = local_mesh.devices.reshape(-1)[0]
+        host_location = collections.OrderedDict(
+            zip(global_mesh.shape.keys(), list(zip(*np.nonzero(global_mesh.devices == first_local_device)))[0])
+        )
+        self.num_chunks = collections.OrderedDict()
+        self.chunk_ids = collections.OrderedDict()
+        self.mesh_axes = list(global_mesh.shape.keys())
+        for mesh_axis in self.mesh_axes:
+            num_devices_per_chunk = local_mesh.shape[mesh_axis]
+            self.num_chunks[mesh_axis] = global_mesh.shape[mesh_axis] // num_devices_per_chunk
+            self.chunk_ids[mesh_axis] = host_location[mesh_axis] // num_devices_per_chunk
+    def get_local_chunk_info(
+        self, global_shape: Tuple[int, ...], mesh_axes: Sequence[Optional[str]]
+    ) -> LocalChunkInfo:
+        """Get the local chunk info for a given array shape and sharded axes.
+        Args:
+          global_shape: the global, unsharded shape of the array to chunk.
+          mesh_axes: a sequence of names (or None) of equal rank to `global_shape`
+            that specifies which mesh dimensions the array is sharded along.
+        Returns:
+          LocalChunkInfo containing the logical slices of the array found on this
+          host's local devices, as well as the replica index for this chunk among
+          chunks with the same slice. The latter is used to determine which
+          host should write this chunk during checkpointing.
+        """
+        local_slice = [slice(None) for dim in global_shape]
+        sharded_mesh_axes = set()
+        for i, (mesh_axis, size) in enumerate(zip(mesh_axes, global_shape)):
+            if not mesh_axis:
+                continue
+            sharded_mesh_axes.add(mesh_axis)
+            if not isinstance(mesh_axis, str):
+                raise NotImplementedError("TODO(jekbradbury)")
+            chunk_id = self.chunk_ids[mesh_axis]
+            chunk_size = size // self.num_chunks[mesh_axis]
+            local_slice[i] = slice(chunk_id * chunk_size, (chunk_id + 1) * chunk_size)
+        replicated_mesh_axes = [mesh_axis for mesh_axis in self.mesh_axes if mesh_axis not in sharded_mesh_axes]
+        replica_id = 0
+        for mesh_axis in replicated_mesh_axes:
+            chunk_id = self.chunk_ids[mesh_axis]
+            replica_id = replica_id * self.num_chunks[mesh_axis] + chunk_id
+        return LocalChunkInfo(tuple(local_slice), replica_id)
+def standard_logical_axis_rules(
+    activation_partitioning_dims: int = 1,
+    parameter_partitioning_dims: int = 1,
+    additional_rules: Optional[LogicalAxisRules] = None,
+) -> LogicalAxisRules:
+    """Default sharding rules for T5X model in terms of logical axis names.
+    Args:
+      activation_partitioning_dims: enables 2-D activation sharding when set to 2.
+      parameter_partitioning_dims: enables 2-D parameter sharding when set to 2.
+      additional_rules: additional rules (a sequence of tuples) that will be
+        appended to the standard rules.
+    Returns:
+      Sequence of logical axis rules
+    """
+    logging.info(
+        "`activation_partitioning_dims` = %d, `parameter_partitioning_dims` = %d",
+        activation_partitioning_dims,
+        parameter_partitioning_dims,
+    )
+    if activation_partitioning_dims == 1 and parameter_partitioning_dims == 1:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("embed", None),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),  # joined heads+kv dim in 2D attn param layouts
+        ]
+    elif activation_partitioning_dims == 2 and parameter_partitioning_dims == 1:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "model"),
+        ]
+    elif activation_partitioning_dims == 1 and parameter_partitioning_dims == 2:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "data"),
+        ]
+    elif activation_partitioning_dims == 2 and parameter_partitioning_dims == 2:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "model"),
+            ("embed", "data"),
+        ]
+    else:
+        raise ValueError(
+            f"`activation_partitioning_dims` = {activation_partitioning_dims} "
+            f"`parameter_partitioning_dims` = {parameter_partitioning_dims} "
+            "is not supported."
+        )
+    # Add the common rules for the replicated logical axes names.
+    replicated_rules = [
+        ("relpos_buckets", None),
+        ("abspos_buckets", None),
+        ("length", None),
+        ("layers", None),
+        ("stack", None),
+        ("mlp_activations", None),
+    ]
+    rules.extend(replicated_rules)
+    if additional_rules:
+        rules.extend(additional_rules)
+    return rules
+# NB: This needs to be top-level for the jax compilation cache.
+def _id_fn(x, ix):
+    """Identity function for copying parameters to the devices, sharded."""
+    # A pure identity such as `lambda x, *: x` can get optimized away, so we
+    # include a random.split as a cheap function that cannot be optimized away.
+    y = random.split(random.PRNGKey(jnp.array(ix, dtype=jnp.uint32)))
+    return x, y
+@dataclasses.dataclass
+class DataLayout:
+    """Represents data layout for the partitioned model."""
+    batch_size: int
+    shard_id: int
+    num_shards: int
+    is_first_host_in_replica_set: bool
+PartitionedCallable = Callable[..., Any]
+CompiledPartitionedCallable = Callable[..., Any]
+class BasePartitioner(metaclass=abc.ABCMeta):
+    """Interface for partitioning computations across hardware devices."""
+    def __init__(
+        self,
+        num_partitions: Optional[int] = None,
+        model_parallel_submesh: Optional[HardwareMesh] = None,
+        params_on_devices: bool = True,
+        backend: Optional[str] = None,
+    ):
+        """Configures the partitioner.
+        Args:
+          num_partitions: the number of partitions to use. Ignored if
+            `model_parallel_submesh` is provided.
+          model_parallel_submesh: 4-tuple that specifies the x,y,z,c submesh to use
+            as the model-parallel device tile. This submesh is used for the larger
+            of the two parameter dimensions, and, if 2-D activation sharding is
+            enabled, for the model dimension of activations. The rest of the mesh is
+            used for data parallelism and, if 2-D parameter sharding is enabled, the
+            other parameter dimension.
+          params_on_devices: whether to keep the params on devices, if False -
+            params stay in the host memory. Note that some partitioners might ignore
+            this setting, for example if they don't support storing all params on
+            device memory.
+          backend: get devices from the pinned backend, if specified. This is useful
+            for explicitly specifying the devices other than relying on
+            jax_platform_name.
+        """
+        if not num_partitions and not model_parallel_submesh:
+            raise ValueError("At least one of `num_partitions` or " "`model_parallel_submesh` must be set.")
+        if model_parallel_submesh is not None and len(model_parallel_submesh) != 4:
+            logging.error(
+                (
+                    "`model_parallel_submesh` must be either None or a 4-tuple. Got"
+                    " `model_parallel_submesh`=%s. A ValueError will be raised"
+                    " beginning March 1, 2022."
+                ),
+                model_parallel_submesh,
+            )
+        if bool(num_partitions) and bool(model_parallel_submesh):
+            logging.error(
+                "At most one of `num_partitions` or `model_parallel_submesh` can be "
+                "set. Got `num_partitions=%s` and `model_parallel_submesh`=%s. A "
+                "ValueError will be raised beginning March 21, 2022.",
+                num_partitions,
+                model_parallel_submesh,
+            )
+        self._num_partitions = num_partitions
+        self._model_parallel_submesh = model_parallel_submesh
+        self._params_on_devices = params_on_devices
+        self._data_axis = "data"
+        self._backend = backend
+    @property
+    def mesh(self) -> Mesh:
+        raise NotImplementedError
+    @property
+    def data_partition_spec(self) -> PartitionSpec:
+        return PartitionSpec(self._data_axis)
+    def get_data_layout(self, batch_size: Optional[int] = None, host_index: Optional[int] = None) -> DataLayout:
+        """Returns filled `DataLayout` based on the partitioned model layout.
+        Args:
+          batch_size: if set, indicates the requested batch size. The exception will
+            be raised if this batch size is not compatible with the layout. If not
+            set, the batch size is inferred from the layout.
+          host_index: indicates the host index to use for the calculations, if not
+            set - use JAX-provided one. Should be in [0, num_hosts) interval and the
+            order should match the order of corresponding CPU devices in
+            `jax.devices()`.
+        Returns:
+          Filled `DataLayout` structure.
+        """
+        if host_index is not None:
+            raise NotImplementedError("Explicit host_index is not yet implemented.")
+        if self._data_axis is None:
+            return DataLayout(
+                batch_size=batch_size,
+                shard_id=0,
+                num_shards=1,
+                is_first_host_in_replica_set=(jax.process_index() == 0),
+            )
+        mesh_size = self._local_chunker.global_mesh.shape[self._data_axis]
+        batch_size = batch_size or mesh_size
+        if batch_size % mesh_size:
+            raise ValueError(
+                f"Batch size ({batch_size}) must be divisible by corresponding " f"mesh size ({mesh_size})."
+            )
+        num_shards = self._local_chunker.num_chunks[self._data_axis]
+        if batch_size % num_shards:
+            raise ValueError(f"Batch size ({batch_size}) must be divisible by number of " f"replicas ({num_shards}).")
+        replica_id = self._local_chunker.get_local_chunk_info((batch_size,), [self._data_axis]).replica_id
+        return DataLayout(
+            batch_size=int(batch_size),
+            shard_id=int(self._local_chunker.chunk_ids[self._data_axis]),
+            num_shards=int(num_shards),
+            is_first_host_in_replica_set=(replica_id == 0),
+        )
+    def get_local_chunk_info(
+        self, global_shape: Tuple[int, ...], mesh_axes: Sequence[Optional[str]]
+    ) -> LocalChunkInfo:
+        """Returns the local chunk info for a given array shape and sharded axes."""
+        return self._local_chunker.get_local_chunk_info(global_shape, mesh_axes)
+    @property
+    def params_on_devices(self):
+        return self._params_on_devices
+    def move_params_to_devices(self, train_state: TrainState, train_state_axes: TrainState) -> TrainState:
+        """Moves the optimizer parameters to devices."""
+        p_id_fn = self.partition(
+            _id_fn,
+            in_axis_resources=(train_state_axes, None),
+            out_axis_resources=(train_state_axes, None),
+            donate_argnums=(0,),
+        )
+        if jax.config.jax_array and jax.process_count() > 1:
+            train_state = multihost_utils.host_local_array_to_global_array(train_state, self.mesh, train_state_axes)
+        train_state, _ = p_id_fn(train_state, jnp.ones((), dtype=jnp.uint32))
+        return train_state
+    @property
+    @abc.abstractmethod
+    def _local_chunker(self):
+        """Returns the chunker that matches the parameters of this partitioner."""
+        raise NotImplementedError
+    def get_logical_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[AxisNames] as leaves."""
+        # By default, return None for the logical axes.
+        return train_state.restore_state(jax.tree_map(lambda x: None, train_state.state_dict()))
+    def get_mesh_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[PartitionSpecs] as leaves."""
+        raise NotImplementedError
+    @abc.abstractmethod
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PartitionedCallable:
+        """Partitions the computation using partitioner-specific implementation.
+        Args:
+          fn: the function to partition.
+          in_axis_resources: Pytree of structure matching that of arguments to `fn`,
+            with all actual arguments replaced by resource assignment
+            specifications. It is also valid to specify a pytree prefix (e.g. one
+            value in place of a whole subtree), in which case the leaves get
+            broadcast to all values in that subtree.
+            The valid resource assignment specifications are:
+              `None`: in which case the value will be replicated on all devices
+              `PartitionSpec`: a tuple of length at most equal to the rank of the
+                partitioned value. Each element can be a `None`, a mesh axis or a
+                tuple of mesh axes, and specifies the set of resources assigned to
+                partition the value's dimension matching its position in the spec.
+          out_axis_resources: Like `in_axis_resources`, but specifies resource
+            assignment for function outputs.
+          static_argnums: an optional int or collection of ints that specify which
+            positional arguments to treat as static (compile-time constant) in the
+            partitioned function.
+          donate_argnums: an optional int or collection of ints that specify which
+            argument buffers are "donated" to the computation. It is safe to donate
+            argument buffers if you no longer need them once the computation has
+            finished.
+        Returns:
+          A partitioned version of the input function.
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def compile(self, partitioned_fn: PartitionedCallable, *args) -> CompiledPartitionedCallable:
+        """Compiles and returns the partitioned function, or the original.
+        Args:
+          partitioned_fn: The partitioned function.
+          *args: Sample arguments to the partitioned function matching the input
+            shapes that will be passed to the compiled function.
+        Returns:
+          The compiled function, or the original if this partitioner does not
+          support compilation.
+        """
+        raise NotImplementedError
+class PjittedFnWithContext(PartitionedCallable):
+    """Wraps pjitted function to apply the appropriate contexts."""
+    def __init__(self, pjitted_fn, partition_mesh: Mesh, logical_axis_rules: flax_partitioning.LogicalRules = ()):
+        self._pjitted_fn = pjitted_fn
+        self._mesh = partition_mesh
+        self._logical_axis_rules = logical_axis_rules
+    def __call__(self, *args):
+        with Mesh(self._mesh.devices, self._mesh.axis_names), flax_partitioning.axis_rules(self._logical_axis_rules):
+            return self._pjitted_fn(*args)
+    def lower(self, *args):
+        with Mesh(self._mesh.devices, self._mesh.axis_names), flax_partitioning.axis_rules(self._logical_axis_rules):
+            return self._pjitted_fn.lower(*args)
+class BasePjitPartitioner(BasePartitioner):
+    """Partitioner that uses T5X version of jax.pjit."""
+    @cached_property
+    def _local_chunker(self) -> LocalChunker:
+        return LocalChunker(self.mesh)
+    @cached_property
+    def mesh(self) -> Mesh:
+        return default_mesh(self._num_partitions, self._model_parallel_submesh, self._backend)
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PjittedFnWithContext:
+        pjitted = pjit(
+            fn,
+            in_axis_resources=in_axis_resources,
+            out_axis_resources=out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+            backend=self._backend,
+        )
+        return PjittedFnWithContext(pjitted, self.mesh)
+    def compile(self, partitioned_fn: PjittedFnWithContext, *args) -> CompiledPartitionedCallable:
+        return partitioned_fn.lower(*args).compile()
+class PjitPartitioner(BasePjitPartitioner):
+    """Partitioner that uses named axes and jax.pjit."""
+    def __init__(
+        self,
+        num_partitions: Optional[int] = None,
+        model_parallel_submesh: Optional[HardwareMesh] = None,
+        params_on_devices: bool = True,
+        backend: Optional[str] = None,
+        logical_axis_rules: Optional[LogicalAxisRules] = None,
+        use_cpu_pjit: Optional[bool] = False,
+    ):
+        """PjitPartitioner constructor.
+        See https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.mdx/usage/partitioning for details.
+        Args:
+          num_partitions: an integer that specifies the size of the model parallel
+            submesh to be automatically selected for the current topology. See
+            `model_parallel_submesh` for details on how this submesh is used.
+            Mutually exlusive with `model_parallel_submesh`.
+          model_parallel_submesh: is a 4-tuple that specifies the `(x, y, z, c)`
+            submesh model-parallel device tile, an axis of accelerator parallelism
+            orthogonal to data parallelism. Array axes in a model's parameters or
+            activations can be sharded over this submesh using axis rules (see
+            `logical_axis_rules`) that map them to 'model'. The effective number of
+            model sub-partitions is equal to `np.prod(model_parallel_submesh)` and
+            must evenly divide the total number of devices (i.e.,
+            `jax.device_count() % np.prod(model_parallel_submesh) == 0`). The rest
+            of the TPU mesh is the data parallel submesh, providing
+            `jax.device_count() // np.prod(model_parallel_submesh)` partitions. It
+            is used for data (batch) parallelism and to shard other array axes that
+            are mapped to 'data'. This argument is mutually exclusive with
+            `num_partitions`.
+          params_on_devices: whether to keep the params on devices, if False -
+            params stay in the host memory. Note that some partitioners might ignore
+            this setting, for example if they don't support storing all params on
+            device memory.
+          backend: get devices from the pinned backend, if specified. This is
+            useful for explicitly specifying the devices other than relying on
+            jax_platform_name.
+          logical_axis_rules: a priority-ordered sequence of KV tuples that maps
+            logical axis names to either `None` (not sharded), 'model' (to shard
+            across the model-parallel submesh), or 'data' (to shard across the
+            data-parallel submesh).
+          use_cpu_pjit: enables wrapper function for pjit which just jits the
+            function if using CPU backend.
+        """
+        super().__init__(
+            num_partitions=num_partitions,
+            model_parallel_submesh=model_parallel_submesh,
+            params_on_devices=params_on_devices,
+            backend=backend,
+        )
+        if logical_axis_rules is None:
+            logical_axis_rules = standard_logical_axis_rules()
+        self._logical_axis_rules = tuple(logical_axis_rules)
+        (self._data_axis,) = flax_partitioning.logical_to_mesh_axes(["batch"], logical_axis_rules)
+        self._use_cpu_pjit = use_cpu_pjit
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PjittedFnWithContext:
+        """Partitions the function using jax.pjit."""
+        if self._use_cpu_pjit:
+            pjit_fn = pjit_with_cpu_fallback
+        else:
+            pjit_fn = pjit
+        pjitted = pjit_fn(
+            fn,
+            in_axis_resources=in_axis_resources,
+            out_axis_resources=out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+            backend=self._backend,
+        )
+        return PjittedFnWithContext(pjitted, self.mesh, self._logical_axis_rules)
+    @property
+    def logical_axis_rules(self):
+        """Returns the logical axis rules."""
+        return self._logical_axis_rules
+    def get_logical_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[AxisNames] as leaves."""
+        return train_state.as_logical_axes()
+    def get_mesh_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[PartitionSpecs] as leaves."""
+        logical_axes = self.get_logical_axes(train_state)
+        def _logical_to_mesh_axes(param_name, logical_axes):
+            if logical_axes is None:
+                return None
+            elif logical_axes is traverse_util.empty_node:
+                return traverse_util.empty_node
+            try:
+                return flax_partitioning.logical_to_mesh_axes(logical_axes, self._logical_axis_rules)
+            except ValueError as e:
+                raise ValueError(f"Failed to map logical axes for {param_name}") from e
+        flat_logical_axes = traverse_util.flatten_dict(logical_axes.state_dict(), keep_empty_nodes=True, sep="/")
+        flat_mesh_axes = {k: _logical_to_mesh_axes(k, v) for k, v in flat_logical_axes.items()}
+        return logical_axes.restore_state(traverse_util.unflatten_dict(flat_mesh_axes, sep="/"))

whisper_jax/whisper_jax_pipeline.py ADDED Viewed

	@@ -0,0 +1,506 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import jax
+import jax.numpy as jnp
+import numpy as np
+import requests
+from flax import jax_utils
+from flax.core.frozen_dict import freeze
+from flax.training.common_utils import shard
+from jax.sharding import PartitionSpec as P
+from transformers import WhisperProcessor, is_tokenizers_available, WhisperFeatureExtractor, WhisperTokenizerFast
+from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE, WhisperTokenizer
+from transformers.pipelines.audio_utils import ffmpeg_read
+from transformers.utils import logging
+from .modeling_flax_whisper import FlaxWhisperForConditionalGeneration
+from .partitioner import PjitPartitioner
+from .train_state import InferenceState
+logger = logging.get_logger(__name__)
+# 2D parameter and activation partitioning for DP
+logical_axis_rules_dp = (
+    ("batch", "data"),
+    ("mlp", None),
+    ("heads", None),
+    ("vocab", None),
+    ("embed", None),
+    ("embed", None),
+    ("joined_kv", None),
+    ("kv", None),
+    ("length", None),
+    ("num_mel", None),
+    ("channels", None),
+)
+class FlaxWhisperPipline:
+    def __init__(
+        self,
+        checkpoint="openai/whisper-large-v2",
+        dtype=jnp.float32,
+        batch_size=None,
+        max_length=None,
+    ):
+        """
+        Args
+            checkpoint (`str`, *optional*, defaults to `"openai/whisper-large-v2"):
+                The Whisper checkpoint to use with the pipeline. Must be an available checkpoint on the Hugging Face Hub
+                with Flax weights.
+            dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+                The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+                `jax.numpy.bfloat16` (on TPUs). This can be used to enable half-precision inference on GPUs or TPUs.
+                If specified all the computation will be performed with the given `dtype`. **Note that this only
+                specifies the dtype of the computation and does not influence the dtype of model parameters.**
+            batch_size (`int`, *optional*, defaults to the minimum per-device batch size, i.e. `jax.local_device_count()`):
+                The batch size to be used in chunking transcription. Beneficial for transcribing long audio files. Passing
+                a batch size in the `__init__` method will be superseded by any batch size passed to the `__call__` method.
+            max_length (`int`, *optional*):
+                The maximum numbers of tokens to generate. Defaults to `model.config.max_length`.
+        """
+        self.checkpoint = checkpoint
+        self.dtype = dtype
+        self.processor = WhisperProcessor.from_pretrained(self.checkpoint)
+        self.feature_extractor = self.processor.feature_extractor
+        # potentially load fast tokenizer if available
+        tokenizer_cls = WhisperTokenizerFast if is_tokenizers_available() else WhisperTokenizer
+        self.tokenizer = tokenizer_cls.from_pretrained(checkpoint)
+        self.model, self.params = FlaxWhisperForConditionalGeneration.from_pretrained(
+            self.checkpoint,
+            _do_init=False,
+            dtype=self.dtype,
+        )
+        self.max_length = max_length if max_length is not None else self.model.generation_config.max_length
+        self.min_batch_size = jax.local_device_count()
+        self.batch_size = (
+            batch_size if batch_size is not None else self.min_batch_size
+        )  # we need a minimum of 1 batch per-device
+        def generate(params, input_features, forced_decoder_ids, return_timestamps):
+            output_ids = self.model.pipeline_generate(
+                input_features,
+                params=params,
+                forced_decoder_ids=forced_decoder_ids,
+                return_timestamps=return_timestamps,
+                max_length=self.max_length,
+            )
+            return output_ids
+        # use pmap for DP by default - this is compatible on a Colab TPU v2
+        self.params = jax_utils.replicate(self.params)
+        self.p_generate = jax.pmap(
+            generate, "input_features", in_axes=(0, 0, None), out_axes=0, static_broadcasted_argnums=(3,)
+        )
+        self.is_sharded = False
+    def shard_params(self, num_mp_partitions=1, logical_axis_rules=logical_axis_rules_dp):
+        def init_fn():
+            input_shape = (1, self.model.config.num_mel_bins, 2 * self.model.config.max_source_positions)
+            input_features = jnp.zeros(input_shape, dtype="f4")
+            input_features = input_features.at[(..., -1)].set(self.model.config.eos_token_id)
+            decoder_input_ids = jnp.zeros((input_shape[0], 1), dtype="i4")
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+            batch_size, sequence_length = decoder_input_ids.shape
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+            rng = jax.random.PRNGKey(0)
+            init_params = self.model.module.init(
+                rng,
+                input_features=input_features,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                decoder_position_ids=decoder_position_ids,
+                return_dict=False,
+            )
+            return init_params
+        # Axis names metadata
+        param_axes = jax.eval_shape(init_fn)["params_axes"]
+        # Create InferenceState, since the partitioner expects it
+        state = InferenceState(
+            step=jnp.array(0),
+            params=freeze(self.model.params_shape_tree),
+            params_axes=freeze(param_axes),
+            flax_mutables=None,
+            flax_mutables_axes=param_axes,
+        )
+        partitioner = PjitPartitioner(num_partitions=num_mp_partitions, logical_axis_rules=logical_axis_rules)
+        mesh_axes = partitioner.get_mesh_axes(state)
+        params_spec = mesh_axes.params
+        p_shard_params = partitioner.partition(self.model.to_bf16, (params_spec,), params_spec)
+        # This will auto-magically run in mesh context
+        self.params = p_shard_params(freeze(jax_utils.unreplicate(self.params)))
+        self.is_sharded = True
+        def generate(params, input_features, forced_decoder_ids, return_timestamps):
+            output_ids = self.model.pipeline_generate(
+                input_features,
+                params=params,
+                forced_decoder_ids=forced_decoder_ids,
+                return_timestamps=return_timestamps,
+                max_length=self.max_length,
+            )
+            return output_ids
+        # Use pjit for generate only once we've sharded the params
+        self.p_generate = partitioner.partition(
+            generate,
+            in_axis_resources=(params_spec, P("data"), None),
+            out_axis_resources=P("data"),
+            static_argnums=(3,),
+        )
+    def generate(self, input_features, language=None, task=None, return_timestamps=False):
+        forced_decoder_ids = self.get_forced_decoder_ids(
+            language=language, task=task, return_timestamps=return_timestamps
+        )
+        if not self.is_sharded:
+            # if we're using pmap we need to manually replicate the input data across devices and gather the output tokens
+            output_ids = self.p_generate(
+                freeze(self.params), shard(input_features), forced_decoder_ids, return_timestamps
+            ).sequences
+            output_ids = jax.device_get(output_ids.reshape(-1, self.max_length))
+        else:
+            # pjit handles replication / gathering for us auto-magically
+            output_ids = self.p_generate(
+                freeze(self.params), input_features, forced_decoder_ids, return_timestamps
+            ).sequences
+        return output_ids
+    def get_forced_decoder_ids(self, generation_config=None, task=None, language=None, return_timestamps=False):
+        if generation_config is None:
+            generation_config = self.model.generation_config
+        if hasattr(generation_config, "is_multilingual"):
+            is_multilingual = generation_config.is_multilingual
+        else:
+            is_multilingual = None
+        forced_decoder_ids = []
+        if is_multilingual:
+            if language is not None:
+                language = language.lower()
+                if language in generation_config.lang_to_id.keys():
+                    language_token = language
+                elif language in TO_LANGUAGE_CODE.values():
+                    language_token = f"<|{language}|>"
+                elif language in TO_LANGUAGE_CODE.keys():
+                    language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
+                else:
+                    if len(language) == 2:
+                        # ISO 639-1 language code
+                        acceptable_languages = list(TO_LANGUAGE_CODE.values())
+                    elif "<" in language or "|" in language or ">" in language:
+                        # generation config language code
+                        acceptable_languages = list(generation_config.lang_to_id.keys())
+                    else:
+                        # language passed as a string
+                        acceptable_languages = list(TO_LANGUAGE_CODE.keys())
+                    raise ValueError(
+                        f"Unsupported language: {language}. Language should be one of:" f" {acceptable_languages}."
+                    )
+                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
+            if task is not None:
+                forced_decoder_ids.append((2, generation_config.task_to_id[task]))
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
+        if not return_timestamps:
+            if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+        return forced_decoder_ids
+    def chunk_iter_with_batch(self, inputs, chunk_len, stride_left, stride_right, batch_size):
+        inputs_len = inputs.shape[0]
+        step = chunk_len - stride_left - stride_right
+        all_chunk_start_idx = np.arange(0, inputs_len, step)
+        num_samples = len(all_chunk_start_idx)
+        num_batches = math.ceil(num_samples / batch_size)
+        batch_idx = np.array_split(np.arange(num_samples), num_batches)
+        for idx in batch_idx:
+            chunk_start_idx = all_chunk_start_idx[idx]
+            chunk_end_idx = chunk_start_idx + chunk_len
+            chunks = [inputs[chunk_start:chunk_end] for chunk_start, chunk_end in zip(chunk_start_idx, chunk_end_idx)]
+            processed = self.feature_extractor(
+                chunks, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
+            )
+            _stride_left = np.where(chunk_start_idx == 0, 0, stride_left)
+            is_last = np.where(stride_right > 0, chunk_end_idx > inputs_len, chunk_end_idx >= inputs_len)
+            _stride_right = np.where(is_last, 0, stride_right)
+            chunk_lens = [chunk.shape[0] for chunk in chunks]
+            strides = [
+                (chunk_l, _stride_l, _stride_r)
+                for chunk_l, _stride_l, _stride_r in zip(chunk_lens, _stride_left, _stride_right)
+            ]
+            yield {"stride": strides, **processed}
+    def preprocess_batch(self, inputs, chunk_length_s=30.0, stride_length_s=None, batch_size=None):
+        if isinstance(inputs, np.ndarray):
+            logger.warning(
+                "Numpy array passed as input - no sampling rate checks will be performed."
+                "It is strongly recommended to pass the input as a dictionary with an 'array' key "
+                "containing the numpy array representing the audio, and a 'sampling_rate' key "
+                "containing the sampling rate associated with the audio array."
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+        stride = None
+        if isinstance(inputs, dict):
+            stride = inputs.get("stride", None)
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and "array" in inputs):
+                raise ValueError(
+                    "When passing a dictionary to FlaxWhisperPipline, the dict needs to contain an 'array' key "
+                    "containing the numpy array representing the audio, and a 'sampling_rate' key "
+                    "containing the sampling rate associated with the audio array."
+                )
+            in_sampling_rate = inputs.get("sampling_rate")
+            inputs = inputs.get("array", None)
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                try:
+                    import librosa
+                except ImportError as err:
+                    raise ImportError(
+                        "To support resampling audio files, please install 'librosa' and 'soundfile'."
+                    ) from err
+                inputs = librosa.resample(
+                    inputs, orig_sr=in_sampling_rate, target_sr=self.feature_extractor.sampling_rate
+                )
+                ratio = self.feature_extractor.sampling_rate / in_sampling_rate
+            else:
+                ratio = 1
+        if not isinstance(inputs, np.ndarray):
+            raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+        if stride is not None:
+            if stride[0] + stride[1] > inputs.shape[0]:
+                raise ValueError("Stride is too large for input")
+            # Stride needs to get the chunk length here, it's going to get
+            # swallowed by the `feature_extractor` later, and then batching
+            # can add extra data in the inputs, so we need to keep track
+            # of the original length in the stride so we can cut properly.
+            stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
+        if chunk_length_s:
+            if stride_length_s is None:
+                stride_length_s = chunk_length_s / 6
+            if isinstance(stride_length_s, (int, float)):
+                stride_length_s = [stride_length_s, stride_length_s]
+            chunk_len = round(chunk_length_s * self.feature_extractor.sampling_rate)
+            stride_left = round(stride_length_s[0] * self.feature_extractor.sampling_rate)
+            stride_right = round(stride_length_s[1] * self.feature_extractor.sampling_rate)
+            if chunk_len < stride_left + stride_right:
+                raise ValueError("Chunk length must be superior to stride length")
+            for item in self.chunk_iter_with_batch(
+                inputs,
+                chunk_len,
+                stride_left,
+                stride_right,
+                batch_size,
+            ):
+                yield item
+        else:
+            processed = self.feature_extractor(
+                inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
+            )
+            if stride is not None:
+                processed["stride"] = stride
+            yield processed
+    def postprocess(self, model_outputs, return_timestamps=None, return_language=None):
+        # unpack the outputs from list(dict(list)) to list(dict)
+        model_outputs = [dict(zip(output, t)) for output in model_outputs for t in zip(*output.values())]
+        time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
+        # Send the chunking back to seconds, it's easier to handle in whisper
+        sampling_rate = self.feature_extractor.sampling_rate
+        for output in model_outputs:
+            if "stride" in output:
+                chunk_len, stride_left, stride_right = output["stride"]
+                # Go back in seconds
+                chunk_len /= sampling_rate
+                stride_left /= sampling_rate
+                stride_right /= sampling_rate
+                output["stride"] = chunk_len, stride_left, stride_right
+        text, optional = self.tokenizer._decode_asr(
+            model_outputs,
+            return_timestamps=return_timestamps,
+            return_language=return_language,
+            time_precision=time_precision,
+        )
+        return {"text": text, **optional}
+    def forward(self, model_inputs, batch_size=None, language=None, task=None, return_timestamps=False):
+        # We need to keep track of some additional input arguments for post-processing so need to forward these on after running generation
+        input_features = model_inputs.pop("input_features")
+        input_batch_size = input_features.shape[0]
+        if input_batch_size != batch_size:
+            padding = np.zeros([batch_size - input_batch_size, *input_features.shape[1:]], input_features.dtype)
+            input_features = np.concatenate([input_features, padding])
+        pred_ids = self.generate(input_features, language=language, task=task, return_timestamps=return_timestamps)[
+            :input_batch_size
+        ]
+        # tokenizer's decode method expects an extra dim - we insert it here for convenience
+        out = {"tokens": pred_ids[:, None, :]}
+        stride = model_inputs.pop("stride", None)
+        if stride is not None:
+            out["stride"] = stride
+        return out
+    def __call__(
+        self,
+        inputs,
+        chunk_length_s=30.0,
+        stride_length_s=None,
+        batch_size=None,
+        language=None,
+        task=None,
+        return_timestamps=None,
+        generate_kwargs=None,
+    ):
+        """
+        Transcribe an audio input sequence to a text transcription, optionally with timestamps.
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either:
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` is the byte content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio assumed to be at the correct sampling rate (16kHz). Note that no further sampling
+                        rate check will be done.
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "array":
+                      np.array}`. Optionally an additional argument `"stride": (left: int, right: int)` can be used to
+                       ask the pipeline to treat the first `left` samples and last `right` samples to be ignored in
+                       decoding (but used at inference to provide more context to the model). In general, this additional
+                       stride argument is not required.
+            chunk_length_s (`float`, *optional*, defaults to 30.0):
+                The input length for each chunk. If `chunk_length_s = 0` then chunking is disabled. By default, the chunk
+                length is set 30.0s, equal to Whisper's context window.
+            stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
+                The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables
+                the model to *see* more context and infer letters better than without this context but the pipeline
+                discards the stride bits at the end to make the final reconstitution as perfect as possible.
+                <Tip>
+                For more information on how to effectively use `stride_length_s`, refer to the [ASR chunking
+                blog post](https://huggingface.co/blog/asr-chunking).
+                </Tip>
+            batch_size (`int`, *optional*, defaults to the minimum per-device batch size, i.e. `jax.local_device_count()`):
+                The batch size to be used in chunking transcription. Beneficial for transcribing long audio files. Passing
+                a batch size in the `__call__` method will supersede any batch size passed to the `__init__`.
+            task (`str`, *optional*):
+                Task to use for generation, either `"transcribe"` or `"translate"`. Defaults to `"transcribe"`.
+            language (`str`, *optional*):
+                Language token to use for generation, can be either in the form of `"<|en|>"`, `"en"` or `"english"`.
+                Defaults to `None`, meaning the language is automatically inferred from the audio input.
+            return_timestamps (*optional*, `bool`):
+                Whether to return timestamps in the prediction. Defaults to False. If set to true, the pipeline
+                will return two keys in the output dictionary: `"text"` containing the text transcription, and `"chunks"`
+                containing the transcription segments chunked by their utterance-level timestamps.
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **text** (`str` ) -- The recognised text.
+                - **chunks** (*optional(, `List[Dict]`)
+                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamps": (0.5,0.9), {"text":
+                    "there", "timestamps": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                    `"".join(chunk["text"] for chunk in output["chunks"])`.
+        """
+        batch_size = batch_size if batch_size is not None else self.batch_size
+        if batch_size % self.min_batch_size != 0:
+            raise ValueError(
+                f"Batch size must be a multiple of the number of JAX devices, but got batch size {batch_size} and num devices {self.min_batch_size}."
+            )
+        dataloader = self.preprocess_batch(
+            inputs, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, batch_size=batch_size
+        )
+        model_outputs = []
+        # iterate over our chunked audio samples
+        for batch in dataloader:
+            model_outputs.append(
+                self.forward(
+                    batch, batch_size=batch_size, language=language, task=task, return_timestamps=return_timestamps
+                )
+            )
+        post_processed = self.postprocess(model_outputs, return_timestamps=return_timestamps)
+        return post_processed

whisper_jax/whisper_jax_train_state.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# coding=utf-8
+# Copyright 2023 The T5X Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for partitioning."""
+from typing import Any, Mapping, MutableMapping, Optional, Tuple
+import flax.core
+import flax.serialization
+import flax.struct
+import jax.numpy as jnp
+from flax import traverse_util
+from flax.core import scope as flax_scope
+from flax.linen import partitioning as flax_partitioning
+EMPTY_DICT = flax.core.freeze({})
+FrozenDict = flax_scope.FrozenDict
+FrozenVariableDict = flax_scope.FrozenVariableDict
+MutableVariableDict = flax_scope.MutableVariableDict
+VariableDict = flax_scope.VariableDict
+def _validate_params_axes(params_axes, params):
+    axis_names = flax_partitioning.get_axis_names(params_axes)
+    missing_params_axes = set(traverse_util.flatten_dict(params, sep="/")) - set(
+        traverse_util.flatten_dict(axis_names, sep="/")
+    )
+    if missing_params_axes:
+        raise ValueError(f"Missing axis names for parameters: {missing_params_axes}")
+def _split_variables_and_axes(variables_and_axes: FrozenVariableDict) -> Tuple[FrozenVariableDict, FrozenVariableDict]:
+    """Splits `variables_and_axes` into two separate dicts with the same keys."""
+    # For each `key`, `key_axes` (if any) are its axes in `variables_and_axes`.
+    variables = {}
+    axes = {}
+    for k, v in variables_and_axes.items():
+        if k.endswith("_axes"):
+            axes[k[:-5]] = v  # k without "_axes".
+            _validate_params_axes(v, variables_and_axes[k[:-5]])  # k without "_axes".
+        else:
+            variables[k] = v
+    return flax.core.freeze(variables), flax.core.freeze(axes)
+class InferenceState(flax.struct.PyTreeNode):
+    """State compatible with FlaxOptimTrainState without optimizer state."""
+    step: jnp.ndarray
+    params: flax_scope.FrozenVariableDict
+    params_axes: Optional[flax_scope.FrozenVariableDict] = None
+    flax_mutables: flax_scope.FrozenDict = EMPTY_DICT
+    flax_mutables_axes: Optional[flax_scope.FrozenVariableDict] = None
+    @classmethod
+    def create(cls, model_variables: FrozenVariableDict) -> "InferenceState":
+        other_variables, params = model_variables.pop("params")
+        if "params_axes" in other_variables:
+            other_variables, params_axes = other_variables.pop("params_axes")
+            _validate_params_axes(params_axes, params)
+        else:
+            params_axes = None
+        # Split other_variables into mutables and their corresponding axes.
+        flax_mutables, flax_mutables_axes = _split_variables_and_axes(other_variables)
+        flax_mutables_axes = flax_mutables_axes or None
+        return InferenceState(
+            step=jnp.array(0),
+            params=params,
+            params_axes=params_axes,
+            flax_mutables=flax_mutables,
+            flax_mutables_axes=flax_mutables_axes,
+        )
+    @property
+    def param_states(self) -> FrozenVariableDict:
+        """The optimizer states of the parameters as a PyTree."""
+        raise NotImplementedError("InferenceState has no optimizer states.")
+    def apply_gradient(self, *args, **kwargs) -> "InferenceState":
+        raise NotImplementedError("InferenceState does not support `apply_gradient`.")
+    def state_dict(self) -> MutableMapping[str, Any]:
+        state_dict = {"target": flax.core.unfreeze(self.params), "state": {"step": self.step}}
+        if self.flax_mutables:
+            state_dict["flax_mutables"] = flax.core.unfreeze(self.flax_mutables)
+        return state_dict
+    def replace_step(self, step: jnp.ndarray) -> "InferenceState":
+        return self.replace(step=step)
+    def replace_params(self, params: FrozenVariableDict) -> "InferenceState":
+        return self.replace(params=params)
+    def replace_flax_mutables(self, flax_mutables: FrozenDict) -> "InferenceState":
+        return self.replace(flax_mutables=flax_mutables)
+    def restore_state(self, state_dict: Mapping[str, Any]) -> "InferenceState":
+        return self.replace(
+            params=flax.core.freeze(state_dict["target"]),
+            step=state_dict["state"]["step"],
+            flax_mutables=flax.core.freeze(state_dict["flax_mutables"])
+            if "flax_mutables" in state_dict
+            else EMPTY_DICT,
+        )
+    def as_logical_axes(self) -> "InferenceState":
+        # Set step to None so that when the logical axes are processed by the
+        # flax.partitioning.logical_to_mesh_axes function, it will be skipped
+        # because jax.tree_map will short circut and never call the function on the
+        # step.
+        flax_mutables_axes = self.flax_mutables_axes or EMPTY_DICT
+        return InferenceState(
+            step=None,
+            params=flax_partitioning.get_axis_names(self.params_axes),
+            flax_mutables=flax_partitioning.get_axis_names(flax_mutables_axes),
+        )