init

Files changed (11) hide show

cache.py +44 -0
config.json +4 -4
configuration_hyena.py +92 -0
engine.py +389 -0
layers.py +155 -0
model.py +472 -0
modeling_hyena.py +145 -0
positional_embeddings.py +113 -0
streamer.py +106 -0
tokenizer.py +116 -0
utils.py +96 -0

cache.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+from torch import Tensor
+from dataclasses import dataclass, field
+from typing import Optional
+# https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+    max_seqlen: int
+    max_batch_size: int
+    seqlen_offset: int = 0
+    batch_size_offset: int = 0
+    key_value_memory_dict: dict = field(default_factory=dict)
+    lengths_per_sample: Optional[Tensor] = None
+    def reset(self, max_seqlen, max_batch_size):
+        self.max_seqlen = max_seqlen
+        self.max_batch_size = max_batch_size
+        self.seqlen_offset = 0
+        if self.lengths_per_sample is not None:
+            self.lengths_per_sample.zero_()
+@dataclass
+class RecurrentInferenceParams:
+    """Inference parameters passed to blocks with recurrent mode."""
+    fir_filter_length: int = 3
+    state_dim: int = 16
+    seqlen_offset: int = 0
+    fir_state_dict: dict = field(default_factory=dict)
+    state_dict: dict = field(default_factory=dict)
+    def reset(self):
+        self.fir_filter_length = 3
+        self.state_dim = 16
+        self.seqlen_offset = 0

config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "_commit_hash": "1cc23830f62c268082475776fb449af8428eb703",
-  "_name_or_path": "LongSafari/Evo-1",
   "architectures": [
     "StripedHyenaModelForCausalLM"
   ],
@@ -10,8 +10,8 @@
     24
   ],
   "auto_map": {
-    "AutoConfig": "LongSafari/Evo-1--configuration_hyena.StripedHyenaConfig",
-    "AutoModelForCausalLM": "LongSafari/Evo-1--modeling_hyena.StripedHyenaModelForCausalLM"
   },
   "column_split": false,
   "column_split_hyena": true,

 {
+  "_commit_hash": null,
+  "_name_or_path": "togethercomputer/evo-1-phase-2",
   "architectures": [
     "StripedHyenaModelForCausalLM"
   ],
     24
   ],
   "auto_map": {
+    "AutoConfig": "configuration_hyena.StripedHyenaConfig",
+    "AutoModelForCausalLM": "modeling_hyena.StripedHyenaModelForCausalLM"
   },
   "column_split": false,
   "column_split_hyena": true,

configuration_hyena.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from transformers import PretrainedConfig
+import json
+class StripedHyenaConfig(PretrainedConfig):
+    model_type = "stripedhyena"
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        num_filters=4096,
+        inner_mlp_size=14336,
+        attn_layer_idxs=[],
+        hyena_layer_idxs=[],
+        num_layers=32,
+        tie_embeddings=False,
+        short_filter_length=3,
+        num_attention_heads=32,
+        proj_groups=4,
+        hyena_filter_groups=1,
+        split_k0=True,
+        column_split_hyena=True,
+        column_split=False,
+        model_parallel_size=1,
+        pipe_parallel_size=1,
+        short_filter_bias=True,
+        mha_out_proj_bias=False,
+        qkv_proj_bias=False,
+        final_norm=True,
+        use_cache=True,
+        use_flash_attention_2=True,
+        use_flash_rmsnorm=True,
+        use_flash_depthwise=False,
+        use_flashfft=False,
+        inference_mode=False,
+        prefill_style="fft",
+        max_seqlen=32768,
+        eps=1e-5,
+        state_size=2,
+        rotary_emb_base=500000,
+        smeared_gqa=False,
+        make_vocab_size_divisible_by=8,
+        log_intermediate_values=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_filters = num_filters
+        self.inner_mlp_size = inner_mlp_size
+        self.attn_layer_idxs = attn_layer_idxs
+        self.hyena_layer_idxs = hyena_layer_idxs
+        self.num_layers = num_layers
+        self.tie_embeddings = tie_embeddings
+        self.short_filter_length = short_filter_length
+        self.num_attention_heads = num_attention_heads
+        self.proj_groups = proj_groups
+        self.hyena_filter_groups = hyena_filter_groups
+        self.split_k0 = split_k0
+        self.column_split_hyena = column_split_hyena
+        self.column_split = column_split
+        self.model_parallel_size = model_parallel_size
+        self.pipe_parallel_size = pipe_parallel_size
+        self.short_filter_bias = short_filter_bias
+        self.mha_out_proj_bias = mha_out_proj_bias
+        self.qkv_proj_bias = qkv_proj_bias
+        self.final_norm = final_norm
+        self.use_cache = use_cache
+        self.use_flash_attention_2 = use_flash_attention_2
+        self.use_flash_rmsnorm = use_flash_rmsnorm
+        self.use_flash_depthwise = use_flash_depthwise
+        self.use_flashfft = use_flashfft
+        self.inference_mode = inference_mode
+        self.prefill_style = prefill_style
+        self.max_seqlen = max_seqlen
+        self.eps = eps
+        self.state_size = state_size
+        self.rotary_emb_base = rotary_emb_base
+        self.smeared_gqa = smeared_gqa
+        self.make_vocab_size_divisible_by = make_vocab_size_divisible_by
+        self.log_intermediate_values = log_intermediate_values
+        super().__init__(**kwargs)
+    def to_dict(self):
+        return {attr: getattr(self, attr) for attr in self.__dict__}
+    @classmethod
+    def from_original_config(cls, config_path, **kwargs):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        return cls(**config, **kwargs)

engine.py ADDED Viewed

	@@ -0,0 +1,389 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+import gc
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import conv1d_cpp
+except:
+    pass
+from .utils import column_split
+IIR_PREFILL_MODES = [
+    "recurrence",
+    "modal-fft",
+    "hybrid-modal-recurrence",
+    "modal-scan",
+    "canonical-fft",
+    "iir-fir-caching",
+]
+def canonicalize_modal_system(poles, residues):
+    """Canonicalize a modal system.
+    Args:
+        poles (Tensor): The poles of the system.
+        residues (Tensor): The residues of the system.
+    Returns:
+        Tuple[Tensor, Tensor]: The canonicalized poles and residues.
+    """
+    raise NotImplementedError
+def list_tensors(idx):
+    for obj in gc.get_objects():
+        try:
+            if torch.is_tensor(obj) and isinstance(obj, torch.Tensor):
+                # dump to log
+                print(type(obj), obj.size())
+                el = obj[0]
+                with open(f"tensors_{idx}.txt", "a") as f:
+                    f.write(f"{type(obj)} {obj.size()} {el}\n")
+        except Exception as e:
+            pass
+class HyenaInferenceEngine:
+    def __init__(
+        self,
+        fir_fn=None,
+        iir_prefill_style="modal-fft",
+        layer_idx=None,
+    ) -> None:
+        self.fir_fn = fir_fn
+        assert iir_prefill_style in IIR_PREFILL_MODES, f"iir_prefill_style must be one of {IIR_PREFILL_MODES}"
+        self.iir_prefill_style = iir_prefill_style
+        self.layer_idx = layer_idx
+        self.low_mem_mode = False
+    def parallel_fir(
+        self,
+        fir_fn,
+        u,
+        weight,
+        bias,
+        L,
+        fir_length=3,
+        inference_params=None,
+        prefill_mode=None,
+        padding_mask=None,
+    ):
+        """Compute the output state of the long convolutional filter."""
+        # prepare input layout, dimensions and dispatch to fir kernel
+        if fir_fn != torch.nn.functional.conv1d:
+            z_pre = fir_fn(u)[:, :L]  # B, L, D
+            z_pre = z_pre.permute(0, 2, 1)
+        else:
+            u = u.permute(0, 2, 1)  # B, D, L
+            z_pre = fir_fn(
+                u,
+                weight,
+                bias=None,  # don't pass it here, add manually instead!  source of small error
+                stride=1,
+                padding=fir_length - 1,
+                groups=u.shape[1],
+            )[..., :L]
+            # add manually instead!  source of small error
+            z_pre = z_pre + bias[None, :, None]
+        # handle padding post fir, the only place with biases
+        if type(padding_mask) == torch.Tensor:
+            z_pre = z_pre * padding_mask[:, None]
+        if inference_params is not None:
+            # handle seqlen last and dim last cases for `u`
+            if fir_fn != torch.nn.functional.conv1d:
+                fir_state = u[:, -fir_length + 1 :].permute(0, 2, 1)
+            else:
+                fir_state = u[..., -fir_length + 1 :]
+        else:
+            fir_state = None
+        return z_pre, fir_state
+    def parallel_iir(
+        self,
+        z_pre,
+        h,
+        D,
+        L,
+        poles,
+        residues,
+        t,
+        dims,
+        layer_idx,
+        inference_params=None,
+        prefill_style="fft",
+        fftconv_fn=None,
+        padding_mask=None,
+        use_flashfft=False,
+        column_split_hyena=False,
+        long_fir_threshold=None,
+    ):
+        """Compute the output state of the short convolutional filter."""
+        fft_size = 2 * L
+        hidden_size, num_attention_heads, hidden_size_per_attention_head, _, _ = dims
+        # Compatibility with training infra that column splits the projections
+        if column_split_hyena:
+            z = z_pre.reshape(
+                z_pre.shape[0],
+                num_attention_heads,
+                3 * hidden_size_per_attention_head,
+                z_pre.shape[2],
+            )
+            x2, x1, v = (
+                z[:, :, :hidden_size_per_attention_head],
+                z[
+                    :,
+                    :,
+                    hidden_size_per_attention_head : 2 * hidden_size_per_attention_head,
+                ],
+                z[:, :, 2 * hidden_size_per_attention_head :],
+            )
+            x2, x1, v = (
+                x2.reshape(x2.shape[0], -1, x2.shape[-1]),
+                x1.reshape(x1.shape[0], -1, x1.shape[-1]),
+                v.reshape(v.shape[0], -1, v.shape[-1]),
+            )
+        else:
+            x2, x1, v = z_pre.split([hidden_size, hidden_size, hidden_size], dim=1)
+        x1v = x1 * v
+        if inference_params is not None and prefill_style == "recurrence":
+            y = self.prefill_via_direct_recurrence(
+                inference_params=inference_params,
+                x1v=x1v,
+                L=L,
+                poles=poles,
+                residues=residues,
+            )
+        else:
+            if use_flashfft and (L % 2) == 0:  # only works with even L
+                y = fftconv_fn(
+                    x1v.to(dtype=torch.bfloat16).contiguous(),
+                    h.to(dtype=torch.float32),
+                )
+                X_s = None
+            elif long_fir_threshold is None:
+                H = torch.fft.rfft(h.to(dtype=torch.float32), n=fft_size) / fft_size
+                X_s = torch.fft.fft(x1v.to(dtype=torch.float32), n=fft_size)
+                X = X_s[..., : H.shape[-1]]
+                if len(z_pre.shape) > 3:
+                    H = H.unsqueeze(1)
+                y = torch.fft.irfft(X * H, n=fft_size, norm="forward")[..., :L]
+            else:
+                assert h.shape[0] == 1, "batch size must be 1 for long_fir_threshold"
+                h = h[0][:, None]  # rearrange to d, 1, l for depthwise conv1d
+                h = h[..., :long_fir_threshold]
+                y = F.conv1d(
+                    x1v,
+                    h.to(dtype=x1v.dtype),
+                    stride=1,
+                    groups=x1v.shape[1],
+                    padding=h.shape[-1] - 1,
+                )[..., :L]
+        y = y.to(dtype=x1v.dtype)
+        y = (y + x1v * D.unsqueeze(-1)) * x2
+        if inference_params is not None:
+            if prefill_style == "fft":
+                self.prefill_via_modal_fft(
+                    inference_params=inference_params,
+                    x1v=x1v,
+                    X_s=X_s,
+                    L=L,
+                    t=t,
+                    poles=poles,
+                    dims=dims,
+                    layer_idx=layer_idx,
+                    use_flashfft=use_flashfft,
+                    fftconv_fn=fftconv_fn,
+                )
+            elif prefill_style == "recurrence":
+                # recurrent prefill is done before
+                pass
+            else:
+                raise NotImplementedError
+            if self.low_mem_mode:
+                # TODO: smarter gc
+                del z_pre, x2, x1, v, x1v, h, poles, residues
+                torch.cuda.empty_cache()
+        return y.permute(0, 2, 1)
+    def step_fir(self, u, fir_state, weight, bias=None):
+        """Step the FIR filter.
+        Note:
+        `fir_state` contains the last `short_filter_length - 1` elements of `u`: `u_(L-2), u_{L-1), ...`
+        We assume dimensions of `short_filter_weight` to be `[d, 1, short_filter_len]` (SISO / multi SISO layout).
+        """
+        h0, h = weight[..., 0, -1], weight[..., 0, :-1]
+        h0, h = h0[None], h[None]
+        y = h0 * u + torch.sum(fir_state * h, dim=-1) + bias
+        # update
+        fir_state = torch.roll(fir_state, -1, dims=2)
+        fir_state[..., -1] = u
+        return y, fir_state
+    def step_iir(self, x2, x1, v, D, residues, poles, iir_state, iir_groups=1):
+        x1v = x1 * v
+        residues, poles = (
+            torch.view_as_complex(residues.to(torch.float32)),
+            torch.view_as_complex(poles.to(torch.float32)),
+        )
+        # squeeze the dummy seqlen dimension
+        # D, state_dim, 1 -> 1, D, state_dim
+        residues, poles = residues[..., 0][None], poles[..., 0][None]
+        iir_state = poles * iir_state + x1v[..., None]
+        res_state = torch.sum(residues * iir_state, dim=-1).real
+        if iir_groups > 1:
+            raise NotImplementedError
+        y = x2 * (res_state + D * x1v)
+        return y, iir_state
+    def prefill_via_fir_caching(self, u, inference_params, L, *args, **kwargs):
+        """Turns the IIR filter into a FIR and uses a cache for decoding."""
+        raise NotImplementedError(":)")
+    def prefill_via_direct_recurrence(
+        self, inference_params, x1v, L, residues, poles, *args, **kwargs
+    ) -> torch.Tensor:
+        """
+        Compute the IIR state via explicit SSM recurrence (modal form)
+        This is the most memory efficient prefilling method for Hyena filters.
+        Note:
+            dtypes: [state: float32, poles: float32, x1v: bfloat16, output: bfloat16]
+        """
+        state_dim = poles.shape[1]
+        x1v_ = x1v[..., None, None]  # b, d, l, sdim, reim
+        x1v_ = x1v_.repeat(1, 1, 1, state_dim, 2)  # b, d, l, sdim, reim
+        x1v_[..., 1] = 0
+        state = 0 * x1v_[:, :, 0]
+        output = 0 * x1v_[:, :, :, 0, 0]  # b, d, l
+        # suppress dummy seqlen dimension
+        poles = poles[:, :, 0][None]
+        residues = residues[:, :, 0][None].repeat(x1v_.shape[0], 1, 1, 1)  # b, d, sdim, reim
+        # state: b, d, sdim, reim
+        # poles: 1, d, sdim, reim
+        # x1v_: b, d, l, sdim, reim
+        for i in range(L):
+            state[..., 0] = poles[..., 0] * state[..., 0] - poles[..., 1] * state[..., 1] + x1v_[:, :, i, :, 0]
+            state[..., 1] = poles[..., 0] * state[..., 1] + poles[..., 1] * state[..., 0] + x1v_[:, :, i, :, 1]
+            output[:, :, i] = torch.sum(residues * state, dim=-2)[..., 0]  # .real
+        inference_params.state_dict[self.layer_idx] = torch.view_as_complex(state.to(dtype=torch.float32))
+        return output
+    def prefill_via_hybrid_recurrence(self, inference_params, u, log_poles, x1v_f_a, L, *args, **kwargs):
+        """
+        Compute the IIR state via hybrid recurrence-convolution over blocks
+        """
+        raise NotImplementedError(":)")
+    def prefill_via_scan(self, u, inference_params=None, *args, **kwargs):
+        raise NotImplementedError
+    def prefill_via_canonical_fft(self, u, inference_params=None, *args, **kwargs):
+        """
+        Compute the IIR state via a single FFT with the denominator of the SSM in companion form.
+        This is the most memory efficient "parallelized" prefilling method for Hyena.
+        From: https://arxiv.org/abs/2310.18780
+        """
+        raise NotImplementedError(":)")
+    def prefill_via_modal_fft(
+        self,
+        inference_params,
+        x1v,
+        L,
+        poles,
+        t,
+        dims,
+        layer_idx,
+        X_s=None,
+        use_flashfft=False,
+        fftconv_fn=None,
+        state_dtype=torch.complex64,
+        *args,
+        **kwargs,
+    ):
+        """
+        Compute the IIR state via a single FFT, using the poles of the SSM in modal form.
+        """
+        # When the model has a long convolution derived from a SSM in modal form and prefill_style is "fft",
+        # we split the filter into poles and residues and reuse FFT computation on the input.
+        # This optimization is currently not supported when using flashfftconv.
+        hidden_size, _, _, state_size, hyena_filter_groups = dims
+        if use_flashfft:
+            # using real states
+            poles = poles.squeeze().reshape(poles.shape[0], -1)[..., None]
+            state_s = poles**t
+            if hyena_filter_groups > 1:
+                raise NotImplementedError
+            x1v = x1v[:, :, None].repeat(1, 1, 2 * state_size, 1)
+            x1v = x1v.reshape(x1v.shape[0], -1, x1v.shape[-1])
+            state_s = state_s[None]
+            state = fftconv_fn(
+                x1v.contiguous(),
+                state_s.to(dtype=torch.float32),
+            )
+            state = state[..., L - 1].reshape(x1v.shape[0], hidden_size, state_size, 2)
+            state = torch.view_as_complex(state.contiguous().to(dtype=torch.float32))
+            inference_params.state_dict[self.layer_idx] = state
+        else:
+            assert X_s is not None
+            bs = x1v.shape[0]
+            fft_size = 2 * L
+            poles = torch.view_as_complex(poles.to(torch.float32))
+            state_s = poles**t
+            state_S = torch.fft.fft(state_s, n=fft_size).repeat(bs, 1, 1, 1)  # B, D, state_dim, 2 * L
+            if hyena_filter_groups > 1:
+                state_S = state_S.repeat_interleave(hidden_size // hyena_filter_groups, 1)
+            state = torch.fft.ifft(X_s[..., None, :] * state_S, n=fft_size)
+            inference_params.state_dict[layer_idx] = state[..., L - 1].to(dtype=state_dtype)
+    def _compute_state(self, log_poles, u, t, L, *args, **kwargs):
+        """
+        Compute the IIR state given an input `u` and log_poles of the modal system.
+        """
+        bs = u.shape[0]
+        fft_size = 2 * L
+        U = torch.fft.rfft(u.to(torch.float32), n=fft_size)
+        fft_size = 2 * L
+        x = (log_poles * t).exp()
+        # [batch, hidden_size, state_dim, 2 * seqlen]
+        X = torch.fft.fft(x, n=fft_size).repeat(bs, 1, 1, 1)
+        state = torch.fft.ifft(U[..., None, :] * X, n=fft_size)[..., :L]
+        return state

layers.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+import torch.nn as nn
+from .utils import grab_first_if_tuple
+def grab_first_if_tuple(x):
+    if x.__class__.__name__ == "tuple":
+        return x[0]
+    else:
+        return x
+class RMSNorm(torch.nn.Module):
+    def __init__(self, config):
+        super(RMSNorm, self).__init__()
+        self.eps, self.hidden_size = config.eps, config.hidden_size
+        self.scale = torch.nn.Parameter(torch.ones(self.hidden_size))
+        self.register_parameter("scale", self.scale)
+        self.use_flash_rmsnorm = config.get("use_flash_rmsnorm", False)
+        if self.use_flash_rmsnorm:
+            try:
+                from flash_attn.ops.rms_norm import rms_norm as rmsnorm_func
+                self.rmsnorm_func = rmsnorm_func
+            except:
+                raise ImportError(
+                    "For `use_flash_rmsnorm`: `pip install git+https://github.com/HazyResearch/flash-attention.git#subdirectory=csrc/layer_norm`"
+                )
+    def forward(self, x):
+        if self.use_flash_rmsnorm:
+            return self.rmsnorm_func(x, self.scale, self.eps)
+        else:
+            y = x / (x.norm(2, dim=-1, keepdim=True) * self.hidden_size ** (-1.0 / 2) + self.eps)
+            return self.scale * y
+class ParallelGatedMLP(nn.Module):
+    def __init__(
+        self,
+        config,
+    ):
+        super().__init__()
+        multiple_of = config.get("inner_size_multiple_of", 64)
+        self.act_type = config.get("mlp_activation", "silu")
+        if self.act_type == "gelu":
+            self.act = F.gelu
+        elif self.act_type == "silu":
+            self.act = F.silu
+        else:
+            raise NotImplementedError
+        self.multiple_of = multiple_of * config.model_parallel_size
+        inner_size = int(2 * config.hidden_size * 4 / 3)
+        inner_size = self.multiple_of * ((inner_size + self.multiple_of - 1) // self.multiple_of)
+        if config.get("inner_mlp_size", None) is not None:
+            inner_size = config.inner_mlp_size
+        self.l1 = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=inner_size,
+            bias=False,
+        )
+        self.l2 = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=inner_size,
+            bias=False,
+        )
+        self.l3 = nn.Linear(
+            in_features=inner_size,
+            out_features=config.hidden_size,
+            bias=False,
+        )
+    def forward(self, z):
+        z1, z2 = self.l1(z), self.l2(z)
+        z1, z2 = grab_first_if_tuple(z1), grab_first_if_tuple(z2)
+        y = self.l3(self.act(z1) * z2)
+        return grab_first_if_tuple(y)
+class Embedding(nn.Module):
+    _train_dtype = "bf16"
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+    def embed(self, input_ids, position_ids=None, tokentype_ids=None):
+        embeddings = self.word_embeddings(input_ids)
+        return embeddings
+    def unembed(self, u):
+        weight = self.word_embeddings.weight
+        return torch.matmul(u, weight)
+class VocabParallelEmbedding(nn.Embedding):
+    "Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/embedding.py"
+    def __init__(self, config):
+        vocab_size, process_group, padding_idx = (
+            config.vocab_size,
+            config.get("process_group", None),
+            config.get("padding_idx", None),
+        )
+        self.process_group = process_group
+        if process_group is not None:
+            world_size = torch.distributed.get_world_size(process_group)
+            if vocab_size % world_size != 0:
+                raise ValueError(
+                    f"vocab_size ({vocab_size}) must be divisible by " f"world_size ({world_size})"
+                )
+            if world_size > 1 and padding_idx is not None:
+                raise RuntimeError("ParallelEmbedding does not support padding_idx")
+        else:
+            world_size = 1
+        super().__init__(
+            vocab_size // world_size,
+            embedding_dim=config.hidden_size,
+            padding_idx=padding_idx,
+        )
+    def embed(self, x: Tensor) -> Tensor:
+        if self.process_group is None:
+            return self.forward(x)
+        else:
+            rank = torch.distributed.get_rank(self.process_group)
+            vocab_size = self.num_embeddings
+            vocab_start_index, vocab_end_index = (
+                rank * vocab_size,
+                (rank + 1) * vocab_size,
+            )
+            # Create a mask of valid vocab ids (1 means it needs to be masked).
+            input_ids_mask = (x < vocab_start_index) | (x >= vocab_end_index)
+            x = x - vocab_start_index
+            x[input_ids_mask] = 0
+            embeddings = self.forward(x)
+            embeddings[input_ids_mask] = 0.0
+            # Reduce to the global process group
+            torch.distributed.all_reduce(embeddings, group=self.process_group)
+            return embeddings
+    def unembed(self, u: Tensor) -> Tensor:
+        if self.process_group is None:
+            return u @ self.weight.T
+        else:
+            raise NotImplementedError

model.py ADDED Viewed

	@@ -0,0 +1,472 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+# Note: MP and PP utilities are removed for ease of use and editing.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .cache import InferenceParams, RecurrentInferenceParams
+from .engine import HyenaInferenceEngine
+from .layers import ParallelGatedMLP, RMSNorm, VocabParallelEmbedding
+from .utils import column_split, print_rank_0
+try:
+    from flash_attn.modules.mha import MHA
+except ImportError:
+    "flash_attn not installed"
+try:
+    from .positional_embeddings import swap_mha_rope
+except ImportError:
+    "could not import swap_mha_rope from positional_embeddings.py"
+# dummy import to force huggingface to bundle the tokenizer
+from .tokenizer import ByteTokenizer
+class AttentionBlock(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.config = config
+        self.pre_norm, self.post_norm = RMSNorm(config), RMSNorm(config)
+        self.layer_idx = layer_idx
+        self.proj_groups = config.get("proj_groups", 1)
+        dtype = config.get("attn_block_dtype", torch.bfloat16)
+        mlp_dtype = config.get("mlp_dtype", torch.bfloat16)
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size_per_attention_head = config.hidden_size // config.num_attention_heads
+        self.counter = 0
+        self.inner_mha_cls = MHA(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_heads_kv=config.num_attention_heads // self.proj_groups,
+            rotary_emb_dim=config.hidden_size // config.num_attention_heads,
+            qkv_proj_bias=config.get("qkv_proj_bias", True),
+            rotary_emb_base=config.get("rotary_emb_base", 10000),
+            causal=True,
+            layer_idx=layer_idx,
+            out_proj_bias=config.get("mha_out_proj_bias", True),
+            use_flash_attn=self.config.use_flash_attn,
+        ).to(dtype=dtype)
+        # check if using interpolated rotary pos emb from config, and swap the rope emb
+        if config.get("use_interpolated_rotary_pos_emb", False):
+            swap_mha_rope(
+                mha=self.inner_mha_cls,
+                kwargs_new_rope={'scaling_factor': config.get("rotary_emb_scaling_factor", 1.)},
+            )
+        if self.config.get("smeared_gqa", False):
+            self.inner_mha_cls.num_heads_kv = self.inner_mha_cls.num_heads
+        self.inner_mha_cls.rotary_emb.register_buffer("inv_freq", self.inner_mha_cls.rotary_emb.inv_freq)
+        self.mlp = ParallelGatedMLP(config).to(dtype=mlp_dtype)
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        if (
+            type(padding_mask) == torch.Tensor
+        ):  # workaround for masking bug in FA. This works because Wqkv does not have bias
+            # and attention scores will be also automatically zeroed.
+            u = u * padding_mask[..., None]
+        u = (
+            self.inner_mha_cls(
+                self.pre_norm(u),
+                inference_params=inference_params,
+            )
+            + u
+        )
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            u = u * padding_mask[..., None]
+        u = self.mlp(self.post_norm(u)) + u
+        return u, None
+class ParallelHyenaFilter(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hyena_filter_groups = config.get("hyena_filter_groups", self.config.hidden_size)
+        self.use_flashfft = config.get("use_flashfft", False)
+        self.state_size = config.state_size
+        self.hidden_size = config.hidden_size
+        self.num_filters = config.num_filters
+        self.inference_mode = config.get("inference_mode", True)
+        self.counter = 0
+        self.column_split_hyena = config.get("column_split_hyena", True)
+        assert self.hidden_size % self.num_filters == 0 and self.num_filters <= self.hidden_size
+        self.D = nn.Parameter(torch.zeros(self.hidden_size))
+        # attention heads are not used except to split post short_filter
+        # projections in the same way as the checkpoint
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
+        # after preprocessing here we can save the new checkpoint
+        self.short_filter_length = config.short_filter_length
+        self.short_filter_weight = nn.Parameter(torch.randn(3 * config.hidden_size, 1, config.short_filter_length))
+        self.short_filter_bias = (
+            nn.Parameter(torch.randn(3 * config.hidden_size)) if config.short_filter_bias else None
+        )
+        self.engine = HyenaInferenceEngine(layer_idx=layer_idx)
+        self.use_flash_depthwise = config.get("use_flash_depthwise", False)
+        self.data_dtype = None
+        if self.use_flash_depthwise:
+            self.fir_fn = FlashDepthwiseConv1d(
+                channels=3 * self.hidden_size,
+                kernel_size=self.short_filter_length,
+                padding=self.short_filter_length - 1,
+                weights=self.short_filter_weight,
+                bias=self.short_filter_bias,
+                device=None,
+                dtype=self.config.get("depthwise_dtype", torch.bfloat16),
+            )
+        else:
+            self.fir_fn = F.conv1d
+        self.fftconv_fn = None
+        self.long_fir_threshold = config.get("long_fir_threshold", None)
+        if self.long_fir_threshold is not None:
+            assert self.use_flashfft is False, "long_fir_threshold not compatible with fused flashfft"
+        self.num_systems = self.hidden_size // self.hyena_filter_groups
+        poles = torch.randn(self.num_systems, self.state_size, 1, 2)
+        # TODO: bring over init from internals
+        poles[..., 0] = 1e-2 * torch.randn(self.num_systems, self.state_size, 1)
+        poles[..., 1] = 1e-3 * torch.randn(self.num_systems, self.state_size, 1)
+        self.poles = nn.Parameter(poles)
+        self.residues = nn.Parameter(torch.randn(self.num_systems, self.state_size, 1, 2))
+        self.h = None
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        if inference_params is not None and self.layer_idx in inference_params.fir_state_dict.keys():
+            return self.sequential_forward(u, inference_params)
+        else:
+            return self.parallel_forward(u, inference_params, padding_mask)
+    def parallel_forward(self, u, inference_params=None, padding_mask=None):
+        L = u.shape[1]
+        z_pre, fir_state = self.engine.parallel_fir(
+            self.fir_fn,
+            u,
+            self.short_filter_weight,
+            self.short_filter_bias,
+            L,
+            fir_length=self.short_filter_length,
+            inference_params=inference_params,
+            padding_mask=padding_mask,
+        )
+        if inference_params:
+            inference_params.fir_state_dict[self.layer_idx] = fir_state
+        if self.h is None:
+            h, filter_dtype, poles, residues = self.compute_filter(L, u.device)
+        else:
+            h = self.h
+            filter_dtype = self.h.dtype
+        if self.hyena_filter_groups > 1:
+            h = h.repeat_interleave(self.hidden_size // self.hyena_filter_groups, 1)
+        # if inference_params is not None, we plan to perform generation:
+        # prefilling is handled by the engine.
+        dims = (
+            self.hidden_size,
+            self.num_attention_heads,
+            self.hidden_size_per_attention_head,
+            self.state_size,
+            self.hyena_filter_groups,
+        )
+        y = self.engine.parallel_iir(
+            z_pre,
+            h,
+            self.D,
+            L,
+            t=self.t,
+            poles=self.poles,
+            residues=self.residues,
+            dims=dims,
+            inference_params=inference_params,
+            layer_idx=self.layer_idx,
+            prefill_style=self.config.get("prefill_style", "fft"),
+            use_flashfft=self.use_flashfft,
+            fftconv_fn=self.fftconv_fn,
+            column_split_hyena=self.column_split_hyena,
+            long_fir_threshold=self.long_fir_threshold,
+            padding_mask=padding_mask,
+        )
+        return y, inference_params
+    def sequential_forward(self, u, inference_params):
+        if self.data_dtype is None:
+            self.data_dtype = u.dtype
+        if len(u.shape) > 2:
+            u = u[:, -1]
+        fir_state, iir_state = (
+            inference_params.fir_state_dict[self.layer_idx],
+            inference_params.state_dict[self.layer_idx],
+        )
+        z_pre, fir_state = self.engine.step_fir(
+            u, fir_state, weight=self.short_filter_weight, bias=self.short_filter_bias
+        )
+        x2, x1, v = (
+            column_split(z_pre, self.num_attention_heads, self.hidden_size_per_attention_head)
+            if self.column_split_hyena
+            else z_pre.split([self.hidden_size, self.hidden_size, self.hidden_size], dim=1)
+        )
+        y, iir_state = self.engine.step_iir(
+            x2,
+            x1,
+            v,
+            self.D,
+            self.residues,
+            self.poles,
+            iir_state,
+            iir_groups=self.hyena_filter_groups,
+        )
+        inference_params.fir_state_dict[self.layer_idx] = fir_state
+        inference_params.state_dict[self.layer_idx] = iir_state
+        y = y.to(dtype=self.data_dtype)
+        return y[:, None], inference_params
+    def update_time(self, L, device):
+        """
+        Set [0, 1, ..., L-1] where L is the length of the current batch of inputs.
+        If L is greater than the length of the previous batch, then the time vector is
+        reinitialized. Otherwise, the time vector is truncated from cache.
+        """
+        if not hasattr(self, "t"):
+            self.t = torch.arange(L, device=device)[None, None]
+        elif self.t.shape[-1] < L:
+            self.t = torch.arange(L, device=device)[None, None]
+        else:
+            self.t = self.t[..., :L]
+    def compute_filter(self, L, device):
+        self.update_time(L, device)
+        filter_dtype = torch.float32
+        residues, log_poles = (
+            torch.view_as_complex(self.residues.to(filter_dtype)),
+            torch.view_as_complex(self.poles.to(filter_dtype)).log(),
+        )
+        h = (residues * (log_poles * self.t).exp()).real.sum(1)[None]
+        return h, filter_dtype, log_poles, residues
+class ParallelGatedConvBlock(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.low_mem_mode = config.get("low_mem_mode", False)
+        dtype = config.get("hyena_block_dtype", torch.float32)
+        mlp_dtype = config.get("mlp_dtype", torch.bfloat16)
+        self.pre_norm, self.post_norm = RMSNorm(config).to(dtype=dtype), RMSNorm(config).to(dtype=dtype)
+        self.filter = ParallelHyenaFilter(config, layer_idx).to(dtype=dtype)
+        self.projections = nn.Linear(config.hidden_size, 3 * config.hidden_size)
+        self.out_filter_dense = nn.Linear(config.hidden_size, config.hidden_size).to(dtype)
+        self.mlp = ParallelGatedMLP(config).to(dtype=mlp_dtype)
+        self.proj_norm_fn = self.proj_norm
+        self.res_mlp_norm_fn = self.res_mlp_norm
+        if self.config.get("compile", False):
+            self.proj_norm_fn = torch.compile(self.proj_norm, fullgraph=True, dynamic=False, mode="reduce-overhead")
+            self.res_mlp_norm_fn = torch.compile(
+                self.res_mlp_norm, fullgraph=True, dynamic=False, mode="reduce-overhead"
+            )
+    def proj_norm(self, x):
+        return self.projections(self.pre_norm(x))
+    def res_mlp_norm(self, x):
+        return self.mlp(self.post_norm(x)) + x
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        z = self.proj_norm_fn(u)
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            z = z * padding_mask[..., None]
+        z, inference_params = self.filter(z, inference_params=inference_params, padding_mask=padding_mask)
+        z_in = self.out_filter_dense(z) + u
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            z_in = z_in * padding_mask[..., None]
+        y = self.res_mlp_norm_fn(z_in)
+        return y, inference_params
+def get_block(config, layer_idx, flash_fft=None):
+    if layer_idx in config.attn_layer_idxs:
+        return AttentionBlock(config, layer_idx)
+    elif layer_idx in config.hyena_layer_idxs:
+        block = ParallelGatedConvBlock(config, layer_idx)
+        if config.get("use_flashfft", "False"):
+            block.filter.fftconv_fn = flash_fft
+        return block
+    else:
+        raise NotImplementedError
+class StripedHyena(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embedding_layer = VocabParallelEmbedding(config)
+        self.norm = RMSNorm(config) if config.get("final_norm", True) else None
+        self.unembed = self.embedding_layer if config.tie_embeddings else VocabParallelEmbedding(config)
+        if config.get("use_flashfft", "False"):
+            from flashfftconv import FlashFFTConv
+            self.flash_fft = FlashFFTConv(2 * config.seqlen, dtype=torch.bfloat16)
+        else:
+            self.flash_fft = None
+        self.blocks = nn.ModuleList(
+            get_block(config, layer_idx, flash_fft=self.flash_fft) for layer_idx in range(config.num_layers)
+        )
+    def forward(self, x, inference_params_dict=None, padding_mask=None):
+        L = x.shape[1]
+        x = self.embedding_layer.embed(x)
+        if inference_params_dict is not None:
+            x, inference_params_dict_out = self.stateful_forward(
+                x,
+                inference_params_dict=inference_params_dict,
+            )
+        else:
+            x, inference_params_dict_out = self.stateless_forward(x, padding_mask=padding_mask)
+        x = self.norm(x)
+        x = self.unembed.unembed(x)
+        return x, inference_params_dict_out
+    def stateful_forward(self, x, inference_params_dict=None):
+        for block_idx, block in enumerate(self.blocks):
+            block_name = "mha" if block_idx in self.config.attn_layer_idxs else "hyena"
+            inference_params = inference_params_dict[block_name]
+            x, _ = block(x, inference_params=inference_params)
+        return x, inference_params_dict
+    def stateless_forward(self, x, padding_mask=None):
+        if type(padding_mask) == torch.Tensor:
+            x = x * padding_mask[..., None]
+        for _, block in enumerate(self.blocks):
+            x, _ = block(x, inference_params=None, padding_mask=padding_mask)
+        return x, None
+    def initialize_inference_params(self):
+        print_rank_0("Initializing inference params...")
+        inference_params_dict = {
+            "mha": InferenceParams(
+                max_seqlen=self.config.get("max_seqlen", 8192),
+                max_batch_size=self.config.get("max_batch_size", 1),
+                seqlen_offset=0,
+            ),
+            "hyena": RecurrentInferenceParams(
+                fir_filter_length=self.config.short_filter_length,
+                state_dim=self.config.state_size,
+                seqlen_offset=0,
+            ),
+        }
+        return inference_params_dict
+    def precompute_filters(self, L, device):
+        for block_idx, block in enumerate(self.blocks):
+            if type(block) == ParallelGatedConvBlock:
+                if type(block.filter) == ParallelHyenaFilter:
+                    L = block.filter.long_fir_threshold or L
+                    print_rank_0(f"Precomputing filters, L={L}...")
+                    filter_dtype = torch.float16 if L >= 2048 else torch.float32
+                    block.filter._set_time(L, device)
+                    residues, poles = (
+                        torch.view_as_complex(block.filter.residues.to(torch.float16)),
+                        torch.view_as_complex(block.filter.poles.to(torch.float16)),
+                    )
+                    block.filter.h = (residues * poles**block.filter.t).real.sum(1)[None]
+                    block.filter.h = block.filter.h.to(dtype=filter_dtype)
+    def load_poles_residues(self, path):
+        "Load different poles and residues for each layer."
+        for block_idx, block in enumerate(self.blocks):
+            if type(block) == ParallelGatedConvBlock:
+                if type(block.filter) == ParallelHyenaFilter:
+                    print(f"Loading poles and residues for block {block_idx}")
+                    poles = torch.load(path + f"/approx_poles_{block_idx+1}.pt", map_location="cpu")
+                    poles = torch.view_as_real(poles)
+                    residues = torch.load(path + f"/approx_residues_{block_idx+1}.pt", map_location="cpu")
+                    residues = torch.view_as_real(residues)
+                    poles = poles.permute(1, 0, 2).unsqueeze(-2)
+                    residues = residues.permute(1, 0, 2).unsqueeze(-2)
+                    block.filter.poles = nn.Parameter(poles)
+                    block.filter.residues = nn.Parameter(residues)
+    def to_bfloat16_except_poles_residues(self):
+        """Convert all parameters to bfloat16 except for the poles and residues.
+        Particularly important for longer prompts.
+        """
+        for k, p in self.named_parameters():
+            if "poles" not in k and "residues" not in k:
+                p.data = p.data.to(torch.bfloat16)
+    def load_from_split_converted_state_dict(self, path):
+        print("Loading from split converted state dict")
+        embedding_weight = torch.load(path + "/layer_00.pt")["word_embeddings.weight"]
+        self.embedding_layer.weight = nn.Parameter(embedding_weight.to(self.embedding_layer.weight.dtype))
+        print("Loading embedding weight ok")
+        if self.config.get("final_norm", False) is not None:
+            idx = len(self.blocks) + 1
+            final_norm_scale = torch.load(path + f"/layer_{idx:02d}.pt")["norm.scale"]
+            self.norm.scale = nn.Parameter(final_norm_scale.to(self.norm.scale.dtype))
+            print("loading final norm ok")
+        if not self.config.get("tie_embeddings", True):
+            idx = len(self.blocks) + 2
+            embedding_weight = torch.load(path + f"/layer_{idx:02d}.pt")["word_embeddings.weight"]
+            self.unembed.weight = nn.Parameter(embedding_weight.to(self.unembed.weight.dtype))
+            print("loading unembed weight ok")
+        for block_idx, block in enumerate(self.blocks):
+            print("loading block {}...".format(block_idx))
+            # strict = False if type(block) == ParallelGatedConvBlock else True
+            # some blocks (optionally) go through a round of conv distillation on some parameters
+            strict = True  # safer to be strict and account for every layer
+            loaded_dict = torch.load(path + f"/layer_{block_idx + 1:02d}.pt")
+            block.load_state_dict(loaded_dict, strict=strict)

modeling_hyena.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# -*- coding: utf-8 -*-
+"""StripedHyena custom code port for the Hugging Face Hub"""
+import torch
+from torch.nn import functional as F
+from .configuration_hyena import StripedHyenaConfig
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
+from transformers.utils import logging
+from typing import Optional, Tuple, Union
+from .model import StripedHyena
+from .utils import dotdict
+from .cache import InferenceParams
+from .engine import HyenaInferenceEngine
+from .layers import RMSNorm
+from .utils import dotdict, column_split
+logger = logging.get_logger(__name__)
+class StripedHyenaPreTrainedModel(PreTrainedModel):
+    config_class = StripedHyenaConfig
+    base_model_prefix = "sh"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["AttentionBlock", "ParallelGatedConvBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_missing = [r"freq"]
+    _keys_to_ignore_on_load_unexpected = [r"fftconv", r"twiddle_factors"]
+    _supports_flash_attn_2 = True
+class StripedHyenaModelForCausalLM(StripedHyenaPreTrainedModel):
+    supports_gradient_checkpointing = True
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        model_config = dotdict(config.to_dict())
+        self.backbone = StripedHyena(model_config)
+        self.backbone.gradient_checkpointing = False
+        self.config = config
+        vocab_size = config.vocab_size
+        if vocab_size % config.make_vocab_size_divisible_by != 0:
+            vocab_size += config.make_vocab_size_divisible_by - (
+                vocab_size % config.make_vocab_size_divisible_by
+            )
+        self.vocab_size = vocab_size
+        self.post_init()
+        self.force_dtype()
+    def force_dtype(self):
+        self.backbone.to_bfloat16_except_poles_residues()
+    def _set_gradient_checkpointing(self, enable, gradient_checkpointing_func):
+        self.backbone.gradient_checkpointing = enable
+    def get_input_embeddings(self):
+        return self.backbone.embedding_layer
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        past_key_values=None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if use_cache:
+            if self.backbone.gradient_checkpointing and self.backbone.training:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+            elif labels is not None:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with loss calculation. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        inputs = input_ids
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = self.backbone.initialize_inference_params()
+                batch_size = input_ids.shape[0]
+                past_key_values["mha"].max_batch_size = batch_size
+                past_key_values["hyena"].max_batch_size = batch_size
+            else:
+                seqlen_offset = past_key_values["mha"].seqlen_offset
+                if seqlen_offset == 0:
+                    # second loop through generate will have prompt_len + 1 as seqlen
+                    seqlen_offset = input_ids.shape[-1] - 1
+                    past_key_values["hyena"].seqlen_offset = seqlen_offset
+                    past_key_values["mha"].seqlen_offset = seqlen_offset
+                else:
+                    past_key_values["mha"].seqlen_offset += 1
+                    past_key_values["hyena"].seqlen_offset += 1
+                inputs = input_ids[
+                    :,
+                    -1:,
+                ]
+        logits, past_key_values = self.backbone(
+            inputs,
+            padding_mask=attention_mask,
+            inference_params_dict=past_key_values if use_cache else None,
+        )
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = F.cross_entropy(shift_logits, shift_labels)
+        if return_dict:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                hidden_states=None,
+                past_key_values=past_key_values if use_cache else None,
+                loss=loss,
+            )
+        else:
+            return logits
+    @classmethod
+    def can_generate(cls) -> bool:
+        return True
+    def prepare_inputs_for_generation(
+        self, input_ids, attention_mask=None, past_key_values=None, **kwargs
+    ):
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+        }

positional_embeddings.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Armin Thomas, Eric Nguyen
+import torch
+import copy
+from einops import rearrange
+from flash_attn.layers.rotary import RotaryEmbedding
+from flash_attn.modules.mha import MHA
+# simple wrapper for flash-attn RoPE with linear scaling:
+class LinearlyScaledRotaryEmbedding(RotaryEmbedding):
+    def __init__(
+        self,
+        dim: int,
+        scaling_factor: float=1.,
+        base=10000.0,
+        interleaved=False,
+        scale_base=None,
+        pos_idx_in_fp32=True,
+        device=None,
+    ):
+        super().__init__(
+            dim=dim,
+            base=base,
+            interleaved=interleaved,
+            scale_base=scale_base,
+            pos_idx_in_fp32=pos_idx_in_fp32,
+            device=device
+        )
+        self._linear_scaling_factor = scaling_factor
+    # adpated from: https://github.com/Dao-AILab/flash-attention/blob/43ceab630bc6c27712428da5a33fc9cb5c369d91/flash_attn/layers/rotary.py#L368
+    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
+        # Reset the tables if the sequence length has changed,
+        # if we're on a new device (possibly due to tracing for instance),
+        # or if we're switching from inference mode to training
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached is None
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+            or (self.training and self._cos_cached.is_inference())
+        ):
+            self._seq_len_cached = seqlen
+            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
+            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
+            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            if self.pos_idx_in_fp32:
+                t = torch.arange(seqlen, device=device, dtype=torch.float32)
+                # linear scaling:
+                t = t / self._linear_scaling_factor
+                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
+                # will be large. Having it in bf16 will lose a lot of precision and cause the
+                # cos & sin output to change significantly.
+                # We want to recompute self.inv_freq if it was not loaded in fp32
+                if self.inv_freq.dtype != torch.float32:
+                    inv_freq = self._compute_inv_freq(device=device)
+                else:
+                    inv_freq = self.inv_freq
+            else:
+                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                # linear scaling:
+                t = t / self._linear_scaling_factor
+                inv_freq = self.inv_freq
+            # Don't do einsum, it converts fp32 to fp16 under AMP
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            freqs = torch.outer(t, inv_freq)
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+            else:
+                power = (
+                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
+                    - seqlen // 2
+                ) / self.scale_base
+                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+# swap out RoPE of existing mha:
+def swap_mha_rope(
+    mha,
+    new_rope: torch.nn.Module=LinearlyScaledRotaryEmbedding,
+    kwargs_new_rope: dict=None
+):
+    # determine mha dtype and device:
+    dtype = mha.Wq.weight.dtype if mha.cross_attn else mha.Wqkv.weight.dtype
+    device = mha.Wq.weight.device if mha.cross_attn else mha.Wqkv.weight.device
+    # determine RoPE settings:
+    kwargs_old_rope = dict(
+        dim = mha.rotary_emb.dim,
+        base = mha.rotary_emb.base,
+        interleaved = mha.rotary_emb.interleaved,
+        scale_base = mha.rotary_emb.scale_base,
+        pos_idx_in_fp32 = mha.rotary_emb.pos_idx_in_fp32,
+        device = mha.rotary_emb.inv_freq.device
+    )
+    # delete old RoPE:
+    del mha.rotary_emb
+    # create new RoPE:
+    kwargs_new_rope = kwargs_new_rope or {'scaling_factor': 1.0}
+    scaled_rope = new_rope(
+        **kwargs_new_rope,
+        **kwargs_old_rope
+    ).to(dtype)
+    # attach new RoPE to mha:
+    mha.rotary_emb = scaled_rope
+    # make new sure RoPE is correctly registered:
+    assert isinstance(mha.rotary_emb, new_rope)
+    return mha

streamer.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from transformers import AutoTokenizer
+class BaseStreamer:
+    """
+    Base class from which `.generate()` streamers should inherit.
+    """
+    def put(self, value):
+        """Function that is called by `.generate()` to push new tokens"""
+        raise NotImplementedError()
+    def end(self):
+        """Function that is called by `.generate()` to signal the end of generation"""
+        raise NotImplementedError()
+class ByteStreamer(BaseStreamer):
+    """
+    Simple text streamer that prints the token(s) to stdout as soon as entire words are formed.
+    <Tip warning={true}>
+    The API for the streamer classes is still under development and may change in the future.
+    </Tip>
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+    Examples:
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+        >>> tok = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
+        >>> streamer = TextStreamer(tok)
+        >>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
+        >>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
+        An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
+        ```
+    """
+    def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs):
+        self.tokenizer = tokenizer
+        self.skip_prompt = skip_prompt
+        self.decode_kwargs = decode_kwargs
+        # variables used in the streaming process
+        self.token_cache = []
+        self.print_len = 0
+        self.next_tokens_are_prompt = True
+    def put(self, value):
+        """
+        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
+        """
+        if len(value.shape) > 1 and value.shape[0] > 1:
+            raise ValueError("TextStreamer only supports batch size 1")
+        elif len(value.shape) > 1:
+            value = value[0]
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+            return
+        # Add the new token to the cache and decodes the entire thing.
+        self.token_cache.extend(value.tolist())
+        text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+        # After the symbol for a new line, we flush the cache.
+        if text.endswith("\n"):
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        else:
+            printable_text = text[self.print_len : self.print_len + 1]
+            self.print_len += len(printable_text)
+        self.on_finalized_text(printable_text)
+    def end(self):
+        """Flushes any remaining cache and prints a newline to stdout."""
+        # Flush the cache, if it exists
+        if len(self.token_cache) > 0:
+            text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        else:
+            printable_text = ""
+        self.next_tokens_are_prompt = True
+        self.on_finalized_text(printable_text, stream_end=True)
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Prints the new text to stdout. If the stream is ending, also prints a newline."""
+        print(text, flush=True, end="" if not stream_end else None)

tokenizer.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# based on https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
+from abc import ABC
+import json
+import pathlib
+import torch
+import tqdm
+from tokenizers import Tokenizer
+from abc import abstractmethod
+from typing import Any, List, Union
+import numpy as np
+class HFAutoTokenizer:
+    def __init__(self, vocab_file):
+        self.tokenizer = Tokenizer.from_file(vocab_file)
+        self.eos = "</s>"
+        self.bos = "<s>"
+        self.eos_id = self.tokenize(self.eos)
+        self.bos_id = self.tokenize(self.bos)
+        self.vsize = 32000
+    def encode_to_list(self, text):
+        return self.tokenizer.encode(text, add_special_tokens=False)
+    def tokenize_file(self, input_file, output_file, verbose=False):
+        if verbose:
+            print(f"Tokenizing file: {input_file}")
+        if pathlib.Path(output_file).exists():
+            print(f"Output file {output_file} already exists, skipping")
+            return
+        with open(input_file, "r") as fin, open(output_file, "w") as fout:
+            for line in tqdm.tqdm(fin):
+                if verbose:
+                    print(f"Tokenizing line: {line[-200:]}")
+                data = json.loads(line.strip())
+                if "text" not in data.keys():
+                    break
+                tokenized_data = self.tokenize(data["text"])
+                fout.write(json.dumps({"tokens": tokenized_data}) + "\n")
+    def tokenize(self, text: str, *args, **kwargs):
+        ids = self.tokenizer.encode(text)
+        if type(ids) == list:
+            return torch.tensor(ids)
+        else:
+            return torch.tensor(ids.ids)
+    def tokenize_batch(self, text_batch):
+        return self.tokenizer.encode_batch(text_batch)
+    def detokenize(self, token_ids, skip_special_tokens=False):
+        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+    def detokenize_batch(self, token_ids_batch, skip_special_tokens=False):
+        out = []
+        for token_ids in token_ids_batch:
+            out.append(
+                self.detokenize(
+                    [t.item() for t in token_ids],
+                    skip_special_tokens=skip_special_tokens,
+                )
+            )
+        return out
+    @property
+    def eod(self):
+        return self.eod_id
+    @property
+    def vocab_size(self):
+        return 32000
+class ByteTokenizer:
+    """UTF-8 Encoder."""
+    def __init__(self):
+        self.vocab_size = 512
+        self.eod_id = 0
+        self.eos_id = 0
+        self.eos_token = 0
+        self.eos_token_id = 0
+        self.pad_id = 1
+    def clamp(self, n):
+        return max(32, min(n, self.vocab_size))
+    def decode_token(self, token: int):
+        return str(chr(self.clamp(token)))
+    def __call__(self, text: str, *args, **kwargs):
+        ids = torch.tensor(self.tokenize(text), dtype=torch.long).unsqueeze(0)
+        return {"input_ids": ids}
+    def tokenize(self, text: str):
+        return list(np.fromstring(text, dtype=np.uint8))
+    def tokenize_batch(self, text_batch: Union[List[str], str]):
+        if isinstance(text_batch, list):
+            return [self.tokenize(s) for s in text_batch]
+        else:
+            return self.tokenize(text_batch)
+    def decode(self, token_ids):
+        return "".join(list(map(self.decode_token, token_ids)))
+    def decode_batch(self, token_ids: Union[List[str], str]):
+        if isinstance(token_ids, list):
+            return [self.decode(s) for s in token_ids]
+        # elif if tensor, convert to list first
+        elif isinstance(token_ids, torch.Tensor):
+            return [self.decode(s) for s in token_ids.tolist()]
+        else:
+            return self.decode(token_ids)

utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+def grab_first_if_tuple(x):
+    if x.__class__.__name__ == "tuple":
+        return x[0]
+    else:
+        return x
+def column_split(x, num_heads, head_size):
+    """Split a tensor with `num_heads` alongside the head dimension, instead of
+    across heads. Fixed to three projections
+    """
+    x_reshaped = x.reshape(
+        x.shape[0],
+        num_heads,
+        3 * head_size,
+    )
+    x2, x1, v = (
+        x_reshaped[:, :, :head_size],
+        x_reshaped[
+            :,
+            :,
+            head_size : 2 * head_size,
+        ],
+        x_reshaped[:, :, 2 * head_size :],
+    )
+    x2, x1, v = (
+        x2.reshape(x2.shape[0], -1),
+        x1.reshape(x1.shape[0], -1),
+        v.reshape(v.shape[0], -1),
+    )
+    return x2, x1, v
+def get_init_from_string(init_str):
+    if type(init_str) == str:
+        if init_str == "torch.nn.init.zeros_":
+            return torch.nn.init.zeros_
+        elif init_str == "torch.nn.init.xavier_uniform_":
+            return torch.nn.init.xavier_uniform_
+        elif init_str == "torch.nn.init.xavier_normal_":
+            return torch.nn.init.xavier_normal_
+        else:
+            raise ValueError(f"Unrecognized init {init_str}")
+def print_rank_0(message, debug=False, end="\n"):
+    """Print from rank 0 only."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True, end=end)
+    else:
+        print(message, flush=True, end=end)
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+    first and last index of the vocabulary belonging to the `rank`
+    partition: Note that indices in [first, last]"""
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size
+        )