add model

Browse files

Files changed (6) hide show

README.md +3 -3
SCRIPT_README.md +22 -0
generate.py +177 -0
modello_italia.py +403 -0
requirements.txt +5 -0
tokenizer.model +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,3 @@
----
-license: mit
----


1	+ ### Instructions
2	+
3	+ To run the model `italia.bin` along with its tokenizer `tokenizer.model`, you'll need the inference script. Once you get it, you can either move these two files to the `inference_script` folder or specify the correct path within the script.

SCRIPT_README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+```python
+# Modello Italia inference script and model
+# Copyright 2024 iGenius
+#
+# Licensed under the MIT License (see LICENSE-MIT).
+# This code also contains code from the original project licensed under the Apache License 2.0 (see LICENSE-APACHE).
+# This script contains modifications of the original code from Lightning AI.
+```
+### Instructions
+1. First, move the model and the tokenizer from `/modello_italia_9b` to the current directory, or ensure that the path is correctly specified.
+2. Install dependencies by running the following command in the terminal:
+   ```terminal
+   pip install -r requirements.txt
+   ```
+3. To run the generation, use the following command:
+   ```terminal
+   python generate.py --checkpoint_dir <model_path> --max_new_tokens 500 --temperature 0.2 --prompt "Ciao, chi sei?"
+   ```

generate.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+# Derivated from https://github.com/Lightning-AI/litgpt/blob/main/litgpt/generate/base.py
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Any, Optional
+import torch
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+from modello_italia import Italia, ItaliaConfig, Tokenizer
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+MI_SYSTEM_PROMPT_SHORT = (
+    "Tu sei Modello Italia, un modello di linguaggio naturale addestrato da iGenius."
+)
+def multinomial_num_samples_1(probs: torch.Tensor) -> torch.Tensor:
+    if torch._dynamo.is_compiling():
+        # Faster alternative to `torch.multinomial(probs, num_samples=1)` that is also CUDAGraph friendly
+        distribution = torch.empty_like(probs).exponential_(1)
+        return torch.argmax(probs / distribution, dim=-1, keepdim=True)
+    return torch.multinomial(probs, num_samples=1)
+def sample(
+    logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] = None
+) -> torch.Tensor:
+    logits = logits[0, -1]
+    # optionally crop the logits to only the top k options
+    if top_k is not None:
+        v, i = torch.topk(logits, min(top_k, logits.size(-1)))
+        # do not use `torch.where` as in nanogpt because it will repeat top-k collisions
+        logits = torch.full_like(logits, float("-inf")).scatter_(-1, i, v)
+    # optionally scale the logits and sample from a probability distribution
+    if temperature > 0.0:
+        probs = torch.nn.functional.softmax(logits / temperature, dim=-1)
+        return multinomial_num_samples_1(probs)
+    return torch.argmax(logits, dim=-1, keepdim=True)
+def next_token(
+    model: Italia, input_pos: torch.Tensor, x: torch.Tensor, **kwargs: Any
+) -> torch.Tensor:
+    logits = model(x, input_pos)
+    next = sample(logits, **kwargs)
+    return next.to(dtype=x.dtype)
+@torch.inference_mode()
+def generate(
+    model: Italia,
+    prompt: torch.Tensor,
+    tokenizer: Tokenizer,
+    max_returned_tokens: int,
+    *,
+    temperature: float = 1.0,
+    top_k: Optional[int] = None,
+    eos_id: Optional[int] = None,
+) -> torch.Tensor:
+    """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
+    The implementation of this function is modified from A. Karpathy's nanoGPT.
+    Args:
+        model: The model to use.
+        prompt: Tensor of shape (T) with indices of the prompt sequence.
+        max_returned_tokens: The maximum number of tokens to return (given plus generated).
+        tokenizer: Tokenizer instance to decode generated tokens
+        temperature: Scales the predicted logits by 1 / temperature.
+        top_k: If specified, only sample among the tokens with the k highest probabilities.
+    """
+    T = prompt.size(0)
+    assert max_returned_tokens > T
+    device = prompt.device
+    tokens = [prompt]
+    input_pos = torch.tensor([T], device=device)
+    token = next_token(
+        model,
+        torch.arange(0, T, device=device),
+        prompt.view(1, -1),
+        temperature=temperature,
+        top_k=top_k,
+    ).clone()
+    tokens.append(token)
+    for _ in range(2, max_returned_tokens - T + 1):
+        token = next_token(
+            model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k
+        ).clone()
+        tokens.append(token)
+        if token == tokenizer.eos_id:
+            break
+        os.system('cls' if os.name == 'nt' else 'clear')
+        print(tokenizer.decode(torch.cat(tokens)[T:]))
+        input_pos = input_pos.add_(1)
+    return torch.cat(tokens)
+@torch.inference_mode()
+def main(
+    prompt: str = "Ciao, chi sei?",
+    *,
+    num_samples: int = 1,
+    max_new_tokens: int = 200,
+    top_k: Optional[int] = 200,
+    temperature: float = 0.4,
+    checkpoint_dir: Path = Path("."),
+) -> None:
+    """Generates text samples based on a pre-trained model and tokenizer.
+    Args:
+        prompt: The prompt string to use for generating the samples.
+        num_samples: The number of text samples to generate.
+        max_new_tokens: The number of generation steps to take.
+        top_k: The number of top most probable tokens to consider in the sampling process.
+        temperature: A value controlling the randomness of the sampling process. Higher values result in more random
+            samples.
+        checkpoint_dir: The checkpoint directory to load.
+    """
+    config = ItaliaConfig()
+    checkpoint_path = checkpoint_dir / "italia.bin"
+    tokenizer = Tokenizer(checkpoint_dir)
+    prompt = f"<|system|>{MI_SYSTEM_PROMPT_SHORT}\n<|user|>{prompt}\n<|assistant|>"
+    encoded = tokenizer.encode(prompt, device=device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
+    print(f"Loading model {str(checkpoint_path)!r}")
+    t0 = time.perf_counter()
+    model = Italia(config)
+    model.load_state_dict(torch.load(checkpoint_path, mmap=True))
+    model.to(device)
+    print(
+        f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.",
+        file=sys.stderr,
+    )
+    model.max_seq_length = max_returned_tokens
+    model.set_kv_cache(batch_size=1, device=device)
+    model.eval()
+    for _ in range(num_samples):
+        t0 = time.perf_counter()
+        y = generate(
+            model,
+            encoded,
+            tokenizer,
+            max_returned_tokens,
+            temperature=temperature,
+            top_k=top_k,
+        )
+        t = time.perf_counter() - t0
+        for block in model.transformer.h:
+            block.attn.kv_cache.reset_parameters()
+        #print(tokenizer.decode(y))
+        tokens_generated = y.size(0) - prompt_length
+        print(f"\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec")
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    torch.set_float32_matmul_precision("high")
+    CLI(main)

modello_italia.py ADDED Viewed

	@@ -0,0 +1,403 @@

+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+# Derivated from https://github.com/Lightning-AI/litgpt/blob/main/litgpt/model.py
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Union
+from sentencepiece import SentencePieceProcessor
+import torch
+@dataclass
+class ItaliaConfig:
+    block_size: int = 4096
+    vocab_size: int = 50_000
+    padding_multiple: int = 512
+    padded_vocab_size: int = 50176
+    head_size: int = 160
+    n_layer: int = 34
+    n_head: int = 32
+    n_embd: int = 5120
+    rotary_percentage: float = 0.4
+    parallel_residual: bool = True
+    bias: bool = True
+    lm_head_bias: bool = True
+    n_query_groups: int = 32
+    shared_attention_norm: bool = True
+    norm_eps: float = 1e-5
+    intermediate_size: int = 12800
+    rope_condense_ratio: int = 1
+    rope_n_elem: int = 64
+    rope_base: int = 10000
+class Tokenizer:
+    def __init__(self, checkpoint_dir: Union[Path, str]) -> None:
+        checkpoint_dir = Path(checkpoint_dir)
+        if not checkpoint_dir.exists():
+            raise NotADirectoryError(
+                f"The checkpoint directory does not exist: {str(checkpoint_dir)}"
+            )
+        self.use_bos = True
+        self.bos_id = None
+        self.eos_id = None
+        if (vocabulary_path := checkpoint_dir / "tokenizer.model").is_file():
+            self.processor = SentencePieceProcessor(model_file=str(vocabulary_path))
+            self.backend = "sentencepiece"
+            self.bos_id = self.processor.bos_id()
+            self.eos_id = self.processor.eos_id()
+        else:
+            raise FileNotFoundError(
+                f"tokenizer.model not found in {str(checkpoint_dir)}"
+            )
+    @property
+    def vocab_size(self) -> int:
+        return self.processor.vocab_size()
+    def token_to_id(self, token: str) -> int:
+        return self.processor.piece_to_id(token)
+    def encode(
+        self,
+        string: str,
+        device: Optional[torch.device] = None,
+        max_length: int = -1,
+    ) -> torch.Tensor:
+        tokens = self.processor.encode(string)
+        tokens = [self.bos_id] + tokens
+        if max_length > 0:
+            tokens = tokens[:max_length]
+        return torch.tensor(tokens, dtype=torch.int, device=device)
+    def decode(self, tensor: torch.Tensor) -> str:
+        tokens = [tensor.item()] if tensor.ndim == 0 else tensor.tolist()
+        return self.processor.decode(tokens).strip()
+class Italia(nn.Module):
+    def __init__(self, config: ItaliaConfig) -> None:
+        super().__init__()
+        assert config.padded_vocab_size is not None
+        self.config = config
+        self.lm_head = nn.Linear(
+            config.n_embd, config.padded_vocab_size, bias=config.lm_head_bias
+        )
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.padded_vocab_size, config.n_embd),
+                h=nn.ModuleList(Block(config) for _ in range(config.n_layer)),
+                ln_f=nn.LayerNorm(config.n_embd, eps=config.norm_eps),
+            )
+        )
+        self.max_seq_length = self.config.block_size
+        self.mask_cache: Optional[torch.Tensor] = None
+    @property
+    def max_seq_length(self) -> int:
+        return self._max_seq_length
+    @max_seq_length.setter
+    def max_seq_length(self, value: int) -> None:
+        """
+        When doing inference, the sequences used might be shorter than the model's context length.
+        This allows setting a smaller number to avoid allocating unused memory
+        """
+        if value > self.config.block_size:
+            raise ValueError(
+                f"Cannot attend to {value}, block size is only {self.config.block_size}"
+            )
+        self._max_seq_length = value
+        if not hasattr(self, "cos"):
+            cos, sin = self.rope_cache()
+            self.register_buffer("cos", cos, persistent=False)
+            self.register_buffer("sin", sin, persistent=False)
+        elif value != self.cos.size(0):
+            self.cos, self.sin = self.rope_cache(device=self.cos.device)
+    def reset_parameters(self) -> None:
+        self.cos, self.sin = self.rope_cache()
+    def forward(
+        self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        T = idx.size(1)
+        if self.max_seq_length < T:
+            raise ValueError(
+                f"Cannot forward sequence of length {T}, max seq length is only {self.max_seq_length}."
+            )
+        if input_pos is not None:  # use the kv cache
+            cos = self.cos.index_select(0, input_pos)
+            sin = self.sin.index_select(0, input_pos)
+            if self.mask_cache is None:
+                raise TypeError("You need to call `gpt.set_kv_cache()`")
+            mask = self.mask_cache.index_select(2, input_pos)
+        else:
+            cos = self.cos[:T]
+            sin = self.sin[:T]
+            mask = None
+        x = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        for block in self.transformer.h:
+            x = block(x, cos, sin, mask, input_pos)
+        x = self.transformer.ln_f(x)
+        return self.lm_head(x)  # (b, t, vocab_size)
+    def rope_cache(
+        self, device: Optional[torch.device] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return build_rope_cache(
+            seq_len=self.max_seq_length,
+            n_elem=self.config.rope_n_elem,
+            device=device,
+            condense_ratio=self.config.rope_condense_ratio,
+            base=self.config.rope_base,
+        )
+    def set_kv_cache(
+        self,
+        batch_size: int,
+        rope_cache_length: Optional[int] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        if rope_cache_length is None:
+            rope_cache_length = self.cos.size(-1)
+        max_seq_length = self.max_seq_length
+        for block in self.transformer.h:
+            block.attn.kv_cache = block.attn.build_kv_cache(
+                batch_size, max_seq_length, rope_cache_length, device, dtype
+            )
+        if self.mask_cache is None or self.mask_cache.size(3) != max_seq_length:
+            self.mask_cache = build_mask_cache(max_seq_length, device)
+    def clear_kv_cache(self) -> None:
+        self.mask_cache = None
+        for block in self.transformer.h:
+            block.attn.kv_cache = None
+class Block(nn.Module):
+    def __init__(self, config: ItaliaConfig) -> None:
+        super().__init__()
+        self.norm_1 = nn.LayerNorm(config.n_embd, eps=config.norm_eps)
+        self.attn = CausalSelfAttention(config)
+        self.mlp = MLP(config)
+        self.config = config
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        n_1 = self.norm_1(x)
+        h = self.attn(n_1, cos, sin, mask, input_pos)
+        n_2 = n_1 if self.config.shared_attention_norm else self.norm_2(x)
+        x = self.mlp(n_2) + h + x
+        return x
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: ItaliaConfig) -> None:
+        super().__init__()
+        shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
+        linear_module = nn.Linear
+        self.attn = linear_module(config.n_embd, shape, bias=config.bias)
+        self.proj = linear_module(config.n_embd, config.n_embd, bias=config.bias)
+        self.kv_cache: Optional[KVCache] = None
+        self.config = config
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B, T, _ = (
+            x.size()
+        )  # batch size, sequence length, embedding dimensionality (n_embd)
+        qkv = self.attn(x)
+        # assemble into a number of query groups to support MHA, MQA and GQA together (see `config.n_query_groups`)
+        q_per_kv = self.config.n_head // self.config.n_query_groups
+        total_qkv = q_per_kv + 2  # each group has 1+ queries, 1 key, and 1 value
+        qkv = qkv.view(
+            B, T, self.config.n_query_groups, total_qkv, self.config.head_size
+        )
+        qkv = qkv.permute(0, 2, 3, 1, 4)  # (B, n_query_groups, total_qkv, T, hs)
+        # split batched computation into three
+        q, k, v = qkv.split((q_per_kv, 1, 1), dim=2)
+        q = q.reshape(B, -1, T, self.config.head_size)  # (B, nh_q, T, hs)
+        k = k.reshape(B, -1, T, self.config.head_size)  # (B, nh_k, T, hs)
+        v = v.reshape(B, -1, T, self.config.head_size)  # (B, nh_v, T, hs)
+        q_roped = apply_rope(q[..., : self.config.rope_n_elem], cos, sin)
+        k_roped = apply_rope(k[..., : self.config.rope_n_elem], cos, sin)
+        q = torch.cat((q_roped, q[..., self.config.rope_n_elem :]), dim=-1)
+        k = torch.cat((k_roped, k[..., self.config.rope_n_elem :]), dim=-1)
+        if input_pos is not None:
+            if not isinstance(self.kv_cache, KVCache):
+                raise TypeError("You need to call `gpt.set_kv_cache()`")
+            k, v = self.kv_cache(input_pos, k, v)
+        y = self.scaled_dot_product_attention(q, k, v, mask)
+        y = y.reshape(
+            B, T, self.config.n_embd
+        )  # re-assemble all head outputs side by side
+        # output projection
+        return self.proj(y)
+    def scaled_dot_product_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        scale = 1.0 / math.sqrt(self.config.head_size)
+        y = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=mask is None
+        )
+        return y.transpose(1, 2)
+    def build_kv_cache(
+        self,
+        batch_size: int,
+        max_seq_length: int,
+        rope_cache_length: Optional[int] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> "KVCache":
+        heads = 1 if self.config.n_query_groups == 1 else self.config.n_head
+        v_shape = (batch_size, heads, max_seq_length, self.config.head_size)
+        if rope_cache_length is None:
+            if self.config.rotary_percentage != 1.0:
+                raise TypeError(
+                    "Please pass the `rope_cache_length=gpt.cos.size(-1)` value"
+                )
+            k_shape = v_shape
+        else:
+            k_shape = (
+                batch_size,
+                heads,
+                max_seq_length,
+                rope_cache_length + self.config.head_size - self.config.rope_n_elem,
+            )
+        return KVCache(k_shape, v_shape, device=device, dtype=dtype)
+class MLP(nn.Module):
+    def __init__(self, config: ItaliaConfig) -> None:
+        super().__init__()
+        self.fc = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)
+        self.config = config
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc(x)
+        x = torch.nn.functional.gelu(x, approximate="tanh")
+        return self.proj(x)
+def build_rope_cache(
+    seq_len: int,
+    n_elem: int,
+    device: Optional[torch.device] = None,
+    base: int = 10000,
+    condense_ratio: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Enhanced Transformer with Rotary Position Embedding.
+    Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+    transformers/rope/__init__.py. MIT License:
+    https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+    """
+    # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
+    # Create position indexes `[0, 1, ..., seq_len - 1]`
+    seq_idx = torch.arange(seq_len, device=device) / condense_ratio
+    # Calculate the product of position index and $\theta_i$
+    idx_theta = torch.outer(seq_idx, theta).repeat(1, 2)
+    return torch.cos(idx_theta), torch.sin(idx_theta)
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    head_size = x.size(-1)
+    x1 = x[..., : head_size // 2]  # (B, nh, T, hs/2)
+    x2 = x[..., head_size // 2 :]  # (B, nh, T, hs/2)
+    rotated = torch.cat((-x2, x1), dim=-1)  # (B, nh, T, hs)
+    roped = (x * cos) + (rotated * sin)
+    return roped.to(dtype=x.dtype)
+class KVCache(nn.Module):
+    def __init__(
+        self,
+        k_shape: Tuple[int, int, int, int],
+        v_shape: Tuple[int, int, int, int],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__()
+        self.register_buffer(
+            "k", torch.zeros(k_shape, device=device, dtype=dtype), persistent=False
+        )
+        self.register_buffer(
+            "v", torch.zeros(v_shape, device=device, dtype=dtype), persistent=False
+        )
+    def forward(
+        self, input_pos: torch.Tensor, k: torch.Tensor, v: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # move the buffer to the activation dtype for when AMP is used
+        self.k = self.k.to(k.dtype)
+        self.v = self.v.to(v.dtype)
+        # update the cache
+        k = self.k.index_copy_(2, input_pos, k)
+        v = self.v.index_copy_(2, input_pos, v)
+        return k, v
+    def reset_parameters(self) -> None:
+        torch.nn.init.zeros_(self.k)
+        torch.nn.init.zeros_(self.v)
+def build_mask_cache(
+    max_seq_length: int, device: Optional[torch.device] = None
+) -> torch.Tensor:
+    ones = torch.ones((max_seq_length, max_seq_length), device=device, dtype=torch.bool)
+    return torch.tril(ones).unsqueeze(0).unsqueeze(0)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+--find-links https://download.pytorch.org/whl/torch_stable.html
+torch>=2.2.0
+jsonargparse[cli]
+sentencepiece

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd74bea2ba620d87e0a2127d9a21196b862a5cc7942ba4638eb2159bbab3340c
+size 1090536