Upload MyLLaMa

Browse files

Files changed (5) hide show

README.md +199 -0
config.json +16 -0
configure_for_hf.py +54 -0
llama.py +430 -0
model.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "architectures": [
+    "MyLLaMa"
+  ],
+  "auto_map": {
+    "AutoConfig": "configure_for_hf.MyLLaMaConfig",
+    "AutoModelForCausalLM": "configure_for_hf.MyLLaMa"
+  },
+  "embed_dim": 1536,
+  "model_type": "LLaMa",
+  "n_chckpnt_segments": 24,
+  "n_heads": 24,
+  "n_layers": 24,
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.0.dev0"
+}

configure_for_hf.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from torch import nn
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from .llama import CustomAttentionLLaMa
+class MyLLaMaConfig(PretrainedConfig):
+    model_type = "LLaMa"
+    def __init__(
+        self,
+        embed_dim: int = 1536,
+        n_layers: int = 24,
+        n_heads: int = 24,
+        n_chckpnt_segments: int = 24,
+        **kwargs,
+    ):
+        self.embed_dim = embed_dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.n_chckpnt_segments = n_chckpnt_segments
+        super().__init__(**kwargs)
+class MyLLaMa(PreTrainedModel):
+    config_class = MyLLaMaConfig
+    def __init__(self, config: MyLLaMaConfig):
+        super().__init__(config)
+        self.model = CustomAttentionLLaMa(
+            config.embed_dim,
+            config.n_layers,
+            config.n_heads,
+            dropout=0,
+            n_chckpnt_segments=config.n_chckpnt_segments,
+        )
+    def forward(self, tensor, labels=None):
+        logits = self.model(tensor)["logits"]
+        if labels is not None:
+            loss = nn.functional.cross_entropy(logits, labels)
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+AutoConfig.register("LLaMa", MyLLaMaConfig)
+AutoModel.register(MyLLaMaConfig, MyLLaMa)
+AutoModelForCausalLM.register(MyLLaMaConfig, MyLLaMa)

llama.py ADDED Viewed

	@@ -0,0 +1,430 @@

+import torch
+from torch import Tensor, nn
+from torch.nn import Sequential
+from torch.utils.checkpoint import checkpoint, checkpoint_sequential
+from xformers.components.attention.utils import maybe_merge_masks
+from xformers.components import MultiHeadDispatch
+from xformers.components.attention import ScaledDotProduct
+from transformers import AutoTokenizer
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim_per_head: int,
+        max_seq_len: int = 4096,
+        interpolation_ratio: float | None = 0.25,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        self.dim_per_head = dim_per_head
+        self.max_seq_len = max_seq_len
+        freqs = 1.0 / (
+            10000
+            ** (
+                torch.arange(0, dim_per_head, 2, device=device, dtype=dtype).float() / 6
+            )
+        )
+        freqs = torch.repeat_interleave(freqs, 2)
+        r = (
+            freqs
+            * torch.arange(max_seq_len, device=device, dtype=dtype).float()[:, None]
+        )
+        if interpolation_ratio is not None:
+            r = r * interpolation_ratio
+        r1 = r.cos()
+        self.register_buffer("r1", r1)
+        r2 = r.sin()
+        self.register_buffer("r2", r2)
+        aranged = torch.arange(dim_per_head, device=device, dtype=dtype)
+        mask1 = torch.where(
+            aranged % 2 == 1,
+            aranged - 1,
+            aranged + 1,
+        ).float()
+        self.register_buffer("mask1", mask1)
+        mask2 = torch.where(aranged % 2 == 0, -1, 1).float()
+        self.register_buffer("mask2", mask2)
+    def forward(self, x: Tensor):
+        """
+        Args:
+            x (Tensor): input tensor. shape: (bs, seq_len, n_heads, dim_per_head)
+        Returns:
+            Tensor: input tensor with rotary embeddings. shape: (bs, seq_len, n_heads, dim_per_head)
+        """
+        assert (
+            x.ndim == 4
+        ), "input must have 4 dimensions: (bs, n_heads, seq_len, dim_per_head)"
+        assert x.shape[3] % 2 == 0, "dim_per_head must be divisible by 2"
+        x = x.transpose(1, 2)
+        return (
+            x * self.r1[None, : x.shape[1], None, :]
+            + x[
+                :,
+                :,
+                :,
+                self.mask1,
+            ]
+            * self.mask2
+            * self.r2[None, : x.shape[1], None, :]
+        ).transpose(1, 2)
+    def extra_repr(self) -> str:
+        return f"dim_per_head={self.dim_per_head}, max_seq_len={self.max_seq_len}"
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-9):
+        super().__init__()
+        self.dim = dim
+        self.gamma = nn.Parameter(
+            data=torch.nn.init.normal_(torch.zeros((dim,))), requires_grad=True
+        )
+        self.eps = eps
+    def forward(self, x: Tensor):
+        """
+        Args:
+            x (Tensor): input tensor. shape: (bs, seq_len, embed_dim)
+        Returns:
+            Tensor: input tensor with rotary embeddings. shape: (bs, seq_len, embed_dim)
+        """
+        assert x.ndim == 3, "input must have 3 dimensions: (bs, seq_len, embed_dim)"
+        return (
+            x
+            / torch.sqrt_(torch.mean(torch.square(x), dim=-1) + self.eps)[:, :, None]
+            * self.gamma
+        )
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, eps={self.eps}"
+class SiLU(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x: Tensor):
+        """
+        Args:
+            x (Tensor): input
+        """
+        return x * x.sigmoid()
+class SwiGLU(nn.Module):
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        self.linear_inp1 = nn.Linear(dim, (8 * dim) // 3, bias=False)
+        self.linear_inp2 = nn.Linear(dim, (8 * dim) // 3, bias=False)
+        self.linear_out = nn.Linear((8 * dim) // 3, dim, bias=False)
+        self.silu = SiLU()
+        # nn.init.xavier_uniform_(self.linear_inp1.weight)
+        # nn.init.xavier_uniform_(self.linear_inp2.weight)
+        # nn.init.xavier_uniform_(self.linear_out.weight)
+    def forward(self, x: Tensor):
+        """
+        Args:
+            x (Tensor): input tensor
+        """
+        return self.linear_out(self.silu(self.linear_inp1(x)) * self.linear_inp2(x))
+class MistralTokenizer(nn.Module):
+    def __init__(self, max_length=1024, *args, **kwargs):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "mistralai/Mistral-7B-v0.1", *args, **kwargs
+        )
+        self.tokenizer.add_special_tokens({"pad_token": "<pad>"})
+        self.special_tokens_ids = {
+            token: id
+            for token, id in zip(
+                self.tokenizer.special_tokens_map.keys(), self.tokenizer.all_special_ids
+            )
+        }
+        self.max_length = max_length
+        self.pad_token_id = self.tokenizer.pad_token_id
+    def forward(self, text):
+        return self.tokenizer(
+            text,
+            return_tensors="pt",
+            return_attention_mask=False,
+            max_length=self.max_length,
+            truncation=True,
+            padding=True,
+            padding_side="right",
+        )
+    def convert_ids_to_tokens(self, ids):
+        return self.tokenizer.convert_ids_to_tokens(ids)
+    def decode(self, x):
+        return self.tokenizer.batch_decode(x)
+    def __len__(self):
+        return len(self.tokenizer)
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        emb_size: int,
+        n_heads: int,
+        dropout: float = 0.0,
+        use_rotary_embeddings: bool = False,
+        bias_qkv: bool = False,
+        bias_out: bool = False,
+    ):
+        super().__init__()
+        self.emb_size = emb_size
+        self.n_heads = n_heads
+        assert (
+            self.emb_size % n_heads == 0
+        ), "Embedding size needs to be divisible by heads"
+        self.head_dim = emb_size // n_heads
+        self.use_rotary_embeddings = use_rotary_embeddings
+        if self.use_rotary_embeddings:
+            self.rotary_embed = RotaryEmbedding(self.head_dim)
+        self.qkv = nn.Linear(emb_size, emb_size * 3, bias=bias_qkv)
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(emb_size, emb_size, bias=bias_out)
+        self.scaling = self.head_dim**-0.5
+    def forward(self, x: Tensor, att_mask: Tensor = None):
+        qkv = self.qkv(x).chunk(3, dim=-1)
+        q, k, v = map(
+            lambda t: t.reshape(x.shape[0], -1, self.n_heads, self.head_dim).transpose(
+                1, 2
+            ),
+            qkv,
+        )  # [batch_size, n_heads, seq_len, head_dim]
+        if self.use_rotary_embeddings:
+            q, k = self.rotary_embed(q), self.rotary_embed(k)
+        dots = (
+            torch.matmul(q, k.transpose(-1, -2)) * self.scaling
+        )  # [batch_size, n_heads, seq_len, seq_len]
+        if att_mask is not None:
+            dots = dots + att_mask
+        attn = self.dropout(torch.softmax(dots, dim=-1))
+        out = (
+            torch.matmul(attn, v).transpose(1, 2).reshape(x.shape[0], -1, self.emb_size)
+        )
+        out = self.out(out)
+        return out
+class LLaMADecoderLayer(nn.Module):
+    def __init__(
+        self,
+        emb_size: int,
+        n_heads: int,
+        dropout: float,
+    ) -> None:
+        super().__init__()
+        self.emb_size = emb_size
+        self.multihead_attn = MultiHeadDispatch(
+            dim_model=emb_size,
+            num_heads=n_heads,
+            attention=ScaledDotProduct(
+                dropout=dropout,
+            ),
+            bias=(False, False, False, False),
+            use_rotary_embeddings=True,
+        )
+        self.rmsnorm1 = nn.RMSNorm(emb_size, eps=1e-9)
+        self.rmsnorm2 = nn.RMSNorm(emb_size, eps=1e-9)
+        self.swiglu = SwiGLU(emb_size)
+        self.n_heads = n_heads
+    def forward(self, in_tuple) -> Tensor:
+        """
+        Args:
+            in_tuple (tuple[Tensor, Tensor, Tensor]): tuple, containing 3 tensors:
+                x (Tensor): input tensor    (bs, seq_len, dim)
+                attn_mask (Tensor): attention mask  (seq_len, seq_len)
+                padding_mask (Tensor): padding mask (bs, seq_len)
+        Returns:
+            Tensor: output tensor
+        """
+        assert len(in_tuple) == 2, "input tuple must have 2 elements"
+        x, mask = in_tuple
+        x = self.multihead_attn(self.rmsnorm1(x), att_mask=mask) + x
+        return self.swiglu(self.rmsnorm2(x)) + x, mask
+class CustomAttentionLLaMaDecoder(LLaMADecoderLayer):
+    def __init__(
+        self,
+        emb_size: int,
+        n_heads: int,
+        dropout: float,
+    ) -> None:
+        super().__init__(emb_size, n_heads, dropout)
+        self.multihead_attn = MultiHeadAttention(
+            emb_size=emb_size,
+            n_heads=n_heads,
+            bias_qkv=False,
+            bias_out=False,
+            use_rotary_embeddings=True,
+            dropout=dropout,
+        )
+        self.rmsnorm1 = RMSNorm(emb_size, eps=1e-9)
+        self.rmsnorm2 = RMSNorm(emb_size, eps=1e-9)
+class LLaMaBase(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int = 512,
+        n_layers: int = 2,
+        n_heads: int = 8,
+        dropout: int = 0.0,
+        n_chckpnt_segments: int = 1,
+        tokenizer=MistralTokenizer(),
+        **kwargs,
+    ):
+        """
+        Args:
+            n_feats (int): number of input features.
+            n_class (int): number of classes.
+            fc_hidden (int): number of hidden features.
+        """
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.vocab_len = len(tokenizer)
+        self.n_heads = n_heads
+        self.dropout = dropout
+        self.n_layers = n_layers
+        self.embed_dim = embed_dim
+        self.n_segments = n_chckpnt_segments
+        self.embed = nn.Embedding(
+            self.vocab_len, embed_dim, padding_idx=self.tokenizer.pad_token_id
+        )
+        self.head = nn.Linear(embed_dim, self.vocab_len, bias=False)
+    def forward(self, src: Tensor, attn_mask: Tensor, pad_mask: Tensor, **batch):
+        """
+        Model forward method.
+        Args:
+            tokenized (Tensor): input text. shape: (batch_size, seq_len)
+        Returns:
+            output (dict): output dict containing logits.
+        """
+        raise NotImplementedError
+    def __str__(self):
+        """
+        Model prints with the number of parameters.
+        """
+        all_parameters = sum([p.numel() for p in self.parameters()])
+        trainable_parameters = sum(
+            [p.numel() for p in self.parameters() if p.requires_grad]
+        )
+        embedding_parameters = sum([p.numel() for p in self.embed.parameters()])
+        result_info = super().__str__()
+        result_info = result_info + f"\nAll parameters: {all_parameters}"
+        result_info = result_info + f"\nTrainable parameters: {trainable_parameters}"
+        result_info = (
+            result_info
+            + f"\nWithout embedding: {trainable_parameters - embedding_parameters}"
+        )
+        return result_info
+class CustomAttentionLLaMa(LLaMaBase):
+    def __init__(
+        self,
+        embed_dim: int = 512,
+        n_layers: int = 2,
+        n_heads: int = 8,
+        dropout: int = 0.0,
+        n_chckpnt_segments: int = 1,
+        tokenizer=MistralTokenizer(),
+        **kwargs,
+    ):
+        """
+        Args:
+            n_feats (int): number of input features.
+            n_class (int): number of classes.
+            fc_hidden (int): number of hidden features.
+        """
+        super().__init__(
+            embed_dim,
+            n_layers,
+            n_heads,
+            dropout,
+            n_chckpnt_segments,
+            tokenizer,
+        )
+        self.decoders = nn.Sequential(
+            *[
+                CustomAttentionLLaMaDecoder(
+                    emb_size=embed_dim, n_heads=self.n_heads, dropout=dropout
+                )
+                for _ in range(n_layers)
+            ]
+        )
+        self.rmsnorm = RMSNorm(embed_dim, eps=1e-9)
+    def forward(self, src: Tensor, attn_mask: Tensor, pad_mask: Tensor, **batch):
+        """
+        Model forward method.
+        Args:
+            tokenized (Tensor): input text. shape: (batch_size, seq_len)
+        Returns:
+            output (dict): output dict containing logits.
+        """
+        x = self.embed(src)  # embeds shape: [batch_size, seq_len, embed_dim]
+        sizes = x.shape
+        mask = maybe_merge_masks(
+            attn_mask, pad_mask, sizes[0], sizes[1], self.n_heads
+        ).view(x.shape[0], self.n_heads, sizes[1], sizes[1])
+        x, _ = checkpoint_sequential(self.decoders, self.n_segments, input=(x, mask))
+        # for decoder in self.decoders:
+        #     x, _, _ = decoder((x, attn_mask, pad_mask))
+        logits = self.head(self.rmsnorm(x))
+        return {
+            "logits": logits.permute(0, 2, 1)
+        }  # logits shape: [batch_size, vocab_len, seq_len]

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6380bd512c40cdd9705099299688b1b4965a8da9da94f8f9d4b29a5b3ac5bf06
+size 3161813608