Spaces:

hudsongouge
/

DAT-Byte-Demo

Sleeping

App Files Files Community

hudsongouge commited on Jun 21

Commit

adf0368

0 Parent(s):

Update space

Browse files

Files changed (26) hide show

README.md +14 -0
app.py +265 -0
commit.sh +1 -0
export_onnx.py +58 -0
inference/__init__.py +0 -0
inference/__pycache__/__init__.cpython-312.pyc +0 -0
inference/__pycache__/__init__.cpython-313.pyc +0 -0
inference/__pycache__/inference.cpython-312.pyc +0 -0
inference/__pycache__/inference.cpython-313.pyc +0 -0
inference/__pycache__/model.cpython-312.pyc +0 -0
inference/__pycache__/model.cpython-313.pyc +0 -0
inference/__pycache__/onnx_inference.cpython-312.pyc +0 -0
inference/__pycache__/onnx_inference.cpython-313.pyc +0 -0
inference/__pycache__/optimized_diffattn.cpython-312.pyc +0 -0
inference/__pycache__/optimized_diffattn.cpython-313.pyc +0 -0
inference/__pycache__/rotary.cpython-312.pyc +0 -0
inference/__pycache__/rotary.cpython-313.pyc +0 -0
inference/inference.py +335 -0
inference/model.py +189 -0
inference/onnx_inference.py +202 -0
inference/optimized_diffattn.py +177 -0
inference/rotary.py +76 -0
models/small.onnx +3 -0
requirements.txt +1 -0
test-trad.py +119 -0
test.py +81 -0

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: DAT Byte
+emoji: 💬
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+sdk_version: 5.34.2
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: A chat interface for the DAT Byte LLM.
+---
+An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

app.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import gradio as gr
+import os
+import onnxruntime as ort
+from inference.onnx_inference import generate_text, sequence_breaker_strings
+from inference.model import ByteTokenizer
+# --- Globals ---
+MODEL_OPTIONS = [
+    ("DAT-Byte Small (200M)", "small", True),
+    ("DAT-Byte Medium", "medium", False),
+    ("DAT-Byte Large", "large", False),
+]
+ONNX_PATH = "models/small.onnx"  # Assumes model.onnx is in the root directory
+# Cache for the ONNX session
+SESSION_CACHE = {}
+TOKENIZER = ByteTokenizer()
+# Prepare sequence breakers
+SEQUENCE_BREAKER_IDS = {TOKENIZER.im_start_id, TOKENIZER.im_end_id}
+for s in sequence_breaker_strings:
+    # These are single-byte tokens, so encode will return a list with one ID
+    try:
+        SEQUENCE_BREAKER_IDS.add(TOKENIZER.encode(s.encode("utf-8"))[0])
+    except IndexError:
+        print(f"Warning: Could not encode sequence breaker string: {s}")
+# --- Model Loading ---
+def get_session(model_key):
+    if model_key != "small":
+        raise ValueError("Only DAT-Byte Small is available.")
+    if model_key not in SESSION_CACHE:
+        if not os.path.exists(ONNX_PATH):
+            raise FileNotFoundError(f"ONNX model not found at {ONNX_PATH}")
+        # Using CPUExecutionProvider as per the project's goal
+        SESSION_CACHE[model_key] = ort.InferenceSession(
+            ONNX_PATH, providers=["CPUExecutionProvider"]
+        )
+    return SESSION_CACHE[model_key]
+# --- Gradio Callbacks ---
+def chat_respond(
+    message,
+    history,
+    model_name,
+    max_tokens,
+    temperature,
+    top_k,
+    dry_range,
+    dry_allowed_length,
+    dry_base,
+    dry_multiplier,
+    user_role="user",
+    assistant_role="assistant",
+):
+    model_key = next(
+        (key for name, key, enabled in MODEL_OPTIONS if name == model_name and enabled),
+        None,
+    )
+    if not model_key:
+        history.append({"role": "user", "content": message})
+        history.append(
+            {"role": "assistant", "content": f"Model '{model_name}' is not available."}
+        )
+        return history
+    history = history or []
+    try:
+        session = get_session(model_key)
+    except Exception as e:
+        history.append({"role": "user", "content": message})
+        history.append(
+            {"role": "assistant", "content": f"[Model loading error: {str(e)}]"}
+        )
+        return history
+    prompt = ""
+    for turn in history:
+        prompt += f"<|im_start|>{turn['role']}\n{turn['content']}<|im_end|>\n"
+    prompt += (
+        f"<|im_start|>{user_role}\n{message}<|im_end|>\n<|im_start|>{assistant_role}\n"
+    )
+    generated_text, _ = generate_text(
+        session=session,
+        tokenizer=TOKENIZER,
+        prompt=prompt,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        stop_sequences=["<|im_end|>".encode("utf-8")],
+        dry_sequence_breakers=SEQUENCE_BREAKER_IDS,
+        dry_range=dry_range,
+        dry_allowed_length=dry_allowed_length,
+        dry_base=dry_base,
+        dry_multiplier=dry_multiplier,
+    )
+    generated_text = generated_text.decode("utf-8", "ignore")
+    history.append({"role": "user", "content": message})
+    history.append({"role": "assistant", "content": generated_text})
+    return history
+def completion_respond(
+    prompt,
+    model_name,
+    max_tokens,
+    temperature,
+    top_k,
+    dry_range,
+    dry_allowed_length,
+    dry_base,
+    dry_multiplier,
+):
+    model_key = next(
+        (key for name, key, enabled in MODEL_OPTIONS if name == model_name and enabled),
+        None,
+    )
+    if not model_key:
+        return f"[Model '{model_name}' is not available or unknown.]"
+    try:
+        session = get_session(model_key)
+    except Exception as e:
+        return f"[Model loading error: {str(e)}]"
+    generated_text, _ = generate_text(
+        session=session,
+        tokenizer=TOKENIZER,
+        prompt=prompt,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        dry_sequence_breakers=SEQUENCE_BREAKER_IDS,
+        dry_range=dry_range,
+        dry_allowed_length=dry_allowed_length,
+        dry_base=dry_base,
+        dry_multiplier=dry_multiplier,
+    )
+    return generated_text
+# --- Gradio UI ---
+with gr.Blocks() as demo:
+    gr.Markdown("# DAT-Byte Playground (ONNX Accelerated)")
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_selector = gr.Radio(
+                [opt[0] for opt in MODEL_OPTIONS],
+                value=MODEL_OPTIONS[0][0],
+                label="Model",
+                interactive=True,
+            )
+            gr.Markdown("**Note:** Only DAT-Byte Small is currently available.")
+            mode_selector = gr.Radio(
+                ["Chat", "Raw Completion"], value="Chat", label="Mode"
+            )
+            max_tokens = gr.Slider(
+                minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"
+            )
+            temperature = gr.Slider(
+                minimum=0.05, maximum=2.0, value=0.5, step=0.05, label="Temperature"
+            )
+            top_k = gr.Slider(minimum=0, maximum=256, value=15, step=1, label="Top-k")
+            with gr.Accordion("DRY Sampling (Don't Repeat Yourself)", open=False):
+                dry_range = gr.Slider(
+                    minimum=0, maximum=2048, value=1024, step=32, label="Range"
+                )
+                dry_allowed_length = gr.Slider(
+                    minimum=1, maximum=64, value=20, step=1, label="Allowed Length"
+                )
+                dry_base = gr.Slider(
+                    minimum=1.0, maximum=5.0, value=2.0, step=0.1, label="Base"
+                )
+                dry_multiplier = gr.Slider(
+                    minimum=0.0, maximum=2.0, value=0.0, step=0.05, label="Multiplier"
+                )
+            user_role_box = gr.Textbox("user", label="User Role", visible=True)
+            assistant_role_box = gr.Textbox(
+                "assistant", label="Assistant Role", visible=True
+            )
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(label="Chat", type="messages", height=600)
+            with gr.Row():
+                chat_input = gr.Textbox(
+                    label="Message", placeholder="Type a message...", scale=4
+                )
+                send_button = gr.Button("Send", scale=1)
+            completion_input = gr.Textbox(label="Prompt", visible=False)
+            completion_output = gr.Textbox(label="Completion", visible=False)
+    # UI Logic
+    def update_mode(mode):
+        is_chat = mode == "Chat"
+        return (
+            gr.update(visible=is_chat),  # chatbot
+            gr.update(visible=is_chat),  # chat_input row
+            gr.update(visible=not is_chat),  # completion_input
+            gr.update(visible=not is_chat),  # completion_output
+            gr.update(visible=is_chat),  # user_role_box
+            gr.update(visible=is_chat),  # assistant_role_box
+        )
+    mode_selector.change(
+        update_mode,
+        [mode_selector],
+        [
+            chatbot,
+            chat_input.parent,
+            completion_input,
+            completion_output,
+            user_role_box,
+            assistant_role_box,
+        ],
+    )
+    # Event Handlers
+    chat_inputs = [
+        chat_input,
+        chatbot,
+        model_selector,
+        max_tokens,
+        temperature,
+        top_k,
+        dry_range,
+        dry_allowed_length,
+        dry_base,
+        dry_multiplier,
+        user_role_box,
+        assistant_role_box,
+    ]
+    chat_args = {"fn": chat_respond, "inputs": chat_inputs, "outputs": [chatbot]}
+    def clear_input():
+        return ""
+    clear_args = {"fn": clear_input, "inputs": [], "outputs": [chat_input]}
+    send_button.click(**chat_args).then(**clear_args)
+    chat_input.submit(**chat_args).then(**clear_args)
+    completion_inputs = [
+        completion_input,
+        model_selector,
+        max_tokens,
+        temperature,
+        top_k,
+        dry_range,
+        dry_allowed_length,
+        dry_base,
+        dry_multiplier,
+    ]
+    completion_input.submit(
+        completion_respond,
+        completion_inputs,
+        [completion_output],
+    )
+if __name__ == "__main__":
+    demo.launch()

commit.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ git commit -am 'Update space' && git push

export_onnx.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+from inference.model import DiffTransformerLLM
+from inference.inference import load_model
+import argparse
+import os
+def main():
+    parser = argparse.ArgumentParser(description="Export DiffTransformerLLM to ONNX")
+    parser.add_argument(
+        "--checkpoint", type=str, required=True, help="Path to model checkpoint (.pt)"
+    )
+    parser.add_argument(
+        "--onnx_path", type=str, default="model.onnx", help="Output ONNX file path"
+    )
+    parser.add_argument(
+        "--seq_len", type=int, default=32, help="Dummy input sequence length"
+    )
+    args = parser.parse_args()
+    device = torch.device("cpu")
+    print(f"Loading model from {args.checkpoint}")
+    model = load_model(args.checkpoint, device=device, fp16=False, quantize=False)
+    model.eval()
+    # Prepare dummy input
+    batch_size = 1
+    seq_len = args.seq_len
+    input_ids = torch.randint(0, 259, (batch_size, seq_len), dtype=torch.long)
+    # Create a dummy causal mask. This will be a dynamic input to the ONNX model.
+    causal_mask = torch.triu(
+        torch.ones(1, seq_len, seq_len, dtype=torch.bool), diagonal=1
+    )
+    attn_mask = torch.zeros(1, seq_len, seq_len, dtype=torch.float32)
+    attn_mask.masked_fill_(causal_mask, float("-inf"))
+    # Export to ONNX
+    print(f"Exporting to ONNX: {args.onnx_path}")
+    torch.onnx.export(
+        model,
+        (input_ids, attn_mask),
+        args.onnx_path,
+        input_names=["input_ids", "attn_mask"],
+        output_names=["logits"],
+        dynamic_axes={
+            "input_ids": {0: "batch_size", 1: "seq_len"},
+            "attn_mask": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
+            "logits": {0: "batch_size", 1: "seq_len"},
+        },
+        opset_version=17,
+        do_constant_folding=True,
+    )
+    print(f"ONNX export complete: {args.onnx_path}")
+if __name__ == "__main__":
+    main()

inference/__init__.py ADDED Viewed

File without changes

inference/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (155 Bytes). View file

inference/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (155 Bytes). View file

inference/__pycache__/inference.cpython-312.pyc ADDED Viewed

Binary file (11.4 kB). View file

inference/__pycache__/inference.cpython-313.pyc ADDED Viewed

Binary file (11.8 kB). View file

inference/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (9.27 kB). View file

inference/__pycache__/model.cpython-313.pyc ADDED Viewed

Binary file (9.51 kB). View file

inference/__pycache__/onnx_inference.cpython-312.pyc ADDED Viewed

Binary file (5.24 kB). View file

inference/__pycache__/onnx_inference.cpython-313.pyc ADDED Viewed

Binary file (7.44 kB). View file

inference/__pycache__/optimized_diffattn.cpython-312.pyc ADDED Viewed

Binary file (7.76 kB). View file

inference/__pycache__/optimized_diffattn.cpython-313.pyc ADDED Viewed

Binary file (7.79 kB). View file

inference/__pycache__/rotary.cpython-312.pyc ADDED Viewed

Binary file (2.52 kB). View file

inference/__pycache__/rotary.cpython-313.pyc ADDED Viewed

Binary file (2.41 kB). View file

inference/inference.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import torch
+import torch.nn.functional as F
+import os
+import torch.quantization
+from .model import (
+    DiffTransformerLLM,
+    ByteTokenizer,
+    IM_START_TOKEN,
+    IM_END_TOKEN,
+    PAD_TOKEN,
+)
+force_CPU = True
+def list_checkpoints(checkpoint_dir="checkpoints"):
+    """List all available checkpoints in the directory."""
+    if not os.path.exists(checkpoint_dir):
+        print(f"Checkpoint directory {checkpoint_dir} not found.")
+        return []
+    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith(".pt")]
+    return sorted(checkpoints)
+def load_model(checkpoint_path, device=None, fp16=True):
+    """Load a trained model from a checkpoint, applying optimizations as needed."""
+    import torch
+    if device is None:
+        device = torch.device(
+            "cuda" if torch.cuda.is_available() and not force_CPU else "cpu"
+        )
+    print(f"Loading checkpoint from {checkpoint_path}")
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    # Hyperparams
+    vocab_size = 259  # 256 bytes + 3 special tokens
+    embed_dim = 768
+    num_layers = 28
+    num_heads = 12
+    ffn_hidden_dim = embed_dim * 4
+    max_seq_len = 512
+    dropout = 0.1  # For inference you can set dropout=0
+    # Model
+    model = DiffTransformerLLM(
+        vocab_size=vocab_size,
+        embed_dim=embed_dim,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        ffn_hidden_dim=ffn_hidden_dim,
+        max_seq_len=max_seq_len,
+        dropout=dropout,
+    )
+    # The checkpoint is the state dict itself
+    state_dict = checkpoint
+    # Load the state dict into the float32 model first
+    model.load_state_dict(state_dict)
+    model.eval()
+    # Apply device-specific optimizations
+    if device.type == "cpu":
+        print("Optimizing for CPU with dynamic quantization (int8).")
+        # Set the quantization engine
+        torch.backends.quantized.engine = "qnnpack"
+        # Quantize the linear layers to int8 for performance
+        model = torch.quantization.quantize_dynamic(
+            model, {torch.nn.Linear}, dtype=torch.qint8
+        )
+    elif device.type == "cuda" and fp16:
+        print("Casting model to fp16 for CUDA.")
+        model = model.half()
+    model = model.to(device)
+    print("Model loaded successfully.")
+    return model
+def generate_text(
+    model,
+    tokenizer,
+    prompt,
+    max_new_tokens=100,
+    temperature=1.0,
+    top_k=0,
+    top_p=0.9,
+    repetition_penalty=1.0,
+    device=None,
+    stop_sequences=[],
+):
+    """
+    Generate text from a prompt using the trained model.
+    Args:
+        model: The trained DiffTransformerLLM model
+        tokenizer: ByteTokenizer instance
+        prompt: Text prompt to start generation (as a string)
+        max_new_tokens: Maximum number of new tokens to generate
+        temperature: Controls randomness. Lower is more deterministic.
+        top_k: If > 0, only sample from the top k most likely tokens
+        top_p: If > 0, sample from the smallest set of tokens whose cumulative probability exceeds p
+        repetition_penalty: Penalize repetition. 1.0 means no penalty.
+        device: Device to run inference on
+    Returns:
+        The generated text as a string
+    """
+    if device is None:
+        device = torch.device(
+            "cuda" if torch.cuda.is_available() and not force_CPU else "cpu"
+        )
+    # Convert prompt to bytes and tokenize - process as-is without adding special tokens
+    prompt_bytes = prompt.encode("utf-8", errors="replace")
+    input_ids = (
+        torch.tensor(
+            tokenizer.encode(prompt_bytes, add_special_tokens=False), dtype=torch.long
+        )
+        .unsqueeze(0)
+        .to(device)
+    )
+    stop_sequences = [
+        tokenizer.encode(
+            seq.encode("utf-8", errors="replace"), add_special_tokens=False
+        )
+        for seq in stop_sequences
+    ]
+    # Track generated token IDs
+    generated_ids = input_ids.clone()
+    generated_bytes = b""
+    # Set the model to evaluation mode
+    model.eval()
+    with torch.no_grad():
+        for _ in range(max_new_tokens):
+            # Only use the last max_seq_len tokens if we exceed the model's context length
+            if generated_ids.size(1) > model.max_seq_len:
+                input_ids = generated_ids[:, -model.max_seq_len :]
+            else:
+                input_ids = generated_ids
+            # Forward pass to get logits for the next token
+            logits = model(input_ids)
+            # Get logits for the next token (last position)
+            next_token_logits = logits[:, -1, :].squeeze(0)
+            # Apply temperature
+            if temperature > 0:
+                next_token_logits = next_token_logits / temperature
+            # Apply repetition penalty
+            if repetition_penalty > 1.0:
+                for token_id in set(generated_ids[0].tolist()):
+                    next_token_logits[token_id] /= repetition_penalty
+            # Apply top-k filtering
+            if top_k > 0:
+                top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k)
+                next_token_logits = torch.full_like(next_token_logits, float("-inf"))
+                next_token_logits.scatter_(0, top_k_indices, top_k_logits)
+            # Apply top-p (nucleus) filtering
+            if 0 < top_p < 1.0:
+                sorted_logits, sorted_indices = torch.sort(
+                    next_token_logits, descending=True
+                )
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=0), dim=0)
+                # Remove tokens with cumulative probability above the threshold
+                sorted_indices_to_remove = cumulative_probs > top_p
+                # Shift the indices to the right to keep the first token above the threshold
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                    ..., :-1
+                ].clone()
+                sorted_indices_to_remove[..., 0] = 0
+                indices_to_remove = sorted_indices[sorted_indices_to_remove]
+                next_token_logits[indices_to_remove] = float("-inf")
+            # Sample from the filtered distribution
+            probs = F.softmax(next_token_logits, dim=0)
+            next_token = torch.multinomial(probs, 1)
+            # Append the generated token to the sequence
+            generated_ids = torch.cat([generated_ids, next_token.unsqueeze(0)], dim=1)
+            # Check if IM_END_TOKEN has been generated
+            token_bytes = tokenizer.decode([next_token.item()])
+            generated_bytes += token_bytes
+            try:
+                print(token_bytes.decode("utf-8", errors="replace"), end="", flush=True)
+            except Exception as e:
+                print(f"<Error decoding token: {e}>", end="", flush=True)
+            stop_generated = False
+            stop_seq = None
+            for stop_seq in stop_sequences:
+                if generated_ids.tolist()[0][-len(stop_seq) :] == stop_seq:
+                    stop_generated = True
+                    break
+            if stop_generated:
+                # Remove the stop sequence from the generated IDs
+                generated_ids = generated_ids[:, : -len(stop_seq)]
+                generated_bytes = generated_bytes[: -len(stop_seq)]
+                break
+    # Decode to bytes and then to string
+    try:
+        generated_text = generated_bytes.decode("utf-8", errors="replace")
+    except Exception as e:
+        print(f"\nError decoding generated text: {e}")
+        generated_text = "<decoding error>"
+    return generated_text, prompt + generated_text
+def main():
+    parser = argparse.ArgumentParser(
+        description="Text generation with DiffAttention LLM"
+    )
+    parser.add_argument("--checkpoint", type=str, help="Path to the checkpoint file")
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="""\nHow many 'b's are in "barber"? \n""",
+    )
+    parser.add_argument(
+        "--max_tokens",
+        type=int,
+        default=500,
+        help="Maximum number of tokens to generate",
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.7, help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--top_k", type=int, default=10, help="Top-k sampling parameter (0 to disable)"
+    )
+    parser.add_argument(
+        "--top_p",
+        type=float,
+        default=0.9,
+        help="Top-p (nucleus) sampling parameter (0 to disable)",
+    )
+    parser.add_argument(
+        "--repetition_penalty",
+        type=float,
+        default=1.2,
+        help="Repetition penalty (1.0 for no penalty)",
+    )
+    parser.add_argument(
+        "--list_checkpoints",
+        action="store_true",
+        help="List available checkpoints and exit",
+    )
+    args = parser.parse_args()
+    # List checkpoints if requested
+    if args.list_checkpoints:
+        print("Available checkpoints:")
+        checkpoints = list_checkpoints()
+        for i, ckpt in enumerate(checkpoints):
+            print(f"{i+1}. {ckpt}")
+        return
+    # If no checkpoint specified, use the latest one
+    if not args.checkpoint:
+        checkpoints = list_checkpoints()
+        if not checkpoints:
+            print("No checkpoints found. Please train the model first.")
+            return
+        # Find the latest epoch_end checkpoint
+        end_checkpoints = [ckpt for ckpt in checkpoints if "end.pt" in ckpt]
+        if end_checkpoints:
+            latest_checkpoint = max(end_checkpoints)
+        else:
+            latest_checkpoint = max(checkpoints)
+        checkpoint_path = os.path.join("checkpoints", latest_checkpoint)
+    else:
+        checkpoint_path = args.checkpoint
+    # Set device
+    device = torch.device(
+        "cuda" if torch.cuda.is_available() and not force_CPU else "cpu"
+    )
+    print(f"Using device: {device}")
+    # Initialize tokenizer
+    tokenizer = ByteTokenizer()
+    # Load model
+    model = load_model(checkpoint_path, device)
+    # Generate text
+    print(f"\nGenerating text with prompt: '{args.prompt}'")
+    print(
+        f"Parameters: temperature={args.temperature}, top_k={args.top_k}, top_p={args.top_p}, repetition_penalty={args.repetition_penalty}"
+    )
+    print("\nGenerating...")
+    generated_text, full_text = generate_text(
+        model=model,
+        tokenizer=tokenizer,
+        prompt=args.prompt,
+        max_new_tokens=args.max_tokens,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        repetition_penalty=args.repetition_penalty,
+        device=device,
+    )
+    print("\n\nGenerated completion only:")
+    print("-" * 40)
+    print(generated_text)
+    print("-" * 40)
+    print("\nFull generated text (prompt + completion):")
+    print("-" * 40)
+    print(full_text)
+    print("-" * 40)
+if __name__ == "__main__":
+    import argparse
+    main()

inference/model.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import torch
+import torch.nn as nn
+import math
+from .optimized_diffattn import MultiheadDiffAttn
+# --- Tokenizer Definition ---
+# Vocabulary: 256 bytes + IM_START_TOKEN + IM_END_TOKEN + <pad>
+IM_START_TOKEN = "<|im_start|>"
+IM_END_TOKEN = "<|im_end|>"
+PAD_TOKEN = "<pad>"
+SPECIAL_TOKENS = [IM_START_TOKEN, IM_END_TOKEN, PAD_TOKEN]
+VOCAB_SIZE = 256 + len(SPECIAL_TOKENS)
+# Create token to id mapping
+token_to_id = {}
+id_to_token = {}
+for i in range(256):
+    token_to_id[bytes([i])] = i
+    id_to_token[i] = bytes([i])
+for i, token_str in enumerate(SPECIAL_TOKENS):
+    token_id = 256 + i
+    token_to_id[token_str] = token_id
+    id_to_token[token_id] = token_str
+PAD_ID = token_to_id[PAD_TOKEN]
+IM_START_ID = token_to_id[IM_START_TOKEN]
+IM_END_ID = token_to_id[IM_END_TOKEN]
+class ByteTokenizer:
+    def __init__(self):
+        self.token_to_id = token_to_id
+        self.id_to_token = id_to_token
+        self.vocab_size = VOCAB_SIZE
+        self.pad_id = PAD_ID
+        self.im_start_id = IM_START_ID
+        self.im_end_id = IM_END_ID
+    def encode(self, text_bytes: bytes, add_special_tokens=True):
+        ids = [self.token_to_id[bytes([b])] for b in text_bytes]
+        if add_special_tokens:
+            return [self.im_start_id] + ids + [self.im_end_id]
+        return ids
+    def decode(self, ids: list[int]):
+        tokens = []
+        for i in ids:
+            token = self.id_to_token.get(i)
+            if token is None:
+                # Handle unknown token ID if necessary, or raise error
+                tokens.append(b"?")  # Placeholder for unknown
+            elif isinstance(token, bytes):
+                tokens.append(token)
+            # Ignore special tokens for decoding to raw text, or handle as needed
+        return b"".join(tokens)
+# --- RoPE Embeddings --- (Reused from previous script)
+def get_rotary_embeddings(seq_len, dim_model, theta=10000.0):
+    if dim_model % 2 != 0:
+        raise ValueError(f"dim_model must be even, got {dim_model}")
+    position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
+    div_term = torch.exp(
+        torch.arange(0, dim_model, 2).float() * -(math.log(theta) / dim_model)
+    )
+    angles = position * div_term
+    cos_emb = torch.cos(angles)
+    sin_emb = torch.sin(angles)
+    return cos_emb, sin_emb
+# --- Model Definition ---
+class FeedForward(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, dropout=0.1):
+        super().__init__()
+        self.fc1 = nn.Linear(embed_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, embed_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.act = nn.GELU()
+    def forward(self, x):
+        return self.fc2(self.dropout(self.act(self.fc1(x))))
+class DiffTransformerBlock(nn.Module):
+    def __init__(self, embed_dim, num_heads, depth, ffn_hidden_dim, dropout=0.1):
+        super().__init__()
+        self.attn = MultiheadDiffAttn(embed_dim, depth, num_heads, dropout=dropout)
+        self.ffn = FeedForward(embed_dim, ffn_hidden_dim, dropout)
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, rel_pos, attn_mask=None):
+        # Pre-norm
+        attn_out = self.attn(self.norm1(x), rel_pos, attn_mask)
+        x = x + self.dropout(attn_out)
+        ffn_out = self.ffn(self.norm2(x))
+        x = x + self.dropout(ffn_out)
+        return x
+class DiffTransformerLLM(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        embed_dim,
+        num_layers,
+        num_heads,
+        ffn_hidden_dim,
+        max_seq_len,
+        dropout=0.1,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.max_seq_len = max_seq_len
+        self.token_embeddings = nn.Embedding(vocab_size, embed_dim)
+        # Positional embeddings are handled by RoPE, so no separate nn.Embedding for positions
+        self.dropout = nn.Dropout(dropout)
+        self.layers = nn.ModuleList(
+            [
+                DiffTransformerBlock(
+                    embed_dim, num_heads, depth, ffn_hidden_dim, dropout
+                )
+                for depth in range(num_layers)
+            ]
+        )
+        self.norm_out = nn.LayerNorm(embed_dim)
+        self.lm_head = nn.Linear(embed_dim, vocab_size, bias=False)
+        # Tie weights
+        self.token_embeddings.weight = self.lm_head.weight
+        # RoPE precomputation
+        # The head_dim for MultiheadDiffAttn is embed_dim // num_heads // 2
+        self.rope_head_dim = embed_dim // num_heads // 2
+        cos_emb, sin_emb = get_rotary_embeddings(max_seq_len, self.rope_head_dim)
+        self.register_buffer("cos_emb", cos_emb, persistent=False)
+        self.register_buffer("sin_emb", sin_emb, persistent=False)
+    def forward(self, input_ids, attn_mask=None):
+        batch_size, seq_len = input_ids.shape
+        x = self.token_embeddings(input_ids) * math.sqrt(self.embed_dim)
+        x = self.dropout(x)
+        # Ensure RoPE embeddings are on the same device *and* dtype as activations
+        rel_pos = (
+            self.cos_emb[:seq_len, :].to(x.device, dtype=x.dtype),
+            self.sin_emb[:seq_len, :].to(x.device, dtype=x.dtype),
+        )
+        # Create causal attention mask if not provided
+        if attn_mask is None:
+            # Standard causal mask for autoregressive decoding
+            # MultiheadDiffAttn expects a mask where -inf indicates masked positions
+            causal_mask = torch.triu(
+                torch.ones(seq_len, seq_len, device=x.device) * float("-inf"),
+                diagonal=1,
+            )
+        else:
+            # If a custom mask is provided (e.g., for padding), ensure it's correctly formatted
+            # For MultiheadDiffAttn, 0 means attend, -inf means mask.
+            # Assuming input attn_mask is 1 for attend, 0 for mask (like Hugging Face)
+            # We need to convert it: (1 - attn_mask) * -inf
+            # However, MultiheadDiffAttn's internal mask logic might be sufficient if it handles padding.
+            # For simplicity, let's assume the provided attn_mask is already in the correct format if not None.
+            # If it's a padding mask (1 for real tokens, 0 for pad), we need to adapt it.
+            # Let's stick to causal mask for now, padding handled by loss_fn ignore_index.
+            causal_mask = torch.triu(
+                torch.ones(seq_len, seq_len, device=x.device) * float("-inf"),
+                diagonal=1,
+            )
+        for layer in self.layers:
+            x = layer(x, rel_pos, attn_mask=causal_mask)
+        x = self.norm_out(x)
+        logits = self.lm_head(x)
+        return logits
+    def count_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)

inference/onnx_inference.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import onnxruntime as ort
+import numpy as np
+import torch
+import time
+import argparse
+from typing import Set, Optional
+from .model import ByteTokenizer
+sequence_breaker_strings = ["\n", ":", '"', "*", "<", ">", "|"]
+class DRYLogitsProcessor:
+    """
+    Don't Repeat Yourself (DRY) Logits Processor that penalizes repetitive sequences.
+    """
+    def __init__(
+        self,
+        multiplier: float = 0.5,
+        base: float = 2.0,
+        allowed_length: int = 1,
+        sequence_breakers: Optional[Set[int]] = None,
+        range: int = 512,
+    ):
+        """
+        Args:
+            multiplier: Base penalty multiplier
+            base: Exponential base for penalty calculation
+            allowed_length: Length of sequence that's allowed to repeat without penalty
+            sequence_breakers: Set of token IDs that should break sequence matching
+            range: Number of previous tokens to consider for repetition checking
+        """
+        self.multiplier = multiplier
+        self.base = base
+        self.allowed_length = allowed_length
+        self.sequence_breakers = sequence_breakers or set()
+        self.range = range
+    def __call__(self, input_ids: np.ndarray, scores: np.ndarray) -> np.ndarray:
+        """
+        Apply DRY penalty to logits.
+        Args:
+            input_ids: Array of shape (batch_size, seq_len)
+            scores: Array of shape (vocab_size,) with logits
+        Returns:
+            Modified scores with penalties applied
+        """
+        if self.range > 0:
+            input_ids = input_ids[:, -self.range :]
+        # Convert to torch tensors for easier manipulation
+        input_tensor = torch.from_numpy(input_ids)
+        scores_tensor = torch.from_numpy(scores)
+        for input_ids_row in input_tensor:
+            # Raw integer must be extracted here to check for set membership
+            last_token = input_ids_row[-1].item()
+            if last_token in self.sequence_breakers:
+                continue
+            # Exclude the last token as it always matches
+            match_indices = (input_ids_row[:-1] == last_token).nonzero(as_tuple=False)
+            # Stores the maximum matching sequence length for each next token
+            match_lengths = {}
+            for i in match_indices.squeeze(1):
+                i = i.item()
+                if i + 1 >= len(input_ids_row):
+                    continue
+                next_token = input_ids_row[i + 1].item()
+                if next_token in self.sequence_breakers:
+                    continue
+                # We have already found that `last_token` matches at this index,
+                # so the match is at least of length 1.
+                match_length = 1
+                # Extend the match backwards as far as possible
+                while True:
+                    j = i - match_length
+                    if j < 0:
+                        break  # Start of input reached
+                    if match_length + 1 > len(input_ids_row):
+                        break  # End of input reached
+                    previous_token = input_ids_row[-(match_length + 1)].item()
+                    if input_ids_row[j] != previous_token:
+                        break  # Start of match reached
+                    if previous_token in self.sequence_breakers:
+                        break  # Sequence-breaking token reached
+                    match_length += 1
+                # Update the maximum match length for this next token
+                if match_length >= match_lengths.get(next_token, 0):
+                    match_lengths[next_token] = match_length
+            # Apply penalties
+            for token, match_length in match_lengths.items():
+                if match_length >= self.allowed_length:
+                    penalty = self.multiplier * (
+                        self.base ** (match_length - self.allowed_length)
+                    )
+                    scores_tensor[token] -= penalty
+        return scores_tensor.numpy()
+def generate_text(
+    session,
+    tokenizer,
+    prompt,
+    max_new_tokens=100,
+    temperature=0.8,
+    top_k=25,  # There are only 256 bytes total
+    stop_sequences=None,
+    dry_multiplier: float = 0.0,  # Set to 0 to disable DRY by default
+    dry_base: float = 2.0,
+    dry_allowed_length: int = 20,  # 20 since this is byte level.
+    dry_sequence_breakers: Optional[Set[int]] = None,
+    dry_range: int = 512,
+):
+    """Generate text using an ONNX model with DRY sampling and stop sequences."""
+    input_ids_list = tokenizer.encode(prompt.encode("utf-8"), add_special_tokens=False)
+    input_ids = np.array([input_ids_list], dtype=np.int64)
+    generated_token_ids = []
+    start_time = time.time()
+    for _ in range(max_new_tokens):
+        seq_len = input_ids.shape[1]
+        # Create a causal mask for the current sequence length.
+        causal_mask = np.triu(np.ones((1, seq_len, seq_len), dtype=np.bool_), k=1)
+        attn_mask = np.zeros((1, seq_len, seq_len), dtype=np.float32)
+        attn_mask[causal_mask] = -np.inf
+        ort_inputs = {"input_ids": input_ids, "attn_mask": attn_mask}
+        try:
+            ort_outs = session.run(None, ort_inputs)
+        except Exception as e:
+            print(f"ONNX Runtime Error: {e}")
+            # Potentially return or handle the error gracefully
+            return "[ONNX Error]", 0
+        logits = ort_outs[0][0, -1, :]
+        # Apply DRY penalty if enabled
+        if dry_multiplier > 0:
+            dry_processor = DRYLogitsProcessor(
+                multiplier=dry_multiplier,
+                base=dry_base,
+                allowed_length=dry_allowed_length,
+                sequence_breakers=dry_sequence_breakers,
+                range=dry_range,
+            )
+            logits = dry_processor(input_ids, logits)
+        # Apply temperature scaling
+        logits = logits / temperature
+        # Apply top-k filtering
+        if top_k > 0:
+            top_k = min(top_k, logits.shape[-1])
+            indices_to_remove = logits.argsort()[:-top_k]
+            logits[indices_to_remove] = -float("inf")
+        # Sample from the distribution
+        probs = torch.softmax(torch.from_numpy(logits), dim=-1).numpy()
+        next_token_id = np.random.choice(len(probs), p=probs)
+        if next_token_id == tokenizer.im_end_id:
+            break
+        input_ids = np.append(input_ids, [[next_token_id]], axis=1)
+        generated_token_ids.append(next_token_id)
+        if stop_sequences:
+            current_output = tokenizer.decode(np.array(generated_token_ids))
+            stop_generation = False
+            for seq in stop_sequences:
+                if current_output.endswith(seq):
+                    stop_generation = True
+                    # Remove the stop sequence from the generated text
+                    generated_token_ids = generated_token_ids[: -len(seq)]
+                    current_output = tokenizer.decode(np.array(generated_token_ids))
+                    break
+            if stop_generation:
+                break
+    final_text = tokenizer.decode(np.array(generated_token_ids))
+    tps = len(generated_token_ids) / (time.time() - start_time)
+    return final_text, tps

inference/optimized_diffattn.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import math
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+# Re-use rotary embedding helper from the original codebase
+from .rotary import apply_rotary_emb
+# -----------------------------------------------------------------------------
+# Utility helpers (copied from the original implementation)
+# -----------------------------------------------------------------------------
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Efficiently repeat keys / values for GQA without allocating new memory."""
+    bs, n_kv_heads, slen, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, None, :, :]
+        .expand(bs, n_kv_heads, n_rep, slen, head_dim)
+        .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
+    )
+def lambda_init_fn(depth: int) -> float:
+    """Init schedule described in the DiffAttention paper."""
+    return 0.8 - 0.6 * math.exp(-0.3 * depth)
+# -----------------------------------------------------------------------------
+# Optimised Multi-head DiffAttention implementation
+# -----------------------------------------------------------------------------
+class MultiheadDiffAttn(nn.Module):
+    """Optimised DiffAttention block.
+    Differences from the original implementation:
+    1. Removes the dependency on Apex / FusedRMSNorm; uses native LayerNorm.
+    2. Keeps all tensors on-device and works well with autocast fp16/bf16.
+    3. Minimises Python-side tensor reshapes and kernel launches.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        depth: int,
+        num_heads: int,
+        num_kv_heads: Optional[int] = None,
+        dropout: float = 0.1,
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads  # query heads (will be doubled internally)
+        self.num_kv_heads = num_kv_heads or num_heads
+        self.n_rep = (
+            self.num_heads // self.num_kv_heads
+        )  # replication factor for keys / values (GQA)
+        self.attn_dropout = dropout  # Store dropout rate for attention
+        # One half of a traditional head – DiffAttention uses pairs of heads
+        self.head_dim = embed_dim // self.num_heads // 2
+        assert (
+            self.head_dim * self.num_heads * 2 == embed_dim
+        ), "embed_dim must be divisible by num_heads * 2"
+        self.scaling = self.head_dim**-0.5
+        # Projections.  We keep them separated because K/V are smaller (GQA)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.k_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        # Add dropout for regularization
+        self.dropout = nn.Dropout(dropout)
+        # DiffAttention lambda parameters (learnable)
+        self.lambda_init = lambda_init_fn(depth)
+        self.lambda_q1 = nn.Parameter(torch.randn(self.head_dim) * 0.1)
+        self.lambda_k1 = nn.Parameter(torch.randn(self.head_dim) * 0.1)
+        self.lambda_q2 = nn.Parameter(torch.randn(self.head_dim) * 0.1)
+        self.lambda_k2 = nn.Parameter(torch.randn(self.head_dim) * 0.1)
+        # Use standard LayerNorm which has a highly-optimised CUDA kernel
+        self.subln = nn.LayerNorm(2 * self.head_dim, eps=1e-5)
+    # ---------------------------------------------------------------------
+    # Forward
+    # ---------------------------------------------------------------------
+    def forward(
+        self,
+        x: torch.Tensor,  # [bsz, seq_len, embed_dim]
+        rel_pos: tuple[torch.Tensor, torch.Tensor],
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        bsz, seq_len, _ = x.size()
+        # ---- Projections --------------------------------------------------
+        # Projections (run inside the outer autocast context so they stay in
+        # the low-precision dtype and use tensor cores)
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        # Reshape into paired heads (2 × heads)
+        q = q.view(bsz, seq_len, 2 * self.num_heads, self.head_dim)
+        k = k.view(bsz, seq_len, 2 * self.num_kv_heads, self.head_dim)
+        v = v.view(bsz, seq_len, self.num_kv_heads, 2 * self.head_dim)
+        # Rotary position encodings (ensure dtype matches q)
+        cos, sin = rel_pos
+        cos = cos.to(dtype=q.dtype)
+        sin = sin.to(dtype=q.dtype)
+        q = apply_rotary_emb(q, cos, sin, interleaved=True)
+        k = apply_rotary_emb(k, cos, sin, interleaved=True)
+        # ---- Prepare tensors for matmul ----------------------------------
+        # Shape conventions follow PyTorch’s `scaled_dot_product_attention`:
+        #   (bsz, heads, seq, head_dim)
+        q = q.transpose(1, 2)  # [bsz, 2*heads, seq, head_dim]
+        k = k.transpose(1, 2)  # [bsz, 2*kv_heads, seq, head_dim]
+        v = v.transpose(1, 2)  # [bsz, kv_heads, seq, 2*head_dim]
+        # Replicate k/v heads when using GQA
+        k = repeat_kv(k, self.n_rep)  # [bsz, 2*heads, seq, head_dim]
+        v = repeat_kv(v, self.n_rep)  # [bsz, heads, seq, 2*head_dim]
+        # ---- Fused scaled dot-product attention (Flash / SDPA) -----------
+        #
+        # We avoid instantiating the full (seq×seq) score matrix. Instead we
+        # run the fused attention kernel twice (positive/negative queries) and
+        # combine the resulting context tensors with the λ weighting. This
+        # keeps everything in fp16/bf16 and leverages Blackwell’s Flash/SDPA
+        # path, giving ~30-80× speed-up vs. the naive implementation.
+        # ------------------------------------------------------------------
+        # Re-arrange the paired heads: [bsz, 2*H, S, D] → [bsz, H, 2, S, D]
+        q_pairs = q.view(bsz, 2, self.num_heads, seq_len, self.head_dim).permute(
+            0, 2, 1, 3, 4
+        )
+        k_pairs = k.view(bsz, 2, self.num_heads, seq_len, self.head_dim).permute(
+            0, 2, 1, 3, 4
+        )
+        q_pos, q_neg = q_pairs[:, :, 0], q_pairs[:, :, 1]  # [bsz, H, S, D]
+        k_pos, k_neg = k_pairs[:, :, 0], k_pairs[:, :, 1]
+        # λ scalar (identical across heads / sequence)
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1)).type_as(q_pos)
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2)).type_as(q_pos)
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init  # scalar tensor
+        # --- Fused attention (only TWO SDPA calls) -------------------------
+        ctx_pos = F.scaled_dot_product_attention(
+            q_pos, k_pos, v, dropout_p=self.attn_dropout, is_causal=True
+        )  # [bsz, H, S, 2*D]
+        ctx_neg = F.scaled_dot_product_attention(
+            q_neg, k_neg, v, dropout_p=self.attn_dropout, is_causal=True
+        )  # [bsz, H, S, 2*D]
+        # DiffAttention combination
+        attn_out = ctx_pos - lambda_full * ctx_neg  # [bsz, H, S, 2*D]
+        # LayerNorm & residual scaling
+        attn_out = self.subln(attn_out) * (1.0 - self.lambda_init)
+        # Collapse heads and project out
+        attn_out = attn_out.transpose(1, 2).reshape(  # [bsz, seq, heads, 2*head_dim]
+            bsz, seq_len, self.embed_dim
+        )
+        # Apply output projection and dropout
+        out = self.out_proj(attn_out)
+        return self.dropout(out)

inference/rotary.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) 2023, Tri Dao.
+from typing import Optional, Union
+import torch
+def apply_rotary_emb_torch(
+    x,
+    cos,
+    sin,
+    interleaved=False,
+    inplace=False,
+    seqlen_offsets=0,
+    cu_seqlens=None,
+    max_seqlen=None,
+):
+    # Only supports the basic (not interleaved, not variable-length) case.
+    rotary_dim = cos.shape[1] * 2
+    x1 = x[..., :rotary_dim]
+    x2 = x[..., rotary_dim:]
+    # Split [even, odd] pairs
+    x1_1, x1_2 = x1[..., ::2], x1[..., 1::2]  # (..., rotary_dim/2)
+    # Reshape cos/sin for broadcasting
+    # x: [batch, seqlen, nheads, rotary_dim]
+    # cos/sin: [seqlen, rotary_dim/2]
+    # reshape to [1, seqlen, 1, rotary_dim/2] to broadcast
+    cos = cos.unsqueeze(0).unsqueeze(2)
+    sin = sin.unsqueeze(0).unsqueeze(2)
+    rot_x1 = x1_1 * cos - x1_2 * sin
+    rot_x2 = x1_1 * sin + x1_2 * cos
+    # Interleave last dimension: (..., rotary_dim/2, 2) -> (..., rotary_dim)
+    rot_x = torch.stack([rot_x1, rot_x2], dim=-1).reshape_as(x1)
+    out = torch.cat([rot_x, x2], dim=-1)
+    return out
+def apply_rotary_emb(
+    x,
+    cos,
+    sin,
+    interleaved=False,
+    inplace=False,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    max_seqlen: Optional[int] = None,
+):
+    """
+    Arguments:
+        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim)
+        cos, sin: (seqlen_rotary, rotary_dim / 2)
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+            of 1st half and 2nd half (GPT-NeoX style).
+        inplace: if True, apply rotary embedding in-place.
+        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+        cu_seqlens: (batch + 1,) or None
+        max_seqlen: int
+    Return:
+        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim)
+    rotary_dim must be <= headdim
+    Apply rotary embedding to the first rotary_dim of x.
+    """
+    # We are forcing the use of the pure PyTorch implementation (`apply_rotary_emb_torch`)
+    # for all devices. The custom Triton kernel (`ApplyRotaryEmb`) was causing a graph
+    # break in `torch.compile`, pushing expensive operations to the CPU.
+    # By using the pure PyTorch version, `torch.compile` can create a single, fully-optimized
+    # graph, which should resolve the CPU bottleneck and improve GPU utilization.
+    return apply_rotary_emb_torch(
+        x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
+    )

models/small.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06daa397631d28d8b2c1eee51f0f992c4e69927cc770a20d8ed5e2c40f95cc33
+size 796014268

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ torch>=2.2.0

test-trad.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from inference.inference import generate_text, list_checkpoints, load_model
+import argparse
+import torch
+from inference.model import ByteTokenizer
+def main():
+    parser = argparse.ArgumentParser(
+        description="Text generation with DiffAttention LLM"
+    )
+    parser.add_argument("--checkpoint", type=str, help="Path to the checkpoint file")
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="""<|im_start|>system\nYou are a helpful chatbot<|im_end|>\n<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n""",
+    )
+    parser.add_argument(
+        "--max_tokens",
+        type=int,
+        default=500,
+        help="Maximum number of tokens to generate",
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.7, help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--top_k", type=int, default=1, help="Top-k sampling parameter (0 to disable)"
+    )
+    parser.add_argument(
+        "--top_p",
+        type=float,
+        default=0.9,
+        help="Top-p (nucleus) sampling parameter (0 to disable)",
+    )
+    parser.add_argument(
+        "--repetition_penalty",
+        type=float,
+        default=1.0,
+        help="Repetition penalty (1.0 for no penalty)",
+    )
+    parser.add_argument(
+        "--list_checkpoints",
+        action="store_true",
+        help="List available checkpoints and exit",
+    )
+    args = parser.parse_args()
+    # List checkpoints if requested
+    if args.list_checkpoints:
+        print("Available checkpoints:")
+        checkpoints = list_checkpoints()
+        for i, ckpt in enumerate(checkpoints):
+            print(f"{i+1}. {ckpt}")
+        return
+    # If no checkpoint specified, use the latest one
+    if not args.checkpoint:
+        checkpoints = list_checkpoints()
+        if not checkpoints:
+            print("No checkpoints found. Please train the model first.")
+            return
+        # Find the latest epoch_end checkpoint
+        end_checkpoints = [ckpt for ckpt in checkpoints if "end.pt" in ckpt]
+        if end_checkpoints:
+            latest_checkpoint = max(end_checkpoints)
+        else:
+            latest_checkpoint = max(checkpoints)
+        checkpoint_path = os.path.join("checkpoints", latest_checkpoint)
+    else:
+        checkpoint_path = args.checkpoint
+    # Set device
+    device = torch.device(
+        "cuda" if torch.cuda.is_available() and not force_CPU else "cpu"
+    )
+    print(f"Using device: {device}")
+    # Initialize tokenizer
+    tokenizer = ByteTokenizer()
+    # Load model
+    model = load_model(checkpoint_path, device)
+    # Generate text
+    print(f"\nGenerating text with prompt: '{args.prompt}'")
+    print(
+        f"Parameters: temperature={args.temperature}, top_k={args.top_k}, top_p={args.top_p}, repetition_penalty={args.repetition_penalty}"
+    )
+    print("\nGenerating...")
+    generated_text, full_text = generate_text(
+        model=model,
+        tokenizer=tokenizer,
+        prompt=args.prompt,
+        max_new_tokens=args.max_tokens,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        repetition_penalty=args.repetition_penalty,
+        device=device,
+    )
+    print("\n\nGenerated completion only:")
+    print("-" * 40)
+    print(generated_text)
+    print("-" * 40)
+    print("\nFull generated text (prompt + completion):")
+    print("-" * 40)
+    print(full_text)
+    print("-" * 40)
+if __name__ == "__main__":
+    import argparse
+    main()

test.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from inference.onnx_inference import generate_text
+import argparse
+import onnxruntime as ort
+from inference.model import ByteTokenizer
+sequence_breaker_strings = ["\n", ":", '"', "*", "<", ">", "|"]
+def main():
+    parser = argparse.ArgumentParser(
+        description="Inference with ONNX DiffTransformerLLM"
+    )
+    parser.add_argument(
+        "--onnx_path", type=str, default="models/small.onnx", help="Path to ONNX model"
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="<|im_start|>system\nYou are a helpful chatbot<|im_end|>\n<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n",
+        help="Prompt for the model",
+    )
+    parser.add_argument("--max_tokens", type=int, default=100, help="Max new tokens")
+    parser.add_argument(
+        "--temperature", type=float, default=0.7, help="Temperature for sampling"
+    )
+    parser.add_argument("--top_k", type=int, default=1, help="Top-k for sampling")
+    parser.add_argument(
+        "--stop_sequence", type=str, action="append", help="Stop sequence(s)"
+    )
+    # DRY sampling args
+    parser.add_argument(
+        "--dry_range", type=int, default=1024, help="Range for DRY sampling"
+    )
+    parser.add_argument(
+        "--dry_allowed_length",
+        type=int,
+        default=17,
+        help="Allowed repeat length for DRY sampling",
+    )
+    parser.add_argument(
+        "--dry_base", type=float, default=1.1, help="Base for DRY penalty"
+    )
+    parser.add_argument(
+        "--dry_multiplier", type=float, default=0.0, help="Multiplier for DRY penalty"
+    )
+    args = parser.parse_args()
+    print(f"Loading ONNX model from {args.onnx_path}")
+    session = ort.InferenceSession(args.onnx_path, providers=["CPUExecutionProvider"])
+    tokenizer = ByteTokenizer()
+    sequence_breaker_ids = {tokenizer.im_start_id, tokenizer.im_end_id}
+    for s in sequence_breaker_strings:
+        # These are single-byte tokens, so encode will return a list with one ID
+        sequence_breaker_ids.add(tokenizer.encode(s.encode("utf-8"))[0])
+    print(f"Prompt: {args.prompt}")
+    print("--- Output ---")
+    generated_text, tps = generate_text(
+        session,
+        tokenizer,
+        args.prompt,
+        max_new_tokens=args.max_tokens,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        stop_sequences=["<|im_end|>".encode("utf-8")],
+        dry_sequence_breakers=sequence_breaker_ids,
+        dry_range=args.dry_range,
+        dry_allowed_length=args.dry_allowed_length,
+        dry_base=args.dry_base,
+        dry_multiplier=args.dry_multiplier,
+    )
+    print(generated_text)
+    print(generated_text.decode("utf-8", "ignore"))
+    print("--------------")
+    print(f"\nPerformance: {tps:.2f} tokens/second")
+if __name__ == "__main__":
+    main()