Spaces:

Gen-Verse
/

MMaDA

Running on Zero

App Files Files Community

tyfeld commited on May 21

Commit

ea359a8

1 Parent(s): 2f15a78

initial

Browse files

Files changed (15) hide show

app.py +871 -0
models/__init__.py +3 -0
models/common_modules.py +357 -0
models/configuration_llada.py +463 -0
models/logging.py +338 -0
models/lr_schedulers.py +302 -0
models/misc.py +53 -0
models/modeling_llada.py +1500 -0
models/modeling_magvitv2.py +440 -0
models/modeling_mmada.py +668 -0
models/modeling_utils.py +1207 -0
models/sampling.py +118 -0
models/training_utils.py +455 -0
training/__init__.py +1 -0
training/prompting_utils.py +475 -0

app.py ADDED Viewed

	@@ -0,0 +1,871 @@

+import gradio as gr
+import torch
+import numpy as np
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+from torchvision import transforms
+from models import MAGVITv2, get_mask_schedule, MMadaModelLM
+from training.prompting_utils import UniversalPrompting
+from PIL import Image
+def image_transform(image, resolution=256, normalize=True):
+    image = transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BICUBIC)(image)
+    image = transforms.CenterCrop((resolution, resolution))(image)
+    image = transforms.ToTensor()(image)
+    if normalize:
+        image = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)(image)
+    return image
+def add_gumbel_noise(logits, temperature):
+    """
+    Adds Gumbel noise to logits for stochastic sampling.
+    Equivalent to argmax(logits + temperature * G) where G ~ Gumbel(0,1).
+    This version is more numerically stable than a version involving exp() and division.
+    """
+    if abs(temperature) < 1e-9: # Effectively zero temperature
+        return logits
+    # Ensure logits are float64 for precision with noise, as suggested by user context
+    logits = logits.to(torch.float64)
+    # Standard Gumbel noise: -log(-log(U)), U ~ Uniform(0,1)
+    # Add small epsilon for numerical stability inside logs
+    noise = torch.rand_like(logits, dtype=torch.float64)
+    standard_gumbel_noise = -torch.log(-torch.log(noise + 1e-20) + 1e-20)
+    return logits + temperature * standard_gumbel_noise
+def get_num_transfer_tokens(mask_index, steps):
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    # Ensure steps is at least 1 to avoid division by zero if mask_num is also 0 (though sum should be >=0)
+    steps = max(1, int(steps)) # Ensure steps is a positive integer
+    base = mask_num // steps
+    remainder = mask_num % steps
+    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.long) + base
+    for i in range(mask_num.size(0)): # Iterate over batch
+        if remainder[i] > 0 : # Ensure remainder is positive before indexing
+             num_transfer_tokens[i, :remainder[i].item()] += 1 # .item() for single value tensor to int
+    return num_transfer_tokens
+MODEL = None
+TOKENIZER = None
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+MASK_ID = None
+uni_prompting = None
+VQ_MODEL = MAGVITv2().from_pretrained("/data_storage/shared/pretrained_models/models--showlab--magvitv2").to(DEVICE)
+DEFAULT_MODEL_PATH = "/data_storage/lbw/MMaDA/mmada-training-stage3-llada-instruct-512-cot-uni/checkpoint-210000/unwrapped_model" # Default
+CURRENT_MODEL_PATH = None
+MODEL_CHOICES = [
+    "MMaDA-8B-Base",
+    "MMaDA-8B-MixCoT (coming soon)",
+    "MMaDA-8B-Max (coming soon)"
+]
+MODEL_ACTUAL_PATHS = {
+    "MMaDA-8B-Base": DEFAULT_MODEL_PATH,
+}
+def clear_outputs_action():
+        return None, None
+def _load_model_and_tokenizer_core(model_path_to_load, model_display_name_for_status):
+    global MODEL, TOKENIZER, MASK_ID, CURRENT_MODEL_PATH, DEVICE, uni_prompting
+    if MODEL is not None and CURRENT_MODEL_PATH == model_path_to_load:
+        return f"Model '{model_display_name_for_status}' from '{model_path_to_load}' is already loaded. MASK_ID: {MASK_ID}"
+    CURRENT_MODEL_PATH = model_path_to_load
+    status_msg_parts = [f"Loading '{model_display_name_for_status}'..."]
+    try:
+        TOKENIZER = AutoTokenizer.from_pretrained(model_path_to_load, trust_remote_code=True)
+        status_msg_parts.append(f"Tokenizer for '{model_display_name_for_status}' loaded.")
+        MODEL = MMadaModelLM.from_pretrained(model_path_to_load, trust_remote_code=True, torch_dtype=torch.bfloat16).to(DEVICE).eval()
+        status_msg_parts.append(f"Model '{model_display_name_for_status}' loaded to {DEVICE}.")
+        uni_prompting = UniversalPrompting(TOKENIZER, max_text_len=512, special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),ignore_id=-100, cond_dropout_prob=0.1, use_reserved_token=True)
+        if hasattr(TOKENIZER, 'mask_token_id') and TOKENIZER.mask_token_id is not None:
+            MASK_ID = TOKENIZER.mask_token_id
+            status_msg_parts.append(f"Using MASK_ID from tokenizer: {MASK_ID}.")
+        else:
+            MASK_ID = 126336
+            status_msg_parts.append(f"Using default MASK_ID: {MASK_ID}.")
+        if TOKENIZER.pad_token_id is None:
+            if TOKENIZER.eos_token_id is not None:
+                TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
+                TOKENIZER.pad_token = TOKENIZER.eos_token
+                status_msg_parts.append(f"Set pad_token_id to eos_token_id ({TOKENIZER.eos_token_id}).")
+            else:
+                status_msg_parts.append("Warning: pad_token_id is None and no eos_token_id.")
+        if TOKENIZER.eos_token_id is None: # Important for cleaning up output in visualization
+             status_msg_parts.append("Warning: tokenizer.eos_token_id is None. EOS cleanup might not work.")
+        TOKENIZER.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}"
+        return " ".join(status_msg_parts)
+    except Exception as e:
+        MODEL = None
+        TOKENIZER = None
+        MASK_ID = None
+        CURRENT_MODEL_PATH = None
+        return f"Error loading model '{model_display_name_for_status}': {str(e)}"
+def handle_model_selection_change(selected_model_name_ui):
+    if "coming soon" in selected_model_name_ui.lower():
+        global MODEL, TOKENIZER, MASK_ID, CURRENT_MODEL_PATH
+        MODEL = None
+        TOKENIZER = None
+        MASK_ID = None
+        CURRENT_MODEL_PATH = None
+        return f"'{selected_model_name_ui}' is not yet available. Please select 'Model A'."
+    actual_path = MODEL_ACTUAL_PATHS.get(selected_model_name_ui)
+    if not actual_path:
+        return f"Path for '{selected_model_name_ui}' is not defined. Cannot load."
+    return _load_model_and_tokenizer_core(actual_path, selected_model_name_ui)
+def get_highlighted_text_tuples(current_x_ids_batch, prompt_input_ids, prompt_len, tk, current_mask_id, raw_prompt_attention_mask):
+    if current_x_ids_batch is None or current_x_ids_batch.ndim == 0 or current_x_ids_batch.shape[0] == 0:
+        return [("Error in sequence data for visualization.", "ERROR")]
+    # only answer part
+    current_x_ids_batch = current_x_ids_batch[:, prompt_len:]
+    seq_ids = current_x_ids_batch[0].tolist()
+    eos_token_id = tk.eos_token_id  # Get EOS token ID
+    # Stage 1: Build initial list of tuples with (token_str, label, token_id_int)
+    # This helps in identifying EOS tokens later without re-checking the type.
+    intermediate_tuples = []
+    for j, token_id_int in enumerate(seq_ids):
+        try:
+            token_str = tk.decode([token_id_int], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        except Exception: # Handle cases where a token ID might be problematic (e.g. with mock)
+            token_str = f"[ID:{token_id_int}]"
+        label = "ERROR"
+        if token_id_int == current_mask_id:
+            token_str = "[MASK]"
+            label = "MASK"
+        else:
+            label = "GEN"
+        intermediate_tuples.append((token_str, label, token_id_int))
+    return intermediate_tuples
+@torch.no_grad()
+def generate_viz_wrapper_t2i(prompt_text, steps, guidance_scale, mask_schedule="cosine"):
+    global MODEL, TOKENIZER, MASK_ID, DEVICE, uni_prompting
+    if MODEL is None or TOKENIZER is None or MASK_ID is None:
+        yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
+        return
+    steps = int(steps)
+    guidance_scale = float(guidance_scale)
+    image_tokens = torch.ones((1, 1024), dtype=torch.long, device=DEVICE) * MASK_ID
+    prompt_text = [prompt_text]
+    input_ids, attention_mask = uni_prompting((prompt_text, image_tokens), 't2i_gen')
+    if guidance_scale > 0:
+        uncond_input_ids, uncond_attention_mask = uni_prompting(([''], image_tokens), 't2i_gen')
+    else:
+        uncond_input_ids, uncond_attention_mask = None, None
+    mask_schedule = get_mask_schedule(mask_schedule)
+    blank_image = Image.new("RGB", (512, 512), (255, 255, 255))
+    yield blank_image, "Starting generation..."
+    for image_step, status_msg_step in MODEL.t2i_generate_decoding_stepwise(
+            input_ids = input_ids,
+            uncond_input_ids = uncond_input_ids,
+            attention_mask = attention_mask,
+            uncond_attention_mask = uncond_attention_mask,
+            temperature=1.0,
+            timesteps = steps,
+            guidance_scale = guidance_scale,
+            noise_schedule = mask_schedule,
+            noise_type = "mask",
+            seq_len = 1024,
+            vq_model = VQ_MODEL,
+            uni_prompting=uni_prompting):
+        yield image_step, status_msg_step
+@torch.no_grad()
+def generate_viz_wrapper_lm(prompt_text, steps, gen_length, block_length, temperature,
+                         cfg_scale, remasking_strategy, thinking_mode_lm):
+    global MODEL, TOKENIZER, MASK_ID, DEVICE
+    print(f"thinking_mode_lm: {thinking_mode_lm}")
+    if MODEL is None or TOKENIZER is None or MASK_ID is None:
+        yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
+        return
+    steps = int(steps)
+    gen_length = int(gen_length)
+    block_length = int(block_length)
+    if thinking_mode_lm:
+        prompt_text = "You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here\n" + prompt_text
+    try:
+        m = [{"role": "user", "content": prompt_text}]
+        processed_prompt_text = TOKENIZER.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
+    except Exception as e:
+        yield [("Error applying chat template.", "ERROR")], f"Chat template error: {e}"
+        processed_prompt_text = prompt_text
+    try:
+        if TOKENIZER.pad_token_id is None:
+            if TOKENIZER.eos_token_id is not None:
+                TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
+            else: # Should have been caught by load_model, but double check
+                 yield [("Tokenizer Error", "ERROR")], "pad_token_id is not set in tokenizer."
+                 return
+        input_ids = TOKENIZER(text=processed_prompt_text, return_tensors="pt", padding="longest", padding_side="left", truncation=True, max_length=MODEL.config.max_position_embeddings if hasattr(MODEL.config, 'max_position_embeddings') else 2048)['input_ids'].to(DEVICE)
+        raw_prompt_attention_mask = None
+    except Exception as e:
+        yield [("Error tokenizing prompt.", "ERROR")], f"Tokenization error: {e}"
+        return
+    batch_size = input_ids.shape[0]
+    prompt_len = input_ids.shape[1]
+    x = torch.full((batch_size, prompt_len + gen_length), MASK_ID, dtype=torch.long, device=DEVICE)
+    x[:, :prompt_len] = input_ids.clone()
+    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), "Starting generation: Prompt + Initial Masks"
+    if gen_length == 0:
+         final_text_output = TOKENIZER.batch_decode(x[:,prompt_len:], skip_special_tokens=True)
+         yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_output[0] if final_text_output else ""
+         return
+    if block_length <= 0 or gen_length % block_length != 0 :
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+              f"Error: gen_length ({gen_length}) must be divisible by block_length ({block_length}) and block_length > 0."
+        return
+    num_blocks = gen_length // block_length
+    if steps <=0 or steps % num_blocks != 0:
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+              f"Error: steps ({steps}) must be positive and divisible by num_blocks ({num_blocks}). Steps: {steps}, Num Blocks: {num_blocks}"
+        return
+    steps_per_block = steps // num_blocks
+    for num_block_iter in range(num_blocks):
+        current_block_start_idx_in_x = prompt_len + num_block_iter * block_length
+        current_block_end_idx_in_x = prompt_len + (num_block_iter + 1) * block_length
+        block_masks_bool_current = torch.zeros_like(x, dtype=torch.bool)
+        block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x] = \
+            (x[:, current_block_start_idx_in_x:current_block_end_idx_in_x] == MASK_ID)
+        num_transfer_tokens_for_this_block = get_num_transfer_tokens(
+            block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x],
+            steps_per_block
+        )
+        for i_step_in_block in range(steps_per_block):
+            mask_index_global = (x == MASK_ID)
+            if cfg_scale > 0.:
+                un_x = x.clone()
+                # For unconditional pass, mask out the original prompt tokens that are not padding
+                # raw_prompt_attention_mask is (B, prompt_len)
+                prompt_active_tokens_mask = raw_prompt_attention_mask.bool() # True where actual prompt tokens are
+                un_x[:, :prompt_len][prompt_active_tokens_mask] = MASK_ID
+                x_cfg_input = torch.cat([x, un_x], dim=0)
+                # Pass attention_mask for CFG if model expects it, covering both parts
+                # For simplicity, not passing explicit attention_mask here; relies on model's internal handling.
+                model_output = MODEL(x_cfg_input)
+                logits_cond, logits_uncond = torch.chunk(model_output.logits, 2, dim=0)
+                logits = logits_uncond + (cfg_scale + 1) * (logits_cond - logits_uncond)
+            else:
+                # Not passing explicit attention_mask here; relies on model's internal handling.
+                model_output = MODEL(x)
+                logits = model_output.logits
+            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+            x0_predicted_tokens = torch.argmax(logits_with_noise, dim=-1)
+            if remasking_strategy == 'low_confidence':
+                probs = F.softmax(logits.to(torch.float64), dim=-1)
+                x0_probs = torch.gather(probs, dim=-1, index=x0_predicted_tokens.unsqueeze(-1)).squeeze(-1)
+            elif remasking_strategy == 'random':
+                x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float64)
+            else:
+                yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), f"Error: Unknown remasking strategy '{remasking_strategy}'"
+                return
+            confidence_for_selection = torch.full_like(x0_probs, -torch.inf)
+            candidate_positions_for_unmasking = mask_index_global & block_masks_bool_current
+            confidence_for_selection = torch.where(
+                candidate_positions_for_unmasking,
+                x0_probs,
+                -torch.inf
+            )
+            x0_final_candidates = torch.where(mask_index_global, x0_predicted_tokens, x)
+            transfer_indices_bool = torch.zeros_like(x, dtype=torch.bool)
+            num_to_transfer_this_step_batch = num_transfer_tokens_for_this_block[:, i_step_in_block]
+            for j_batch_idx in range(batch_size):
+                k_val = min(num_to_transfer_this_step_batch[j_batch_idx].item(),
+                            candidate_positions_for_unmasking[j_batch_idx].sum().item()) # ensure k isn't too large
+                if k_val > 0:
+                    # Ensure confidence_for_selection[j_batch_idx] is 1D for topk
+                    conf_slice = confidence_for_selection[j_batch_idx]
+                    if conf_slice.ndim > 1: conf_slice = conf_slice.view(-1) # Should already be 1D from x0_probs
+                    # Check if there are enough valid (non -inf) confidences
+                    valid_conf_count = (conf_slice > -torch.inf).sum().item()
+                    actual_k = min(k_val, valid_conf_count)
+                    if actual_k > 0:
+                        _, topk_indices_in_x = torch.topk(conf_slice, k=actual_k)
+                        transfer_indices_bool[j_batch_idx, topk_indices_in_x] = True
+            x[transfer_indices_bool] = x0_final_candidates[transfer_indices_bool]
+            current_total_step = num_block_iter * steps_per_block + i_step_in_block + 1
+            total_overall_steps = num_blocks * steps_per_block
+            status_msg = f"Block {num_block_iter+1}/{num_blocks}, Step {i_step_in_block+1}/{steps_per_block} (Total: {current_total_step}/{total_overall_steps})"
+            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), status_msg
+    final_generated_ids = x[:, prompt_len:]
+    final_text_output = TOKENIZER.batch_decode(final_generated_ids, skip_special_tokens=True)
+    final_text_str = final_text_output[0] if final_text_output and len(final_text_output) > 0 else ""
+    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_str
+@torch.no_grad()
+def generate_viz_wrapper(uploaded_image_pil, prompt_text, steps, gen_length, block_length, temperature,
+                         cfg_scale, remasking_strategy, thinking_mode_mmu):
+    global MODEL, TOKENIZER, MASK_ID, DEVICE
+    if MODEL is None or TOKENIZER is None or MASK_ID is None:
+        yield [("Error: Model not loaded. Please load the model first.", "ERROR")], "Model not loaded."
+        return
+    steps = int(steps)
+    gen_length = int(gen_length)
+    block_length = int(block_length)
+    if thinking_mode_mmu:
+        prompt_text = "You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think> </think> tags, i.e. <think> reasoning process here </think> answer here\n" + prompt_text
+    try:
+        m = [{"role": "user", "content": prompt_text}]
+        processed_prompt_text = TOKENIZER.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
+    except Exception as e:
+        yield [("Error applying chat template.", "ERROR")], f"Chat template error: {e}"
+        processed_prompt_text = prompt_text
+    image_vq_ids_tensor = None
+    if uploaded_image_pil is not None:
+        try:
+            image = image_transform(uploaded_image_pil, resolution=512).to(DEVICE)
+            image = image.unsqueeze(0)
+            image_vq_ids_tensor = VQ_MODEL.get_code(image)  + 126349
+        except Exception as e:
+            yield [("Error processing image.", "ERROR")], f"Image to VQ tokens conversion failed: {str(e)}"
+            return
+    try:
+        if TOKENIZER.pad_token_id is None:
+            if TOKENIZER.eos_token_id is not None:
+                TOKENIZER.pad_token_id = TOKENIZER.eos_token_id
+            else:
+                 yield [("Tokenizer Error", "ERROR")], "pad_token_id is not set in tokenizer."
+                 return
+        input_ids = TOKENIZER(text=processed_prompt_text, return_tensors="pt", padding="longest", padding_side="left", truncation=True, max_length=MODEL.config.max_position_embeddings if hasattr(MODEL.config, 'max_position_embeddings') else 2048)['input_ids'].to(DEVICE)
+        raw_prompt_attention_mask = None
+        if image_vq_ids_tensor is not None:
+            if image_vq_ids_tensor.ndim == 1:
+                image_vq_ids_tensor = image_vq_ids_tensor.unsqueeze(0)
+            input_ids = torch.cat([
+                (torch.ones(input_ids.shape[0], 1) * torch.tensor([126089])).to(DEVICE),
+                (torch.ones(input_ids.shape[0], 1) * torch.tensor([126084])).to(DEVICE),
+                image_vq_ids_tensor,
+                (torch.ones(input_ids.shape[0], 1) * torch.tensor([126085])).to(DEVICE),
+                input_ids
+            ], dim=1).long()
+        else:
+            input_ids = input_ids
+    except Exception as e:
+        yield [("Error tokenizing prompt.", "ERROR")], f"Tokenization error: {e}"
+        return
+    batch_size = input_ids.shape[0]
+    prompt_len = input_ids.shape[1]
+    x = torch.full((batch_size, prompt_len + gen_length), MASK_ID, dtype=torch.long, device=DEVICE)
+    x[:, :prompt_len] = input_ids.clone()
+    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), "Starting generation: Prompt + Initial Masks"
+    if gen_length == 0:
+         final_text_output = TOKENIZER.batch_decode(x[:,prompt_len:], skip_special_tokens=True)
+         yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_output[0] if final_text_output else ""
+         return
+    if block_length <= 0 or gen_length % block_length != 0 :
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+              f"Error: gen_length ({gen_length}) must be divisible by block_length ({block_length}) and block_length > 0."
+        return
+    num_blocks = gen_length // block_length
+    if steps <=0 or steps % num_blocks != 0:
+        yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), \
+              f"Error: steps ({steps}) must be positive and divisible by num_blocks ({num_blocks}). Steps: {steps}, Num Blocks: {num_blocks}"
+        return
+    steps_per_block = steps // num_blocks
+    for num_block_iter in range(num_blocks):
+        current_block_start_idx_in_x = prompt_len + num_block_iter * block_length
+        current_block_end_idx_in_x = prompt_len + (num_block_iter + 1) * block_length
+        block_masks_bool_current = torch.zeros_like(x, dtype=torch.bool)
+        block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x] = \
+            (x[:, current_block_start_idx_in_x:current_block_end_idx_in_x] == MASK_ID)
+        num_transfer_tokens_for_this_block = get_num_transfer_tokens(
+            block_masks_bool_current[:, current_block_start_idx_in_x:current_block_end_idx_in_x],
+            steps_per_block
+        )
+        for i_step_in_block in range(steps_per_block):
+            mask_index_global = (x == MASK_ID)
+            if cfg_scale > 0.:
+                un_x = x.clone()
+                # For unconditional pass, mask out the original prompt tokens that are not padding
+                # raw_prompt_attention_mask is (B, prompt_len)
+                prompt_active_tokens_mask = raw_prompt_attention_mask.bool() # True where actual prompt tokens are
+                un_x[:, :prompt_len][prompt_active_tokens_mask] = MASK_ID
+                x_cfg_input = torch.cat([x, un_x], dim=0)
+                # Pass attention_mask for CFG if model expects it, covering both parts
+                # For simplicity, not passing explicit attention_mask here; relies on model's internal handling.
+                model_output = MODEL(x_cfg_input)
+                logits_cond, logits_uncond = torch.chunk(model_output.logits, 2, dim=0)
+                logits = logits_uncond + (cfg_scale + 1) * (logits_cond - logits_uncond)
+            else:
+                # Not passing explicit attention_mask here; relies on model's internal handling.
+                model_output = MODEL(x)
+                logits = model_output.logits
+            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+            x0_predicted_tokens = torch.argmax(logits_with_noise, dim=-1)
+            if remasking_strategy == 'low_confidence':
+                probs = F.softmax(logits.to(torch.float64), dim=-1)
+                x0_probs = torch.gather(probs, dim=-1, index=x0_predicted_tokens.unsqueeze(-1)).squeeze(-1)
+            elif remasking_strategy == 'random':
+                x0_probs = torch.rand(x.shape, device=x.device, dtype=torch.float64)
+            else:
+                yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), f"Error: Unknown remasking strategy '{remasking_strategy}'"
+                return
+            confidence_for_selection = torch.full_like(x0_probs, -torch.inf)
+            candidate_positions_for_unmasking = mask_index_global & block_masks_bool_current
+            confidence_for_selection = torch.where(
+                candidate_positions_for_unmasking,
+                x0_probs,
+                -torch.inf
+            )
+            x0_final_candidates = torch.where(mask_index_global, x0_predicted_tokens, x)
+            transfer_indices_bool = torch.zeros_like(x, dtype=torch.bool)
+            num_to_transfer_this_step_batch = num_transfer_tokens_for_this_block[:, i_step_in_block]
+            for j_batch_idx in range(batch_size):
+                k_val = min(num_to_transfer_this_step_batch[j_batch_idx].item(),
+                            candidate_positions_for_unmasking[j_batch_idx].sum().item()) # ensure k isn't too large
+                if k_val > 0:
+                    # Ensure confidence_for_selection[j_batch_idx] is 1D for topk
+                    conf_slice = confidence_for_selection[j_batch_idx]
+                    if conf_slice.ndim > 1: conf_slice = conf_slice.view(-1) # Should already be 1D from x0_probs
+                    # Check if there are enough valid (non -inf) confidences
+                    valid_conf_count = (conf_slice > -torch.inf).sum().item()
+                    actual_k = min(k_val, valid_conf_count)
+                    if actual_k > 0:
+                        _, topk_indices_in_x = torch.topk(conf_slice, k=actual_k)
+                        transfer_indices_bool[j_batch_idx, topk_indices_in_x] = True
+            x[transfer_indices_bool] = x0_final_candidates[transfer_indices_bool]
+            current_total_step = num_block_iter * steps_per_block + i_step_in_block + 1
+            total_overall_steps = num_blocks * steps_per_block
+            status_msg = f"Block {num_block_iter+1}/{num_blocks}, Step {i_step_in_block+1}/{steps_per_block} (Total: {current_total_step}/{total_overall_steps})"
+            yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), status_msg
+    final_generated_ids = x[:, prompt_len:]
+    final_text_output = TOKENIZER.batch_decode(final_generated_ids, skip_special_tokens=True)
+    final_text_str = final_text_output[0] if final_text_output and len(final_text_output) > 0 else ""
+    yield get_highlighted_text_tuples(x, input_ids, prompt_len, TOKENIZER, MASK_ID, raw_prompt_attention_mask), final_text_str
+css_styles = """
+.gradio-container{font-family:'IBM Plex Sans',sans-serif;margin:auto;}
+.gr-input {background:#f9f9f9 !important;border:1px solid #e0e0e0 !important;}
+.gr-output{background:#f0f0f0 !important;border:1px solid #d0d0d0 !important;}
+.highlighted-text span{
+    padding:2px 4px;border-radius:4px;margin:1px 2px;display:inline-block;line-height:1.6;
+}
+footer{display:none !important}
+#live-update-scrollable-box {
+    max-height: 800px; /* 您可以根据需要调整这个最大高度，例如 '300px', '50vh' 等 */
+    overflow-y: auto !important; /* 当内容超出 max-height 时显示垂直滚动条 */
+    display: block; /* 确保元素是块级元素，以便 max-height 生效 */
+}
+#think_btn {
+    background-color: #f3f4f6 !important;
+    border: 1px solid #d0d0d0 !important;
+    color: #111827 !important;
+    font-size: 16px !important;
+    font-weight: bold !important;
+}
+#think_btn:hover {
+    background-color: #e0e0e0 !important;
+    border: 1px solid #c0c0c0 !important;
+    color: #222 !important;
+}
+#think_btn:active {
+    background-color: #2563eb !important;
+    border: 1px solid #b0b0b0 !important;
+    color: white !important;
+}
+"""
+# thinking_mode_t2i = gr.State(False)
+def toggle_thinking_mode_lm(current_thinking_mode):
+    # print(f"current_thinking_mode: {current_thinking_mode}")
+    new_state = not current_thinking_mode
+    new_label = "Thinking Mode ✅" if new_state else "Thinking Mode ❌"
+    return new_state, gr.update(value=new_label)
+def toggle_thinking_mode_mmu(current_thinking_mode):
+    new_state = not current_thinking_mode
+    new_label = "Thinking Mode ✅" if new_state else "Thinking Mode ❌"
+    return new_state, gr.update(value=new_label)
+color_map_config = {
+    "MASK": "lightgrey",
+    "GEN": "#DCABFA",
+}
+theme = gr.themes.Ocean(
+    primary_hue="fuchsia",
+)
+with gr.Blocks(css=css_styles, theme=theme) as demo:
+# with gr.Blocks(css=css_styles, theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
+# with gr.Blocks() as demo:
+    thinking_mode_lm = gr.State(False)
+    thinking_mode_mmu = gr.State(False)
+    gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>MMaDA </h1>")
+    gr.Markdown("Interactively explore the step-by-step generation process of a diffusion language model. "
+                "The model begins with a fully masked sequence (except for the prompt) and progressively refines it by unmasking tokens.")
+    gr.Markdown("### Select Model")
+    with gr.Row():
+        model_select_radio = gr.Radio(
+            label="Select Text Generation Model",
+            choices=MODEL_CHOICES,
+            value=MODEL_CHOICES[0]
+        )
+        model_load_status_box = gr.Textbox(
+            label="Model Load Status",
+            interactive=False,
+            lines=3,
+            max_lines=5
+        )
+    gr.Markdown("## Part 1. Text Generation")
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt_input_box_lm = gr.Textbox(label="Enter your prompt:", lines=3, value="A rectangular prism has a length of 5 units, a width of 4 units, and a height of 3 units. What is the volume of the prism?")
+            think_button_lm = gr.Button("🧠 Enable Thinking Mode", elem_id="think_btn")
+            with gr.Accordion("Generation Parameters", open=True):
+                with gr.Row():
+                    gen_length_slider_lm = gr.Slider(minimum=8, maximum=1024, value=512, step=64, label="Generation Length", info="Number of tokens to generate.")
+                    steps_slider_lm = gr.Slider(minimum=1, maximum=512, value=256, step=32, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
+                with gr.Row():
+                    block_length_slider_lm = gr.Slider(minimum=8, maximum=1024, value=128, step=32, label="Block Length", info="gen_length must be divisible by this.")
+                    remasking_dropdown_lm = gr.Dropdown(choices=['low_confidence', 'random'], value='low_confidence', label="Remasking Strategy")
+                with gr.Row():
+                    cfg_scale_slider_lm = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.1, label="CFG Scale", info="Classifier-Free Guidance. 0 disables it.")
+                    temperature_slider_lm = gr.Slider(minimum=0.0, maximum=2.0, value=1, step=0.05, label="Temperature", info="Controls randomness via Gumbel noise. 0 is deterministic.")
+            with gr.Row():
+                run_button_ui_lm = gr.Button("Generate Sequence", variant="primary", scale=3)
+                clear_button_ui_lm = gr.Button("Clear Outputs", scale=1)
+        with gr.Column(scale=3):
+            # gr.Markdown("## Live Generation Process")
+            output_visualization_box_lm = gr.HighlightedText(
+                label="Live Generation Process",
+                show_legend=True,
+                color_map=color_map_config,
+                combine_adjacent=False,
+                interactive=False,
+                elem_id="live-update-scrollable-box",
+            )
+            # gr.Markdown("## Final Generated Text")
+            output_final_text_box_lm = gr.Textbox(label="Final Output", lines=8, interactive=False, show_copy_button=True)
+    gr.Examples(
+        examples=[
+            ["A rectangular prism has a length of 5 units, a width of 4 units, and a height of 3 units. What is the volume of the prism?", 256, 512, 128, 1, 0, "low_confidence"],
+            ["Lily can run 12 kilometers per hour for 4 hours. After that, she can run 6 kilometers per hour. How many kilometers can she run in 8 hours?", 256, 512, 64, 1, 0, "low_confidence"]
+        ],
+        inputs=[prompt_input_box_lm, steps_slider_lm, gen_length_slider_lm, block_length_slider_lm, temperature_slider_lm, cfg_scale_slider_lm, remasking_dropdown_lm],
+        outputs=[output_visualization_box_lm, output_final_text_box_lm],
+        fn=generate_viz_wrapper_lm,
+    )
+    gr.Markdown("---")
+    gr.Markdown("## Part 2. Multimodal Understanding")
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt_input_box_mmu = gr.Textbox(
+                label="Enter your prompt:",
+                lines=3,
+                value="Please describe this image in detail."
+            )
+            think_button_mmu = gr.Button("🧠 Enable Thinking Mode", elem_id="think_btn")
+            with gr.Accordion("Generation Parameters", open=True):
+                with gr.Row():
+                    gen_length_slider_mmu = gr.Slider(minimum=64, maximum=1024, value=512, step=64, label="Generation Length", info="Number of tokens to generate.")
+                    steps_slider_mmu = gr.Slider(minimum=1, maximum=512, value=256, step=32, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
+                with gr.Row():
+                    block_length_slider_mmu = gr.Slider(minimum=32, maximum=1024, value=128, step=32, label="Block Length", info="gen_length must be divisible by this.")
+                    remasking_dropdown_mmu = gr.Dropdown(choices=['low_confidence', 'random'], value='low_confidence', label="Remasking Strategy")
+                with gr.Row():
+                    cfg_scale_slider_mmu = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.1, label="CFG Scale", info="Classifier-Free Guidance. 0 disables it.")
+                    temperature_slider_mmu = gr.Slider(minimum=0.0, maximum=2.0, value=1, step=0.05, label="Temperature", info="Controls randomness via Gumbel noise. 0 is deterministic.")
+            with gr.Row():
+                image_upload_box = gr.Image(type="pil", label="Upload Image")
+            with gr.Row():
+                run_button_ui_mmu = gr.Button("Generate Description", variant="primary", scale=3)
+                clear_button_ui_mmu = gr.Button("Clear Outputs", scale=1)
+        with gr.Column(scale=3):
+            gr.Markdown("## Live Generation Process")
+            output_visualization_box_mmu = gr.HighlightedText(
+                label="Token Sequence (Live Update)",
+                show_legend=True,
+                color_map=color_map_config,
+                combine_adjacent=False,
+                interactive=False,
+                elem_id="live-update-scrollable-box",
+            )
+            gr.Markdown("## Final Generated Text")
+            output_final_text_box_mmu = gr.Textbox(label="Final Output", lines=8, interactive=False, show_copy_button=True)
+    gr.Examples(
+        examples=[
+            [
+                "mmu_validation_2/sunflower.jpg",
+                "Please describe this image in detail.",
+                256,
+                512,
+                128,
+                1,
+                0,
+                "low_confidence"
+            ],
+            [
+                "mmu_validation_2/woman.jpg",
+                "Please describe this image in detail.",
+                256,
+                512,
+                128,
+                1,
+                0,
+                "low_confidence"
+            ]
+        ],
+        inputs=[
+            image_upload_box,
+            prompt_input_box_mmu,
+            steps_slider_mmu,
+            gen_length_slider_mmu,
+            block_length_slider_mmu,
+            temperature_slider_mmu,
+            cfg_scale_slider_mmu,
+            remasking_dropdown_mmu
+        ],
+        outputs=[output_visualization_box_mmu, output_final_text_box_mmu],
+        fn=generate_viz_wrapper,
+    )
+    gr.Markdown("---")
+    gr.Markdown("## Part 3. Text-to-Image Generation")
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt_input_box_t2i = gr.Textbox(label="Enter your prompt:", lines=3, value="A sea turtle swimming near a coral reef in the ocean, with a clear blue sky and water in the background.")
+            with gr.Accordion("Generation Parameters", open=True):
+                with gr.Row():
+                    steps_slider_t2i = gr.Slider(minimum=5, maximum=100, value=15, step=5, label="Total Sampling Steps", info="Must be divisible by (gen_length / block_length).")
+                    guidance_scale_slider_t2i = gr.Slider(minimum=0.0, maximum=7.0, value=3.5, step=0.5, label="Guidance Scale", info="Classifier-Free Guidance. 0 disables it.")
+            with gr.Row():
+                scheduler_radio_t2i = gr.Radio(
+                    choices=["cosine", "sigmoid", "linear"],
+                    value="cosine",
+                    label="Scheduler",
+                )
+            with gr.Row():
+                run_button_ui_t2i = gr.Button("Generate Image", variant="primary", scale=3)
+                clear_button_ui_t2i = gr.Button("Clear Outputs", scale=1)
+        with gr.Column(scale=3):
+            # gr.Markdown("## Live Generation Process")
+            output_image_t2i = gr.Image(label="Generated Image", interactive=False, type="pil")
+            output_status_t2i = gr.Textbox(label="Generation Status", interactive=False)
+    gr.Examples(
+        examples=[
+            ["A sea turtle swimming near a coral reef in the ocean, with a clear blue sky and water in the background.", 15, 3.5, "cosine"],
+            ["A beautiful sunset over a calm ocean, with a few clouds in the sky.", 15, 3.5, "cosine"]
+        ],
+        inputs=[prompt_input_box_t2i, steps_slider_t2i, guidance_scale_slider_t2i, scheduler_radio_t2i],
+        outputs=[output_image_t2i, output_status_t2i],
+        fn=generate_viz_wrapper_t2i,
+    )
+    run_button_ui_t2i.click(
+        fn=generate_viz_wrapper_t2i,
+        inputs=[
+            prompt_input_box_t2i,
+            steps_slider_t2i,
+            guidance_scale_slider_t2i,
+            scheduler_radio_t2i
+        ],
+        outputs=[output_image_t2i, output_status_t2i]
+    )
+    clear_button_ui_t2i.click(
+        fn=lambda: (None, ""),
+        inputs=None,
+        outputs=[output_image_t2i, output_status_t2i],
+        queue=False
+    )
+    think_button_lm.click(
+        fn=toggle_thinking_mode_lm,
+        inputs=[thinking_mode_lm],
+        outputs=[thinking_mode_lm, think_button_lm]
+    )
+    think_button_mmu.click(
+        fn=toggle_thinking_mode_mmu,
+        inputs=[thinking_mode_mmu],
+        outputs=[thinking_mode_mmu, think_button_mmu]
+    )
+    def initialize_default_model():
+        default_model = "MMaDA-8B-Base"
+        result = handle_model_selection_change(default_model)
+        return default_model, result
+    demo.load(
+        fn=initialize_default_model,
+        inputs=None,
+        outputs=[model_select_radio, model_load_status_box],
+        queue=True
+    )
+    def clear_outputs():
+        return None, None, None  # Clear image, visualization, and final text
+    clear_button_ui_lm.click(
+        fn=clear_outputs,
+        inputs=None,
+        outputs=[image_upload_box, output_visualization_box_lm, output_final_text_box_lm],
+        queue=False
+    )
+    clear_button_ui_mmu.click(
+        fn=clear_outputs,
+        inputs=None,
+        outputs=[image_upload_box, output_visualization_box_mmu, output_final_text_box_mmu],
+        queue=False
+    )
+    run_button_ui_lm.click(
+        fn=generate_viz_wrapper_lm,
+        inputs=[
+            prompt_input_box_lm,
+            steps_slider_lm,
+            gen_length_slider_lm,
+            block_length_slider_lm,
+            temperature_slider_lm,
+            cfg_scale_slider_lm,
+            remasking_dropdown_lm,
+            thinking_mode_lm
+        ],
+        outputs=[output_visualization_box_lm, output_final_text_box_lm]
+    )
+    run_button_ui_mmu.click(
+        fn=generate_viz_wrapper,
+        inputs=[
+            image_upload_box,
+            prompt_input_box_mmu,
+            steps_slider_mmu,
+            gen_length_slider_mmu,
+            block_length_slider_mmu,
+            temperature_slider_mmu,
+            cfg_scale_slider_mmu,
+            remasking_dropdown_mmu,
+            thinking_mode_mmu
+        ],
+        outputs=[output_visualization_box_mmu, output_final_text_box_mmu]
+    )
+if __name__ == "__main__":
+    print(f"Starting Gradio App. Attempting to use device: {DEVICE}")
+    demo.launch(share=True)

models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .modeling_magvitv2 import VQGANEncoder, VQGANDecoder, LFQuantizer, MAGVITv2
+from .sampling import *
+from .modeling_mmada import MMadaModelLM, MMadaConfig

models/common_modules.py ADDED Viewed

	@@ -0,0 +1,357 @@

+"""
+Modified from https://github.com/CompVis/taming-transformers/blob/master/taming/modules/diffusionmodules/model.py#L34
+"""
+import math
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1
+            )
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class DepthToSpaceUpsample(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+    ):
+        super().__init__()
+        conv = nn.Conv2d(in_channels, in_channels * 4, 1)
+        self.net = nn.Sequential(
+            conv,
+            nn.SiLU(),
+            Rearrange("b (c p1 p2) h w -> b c (h p1) (w p2)", p1=2, p2=2),
+        )
+        self.init_conv_(conv)
+    def init_conv_(self, conv):
+        o, i, h, w = conv.weight.shape
+        conv_weight = torch.empty(o // 4, i, h, w)
+        nn.init.kaiming_uniform_(conv_weight)
+        conv_weight = repeat(conv_weight, "o ... -> (o 4) ...")
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def forward(self, x):
+        out = self.net(x)
+        return out
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+def unpack_time(t, batch):
+    _, c, w, h = t.size()
+    out = torch.reshape(t, [batch, -1, c, w, h])
+    out = rearrange(out, "b t c h w -> b c t h w")
+    return out
+def pack_time(t):
+    out = rearrange(t, "b c t h w -> b t c h w")
+    _, _, c, w, h = out.size()
+    return torch.reshape(out, [-1, c, w, h])
+class TimeDownsample2x(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out=None,
+        kernel_size=3,
+    ):
+        super().__init__()
+        if dim_out is None:
+            dim_out = dim
+        self.time_causal_padding = (kernel_size - 1, 0)
+        self.conv = nn.Conv1d(dim, dim_out, kernel_size, stride=2)
+    def forward(self, x):
+        x = rearrange(x, "b c t h w -> b h w c t")
+        b, h, w, c, t = x.size()
+        x = torch.reshape(x, [-1, c, t])
+        x = F.pad(x, self.time_causal_padding)
+        out = self.conv(x)
+        out = torch.reshape(out, [b, h, w, c, t])
+        out = rearrange(out, "b h w c t -> b c t h w")
+        out = rearrange(out, "b h w c t -> b c t h w")
+        return out
+class TimeUpsample2x(nn.Module):
+    def __init__(self, dim, dim_out=None):
+        super().__init__()
+        if dim_out is None:
+            dim_out = dim
+        conv = nn.Conv1d(dim, dim_out * 2, 1)
+        self.net = nn.Sequential(
+            nn.SiLU(), conv, Rearrange("b (c p) t -> b c (t p)", p=2)
+        )
+        self.init_conv_(conv)
+    def init_conv_(self, conv):
+        o, i, t = conv.weight.shape
+        conv_weight = torch.empty(o // 2, i, t)
+        nn.init.kaiming_uniform_(conv_weight)
+        conv_weight = repeat(conv_weight, "o ... -> (o 2) ...")
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def forward(self, x):
+        x = rearrange(x, "b c t h w -> b h w c t")
+        b, h, w, c, t = x.size()
+        x = torch.reshape(x, [-1, c, t])
+        out = self.net(x)
+        out = out[:, :, 1:].contiguous()
+        out = torch.reshape(out, [b, h, w, c, t])
+        out = rearrange(out, "b h w c t -> b c t h w")
+        return out
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+class TimeAttention(AttnBlock):
+    def forward(self, x, *args, **kwargs):
+        x = rearrange(x, "b c t h w -> b h w t c")
+        b, h, w, t, c = x.size()
+        x = torch.reshape(x, (-1, t, c))
+        x = super().forward(x, *args, **kwargs)
+        x = torch.reshape(x, [b, h, w, t, c])
+        return rearrange(x, "b h w t c -> b c t h w")
+class Residual(nn.Module):
+    def __init__(self, fn: nn.Module):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) + x
+def cast_tuple(t, length=1):
+    return t if isinstance(t, tuple) else ((t,) * length)
+class CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        pad_mode="constant",
+        **kwargs
+    ):
+        super().__init__()
+        kernel_size = cast_tuple(kernel_size, 3)
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+        dilation = kwargs.pop("dilation", 1)
+        stride = kwargs.pop("stride", 1)
+        self.pad_mode = pad_mode
+        time_pad = dilation * (time_kernel_size - 1) + (1 - stride)
+        height_pad = height_kernel_size // 2
+        width_pad = width_kernel_size // 2
+        self.time_pad = time_pad
+        self.time_causal_padding = (
+            width_pad,
+            width_pad,
+            height_pad,
+            height_pad,
+            time_pad,
+            0,
+        )
+        stride = (stride, 1, 1)
+        dilation = (dilation, 1, 1)
+        self.conv = nn.Conv3d(
+            chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs
+        )
+    def forward(self, x):
+        pad_mode = self.pad_mode if self.time_pad < x.shape[2] else "constant"
+        x = F.pad(x, self.time_causal_padding, mode=pad_mode)
+        return self.conv(x)
+def ResnetBlockCausal3D(
+    dim, kernel_size: Union[int, Tuple[int, int, int]], pad_mode: str = "constant"
+):
+    net = nn.Sequential(
+        Normalize(dim),
+        nn.SiLU(),
+        CausalConv3d(dim, dim, kernel_size, pad_mode),
+        Normalize(dim),
+        nn.SiLU(),
+        CausalConv3d(dim, dim, kernel_size, pad_mode),
+    )
+    return Residual(net)
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout,
+        temb_channels=512
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        else:
+            self.temb_proj = None
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h

models/configuration_llada.py ADDED Viewed

	@@ -0,0 +1,463 @@

+"""
+LLaDA configuration
+"""
+from transformers import AutoConfig, PretrainedConfig
+from enum import Enum
+from os import PathLike
+from typing import Union
+from dataclasses import asdict, dataclass, field
+from glob import glob
+from pathlib import Path
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
+__all__ = [
+    "ActivationType",
+    "ActivationCheckpointingStrategy",
+    "BlockType",
+    "LayerNormType",
+    "InitFnType",
+    "ModelConfig",
+]
+PathOrStr = Union[str, PathLike]
+class StrEnum(str, Enum):
+    """
+    This is equivalent to Python's :class:`enum.StrEnum` since version 3.11.
+    We include this here for compatibility with older version of Python.
+    """
+    def __str__(self) -> str:
+        return self.value
+    def __repr__(self) -> str:
+        return f"'{str(self)}'"
+class LayerNormType(StrEnum):
+    default = "default"
+    """
+    The default LayerNorm implementation, equivalent to PyTorch's built-in version.
+    """
+    low_precision = "low_precision"
+    """
+    A low-precision version of the default LayerNorm.
+    """
+    rms = "rms"
+    """
+    An RMSNorm implementation. When using ``torch.compile`` this is
+    probably the fastest implementation.
+    """
+    gemma_rms = "gemma_rms"
+    """
+    An RMSNorm implementation by gemmma. When using ``torch.compile`` this is
+    probably the fastest implementation.
+    """
+    amd_compatible = "amd_compatible"
+    """
+    LayerNorm implemented manually to work around an issue with ROCm.
+    """
+class ActivationType(StrEnum):
+    gelu = "gelu"
+    relu = "relu"
+    silu = "silu"
+    swiglu = "swiglu"
+class BlockType(StrEnum):
+    sequential = "sequential"
+    parallel = "parallel"
+    llama = "llama"
+    """
+    A block similar to the sequential block with slightly different
+    implementations of operations like attention to imitate the behavior of Llama.
+    """
+class InitFnType(StrEnum):
+    mitchell = "mitchell"
+    """
+    The strategy suggested to us by Mitchell Wortsman from UW.
+    This uses a truncated normal distribution with an adaptive standard deviation that depends
+    on the size of the weights as well as the depth of the layer.
+    """
+    normal = "normal"
+    """
+    All weights are initialized from the same normal distribution.
+    """
+    kaiming_normal = "kaiming_normal"
+    """
+    All weights are initialized with the Kaiming method from a normal distribution.
+    Note this currently won't work with FSDP.
+    """
+    fan_in = "fan_in"
+    """
+    "Fan-in variance scaling", i.e. normal with a standard deviation of ``1/sqrt(d_in)`` where ``d_in``
+    is the input dimensionality of the kernel.
+    """
+    full_megatron = "full_megatron"
+    """
+    This is what metaseq calls "full megatron init". It is the init used for Llama 2.
+    """
+@dataclass
+class ModelConfig():
+    """
+    LLaDA (model) configuration.
+    """
+    # Note that the defaults for these attributes are equivalent to the base GPT2 model.
+    d_model: int = 768
+    """
+    The hidden size of the model.
+    """
+    n_heads: int = 12
+    """
+    The number of self-attention heads.
+    """
+    n_kv_heads: Optional[int] = None
+    """
+    The number of heads to use for keys and values. Defaults to `n_heads`.
+    Set this to ``None`` or ``n_heads`` for normal multi-head attention.
+    Set this to 1 for multi-query attention.
+    Set it to some in-between value for Llama2-style grouped query attention.
+    """
+    n_layers: int = 12
+    """
+    The number of layers/blocks.
+    """
+    mlp_ratio: int = 4
+    """
+    The ratio of the inner MLP dimensionality to ``d_model``.
+    This is only used when ``mlp_hidden_size`` is not set.
+    """
+    mlp_hidden_size: Optional[int] = None
+    """
+    Set the exact hidden size for the MLP. Otherwise the inner MLP hidden size will be set to `mlp_ratio * d_model`.
+    """
+    activation_type: ActivationType = ActivationType.swiglu
+    """
+    The activation function to use within the MLP layers.
+    """
+    block_type: BlockType = BlockType.sequential
+    """
+    The transformer block implementation.
+    """
+    block_group_size: int = 1
+    """
+    The number of blocks to group together into a single parent block.
+    This has no affect on the number of parameters in the model and is only used to wrap groups
+    of blocks together with a single FSDP wrapper during training.
+    """
+    alibi: bool = False
+    """
+    If ``True``, use ALiBi embeddings. Mutually exclusive with ``rope``.
+    """
+    alibi_bias_max: float = 8.0
+    """
+    Maximum absolute value of ALiBi bias.
+    """
+    rope: bool = False
+    """
+    Use rotary positional embeddings (RoPE). Mutually exclusive with ``alibi``.
+    """
+    rope_full_precision: bool = True
+    """
+    If ``True``, apply RoPE embeddings at full precision regardless of the input type. Otherwise,
+    apply RoPE at the precision of the input.
+    """
+    flash_attention: bool = False
+    """
+    If ``True``, use ``FlashAttention``.
+    """
+    attention_dropout: float = 0.1
+    """
+    The dropout probability within the attention modules.
+    """
+    multi_query_attention: Optional[bool] = None
+    """
+    Use the Multi-Query formulation of attention used in PaLM. This reduces the number of parameters
+    and is more efficient during inference.
+    """
+    attention_layer_norm: bool = False
+    """
+    Apply layer norm to the keys and queries within the attention mechanism.
+    This can help stabilize training.
+    """
+    residual_dropout: float = 0.1
+    """
+    The dropout probability for the MLP and attention output within each block.
+    """
+    embedding_dropout: float = 0.1
+    """
+    The dropout probability for embeddings.
+    """
+    input_emb_norm: bool = False
+    """
+    An input hidden_states norm implementation by gemmma.
+    """
+    layer_norm_type: LayerNormType = LayerNormType.default
+    """
+    The layernorm implementation to use.
+    """
+    layer_norm_with_affine: bool = True
+    """
+    Whether to include bias and weight parameters for the layer norms.
+    This only affects layer norms that are immediately followed by a linear layer in the forward pass,
+    so everything except QK-norms. To turn off affines for QK norms as well, set :attr:`attention_layer_norm_with_affine`
+    to ``False``.
+    """
+    rms_norm_eps: float = 1e-05
+    """
+    The rms layernorm eps param.
+    """
+    attention_layer_norm_with_affine: bool = True
+    """
+    Toggle affine transform for the QK norms.
+    """
+    max_sequence_length: int = 1024
+    """
+    The maximum input sequence length supported by the model.
+    """
+    rope_theta: float = 10000.0
+    """
+    The rope base param.
+    """
+    include_qkv_bias: Optional[bool] = False
+    """
+    Whether or not to include bias parameters in qkv linear layers.
+    """
+    include_bias: bool = False
+    """
+    Whether or not to include bias parameters in linear layers.
+    In PaLM, they got rid of all bias terms because they found that large
+    models tend to have near 0 bias terms anyway.
+    """
+    bias_for_layer_norm: Optional[bool] = None
+    """
+    Whether or not to include bias parameters in layer norm.
+    This is separate from the include_bias parameter, because of a ROCm crash when biases are disabled in
+    layer norm.
+    When this is None (the default), it inherits the setting from include_bias.
+    """
+    scale_logits: bool = False
+    """
+    If ``True``, scale the output logits by ``1 / sqrt(d_model)``.
+    """
+    vocab_size: int = 50257
+    """
+    Vocabulary size of the model.
+    """
+    embedding_size: Optional[int] = 50304
+    """
+    The number of embeddings, i.e. the number of tokens. If set to ``None`` it will default
+    to ``vocab_size``. If ``vocab_size`` is not a multiple of 128, setting this to the
+    next multiple of 128 that's greater than ``vocab_size`` can improve throughput
+    substantially.
+    """
+    weight_tying: bool = True
+    """
+    Whether to tie output linear weights to the input embedding.
+    """
+    eos_token_id: int = 50256
+    """
+    The ID of the end-of-sentence special token.
+    """
+    pad_token_id: int = 50256
+    """
+    The ID of the token to use for padding. Defaults to the ID of the EOS token.
+    """
+    mask_token_id: Optional[int] = 50256
+    """
+    The ID of the token to use for mask token. Defaults to the ID of the EOS token.
+    """
+    init_device: Optional[str] = None
+    """
+    The torch device to use when initializing the model parameters, e.g. "cpu", "cuda:0", "meta".
+    """
+    init_fn: InitFnType = InitFnType.normal
+    """
+    The weight initialization strategy.
+    """
+    init_std: float = 0.02
+    """
+    The standard deviation to use when initializing weights with a "fixed distribution" ``init_fn``, such
+    as "normal".
+    """
+    init_cutoff_factor: Optional[float] = None
+    """
+    A positive factor used to scale the cutoff values when initializing weights with a "fixed distribution" ``init_fn``, such
+    as "normal". Setting this to None means values are not cutoff.
+    """
+    precision: Optional[str] = None
+    """
+    Precision used to train/evaluate with. You shouldn't set this directly.
+    See :data:`TrainConfig.precision` instead.
+    """
+    @property
+    def effective_n_kv_heads(self) -> int:
+        if self.n_kv_heads is None:
+            if self.multi_query_attention is True:
+                return 1
+            else:
+                return self.n_heads
+        else:
+            if self.multi_query_attention is None:
+                return self.n_kv_heads
+            if self.multi_query_attention:
+                n_kv_heads_should_be = 1
+            else:
+                n_kv_heads_should_be = self.n_heads
+            if self.n_kv_heads == n_kv_heads_should_be:
+                return n_kv_heads_should_be
+            else:
+                raise Exception(
+                    "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
+                )
+class ActivationCheckpointingStrategy(StrEnum):
+    whole_layer = "whole_layer"
+    """
+    Checkpoint every transformer layer.
+    """
+    one_in_two = "one_in_two"
+    """
+    Checkpoint one in two transformer layers.
+    """
+    one_in_three = "one_in_three"
+    """
+    Checkpoint one in three transformer layers.
+    """
+    one_in_four = "one_in_four"
+    """
+    Checkpoint one in four transformer layers.
+    """
+    two_in_three = "two_in_three"
+    """
+    Checkpoint two out of every three transformer layers.
+    """
+    three_in_four = "three_in_four"
+    """
+    Checkpoint three out of four of every transformer layers.
+    """
+    four_in_five = "four_in_five"
+    """
+    Checkpoint four out of five of every transformer layers.
+    """
+    nine_in_ten = "nine_in_ten"
+    """
+    Checkpoint nine out of ten of every transformer layers.
+    """
+    fine_grained = "fine_grained"
+    """
+    Focus checkpointing on where it is cheap to recompute and saves most memory.
+    """
+class LLaDAConfig(PretrainedConfig):
+    model_type = "llada"
+    keys_to_ignore_at_inference = ["past_key_values"]  # TODO: confirm
+    def __init__(self, use_cache: bool = False, **kwargs):
+        model_config = ModelConfig()
+        all_kwargs = model_config.__dict__
+        all_kwargs.update(kwargs)
+        all_kwargs.update({"use_cache": use_cache})
+        all_kwargs.update(
+            {
+                "architectures": all_kwargs.get("architectures", ["LLaDAModelLM"])
+            }
+        )
+        super().__init__(**all_kwargs)
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
+    @property
+    def hidden_size(self):
+        return self.d_model
+# Register the config class so that it is available for transformer pipelines, auto-loading etc.
+AutoConfig.register("llada", LLaDAConfig)

models/logging.py ADDED Viewed

	@@ -0,0 +1,338 @@

+# coding=utf-8
+# Copyright 2023 Optuna, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Logging utilities."""
+import logging
+import os
+import sys
+import threading
+from logging import CRITICAL  # NOQA
+from logging import DEBUG  # NOQA
+from logging import ERROR  # NOQA
+from logging import FATAL  # NOQA
+from logging import INFO  # NOQA
+from logging import NOTSET  # NOQA
+from logging import WARN  # NOQA
+from logging import WARNING  # NOQA
+from typing import Optional
+from tqdm import auto as tqdm_lib
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+log_levels = {
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+_default_log_level = logging.WARNING
+_tqdm_active = True
+def _get_default_logging_level():
+    """
+    If muse_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+    not - fall back to `_default_log_level`
+    """
+    env_level_str = os.getenv("muse_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str in log_levels:
+            return log_levels[env_level_str]
+        else:
+            logging.getLogger().warning(
+                f"Unknown option muse_VERBOSITY={env_level_str}, has to be one of: { ', '.join(log_levels.keys()) }"
+            )
+    return _default_log_level
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+def _get_library_root_logger() -> logging.Logger:
+    return logging.getLogger(_get_library_name())
+def _configure_library_root_logger() -> None:
+    global _default_handler
+    with _lock:
+        if _default_handler:
+            # This library has already configured the library root logger.
+            return
+        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        _default_handler.flush = sys.stderr.flush
+        # Apply our default configuration to the library root logger.
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        library_root_logger.propagate = False
+def _reset_library_root_logger() -> None:
+    global _default_handler
+    with _lock:
+        if not _default_handler:
+            return
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.removeHandler(_default_handler)
+        library_root_logger.setLevel(logging.NOTSET)
+        _default_handler = None
+def get_log_levels_dict():
+    return log_levels
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    """
+    Return a logger with the specified name.
+    This function is not supposed to be directly accessed unless you are writing a custom muse module.
+    """
+    if name is None:
+        name = _get_library_name()
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+def get_verbosity() -> int:
+    """
+    Return the current level for the 🤗 muse' root logger as an int.
+    Returns:
+        `int`: The logging level.
+    <Tip>
+    🤗 muse has following logging levels:
+    - 50: `muse.logging.CRITICAL` or `muse.logging.FATAL`
+    - 40: `muse.logging.ERROR`
+    - 30: `muse.logging.WARNING` or `muse.logging.WARN`
+    - 20: `muse.logging.INFO`
+    - 10: `muse.logging.DEBUG`
+    </Tip>"""
+    _configure_library_root_logger()
+    return _get_library_root_logger().getEffectiveLevel()
+def set_verbosity(verbosity: int) -> None:
+    """
+    Set the verbosity level for the 🤗 muse' root logger.
+    Args:
+        verbosity (`int`):
+            Logging level, e.g., one of:
+            - `muse.logging.CRITICAL` or `muse.logging.FATAL`
+            - `muse.logging.ERROR`
+            - `muse.logging.WARNING` or `muse.logging.WARN`
+            - `muse.logging.INFO`
+            - `muse.logging.DEBUG`
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().setLevel(verbosity)
+def set_verbosity_info():
+    """Set the verbosity to the `INFO` level."""
+    return set_verbosity(INFO)
+def set_verbosity_warning():
+    """Set the verbosity to the `WARNING` level."""
+    return set_verbosity(WARNING)
+def set_verbosity_debug():
+    """Set the verbosity to the `DEBUG` level."""
+    return set_verbosity(DEBUG)
+def set_verbosity_error():
+    """Set the verbosity to the `ERROR` level."""
+    return set_verbosity(ERROR)
+def disable_default_handler() -> None:
+    """Disable the default handler of the HuggingFace muse' root logger."""
+    _configure_library_root_logger()
+    assert _default_handler is not None
+    _get_library_root_logger().removeHandler(_default_handler)
+def enable_default_handler() -> None:
+    """Enable the default handler of the HuggingFace muse' root logger."""
+    _configure_library_root_logger()
+    assert _default_handler is not None
+    _get_library_root_logger().addHandler(_default_handler)
+def add_handler(handler: logging.Handler) -> None:
+    """adds a handler to the HuggingFace muse' root logger."""
+    _configure_library_root_logger()
+    assert handler is not None
+    _get_library_root_logger().addHandler(handler)
+def remove_handler(handler: logging.Handler) -> None:
+    """removes given handler from the HuggingFace muse' root logger."""
+    _configure_library_root_logger()
+    assert handler is not None and handler not in _get_library_root_logger().handlers
+    _get_library_root_logger().removeHandler(handler)
+def disable_propagation() -> None:
+    """
+    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = False
+def enable_propagation() -> None:
+    """
+    Enable propagation of the library log outputs. Please disable the HuggingFace muse' default handler to prevent
+    double logging if the root logger has been configured.
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = True
+def enable_explicit_format() -> None:
+    """
+    Enable explicit formatting for every HuggingFace muse' logger. The explicit formatter is as follows:
+    ```
+        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+    ```
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+    for handler in handlers:
+        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+        handler.setFormatter(formatter)
+def reset_format() -> None:
+    """
+    Resets the formatting for HuggingFace muse' loggers.
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+    for handler in handlers:
+        handler.setFormatter(None)
+def warning_advice(self, *args, **kwargs):
+    """
+    This method is identical to `logger.warning()`, but if env var muse_NO_ADVISORY_WARNINGS=1 is set, this
+    warning will not be printed
+    """
+    no_advisory_warnings = os.getenv("muse_NO_ADVISORY_WARNINGS", False)
+    if no_advisory_warnings:
+        return
+    self.warning(*args, **kwargs)
+logging.Logger.warning_advice = warning_advice
+class EmptyTqdm:
+    """Dummy tqdm which doesn't do anything."""
+    def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
+        self._iterator = args[0] if args else None
+    def __iter__(self):
+        return iter(self._iterator)
+    def __getattr__(self, _):
+        """Return empty function."""
+        def empty_fn(*args, **kwargs):  # pylint: disable=unused-argument
+            return
+        return empty_fn
+    def __enter__(self):
+        return self
+    def __exit__(self, type_, value, traceback):
+        return
+class _tqdm_cls:
+    def __call__(self, *args, **kwargs):
+        if _tqdm_active:
+            return tqdm_lib.tqdm(*args, **kwargs)
+        else:
+            return EmptyTqdm(*args, **kwargs)
+    def set_lock(self, *args, **kwargs):
+        self._lock = None
+        if _tqdm_active:
+            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
+    def get_lock(self):
+        if _tqdm_active:
+            return tqdm_lib.tqdm.get_lock()
+tqdm = _tqdm_cls()
+def is_progress_bar_enabled() -> bool:
+    """Return a boolean indicating whether tqdm progress bars are enabled."""
+    global _tqdm_active
+    return bool(_tqdm_active)
+def enable_progress_bar():
+    """Enable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = True
+def disable_progress_bar():
+    """Disable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = False

models/lr_schedulers.py ADDED Viewed

	@@ -0,0 +1,302 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for diffusion models."""
+import math
+from enum import Enum
+from typing import Optional, Union
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+from .logging import get_logger
+logger = get_logger(__name__)
+class SchedulerType(Enum):
+    LINEAR = "linear"
+    COSINE = "cosine"
+    COSINE_WITH_RESTARTS = "cosine_with_restarts"
+    POLYNOMIAL = "polynomial"
+    CONSTANT = "constant"
+    CONSTANT_WITH_WARMUP = "constant_with_warmup"
+def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
+    """
+    Create a schedule with a constant learning rate, using the learning rate set in optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
+def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1):
+    """
+    Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
+    increases linearly between 0 and the initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1.0, num_warmup_steps))
+        return 1.0
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
+    """
+    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
+    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        return max(
+            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+        )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def get_cosine_schedule_with_warmup(
+    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1, min_lr_scale: float = 0.0
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_periods (`float`, *optional*, defaults to 0.5):
+            The number of periods of the cosine function in a schedule (the default is to just decrease from the max
+            value to 0 following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    # def lr_lambda(current_step):
+    #     if current_step < num_warmup_steps:
+    #         return float(current_step) / float(max(1, num_warmup_steps))
+    #     progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+    #     return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+    # return LambdaLR(optimizer, lr_lambda, last_epoch)
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        cosine_decay = 0.5 * (1.0 + math.cos(math.pi * 2.0 * num_cycles * progress))
+        return min_lr_scale + (1.0 - min_lr_scale) * cosine_decay
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def get_cosine_with_hard_restarts_schedule_with_warmup(
+    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
+    linearly between 0 and the initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`int`, *optional*, defaults to 1):
+            The number of hard restarts to use.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        if progress >= 1.0:
+            return 0.0
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def get_polynomial_decay_schedule_with_warmup(
+    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
+):
+    """
+    Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
+    optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
+    initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        lr_end (`float`, *optional*, defaults to 1e-7):
+            The end LR.
+        power (`float`, *optional*, defaults to 1.0):
+            Power factor.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
+    implementation at
+    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    lr_init = optimizer.defaults["lr"]
+    if not (lr_init > lr_end):
+        raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        elif current_step > num_training_steps:
+            return lr_end / lr_init  # as LambdaLR multiplies by lr_init
+        else:
+            lr_range = lr_init - lr_end
+            decay_steps = num_training_steps - num_warmup_steps
+            pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
+            decay = lr_range * pct_remaining**power + lr_end
+            return decay / lr_init  # as LambdaLR multiplies by lr_init
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+TYPE_TO_SCHEDULER_FUNCTION = {
+    SchedulerType.LINEAR: get_linear_schedule_with_warmup,
+    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
+    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
+    SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
+    SchedulerType.CONSTANT: get_constant_schedule,
+    SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
+}
+def get_scheduler(
+    name: Union[str, SchedulerType],
+    optimizer: Optimizer,
+    num_warmup_steps: Optional[int] = None,
+    num_training_steps: Optional[int] = None,
+    num_cycles: int = 1,
+    power: float = 1.0,
+    min_lr_scale: float = 0.0
+):
+    """
+    Unified API to get any scheduler from its name.
+    Args:
+        name (`str` or `SchedulerType`):
+            The name of the scheduler to use.
+        optimizer (`torch.optim.Optimizer`):
+            The optimizer that will be used during training.
+        num_warmup_steps (`int`, *optional*):
+            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_training_steps (`int``, *optional*):
+            The number of training steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_cycles (`int`, *optional*):
+            The number of hard restarts used in `COSINE_WITH_RESTARTS` scheduler.
+        power (`float`, *optional*, defaults to 1.0):
+            Power factor. See `POLYNOMIAL` scheduler
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    """
+    name = SchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+    if name == SchedulerType.CONSTANT:
+        return schedule_func(optimizer)
+    # All other schedulers require `num_warmup_steps`
+    if num_warmup_steps is None:
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+    if name == SchedulerType.CONSTANT_WITH_WARMUP:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
+    # All other schedulers require `num_training_steps`
+    if num_training_steps is None:
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+    if name == SchedulerType.COSINE_WITH_RESTARTS:
+        return schedule_func(
+            optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, num_cycles=num_cycles, min_lr_scale=min_lr_scale
+        )
+    if name == SchedulerType.POLYNOMIAL:
+        return schedule_func(
+            optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, power=power
+        )
+    return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

models/misc.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from omegaconf import OmegaConf
+import torch
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    NewType,
+    Optional,
+    Sized,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+# Tensor dtype
+# for jaxtyping usage, see https://github.com/google/jaxtyping/blob/main/API.md
+from jaxtyping import Bool, Complex, Float, Inexact, Int, Integer, Num, Shaped, UInt
+# Config type
+from omegaconf import DictConfig
+# PyTorch Tensor type
+from torch import Tensor
+# Runtime type checking decorator
+from typeguard import typechecked as typechecker
+def broadcast(tensor, src=0):
+    if not _distributed_available():
+        return tensor
+    else:
+        torch.distributed.broadcast(tensor, src=src)
+        return tensor
+def _distributed_available():
+    return torch.distributed.is_available() and torch.distributed.is_initialized()
+def parse_structured(fields: Any, cfg: Optional[Union[dict, DictConfig]] = None) -> Any:
+    # added by Xavier -- delete '--local-rank' in multi-nodes training, don't know why there is such a keyword
+    if '--local-rank' in cfg:
+        del cfg['--local-rank']
+    # added by Xavier -- delete '--local-rank' in multi-nodes training, don't know why there is such a keyword
+    scfg = OmegaConf.structured(fields(**cfg))
+    return scfg

models/modeling_llada.py ADDED Viewed

	@@ -0,0 +1,1500 @@

+from __future__ import annotations
+import logging
+import math
+import sys
+from abc import abstractmethod
+from collections import defaultdict
+from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+cast,
+)
+from dataclasses import fields
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.backends.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import einsum
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.auto import AutoModel
+from transformers.cache_utils import Cache
+from .configuration_llada import (
+    LLaDAConfig,
+    StrEnum,
+    InitFnType,
+    ActivationType,
+    BlockType,
+    LayerNormType,
+    ModelConfig,
+    ActivationCheckpointingStrategy,
+)
+if sys.version_info.minor > 8:
+    from collections.abc import MutableMapping
+elif sys.version_info.minor == 8:
+    from typing import MutableMapping
+else:
+    raise SystemExit("This script supports Python 3.8 or higher")
+__all__ = [
+    "LayerNormBase",
+    "LayerNorm",
+    "RMSLayerNorm",
+    "GemmaRMSLayerNorm",
+    "RotaryEmbedding",
+    "Activation",
+    "GELU",
+    "ReLU",
+    "SwiGLU",
+    "LLaDABlock",
+    "LLaDASequentialBlock",
+    "LLaDAModel",
+    "LLaDAOutput",
+    "LLaDAGenerateOutput",
+]
+log = logging.getLogger(__name__)
+class ModuleType(StrEnum):
+    in_module = "in"
+    out_module = "out"
+    emb = "emb"
+    final_out = "final_out"
+def init_weights(
+    config: ModelConfig,
+    module: Union[nn.Linear, nn.Embedding],
+    d: Optional[int] = None,
+    layer_id: Optional[int] = None,
+    std_factor: float = 1.0,
+    type_of_module: Optional[ModuleType] = None,
+) -> None:
+    """
+    Initialize weights of a linear or embedding module.
+    :param config: The model config.
+    :param module: The linear or embedding submodule to initialize.
+    :param d: The effective input dimensionality of the weights. This could be smaller than the actual dimensions
+        for fused layers.
+    :param layer_id: When set, the standard deviation for the "mitchell" method will be adjusted by
+        ``1 / sqrt(2 * (layer_id + 1))``.
+    """
+    d = d if d is not None else config.d_model
+    if config.init_fn == InitFnType.normal:
+        std = config.init_std * std_factor
+        if config.init_cutoff_factor is not None:
+            cutoff_value = config.init_cutoff_factor * std
+            nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-cutoff_value, b=cutoff_value)
+        else:
+            nn.init.normal_(module.weight, mean=0.0, std=std)
+    elif config.init_fn == InitFnType.mitchell:
+        std = std_factor / math.sqrt(d)
+        if layer_id is not None:
+            std = std / math.sqrt(2 * (layer_id + 1))
+        nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-3 * std, b=3 * std)
+    elif config.init_fn == InitFnType.kaiming_normal:
+        nn.init.kaiming_normal_(module.weight, nonlinearity="relu")
+    elif config.init_fn == InitFnType.fan_in:
+        std = std_factor / math.sqrt(d)
+        nn.init.normal_(module.weight, mean=0.0, std=std)
+    elif config.init_fn == InitFnType.full_megatron:
+        if type_of_module is None:
+            raise RuntimeError(f"When using the {InitFnType.full_megatron} init, every module must have a type.")
+        cutoff_factor = config.init_cutoff_factor
+        if cutoff_factor is None:
+            cutoff_factor = 3
+        if type_of_module == ModuleType.in_module:
+            # for att_proj (same as QKV), ff_proj
+            std = config.init_std
+        elif type_of_module == ModuleType.out_module:
+            # for attn_out, ff_out
+            std = config.init_std / math.sqrt(2.0 * config.n_layers)
+        elif type_of_module == ModuleType.emb:
+            # positional embeddings (wpe)
+            # token embeddings (wte)
+            std = config.init_std
+        elif type_of_module == ModuleType.final_out:
+            # final output (ff_out)
+            std = config.d_model**-0.5
+        else:
+            raise RuntimeError(f"Unknown module type '{type_of_module}'")
+        nn.init.trunc_normal_(
+            module.weight,
+            mean=0.0,
+            std=std,
+            a=-cutoff_factor * std,
+            b=cutoff_factor * std,
+        )
+    else:
+        raise NotImplementedError(config.init_fn)
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+        if config.init_fn == InitFnType.normal and getattr(module, "_is_residual", False):
+            with torch.no_grad():
+                module.weight.div_(math.sqrt(2 * config.n_layers))
+def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
+    """
+    Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
+    is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``.
+    """
+    if check_neg_inf:
+        x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min)
+    if check_pos_inf:
+        x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
+def activation_checkpoint_function(cfg: ModelConfig):
+    preserve_rng_state = (
+        (cfg.attention_dropout == 0.0) and (cfg.embedding_dropout == 0.0) and (cfg.residual_dropout == 0.0)
+    )
+    from torch.utils.checkpoint import checkpoint
+    return partial(
+        checkpoint,
+        preserve_rng_state=preserve_rng_state,
+        use_reentrant=False,
+    )
+class BufferCache(dict, MutableMapping[str, torch.Tensor]):
+    """
+    Cache for attention biases and other things that would normally be stored as buffers.
+    We avoid using buffers because we've run into various issues doing so with FSDP.
+    In general it appears the way FSDP handles buffers is not well-defined.
+    It doesn't shard them but apparently it does synchronize them across processes, which we want to avoid
+    since (A) it isn't necessary, and (B) we sometimes have `-inf` in these biases which might get turned into
+    NaNs when they're synchronized due to casting or some other issue.
+    """
+def _non_meta_init_device(config: ModelConfig) -> torch.device:
+    if config.init_device is not None and config.init_device != "meta":
+        return torch.device(config.init_device)
+    else:
+        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class Dropout(nn.Dropout):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.p == 0.0:
+            return input
+        else:
+            return F.dropout(input, self.p, self.training, self.inplace)
+class LayerNormBase(nn.Module):
+    def __init__(
+        self,
+        config: ModelConfig,
+        *,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = True,
+        eps: float = 1e-05,
+    ):
+        super().__init__()
+        self.config = config
+        self.eps = eps
+        self.normalized_shape = (size or config.d_model,)
+        if elementwise_affine or (elementwise_affine is None and self.config.layer_norm_with_affine):
+            self.weight = nn.Parameter(torch.ones(self.normalized_shape, device=config.init_device))
+            use_bias = self.config.bias_for_layer_norm
+            if use_bias is None:
+                use_bias = self.config.include_bias
+            if use_bias:
+                self.bias = nn.Parameter(torch.zeros(self.normalized_shape, device=config.init_device))
+            else:
+                self.register_parameter("bias", None)
+        else:
+            self.register_parameter("bias", None)
+            self.register_parameter("weight", None)
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, config: ModelConfig, size: Optional[int] = None, **kwargs) -> LayerNormBase:
+        if config.layer_norm_type == LayerNormType.default:
+            return LayerNorm(config, size=size, low_precision=False, **kwargs)
+        elif config.layer_norm_type == LayerNormType.low_precision:
+            return LayerNorm(config, size=size, low_precision=True, **kwargs)
+        elif config.layer_norm_type == LayerNormType.rms:
+            return RMSLayerNorm(config, size=size, **kwargs)
+        elif config.layer_norm_type == LayerNormType.gemma_rms:
+            return GemmaRMSLayerNorm(config, size=size, **kwargs)
+        else:
+            raise NotImplementedError(f"Unknown LayerNorm type: '{config.layer_norm_type}'")
+    def _cast_if_autocast_enabled(self, tensor: torch.Tensor, dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
+        # `is_autocast_cpu_enabled()` for CPU autocast.
+        # See https://github.com/pytorch/pytorch/issues/110966.
+        if tensor.device.type == "cuda" and torch.is_autocast_enabled():
+            return tensor.to(dtype=dtype if dtype is not None else torch.get_autocast_gpu_dtype())
+        elif tensor.device.type == "cpu" and torch.is_autocast_cpu_enabled():
+            return tensor.to(dtype=dtype if dtype is not None else torch.get_autocast_cpu_dtype())
+        else:
+            return tensor
+    def reset_parameters(self):
+        if self.weight is not None:
+            torch.nn.init.ones_(self.weight)  # type: ignore
+        if self.bias is not None:
+            torch.nn.init.zeros_(self.bias)  # type: ignore
+class LayerNorm(LayerNormBase):
+    """
+    The default :class:`LayerNorm` implementation which can optionally run in low precision.
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        low_precision: bool = False,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-05,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps)
+        self.low_precision = low_precision
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.low_precision:
+            module_device = x.device
+            downcast_x = self._cast_if_autocast_enabled(x)
+            downcast_weight = (
+                self._cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+            )
+            downcast_bias = self._cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+            with torch.autocast(enabled=False, device_type=module_device.type):
+                return F.layer_norm(
+                    downcast_x, self.normalized_shape, weight=downcast_weight, bias=downcast_bias, eps=self.eps
+                )
+        else:
+            return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
+class RMSLayerNorm(LayerNormBase):
+    """
+    RMS layer norm, a simplified :class:`LayerNorm` implementation
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-5,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=config.rms_norm_eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            og_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            x = x.to(og_dtype)
+        if self.weight is not None:
+            if self.bias is not None:
+                return self.weight * x + self.bias
+            else:
+                return self.weight * x
+        else:
+            return x
+class GemmaRMSLayerNorm(LayerNormBase):
+    """
+    Gemma RMS layer norm, a simplified :class:`LayerNorm` implementation
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-5,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=config.rms_norm_eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            og_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            x = x.to(og_dtype)
+        if self.weight is not None:
+            if self.bias is not None:
+                return x * (1 + self.weight) + self.bias
+            else:
+                return x * (1 + self.weight)
+        else:
+            return x
+class RotaryEmbedding(nn.Module):
+    """
+    [Rotary positional embeddings (RoPE)](https://arxiv.org/abs/2104.09864).
+    """
+    def __init__(self, config: ModelConfig, cache: BufferCache):
+        super().__init__()
+        self.config = config
+        self.__cache = cache
+        # Warm up cache.
+        self.rope_theta = config.rope_theta
+        self.get_rotary_embedding(config.max_sequence_length, _non_meta_init_device(config))
+    def get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+        if (
+            (pos_sin := self.__cache.get("rope_pos_sin")) is not None
+            and (pos_cos := self.__cache.get("rope_pos_cos")) is not None
+            and pos_sin.shape[-2] >= seq_len
+            and pos_cos.shape[-2] >= seq_len
+        ):
+            if pos_sin.device != device:
+                pos_sin = pos_sin.to(device)
+                self.__cache["rope_pos_sin"] = pos_sin
+            if pos_cos.device != device:
+                pos_cos = pos_cos.to(device)
+                self.__cache["rope_pos_cos"] = pos_cos
+            return pos_sin[:, :, :seq_len, :], pos_cos[:, :, :seq_len, :]
+        with torch.autocast(device.type, enabled=False):
+            dim = self.config.d_model // self.config.n_heads
+            inv_freq = 1.0 / (self.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
+            seq = torch.arange(seq_len, device=device, dtype=torch.float)
+            freqs = einsum("i , j -> i j", seq, inv_freq)
+            positions = torch.cat((freqs, freqs), dim=-1)
+            pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
+        self.__cache["rope_pos_sin"] = pos_sin
+        self.__cache["rope_pos_cos"] = pos_cos
+        return pos_sin, pos_cos
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        B, nh, T, hs = x.size()
+        x = x.view(B, nh, T, 2, hs // 2)
+        x1, x2 = x.unbind(dim=-2)
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.config.rope_full_precision:
+            q_, k_ = q.float(), k.float()
+        else:
+            q_, k_ = q, k
+        with torch.autocast(q.device.type, enabled=False):
+            query_len, key_len = q_.shape[-2], k_.shape[-2]  # could be different if layer_past not None
+            pos_sin, pos_cos = self.get_rotary_embedding(key_len, q_.device)
+            pos_sin = pos_sin.type_as(q_)
+            pos_cos = pos_cos.type_as(q_)
+            q_ = self.apply_rotary_pos_emb(
+                pos_sin[:, :, key_len - query_len : key_len, :],
+                pos_cos[:, :, key_len - query_len : key_len, :],
+                q_,
+            )
+            k_ = self.apply_rotary_pos_emb(pos_sin, pos_cos, k_)
+        return q_.type_as(q), k_.type_as(k)
+class Activation(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    @property
+    @abstractmethod
+    def output_multiplier(self) -> float:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, config: ModelConfig) -> Activation:
+        if config.activation_type == ActivationType.gelu:
+            return cast(Activation, GELU(approximate="none"))
+        elif config.activation_type == ActivationType.relu:
+            return cast(Activation, ReLU(inplace=False))
+        elif config.activation_type == ActivationType.silu:
+            return cast(Activation, SiLU(inplace=False))
+        elif config.activation_type == ActivationType.swiglu:
+            return SwiGLU(config)
+        else:
+            raise NotImplementedError(f"Unknown activation: '{config.activation_type}'")
+class GELU(nn.GELU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class ReLU(nn.ReLU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class SiLU(nn.SiLU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class SwiGLU(Activation):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = x.chunk(2, dim=-1)
+        return F.silu(gate) * x
+    @property
+    def output_multiplier(self) -> float:
+        return 0.5
+def causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor:
+    att_bias = torch.triu(
+        torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
+        diagonal=1,
+    )
+    att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min)
+    return att_bias.view(1, 1, seq_len, seq_len)  # type: ignore
+def get_causal_attention_bias(cache: BufferCache, seq_len: int, device: torch.device) -> torch.Tensor:
+    if (causal_bias := cache.get("causal_attention_bias")) is not None and causal_bias.shape[-1] >= seq_len:
+        if causal_bias.device != device:
+            causal_bias = causal_bias.to(device)
+            cache["causal_attention_bias"] = causal_bias
+        return causal_bias
+    with torch.autocast(device.type, enabled=False):
+        causal_bias = causal_attention_bias(seq_len, device)
+    cache["causal_attention_bias"] = causal_bias
+    return causal_bias
+def alibi_attention_bias(seq_len: int, config: ModelConfig, device: torch.device) -> torch.FloatTensor:
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, 1, seq_len)
+    # shape: (1, 1, seq_len, seq_len)
+    alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, seq_len, 1)
+    alibi_bias.abs_().mul_(-1)
+    # shape: (n_heads,)
+    m = torch.arange(1, config.n_heads + 1, dtype=torch.float, device=device)
+    m.mul_(config.alibi_bias_max / config.n_heads)
+    # shape: (1, n_heads, seq_len, seq_len)
+    return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1)))  # type: ignore
+class LLaDABlock(nn.Module):
+    """
+    A base class for transformer block implementations.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.hidden_size = (
+            config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
+        )
+        self.__cache = cache
+        assert config.d_model % config.n_heads == 0
+        self._activation_checkpoint_fn = None
+        # Dropout.
+        self.dropout = Dropout(config.residual_dropout)
+        # Layer norms.
+        self.k_norm: Optional[LayerNormBase] = None
+        self.q_norm: Optional[LayerNormBase] = None
+        if config.attention_layer_norm:
+            self.k_norm = LayerNormBase.build(
+                config,
+                size=(config.d_model // config.n_heads) * config.effective_n_kv_heads,
+                elementwise_affine=config.attention_layer_norm_with_affine,
+            )
+            self.q_norm = LayerNormBase.build(config, elementwise_affine=config.attention_layer_norm_with_affine)
+        # Activation function.
+        self.act = Activation.build(config)
+        assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
+        # Attention output projection.
+        self.attn_out = nn.Linear(
+            config.d_model, config.d_model, bias=config.include_bias, device=config.init_device
+        )
+        # Feed-forward output projection.
+        self.ff_out = nn.Linear(
+            int(self.act.output_multiplier * self.hidden_size),
+            config.d_model,
+            bias=config.include_bias,
+            device=config.init_device,
+        )
+        self.ff_out._is_residual = True  # type: ignore
+        # Rotary embeddings.
+        if self.config.rope:
+            self.rotary_emb = RotaryEmbedding(config, self.__cache)
+        self.flash_attn_func = None
+        if config.flash_attention:
+            try:
+                from flash_attn import flash_attn_func  # type: ignore
+                self.flash_attn_func = flash_attn_func
+            except ModuleNotFoundError:
+                pass
+    def reset_parameters(self):
+        if self.k_norm is not None:
+            self.k_norm.reset_parameters()
+        if self.q_norm is not None:
+            self.q_norm.reset_parameters()
+        init_weights(
+            self.config,
+            self.attn_out,
+            d=self.config.d_model,
+            layer_id=self.layer_id,
+            type_of_module=ModuleType.out_module,
+        )
+        init_weights(
+            self.config,
+            self.ff_out,
+            d=self.ff_out.in_features,
+            layer_id=self.layer_id,
+            type_of_module=ModuleType.out_module,
+        )
+    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
+        if strategy == ActivationCheckpointingStrategy.fine_grained:
+            self._activation_checkpoint_fn = activation_checkpoint_function(self.config)
+        else:
+            self._activation_checkpoint_fn = None
+    @classmethod
+    def _cast_attn_bias(cls, bias: torch.Tensor, input_dtype: torch.dtype) -> torch.Tensor:
+        target_dtype = input_dtype
+        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
+        # `is_autocast_cpu_enabled()` for CPU autocast.
+        # See https://github.com/pytorch/pytorch/issues/110966.
+        if bias.device.type == "cuda" and torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        elif bias.device.type == "cpu" and torch.is_autocast_cpu_enabled():
+            target_dtype = torch.get_autocast_cpu_dtype()
+        if bias.dtype != target_dtype:
+            bias = bias.to(target_dtype)
+            ensure_finite_(bias, check_neg_inf=True, check_pos_inf=False)
+        return bias
+    def _scaled_dot_product_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        """
+        Computes scaled dot product attention on query, key and value tensors, using an optional
+        attention mask if passed, and applying dropout if a probability greater than 0.0 is specified.
+        """
+        if self.flash_attn_func is not None and attn_mask is None:
+            r = self.flash_attn_func(
+                q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), dropout_p=dropout_p, causal=False
+            )
+            return r.transpose(1, 2)
+        else:
+            # torch's sdpa doesn't support GQA, so we're doing this
+            assert k.size(1) == v.size(1)
+            num_kv_heads = k.size(1)
+            num_q_heads = q.size(1)
+            if num_q_heads != num_kv_heads:
+                assert num_q_heads % num_kv_heads == 0
+                k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+                v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+            # Modify: MDM set causal to False, and with no attn_mask.
+            return F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=None,
+                dropout_p=dropout_p,
+                is_causal=False,
+            )
+    def attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        B, T, C = q.size()  # batch size, sequence length, d_model
+        dtype = k.dtype
+        # Optionally apply layer norm to keys and queries.
+        if self.q_norm is not None and self.k_norm is not None:
+            q = self.q_norm(q).to(dtype=dtype)
+            k = self.k_norm(k).to(dtype=dtype)
+        # Move head forward to be next to the batch dim.
+        # shape: (B, nh, T, hs)
+        q = q.view(B, T, self.config.n_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        k = k.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        v = v.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat((past_key, k), dim=-2)
+            v = torch.cat((past_value, v), dim=-2)
+        present = (k, v) if use_cache else None
+        query_len, key_len = q.shape[-2], k.shape[-2]  # could be different if layer_past not None
+        if self.config.rope:
+            # Apply rotary embeddings.
+            q, k = self.rotary_emb(q, k)
+        if attention_bias is not None:
+            # Resize and cast attention bias.
+            # The current dtype of the attention bias might not match the dtype that the SDP attn function will
+            # run in if AMP is enabled, and this can be a problem if some tokens are masked out due to padding
+            # as down-casting the attention bias to the autocast precision will result in -infs, which will
+            # cause the SDP attn function to produce NaNs.
+            attention_bias = self._cast_attn_bias(
+                attention_bias[:, :, key_len - query_len : key_len, :key_len], dtype
+            )
+        # Get the attention scores.
+        # shape: (B, nh, T, hs)
+        att = self._scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None,
+            dropout_p=0.0 if not self.training else self.config.attention_dropout,
+            is_causal=False,
+        )
+        # Re-assemble all head outputs side-by-side.
+        att = att.transpose(1, 2).contiguous().view(B, T, C)
+        # Apply output projection.
+        return self.attn_out(att), present
+    @abstractmethod
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> LLaDABlock:
+        if config.block_type == BlockType.sequential:
+            return LLaDASequentialBlock(layer_id, config, cache)
+        elif config.block_type == BlockType.llama:
+            return LLaDALlamaBlock(layer_id, config, cache)
+        else:
+            raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
+class LLaDASequentialBlock(LLaDABlock):
+    """
+    This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__(layer_id, config, cache)
+        # Layer norms.
+        self.attn_norm = LayerNorm.build(config)
+        self.ff_norm = LayerNorm.build(config)
+        # Attention input projection. Projects x -> (q, k, v)
+        head_dim = config.d_model // config.n_heads
+        self.fused_dims = (
+            config.d_model,
+            config.effective_n_kv_heads * head_dim,
+            config.effective_n_kv_heads * head_dim,
+        )
+        self.att_proj = nn.Linear(
+            config.d_model, sum(self.fused_dims), bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        # Feed-forward input projection.
+        self.ff_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        init_weights(
+            self.config, self.att_proj, d=self.config.d_model, layer_id=None, type_of_module=ModuleType.in_module
+        )
+        init_weights(
+            self.config, self.ff_proj, d=self.config.d_model, layer_id=None, type_of_module=ModuleType.in_module
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        #  - for group query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_kv_heads)
+        if self._activation_checkpoint_fn is not None:
+            q, k, v = self.att_proj(self._activation_checkpoint_fn(self.attn_norm, x)).split(
+                self.fused_dims, dim=-1
+            )
+        else:
+            q, k, v = self.att_proj(self.attn_norm(x)).split(self.fused_dims, dim=-1)
+        # Get attention scores.
+        if self._activation_checkpoint_fn is not None:
+            att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention, q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache
+            )
+        else:
+            att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
+        # Add attention scores.
+        # shape: (B, T, C)
+        x = x + self.dropout(att)
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+        else:
+            x = self.ff_norm(x)
+        x = self.ff_proj(x)
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+        else:
+            x = self.act(x)
+        x = self.ff_out(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class LLaDALlamaBlock(LLaDABlock):
+    """
+    This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection). This block is similar to `LLaDASequentialBlock`
+    but some operations have slightly different implementations to imitate the
+    behavior of Llama.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__(layer_id, config, cache)
+        # Layer norms.
+        self.attn_norm = LayerNorm.build(config)
+        self.ff_norm = LayerNorm.build(config)
+        self.__cache = cache
+        # Attention input projection. Projects x -> (q, k, v)
+        head_dim = config.d_model // config.n_heads
+        q_proj_out_dim = config.d_model
+        k_proj_out_dim = config.effective_n_kv_heads * head_dim
+        v_proj_out_dim = config.effective_n_kv_heads * head_dim
+        self.q_proj = nn.Linear(
+            config.d_model, q_proj_out_dim, bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        self.k_proj = nn.Linear(
+            config.d_model, k_proj_out_dim, bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        self.v_proj = nn.Linear(
+            config.d_model, v_proj_out_dim, bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        # Feed-forward input projection.
+        self.ff_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+        # new add
+        self.up_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        init_weights(self.config, self.q_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.k_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.v_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.ff_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.up_proj, d=self.config.d_model, layer_id=None)  # new add
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        #  - for group query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_kv_heads)
+        x_normed = self.attn_norm(x)
+        q = self.q_proj(x_normed)
+        k = self.k_proj(x_normed)
+        v = self.v_proj(x_normed)
+        # Get attention scores.
+        if self._activation_checkpoint_fn is not None:
+            att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention, q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache
+            )
+        else:
+            att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
+        # Add attention scores.
+        # shape: (B, T, C)
+        x = x + self.dropout(att)
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+        else:
+            x = self.ff_norm(x)
+        x, x_up = self.ff_proj(x), self.up_proj(x) # new add
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+        else:
+            x = self.act(x)
+        x = x * x_up # new add
+        x = self.ff_out(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class LLaDAOutput(NamedTuple):
+    logits: torch.FloatTensor
+    """
+    A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
+    for the next token *before* normalization via (log) softmax.
+    """
+    attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]]
+    """
+    Attention keys and values from each block.
+    """
+    hidden_states: Optional[Tuple[torch.Tensor]]
+    """
+    Hidden states from each block.
+    """
+class LLaDAGenerateOutput(NamedTuple):
+    token_ids: torch.LongTensor
+    """
+    The generated token IDs, a tensor of shape `(batch_size, beam_size, max_steps)`.
+    These do *not* include the original input IDs.
+    """
+    scores: torch.FloatTensor
+    """
+    The scores of the generated sequences, a tensor of shape `(batch_size, beam_size)`.
+    """
+class LLaDABlockGroup(nn.ModuleList):
+    def __init__(self, config: ModelConfig, layer_offset: int, modules: Optional[Iterable[nn.Module]] = None):
+        super().__init__(modules)
+        self.config = config
+        self.layer_offset = layer_offset
+        self.activation_checkpointing_strategy: Optional[ActivationCheckpointingStrategy] = None
+        self._activation_checkpoint_fn = activation_checkpoint_function(self.config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.FloatTensor] = None,
+        layers_past: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[List[Tuple[torch.Tensor, torch.Tensor]]]]:
+        attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+        for block_idx, block in enumerate(self):
+            layer_past = None if layers_past is None else layers_past[block_idx]
+            block_idx += self.layer_offset
+            if (
+                (self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.whole_layer)
+                or (
+                    self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_two
+                    and block_idx % 2 == 0
+                )
+                or (
+                    self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_three
+                    and block_idx % 3 == 0
+                )
+                or (
+                    self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_four
+                    and block_idx % 4 == 0
+                )
+            ):
+                # shape: (batch_size, seq_len, d_model)
+                x, cache = self._activation_checkpoint_fn(  # type: ignore
+                    block, x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache
+                )
+            else:
+                # shape: (batch_size, seq_len, d_model)
+                x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
+            if attn_key_values is not None:
+                assert cache is not None
+                attn_key_values.append(cache)
+        return x, attn_key_values
+    def reset_parameters(self):
+        for block in self:
+            block.reset_parameters()
+    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
+        self.activation_checkpointing_strategy = strategy
+        for block in self:
+            block.set_activation_checkpointing(strategy)
+class LLaDAModel(nn.Module):
+    def __init__(self, config: ModelConfig, init_params: bool = True):
+        super().__init__()
+        self.config = config
+        self.__cache = BufferCache()
+        # Validate config.
+        if self.config.alibi and self.config.flash_attention:
+            raise Exception("ALiBi is currently not supported with FlashAttention")
+        if self.config.alibi and self.config.rope:
+            raise Exception("ALiBi and RoPE are mutually exclusive")
+        if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
+            if self.config.embedding_size < self.config.vocab_size:
+                raise Exception("embedding size should be at least as big as vocab size")
+            elif self.config.embedding_size % 128 != 0:
+                import warnings
+                warnings.warn(
+                    "Embedding size is not a multiple of 128! This could hurt throughput performance.", UserWarning
+                )
+        self.activation_checkpointing_strategy: Optional[ActivationCheckpointingStrategy] = None
+        self._activation_checkpoint_fn: Callable = activation_checkpoint_function(self.config)
+        if not (
+            0 < self.config.block_group_size <= self.config.n_layers
+            and self.config.n_layers % self.config.block_group_size == 0
+        ):
+            raise Exception("n layers must be divisible by block group size")
+        torch.backends.cuda.enable_flash_sdp(True)
+        torch.backends.cuda.enable_mem_efficient_sdp(False)  # this is super slow so make sure torch won't use it
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(
+                    config.embedding_size or config.vocab_size, config.d_model, device=config.init_device
+                ),
+                emb_drop=Dropout(config.embedding_dropout),
+                ln_f=LayerNorm.build(config),
+            )
+        )
+        blocks = [LLaDABlock.build(i, config, self.__cache) for i in range(config.n_layers)]
+        if self.config.block_group_size > 1:
+            block_groups = [
+                LLaDABlockGroup(config, i, blocks[i : i + config.block_group_size])
+                for i in range(0, config.n_layers, config.block_group_size)
+            ]
+            self.transformer.update({"block_groups": nn.ModuleList(block_groups)})
+        else:
+            self.transformer.update({"blocks": nn.ModuleList(blocks)})
+        if not (self.config.alibi or self.config.rope):
+            self.transformer.update(
+                {"wpe": nn.Embedding(config.max_sequence_length, config.d_model, device=config.init_device)}
+            )
+        if not config.weight_tying:
+            self.transformer.update(
+                {
+                    "ff_out": nn.Linear(
+                        config.d_model,
+                        config.embedding_size or config.vocab_size,
+                        bias=config.include_bias,
+                        device=config.init_device,
+                    )
+                }
+            )
+        # When `init_device="meta"` FSDP will call `reset_parameters()` to initialize weights.
+        if init_params and self.config.init_device != "meta":
+            self.reset_parameters()
+        self.__num_fwd_flops: Optional[int] = None
+        # Warm up cache.
+        if self.config.alibi:
+            get_causal_attention_bias(self.__cache, config.max_sequence_length, _non_meta_init_device(config))
+            self.get_alibi_attention_bias(config.max_sequence_length, _non_meta_init_device(config))
+    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
+        self.activation_checkpointing_strategy = strategy
+        if self.config.block_group_size != 1:
+            for block_group in self.transformer.block_groups:
+                block_group.set_activation_checkpointing(strategy)
+        else:
+            for block in self.transformer.blocks:
+                block.set_activation_checkpointing(strategy)
+    @property
+    def device(self) -> torch.device:
+        device: torch.device = self.transformer.wte.weight.device  # type: ignore
+        if device.type == "meta":
+            return _non_meta_init_device(self.config)
+        else:
+            return device
+    def reset_parameters(self):
+        log.info("Initializing model parameters...")
+        # Top-level embeddings / linear layers.
+        init_weights(
+            self.config,
+            self.transformer.wte,  # type: ignore
+            std_factor=(0.5 * math.sqrt(self.config.d_model)) if self.config.scale_logits else 1.0,
+            type_of_module=ModuleType.emb,
+        )
+        if hasattr(self.transformer, "wpe"):
+            init_weights(self.config, self.transformer.wpe, type_of_module=ModuleType.emb)  # type: ignore
+        # Top-level layer norm.
+        self.transformer.ln_f.reset_parameters()  # type: ignore
+        # Output weights.
+        if hasattr(self.transformer, "ff_out"):
+            init_weights(self.config, self.transformer.ff_out, type_of_module=ModuleType.final_out)  # type: ignore
+        # Let the blocks handle themselves.
+        if self.config.block_group_size == 1:
+            for block in self.transformer.blocks:
+                block.reset_parameters()
+        else:
+            for block_group in self.transformer.block_groups:
+                block_group.reset_parameters()
+    def get_alibi_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        if (alibi_bias := self.__cache.get("alibi_attention_bias")) is not None and alibi_bias.shape[
+            -1
+        ] >= seq_len:
+            if alibi_bias.device != device:
+                alibi_bias = alibi_bias.to(device)
+                self.__cache["alibi_attention_bias"] = alibi_bias
+            return alibi_bias
+        with torch.autocast(device.type, enabled=False):
+            alibi_bias = alibi_attention_bias(seq_len, self.config, device)
+        self.__cache["alibi_attention_bias"] = alibi_bias
+        return alibi_bias
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        input_embeddings: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+        last_logits_only: bool = False,
+        output_hidden_states: Optional[bool] = None,
+    ) -> LLaDAOutput:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input
+            embeddings. When provided, it is treated as the output of the input embedding layer.
+        :param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates
+            which input IDs are masked. A `1` value in the mask means that
+            the corresponding input ID should *not* be ignored. A `0` means
+            that the corresponding input ID is masked.
+            This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
+            library.
+        :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
+            `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
+            to introduce causal or other biases.
+            If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
+            indicates that the i-th element in the sequence is allowed to attend to the j-th
+            element in the sequence.
+            If the tensor is a float tensor, it will just be added to the attention
+            scores before the softmax.
+            The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
+        :param past_key_values: Pre-computed keys and values for each attention block.
+            Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+        :param use_cache: If `True`, return key and value tensors for each block.
+        :param last_logits_only: If `True`, only compute the logits for the last token of each sequence.
+            This can speed up decoding when you only care about the next token.
+        """
+        # Add Basic MDM Model config check
+        assert not self.config.alibi, "Alibi length extrapolation is not supported for MDM."
+        assert self.config.rope, "Rope must be used in Llama-Encoder for MDM."
+        assert (past_key_values is None and not use_cache), "The kvcache is not suppotred for MDM."
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        if past_key_values:
+            assert len(past_key_values) == self.config.n_layers
+        batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
+        if past_key_values is None:
+            past_length = 0
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+        # print(f"input_ids: {input_ids}, input_ids.shape: {input_ids.shape}")
+        # print(f"transformer wte weight shape: {self.transformer.wte.weight.shape}")
+        x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings  # type: ignore
+        # print(f"xshape: {x.shape}")
+        if self.config.input_emb_norm:
+            x = x * (self.config.d_model**0.5)
+        if not (self.config.alibi or self.config.rope):
+            # Get positional embeddings.
+            # shape: (1, seq_len)
+            pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
+            # shape: (1, seq_len, d_model)
+            pos_emb = self.transformer.wpe(pos)  # type: ignore
+            x = pos_emb + x
+        # Add input + positional embeddings and apply dropout.
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.emb_drop(x)  # type: ignore
+        # Transform the attention mask into what the blocks expect.
+        if attention_mask is not None and 0.0 in attention_mask:
+            # shape: (batch_size, 1, 1, seq_len)
+            attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
+            attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min
+        else:
+            attention_mask = None
+        # Merge attention mask with attention bias.
+        if (
+            attention_bias is not None
+            or attention_mask is not None
+            or self.config.alibi
+            # NOTE (epwalsh): we need to initialize the attn bias in order for attn to work properly
+            # with key+value cache. Otherwise `F.scaled_dot_product_attention()` doesn't seem to compute
+            # scores correctly.
+            or past_key_values is not None
+        ):
+            if attention_bias is None and self.config.alibi:
+                # print(f"get_causal_attention_bias")
+                attention_bias = get_causal_attention_bias(
+                    self.__cache, past_length + seq_len, x.device
+                ) + self.get_alibi_attention_bias(past_length + seq_len, x.device)
+            elif attention_bias is None:
+                # print(f"get_causal_attention_bias")
+                attention_bias = get_causal_attention_bias(self.__cache, past_length + seq_len, x.device)
+            elif attention_bias.dtype in (torch.int8, torch.bool):
+                # print(f"attention_bias.dtype in (torch.int8, torch.bool)")
+                attention_bias = attention_bias.to(dtype=torch.float)
+                attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)
+            # Transform to the right shape and data type.
+            mask_len = seq_len
+            if attention_mask is not None:
+                mask_len = attention_mask.shape[-1]
+            elif past_key_values is not None:
+                mask_len = past_key_values[0][0].shape[-2] + seq_len
+            attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float)
+            # Add in the masking bias.
+            if attention_mask is not None:
+                attention_bias = attention_bias + attention_mask
+                # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf.
+                # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead
+                # it can produce NaNs.
+                ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False)
+        attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+        # decoder layers
+        all_hidden_states = []
+        # Apply blocks one-by-one.
+        if self.config.block_group_size == 1:
+            for block_idx, block in enumerate(self.transformer.blocks):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layer_past = None if past_key_values is None else past_key_values[block_idx]
+                if (
+                    (self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.whole_layer)
+                    or (
+                        self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_two
+                        and block_idx % 2 == 0
+                    )
+                    or (
+                        self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_three
+                        and block_idx % 3 == 0
+                    )
+                    or (
+                        self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_four
+                        and block_idx % 4 == 0
+                    )
+                ):
+                    # shape: (batch_size, seq_len, d_model)
+                    x, cache = self._activation_checkpoint_fn(
+                        block, x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache
+                    )
+                else:
+                    # shape: (batch_size, seq_len, d_model)
+                    x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
+                if attn_key_values is not None:
+                    assert cache is not None
+                    attn_key_values.append(cache)
+        else:
+            for group_idx, block_group in enumerate(self.transformer.block_groups):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layers_past = (
+                    None
+                    if past_key_values is None
+                    else past_key_values[
+                        group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size
+                    ]
+                )
+                x, cache = block_group(
+                    x, attention_bias=attention_bias, layers_past=layers_past, use_cache=use_cache
+                )
+                if attn_key_values is not None:
+                    assert cache is not None
+                    attn_key_values.extend(cache)
+        if last_logits_only:
+            # shape: (batch_size, 1, d_model)
+            x = x[:, -1, :].unsqueeze(1)
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        x = self.transformer.ln_f(x)  # type: ignore
+        if output_hidden_states:
+            # add final hidden state post-final-layernorm, following HuggingFace's convention
+            all_hidden_states.append(x)
+        # Get logits.
+        # shape: (batch_size, seq_len or 1, vocab_size)
+        if self.config.weight_tying:
+            logits = F.linear(x, self.transformer.wte.weight, None)  # type: ignore
+        else:
+            logits = self.transformer.ff_out(x)  # type: ignore
+        if self.config.scale_logits:
+            logits.mul_(1 / math.sqrt(self.config.d_model))
+        return LLaDAOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
+def create_model_config_from_pretrained_config(config: LLaDAConfig):
+    """
+    Utility function
+    """
+    kwargs = {}
+    for field in fields(ModelConfig):
+        kwargs[field.name] = getattr(config, field.name)
+    model_config = ModelConfig(**kwargs)
+    return model_config
+class LLaDAModelLM(PreTrainedModel):
+    """
+    Extremely barebones HF model wrapper.
+    """
+    config_class = LLaDAConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["LLaDABlock", "LLaDASequentialBlock", "LLaDALlamaBlock"]
+    def __init__(self, config: LLaDAConfig, model: Optional[LLaDAModel] = None, init_params: bool = False):
+        super().__init__(config)
+        if not model:
+            model_config = create_model_config_from_pretrained_config(config)
+            # Initialize model (always on CPU to start with so we don't run out of GPU memory).
+            model_config.init_device = "cpu"
+            self.model = LLaDAModel(model_config, init_params=init_params)
+        else:
+            self.model = model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[Cache] = None,  # This is a hack mitigation of an issue in transformers `4.39.x`
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if use_cache is None:
+            use_cache = self.config.use_cache
+        if output_attentions:
+            raise ValueError("output_attentions is not yet supported in LLaDA")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            input_embeddings=inputs_embeds,
+            attention_mask=attention_mask,
+            attention_bias=attention_bias,
+            past_key_values=None,
+            use_cache=False,
+            output_hidden_states=output_hidden_states,
+        )
+        logits = outputs.logits
+        hidden_states = outputs.hidden_states
+        loss = None
+        if labels is not None:
+            import warnings
+            warnings.warn("Note that for LLaDA, you cannot calculate the loss here.", UserWarning)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            logits=logits,
+            past_key_values=outputs.attn_key_values,
+            hidden_states=hidden_states,
+        )
+    def can_generate(self) -> bool:
+        return True
+    def prepare_inputs_for_generation(
+        self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
+    ):
+        if past_key_values:
+            # This is because we want the model to only process the last generated token.
+            input_ids = input_ids[:, -1:]
+        model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values}
+        model_inputs.update(kwargs)
+        model_inputs["use_cache"] = kwargs.pop("use_cache", self.config.use_cache)
+        return model_inputs
+    # TODO: these are required to make the implementation complete.
+    # def resize_position_embeddings(self, new_num_position_embeddings: int):
+    #     pass
+    #
+    # def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
+    #     pass
+    #
+    # def _reorder_cache(self, past_key_values, beam_idx):
+    #     pass
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.model.transformer.wte
+    def set_input_embeddings(self, value: torch.nn.Module):
+        self.model.transformer.wte = value
+    def get_output_embeddings(self):
+        if self.config.weight_tying:
+            return self.model.transformer.wte
+        else:
+            return self.model.transformer.ff_out
+    def set_output_embeddings(self, value: torch.nn.Module):
+        if self.config.weight_tying:
+            self.model.transformer.wte = value
+        else:
+            self.model.transformer.ff_out = value
+    def tie_weights(self):
+        if self.config.weight_tying:
+            self.model.transformer.ff_out = self.model.transformer.wte
+# Register the model so that it is available for transformer pipelines, auto-loading, etc.
+AutoModel.register(LLaDAConfig, LLaDAModelLM)

models/modeling_magvitv2.py ADDED Viewed

	@@ -0,0 +1,440 @@

+from dataclasses import dataclass, field
+import numpy as np
+import torch
+import torch.nn as nn
+from .common_modules import *
+from .modeling_utils import ConfigMixin, ModelMixin, register_to_config
+from .misc import *
+import math
+class Updateable:
+    def do_update_step(
+            self, epoch: int, global_step: int, on_load_weights: bool = False
+    ):
+        for attr in self.__dir__():
+            if attr.startswith("_"):
+                continue
+            try:
+                module = getattr(self, attr)
+            except:
+                continue  # ignore attributes like property, which can't be retrived using getattr?
+            if isinstance(module, Updateable):
+                module.do_update_step(
+                    epoch, global_step, on_load_weights=on_load_weights
+                )
+        self.update_step(epoch, global_step, on_load_weights=on_load_weights)
+    def do_update_step_end(self, epoch: int, global_step: int):
+        for attr in self.__dir__():
+            if attr.startswith("_"):
+                continue
+            try:
+                module = getattr(self, attr)
+            except:
+                continue  # ignore attributes like property, which can't be retrived using getattr?
+            if isinstance(module, Updateable):
+                module.do_update_step_end(epoch, global_step)
+        self.update_step_end(epoch, global_step)
+    def update_step(self, epoch: int, global_step: int, on_load_weights: bool = False):
+        # override this method to implement custom update logic
+        # if on_load_weights is True, you should be careful doing things related to model evaluations,
+        # as the models and tensors are not guarenteed to be on the same device
+        pass
+    def update_step_end(self, epoch: int, global_step: int):
+        pass
+class VQGANEncoder(ModelMixin, ConfigMixin):
+    @dataclass
+    class Config:
+        ch: int = 128
+        ch_mult: List[int] = field(default_factory=lambda: [1, 2, 2, 4, 4])
+        num_res_blocks: List[int] = field(default_factory=lambda: [4, 3, 4, 3, 4])
+        attn_resolutions: List[int] = field(default_factory=lambda: [5])
+        dropout: float = 0.0
+        in_ch: int = 3
+        out_ch: int = 3
+        resolution: int = 256
+        z_channels: int = 13
+        double_z: bool = False
+    def __init__(self,
+                 ch: int = 128,
+                 ch_mult: List[int] = [1, 2, 2, 4, 4],
+                 num_res_blocks: List[int] = [4, 3, 4, 3, 4],
+                 attn_resolutions: List[int] = [5],
+                 dropout: float = 0.0,
+                 in_ch: int = 3,
+                 out_ch: int = 3,
+                 resolution: int = 256,
+                 z_channels: int = 13,
+                 double_z: bool = False):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_ch = in_ch
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            self.in_ch, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = self.resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = self.ch * in_ch_mult[i_level]
+            block_out = self.ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks[i_level]):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, True)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        # for param in self.parameters():
+        #     broadcast(param, src=0)
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks[i_level]):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        h = self.quant_conv(h)
+        return h
+class LFQuantizer(nn.Module):
+    def __init__(self, num_codebook_entry: int = -1,
+                 codebook_dim: int = 13,
+                 beta: float = 0.25,
+                 entropy_multiplier: float = 0.1,
+                 commit_loss_multiplier: float = 0.1, ):
+        super().__init__()
+        self.codebook_size = 2 ** codebook_dim
+        print(
+            f"Look-up free quantizer with codebook size: {self.codebook_size}"
+        )
+        self.e_dim = codebook_dim
+        self.beta = beta
+        indices = torch.arange(self.codebook_size)
+        binary = (
+                         indices.unsqueeze(1)
+                         >> torch.arange(codebook_dim - 1, -1, -1, dtype=torch.long)
+                 ) & 1
+        embedding = binary.float() * 2 - 1
+        self.register_buffer("embedding", embedding)
+        self.register_buffer(
+            "power_vals", 2 ** torch.arange(codebook_dim - 1, -1, -1)
+        )
+        self.commit_loss_multiplier = commit_loss_multiplier
+        self.entropy_multiplier = entropy_multiplier
+    def get_indices(self, z_q):
+        return (
+            (self.power_vals.reshape(1, -1, 1, 1) * (z_q > 0).float())
+            .sum(1, keepdim=True)
+            .long()
+        )
+    def get_codebook_entry(self, indices, shape=None):
+        if shape is None:
+            h, w = int(math.sqrt(indices.shape[-1])), int(math.sqrt(indices.shape[-1]))
+        else:
+            h, w = shape
+        b, _ = indices.shape
+        indices = indices.reshape(-1)
+        z_q = self.embedding[indices]
+        z_q = z_q.view(b, h, w, -1)
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q
+    def forward(self, z, get_code=False):
+        """
+        Inputs the output of the encoder network z and maps it to a discrete
+        one-hot vector that is the index of the closest embedding vector e_j
+        z (continuous) -> z_q (discrete)
+        z.shape = (batch, channel, height, width)
+        quantization pipeline:
+            1. get encoder input (B,C,H,W)
+            2. flatten input to (B*H*W,C)
+        """
+        if get_code:
+            return self.get_codebook_entry(z)
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        ge_zero = (z_flattened > 0).float()
+        ones = torch.ones_like(z_flattened)
+        z_q = ones * ge_zero + -ones * (1 - ge_zero)
+        # preserve gradients
+        z_q = z_flattened + (z_q - z_flattened).detach()
+        # compute entropy loss
+        CatDist = torch.distributions.categorical.Categorical
+        logit = torch.stack(
+            [
+                -(z_flattened - torch.ones_like(z_q)).pow(2),
+                -(z_flattened - torch.ones_like(z_q) * -1).pow(2),
+            ],
+            dim=-1,
+        )
+        cat_dist = CatDist(logits=logit)
+        entropy = cat_dist.entropy().mean()
+        mean_prob = cat_dist.probs.mean(0)
+        mean_entropy = CatDist(probs=mean_prob).entropy().mean()
+        # compute loss for embedding
+        commit_loss = torch.mean(
+            (z_q.detach() - z_flattened) ** 2
+        ) + self.beta * torch.mean((z_q - z_flattened.detach()) ** 2)
+        # reshape back to match original input shape
+        z_q = z_q.view(z.shape)
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return {
+            "z": z_q,
+            "quantizer_loss": commit_loss * self.commit_loss_multiplier,
+            "entropy_loss": (entropy - mean_entropy) * self.entropy_multiplier,
+            "indices": self.get_indices(z_q),
+        }
+class VQGANDecoder(ModelMixin, ConfigMixin):
+    def __init__(self, ch: int = 128,
+                 ch_mult: List[int] = [1, 1, 2, 2, 4],
+                 num_res_blocks: List[int] = [4, 4, 3, 4, 3],
+                 attn_resolutions: List[int] = [5],
+                 dropout: float = 0.0,
+                 in_ch: int = 3,
+                 out_ch: int = 3,
+                 resolution: int = 256,
+                 z_channels: int = 13,
+                 double_z: bool = False):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_ch = in_ch
+        self.give_pre_end = False
+        self.z_channels = z_channels
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = self.resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        print(
+            "Working with z of shape {} = {} dimensions.".format(
+                self.z_shape, np.prod(self.z_shape)
+            )
+        )
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks[i_level]):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, True)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1
+        )
+        self.post_quant_conv = torch.nn.Conv2d(
+            z_channels, z_channels, 1
+        )
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        output = dict()
+        z = self.post_quant_conv(z)
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks[i_level]):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        output["output"] = h
+        if self.give_pre_end:
+            return output
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        output["output"] = h
+        return output
+class MAGVITv2(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+            self,
+    ):
+        super().__init__()
+        self.encoder = VQGANEncoder()
+        self.decoder = VQGANDecoder()
+        self.quantize = LFQuantizer()
+    def forward(self, pixel_values, return_loss=False):
+        pass
+    def encode(self, pixel_values, return_loss=False):
+        hidden_states = self.encoder(pixel_values)
+        quantized_states = self.quantize(hidden_states)['z']
+        codebook_indices = self.quantize.get_indices(quantized_states).reshape(pixel_values.shape[0], -1)
+        output = (quantized_states, codebook_indices)
+        return output
+    def get_code(self, pixel_values):
+        hidden_states = self.encoder(pixel_values)
+        codebook_indices = self.quantize.get_indices(self.quantize(hidden_states)['z']).reshape(pixel_values.shape[0], -1)
+        return codebook_indices
+    def decode_code(self, codebook_indices, shape=None):
+        z_q = self.quantize.get_codebook_entry(codebook_indices, shape=shape)
+        reconstructed_pixel_values = self.decoder(z_q)["output"]
+        return reconstructed_pixel_values
+if __name__ == '__main__':
+    encoder = VQGANEncoder()
+    import ipdb
+    ipdb.set_trace()
+    print()

models/modeling_mmada.py ADDED Viewed

	@@ -0,0 +1,668 @@

+from __future__ import annotations
+import logging
+import math
+import sys
+from abc import abstractmethod
+from collections import defaultdict
+from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    cast,
+)
+from dataclasses import fields
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.backends.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import einsum
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.auto import AutoModel, AutoConfig, AutoModelForCausalLM
+from transformers.cache_utils import Cache
+from PIL import Image
+from .configuration_llada import (
+    LLaDAConfig,
+    StrEnum,
+    InitFnType,
+    ActivationType,
+    BlockType,
+    LayerNormType,
+    ModelConfig,
+    ActivationCheckpointingStrategy,
+)
+from .modeling_llada import LLaDAModelLM
+from .sampling import cosine_schedule, mask_by_random_topk
+from transformers import PretrainedConfig
+def add_gumbel_noise(logits, temperature):
+    '''
+    The Gumbel max is a method for sampling categorical distributions.
+    According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
+    Thus, we use float64.
+    '''
+    if temperature == 0:
+        return logits
+    logits = logits.to(torch.float64)
+    noise = torch.rand_like(logits, dtype=torch.float64)
+    gumbel_noise = (- torch.log(noise)) ** temperature
+    return logits.exp() / gumbel_noise
+def get_num_transfer_tokens(mask_index, steps):
+    '''
+    In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals.
+    Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)),
+    the expected number of tokens transitioned at each step should be consistent.
+    This function is designed to precompute the number of tokens that need to be transitioned at each step.
+    '''
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    base = mask_num // steps
+    remainder = mask_num % steps
+    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
+    for i in range(mask_num.size(0)):
+        num_transfer_tokens[i, :remainder[i]] += 1
+    return num_transfer_tokens
+class MMadaConfig(PretrainedConfig):
+    model_type = "mmada"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        allowed_keys = [
+            "vocab_size",
+            "llm_vocab_size",
+            "llm_model_path",
+            "codebook_size",
+            "num_vq_tokens",
+            "num_new_special_tokens",
+            "gradient_checkpointing",
+            "new_vocab_size",
+        ]
+        for key in allowed_keys:
+            if key in kwargs:
+                setattr(self, key, kwargs[key])
+class MMadaModelLM(LLaDAModelLM):
+    config_class = MMadaConfig
+    base_model_prefix = "model"
+    def __init__(self, config: MMadaConfig, *args, **kwargs):
+        print(f"Initializing MMadaModelLM with config: {config}")
+        super().__init__(config, *args, **kwargs)
+        # # resize token embeddings
+        # print(f"Resizing token embeddings to {config.new_vocab_size}")
+        # self.resize_token_embeddings(config.new_vocab_size)
+    @torch.no_grad()
+    def t2i_generate(
+            self,
+            input_ids: torch.LongTensor = None,
+            uncond_input_ids: torch.LongTensor = None,
+            attention_mask=None,
+            uncond_attention_mask=None,
+            temperature=1.0,
+            timesteps=18,  # ideal number of steps is 18 in maskgit paper
+            guidance_scale=0,
+            noise_schedule=cosine_schedule,
+            generator: torch.Generator = None,
+            config=None,
+            seq_len=1024,
+            mask_token_id = 126336,
+            resolution = 512,
+            codebook_size = 8192,
+            **kwargs,
+    ):
+        """
+        Generate 1:1 similar to the original MaskGit repo
+        https://github.com/google-research/maskgit/blob/main/maskgit/libml/parallel_decode.py#L79
+        """
+        # begin with all image token ids masked
+        # 计算有多少个mask token
+        mask_count = (input_ids == mask_token_id).sum().item()
+        num_vq_tokens = seq_len
+        num_new_special_tokens = 0
+        uni_prompting = kwargs.get("uni_prompting", None)
+        # print(f"config.model.mmada.llm_vocab_size: {config.model.mmada.llm_vocab_size}, {len(uni_prompting.text_tokenizer)}")
+        input_ids_minus_lm_vocab_size = input_ids[:, -(num_vq_tokens + 1):-1].clone()
+        input_ids_minus_lm_vocab_size = torch.where(input_ids_minus_lm_vocab_size == mask_token_id, mask_token_id, input_ids_minus_lm_vocab_size - len(uni_prompting.text_tokenizer) - num_new_special_tokens)
+        # for classifier-free guidance
+        if uncond_input_ids is not None:
+            uncond_prefix = uncond_input_ids[:, :resolution + 1]
+        for step in range(timesteps):
+            if uncond_input_ids is not None and guidance_scale > 0:
+                uncond_input_ids = torch.cat(
+                    [uncond_prefix, input_ids[:, resolution + 1:]], dim=1)
+                model_input = torch.cat([input_ids, uncond_input_ids])
+                attention_mask = torch.cat([attention_mask, uncond_attention_mask], dim=0)
+                attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
+                logits = self(model_input, attention_bias=attention_bias).logits
+                # print(f"logits.shape: {logits.shape}")
+                cond_logits, uncond_logits = torch.chunk(logits, 2, dim=0)
+                # logits = uncond_logits + guidance_scale * (cond_logits - uncond_logits)
+                # it seems that muse has a different cfg setting
+                logits = (1 + guidance_scale) * cond_logits - guidance_scale * uncond_logits
+                logits = logits[:, -(num_vq_tokens + 1):-1, len(uni_prompting.text_tokenizer) + num_new_special_tokens: len(uni_prompting.text_tokenizer) + num_new_special_tokens + codebook_size]
+            else:
+                attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
+                logits = self(input_ids, attention_bias=attention_bias).logits
+                logits = logits[:, -(num_vq_tokens + 1):-1, len(uni_prompting.text_tokenizer) + num_new_special_tokens: len(uni_prompting.text_tokenizer) + num_new_special_tokens + codebook_size]
+            # logits: 1, 1024, 8192
+            # print(f"logits.shape: {logits.shape}")
+            probs = logits.softmax(dim=-1)
+            sampled = probs.reshape(-1, logits.size(-1))
+            # print(f"probs: {probs}, probs.shape: {probs.shape}, sampled: {sampled}, sampled.shape: {sampled.shape}")
+            sampled_ids = torch.multinomial(sampled, 1, generator=generator)[:, 0].view(*logits.shape[:-1]) # 1, 1024
+            unknown_map = input_ids_minus_lm_vocab_size == mask_token_id
+            # print(f"unknown_map.sum(dim=-1, keepdim=True): {unknown_map.sum(dim=-1, keepdim=True)}")
+            sampled_ids = torch.where(unknown_map, sampled_ids, input_ids_minus_lm_vocab_size)
+            # Defines the mask ratio for the next round. The number to mask out is
+            # determined by mask_ratio * unknown_number_in_the_beginning.
+            ratio = 1.0 * (step + 1) / timesteps
+            mask_ratio = noise_schedule(torch.tensor(ratio))
+            # Computes the probabilities of each selected tokens.
+            selected_probs = torch.gather(probs, -1, sampled_ids.long()[..., None])
+            selected_probs = selected_probs.squeeze(-1)
+            # Ignores the tokens given in the input by overwriting their confidence.
+            selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max)
+            # Gets mask lens for each sample in the batch according to the mask ratio.
+            mask_len = (num_vq_tokens * mask_ratio).floor().unsqueeze(0).to(logits.device)
+            # Keeps at least one of prediction in this round and also masks out at least
+            # one and for the next iteration
+            mask_len = torch.max(
+                torch.tensor([1], device=logits.device), torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len)
+            )
+            # print(f"mask_len: {mask_len}, mask_len.shape: {mask_len.shape}")
+            # Adds noise for randomness
+            temperature = temperature * (1.0 - ratio)
+            masking = mask_by_random_topk(mask_len, selected_probs, temperature, generator=generator)
+            # Masks tokens with lower confidence.
+            input_ids[:, -(num_vq_tokens + 1):-1] = torch.where(masking, mask_token_id,
+                                                          sampled_ids + len(uni_prompting.text_tokenizer)
+                                                          + num_new_special_tokens)
+            input_ids_minus_lm_vocab_size = torch.where(masking, mask_token_id, sampled_ids)
+        return sampled_ids
+    def forward_process(
+            self,
+            input_ids,
+            labels,
+            batch_size_t2i=0,
+            batch_size_lm=0,
+            batch_size_mmu=0,
+            max_seq_length=128,
+            p_mask_lm=None,
+            p_mask_mmu=None,
+            answer_lengths=None,
+            t2i_masks=None,
+            answer_lengths_lm=None
+            ):
+        # attention bias, True for batch_size, 1, seq_len, seq_len
+        attention_bias = torch.ones(input_ids.shape[0], 1, input_ids.shape[1], input_ids.shape[1])
+        attention_bias_t2i = (t2i_masks[:, :, None] & t2i_masks[:, None, :]).bool().unsqueeze(1)
+        attention_bias[:batch_size_t2i] = attention_bias_t2i
+        logits = self(input_ids, attention_bias=attention_bias).logits
+        # logits = self(input_ids).logits
+        self.output_size = logits.shape[-1]
+        # print(f"logits shape: {logits.shape}") B, 359, vocab_size
+        if batch_size_t2i == 0:
+            loss_t2i = torch.tensor(0.0, device=input_ids.device)
+        else:
+            # t2i loss
+            loss_t2i = F.cross_entropy(
+                logits[:batch_size_t2i, max_seq_length + 1:].contiguous().view(-1, self.output_size),
+                labels[:batch_size_t2i, max_seq_length + 1:].contiguous().view(-1), ignore_index=-100,
+                )
+        # llada loss
+        masked_indices = input_ids == self.config.mask_token_id
+        masked_indices_lm = masked_indices[batch_size_t2i:batch_size_t2i + batch_size_lm]
+        # 新增调试代码：统计每行mask数量
+        # if masked_indices_lm.numel() > 0:
+        #     mask_counts = torch.sum(masked_indices_lm, dim=1)
+        #     logging.info(f"[LM mask nums]: {mask_counts.cpu()}.")
+        # else:
+        #     logging.info("[LM mask nums] no LM sample.")
+        masked_indices_mmu = masked_indices[-batch_size_mmu:]
+        p_mask_lm = p_mask_lm.to(masked_indices_lm.device)
+        p_mask_mmu = p_mask_mmu.to(masked_indices_mmu.device)
+        answer_lengths = answer_lengths.to(masked_indices_mmu.device)
+        loss_lm = F.cross_entropy(
+            logits[batch_size_t2i:batch_size_t2i + batch_size_lm][masked_indices_lm].contiguous().view(-1, self.output_size),
+            labels[batch_size_t2i:batch_size_t2i + batch_size_lm][masked_indices_lm].contiguous().view(-1), ignore_index=-100, reduction='none'
+            )/p_mask_lm[masked_indices_lm]
+        # print(f"logits lm shape: {logits[batch_size_t2i:batch_size_t2i + batch_size_lm].shape}")
+        loss_lm = loss_lm.sum() / (logits[batch_size_t2i:batch_size_t2i + batch_size_lm].shape[0] * logits[batch_size_t2i:batch_size_t2i + batch_size_lm].shape[1])
+        # llm loss
+        answer_lengths_lm = answer_lengths_lm.to(masked_indices_lm.device)
+        loss_lm = torch.sum(loss_lm / answer_lengths_lm[masked_indices_lm]) / (logits[batch_size_t2i:batch_size_t2i + batch_size_lm].shape[0])
+        loss_mmu = F.cross_entropy(
+            logits[-batch_size_mmu:][masked_indices_mmu].contiguous().view(-1, self.output_size),
+            labels[-batch_size_mmu:][masked_indices_mmu].contiguous().view(-1), ignore_index=-100, reduction='none'
+            )/p_mask_mmu[masked_indices_mmu]
+        loss_mmu = torch.sum(loss_mmu/answer_lengths[masked_indices_mmu]) / (logits[-batch_size_mmu:].shape[0])
+        return logits, loss_t2i, loss_lm, loss_mmu
+    def forward_process_with_r2i(
+            self,
+            input_ids,
+            labels,
+            t2i_masks=None,
+            max_seq_length=128,
+            batch_size_t2i=0,
+            batch_size_lm=0,
+            batch_size_mmu=0,
+            batch_size_r2i=0,
+            p_mask_lm=None,
+            p_mask_mmu=None,
+            p_mask_r2i=None,
+            answer_lengths=None,
+            answer_lengths_lm=None,
+            answer_lengths_r2i=None,
+            ):
+        # attention bias, True for batch_size, 1, seq_len, seq_len
+        attention_bias = torch.ones(input_ids.shape[0], 1, input_ids.shape[1], input_ids.shape[1])
+        attention_bias_t2i = (t2i_masks[:, :, None] & t2i_masks[:, None, :]).bool().unsqueeze(1)
+        attention_bias[:batch_size_t2i] = attention_bias_t2i
+        logits = self(input_ids, attention_bias=attention_bias).logits
+        # logits = self(input_ids).logits
+        self.output_size = logits.shape[-1]
+        # print(f"logits shape: {logits.shape}") B, 359, vocab_size
+        if batch_size_t2i == 0:
+            loss_t2i = torch.tensor(0.0, device=input_ids.device)
+        else:
+            # t2i loss
+            loss_t2i = F.cross_entropy(
+                logits[:batch_size_t2i, max_seq_length + 1:].contiguous().view(-1, self.output_size),
+                labels[:batch_size_t2i, max_seq_length + 1:].contiguous().view(-1), ignore_index=-100,
+                )
+        # llada loss
+        start_lm = batch_size_t2i
+        end_lm = start_lm + batch_size_lm
+        start_mmu = end_lm
+        end_mmu = start_mmu + batch_size_mmu
+        start_r2i = end_mmu
+        end_r2i = start_r2i + batch_size_r2i
+        masked_indices = input_ids == self.config.mask_token_id
+        masked_indices_lm = masked_indices[start_lm:end_lm]
+        masked_indices_mmu = masked_indices[start_mmu:end_mmu]
+        masked_indices_r2i = masked_indices[start_r2i:end_r2i]
+        p_mask_lm = p_mask_lm.to(masked_indices_lm.device)
+        p_mask_mmu = p_mask_mmu.to(masked_indices_mmu.device)
+        p_mask_r2i = p_mask_r2i.to(masked_indices_r2i.device)
+        answer_lengths = answer_lengths.to(masked_indices_mmu.device)
+        answer_lengths_lm = answer_lengths_lm.to(masked_indices_lm.device)
+        answer_lengths_r2i = answer_lengths_r2i.to(masked_indices_r2i.device)
+        loss_lm = F.cross_entropy(
+            logits[start_lm:end_lm][masked_indices_lm].contiguous().view(-1, self.output_size),
+            labels[start_lm:end_lm][masked_indices_lm].contiguous().view(-1), ignore_index=-100, reduction='none'
+            )/p_mask_lm[masked_indices_lm]
+        # print(f"logits lm shape: {logits[batch_size_t2i:batch_size_t2i + batch_size_lm].shape}")
+        loss_lm = loss_lm.sum() / (logits[start_lm:end_lm].shape[0] * logits[start_lm:end_lm].shape[1])
+        loss_lm = torch.sum(loss_lm / answer_lengths_lm[masked_indices_lm]) / (logits[start_lm:end_lm].shape[0])
+        loss_mmu = F.cross_entropy(
+            logits[start_mmu:end_mmu][masked_indices_mmu].contiguous().view(-1, self.output_size),
+            labels[start_mmu:end_mmu][masked_indices_mmu].contiguous().view(-1), ignore_index=-100, reduction='none'
+            )/p_mask_mmu[masked_indices_mmu]
+        loss_mmu = torch.sum(loss_mmu/answer_lengths[masked_indices_mmu]) / (logits[start_mmu:end_mmu].shape[0])
+        loss_r2i = F.cross_entropy(
+            logits[start_r2i:end_r2i][masked_indices_r2i].contiguous().view(-1, self.output_size),
+            labels[start_r2i:end_r2i][masked_indices_r2i].contiguous().view(-1), ignore_index=-100, reduction='none'
+            )/p_mask_r2i[masked_indices_r2i]
+        loss_r2i = torch.sum(loss_r2i/answer_lengths_r2i[masked_indices_r2i]) / (logits[start_r2i:end_r2i].shape[0])
+        return logits, loss_t2i, loss_lm, loss_mmu, loss_r2i
+    def forward_t2i(
+            self,
+            input_ids,
+            labels,
+            batch_size_t2i=0,
+            max_seq_length=128,
+            t2i_masks=None
+            ):
+        # attention bias, True for batch_size, 1, seq_len, seq_len
+        attention_bias = torch.ones(input_ids.shape[0], 1, input_ids.shape[1], input_ids.shape[1])
+        attention_bias_t2i = (t2i_masks[:, :, None] & t2i_masks[:, None, :]).bool().unsqueeze(1)
+        attention_bias[:batch_size_t2i] = attention_bias_t2i
+        logits = self(input_ids, attention_bias=attention_bias).logits
+        # logits = self(input_ids).logits
+        self.output_size = logits.shape[-1]
+        # print(f"logits shape: {logits.shape}") B, 359, vocab_size
+        loss_t2i = F.cross_entropy(
+            logits[:batch_size_t2i, max_seq_length + 1:].contiguous().view(-1, self.output_size),
+            labels[:batch_size_t2i, max_seq_length + 1:].contiguous().view(-1), ignore_index=-100,
+            )
+        return loss_t2i
+    @torch.no_grad()
+    def mmu_generate(self, idx=None, input_embeddings=None, max_new_tokens=128, steps=128,block_length=128, temperature=0.0, top_k=None, eot_token=None, cfg_scale=0.0, remasking='low_confidence', mask_id=126336, attention_mask=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        if attention_mask is not None and 0.0 in attention_mask:
+            attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
+            # print(f"attention_bias: {attention_bias}")
+        else:
+            attention_bias = None
+        try:
+            device = idx.device
+        except:
+            device = input_embeddings.device
+        result = []
+        batch_size = idx.shape[0]
+        x = torch.full((batch_size, idx.shape[1] + max_new_tokens), mask_id, dtype=torch.long).to(self.device)
+        x[:, :idx.shape[1]] = idx.clone()
+        prompt_index = (x != mask_id)
+        assert max_new_tokens % block_length == 0
+        num_blocks = max_new_tokens // block_length
+        assert steps % num_blocks == 0
+        steps = steps // num_blocks
+        # print(f"num_blocks: {num_blocks}, steps: {steps}")
+        # num_transfer_tokens = get_num_transfer_tokens(prompt_index, steps)
+        for num_block in range(num_blocks):
+            block_mask_index = (x[:, idx.shape[1] + num_block * block_length: idx.shape[1] + (num_block + 1) * block_length:] == mask_id)
+            num_transfer_tokens = get_num_transfer_tokens(block_mask_index, steps)
+            # num_transfer_tokens = get_num_transfer_tokens(prompt_index, steps)
+            # print(f"num_transfer_tokens: {num_transfer_tokens}, num_transfer_tokens.shape: {num_transfer_tokens.shape}")
+            for i in range(steps):
+                mask_index = (x == mask_id)
+                if cfg_scale > 0.0:
+                    un_x = x.clone()
+                    un_x[prompt_index] = mask_id
+                    x_ = torch.cat([x, un_x], dim=0)
+                    logits = self(x_).logits
+                    logits, un_logits = torch.chunk(logits, 2, dim=0)
+                    logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
+                else:
+                    logits = self(x, attention_bias=attention_bias).logits
+                logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+                x0 = torch.argmax(logits_with_noise, dim=-1) # b, l
+                if remasking == 'low_confidence':
+                    p = F.softmax(logits.to(torch.float64), dim=-1)
+                    x0_p = torch.squeeze(
+                        torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
+                elif remasking == 'random':
+                    x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+                else:
+                    raise NotImplementedError(remasking)
+                x0_p[:, idx.shape[1] + (num_block + 1) * block_length:] = -np.inf
+                x0 = torch.where(mask_index, x0, x)
+                confidence = torch.where(mask_index, x0_p, -np.inf)
+                transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+                for j in range(confidence.shape[0]):
+                    _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j, i])
+                    transfer_index[j, select_index] = True
+                x[transfer_index] = x0[transfer_index]
+            # logits = logits[:, -1, :] / temperature
+            # # optionally crop the logits to only the top k options
+            # if top_k is not None:
+            #     v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+            #     logits[logits < v[:, [-1]]] = -float('Inf')
+            # # apply softmax to convert logits to (normalized) probabilities
+            # probs = F.softmax(logits, dim=-1)
+            # # sample from the distribution
+            # idx_next = torch.multinomial(probs, num_samples=1)
+            # result.append(idx_next[0][0])
+            # # append sampled index to the running sequence and continue
+            # if self.config.w_clip_vit:
+            #     idx_next_embeddings = self.mmada.model.embed_tokens(idx_next)
+            #     input_embeddings = torch.cat([input_embeddings, idx_next_embeddings], dim=1)
+            # else:
+            #     idx = torch.cat((idx, idx_next), dim=1)
+            # if eot_token is not None and idx_next.cpu() == eot_token:
+            #     break
+        return x
+    @torch.no_grad()
+    def mmu_generate_fast(self, idx=None, input_embeddings=None, max_new_tokens=128, steps=128,block_length=128, temperature=0.0, top_k=None, eot_token=None, cfg_scale=0.0, remasking='low_confidence', mask_id=126336, attention_mask=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        if attention_mask is not None and 0.0 in attention_mask:
+            attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
+            # print(f"attention_bias: {attention_bias}")
+        else:
+            attention_bias = None
+        try:
+            device = idx.device
+        except:
+            device = input_embeddings.device
+        result = []
+        batch_size = idx.shape[0]
+        x = torch.full((batch_size, idx.shape[1] + max_new_tokens), mask_id, dtype=torch.long).to(self.device)
+        x[:, :idx.shape[1]] = idx.clone()
+        prompt_index = (x != mask_id)
+        assert max_new_tokens % block_length == 0
+        num_blocks = max_new_tokens // block_length
+        assert steps % num_blocks == 0
+        steps = steps // num_blocks
+        for num_block in range(num_blocks):
+            block_mask_index = (x[:, idx.shape[1] + num_block * block_length: idx.shape[1] + (num_block + 1) * block_length:] == mask_id)
+            num_transfer_tokens = get_num_transfer_tokens(block_mask_index, steps)
+            for i in range(steps):
+                mask_index = (x == mask_id)
+                if cfg_scale > 0.0:
+                    un_x = x.clone()
+                    un_x[prompt_index] = mask_id
+                    x_ = torch.cat([x, un_x], dim=0)
+                    logits = self(x_).logits
+                    logits, un_logits = torch.chunk(logits, 2, dim=0)
+                    logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
+                else:
+                    logits = self(x, attention_bias=attention_bias).logits
+                logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+                x0 = torch.argmax(logits_with_noise, dim=-1) # b, l
+                if remasking == 'low_confidence':
+                    p = F.softmax(logits.to(torch.float64), dim=-1)
+                    x0_p = torch.squeeze(
+                        torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
+                elif remasking == 'random':
+                    x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+                else:
+                    raise NotImplementedError(remasking)
+                x0_p[:, idx.shape[1] + (num_block + 1) * block_length:] = -np.inf
+                x0 = torch.where(mask_index, x0, x)
+                confidence = torch.where(mask_index, x0_p, -np.inf)
+                transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+                for j in range(confidence.shape[0]):
+                    _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j, i])
+                    transfer_index[j, select_index] = True
+                x[transfer_index] = x0[transfer_index]
+            if eot_token is not None:
+                last_token_index_in_current_block = idx.shape[1] + (num_block + 1) * block_length - 1
+                if last_token_index_in_current_block < x.shape[1]:
+                    tokens_at_block_end = x[:, last_token_index_in_current_block]
+                    if torch.all(tokens_at_block_end == eot_token):
+                        break
+        return x
+    @torch.no_grad()
+    def t2i_generate_decoding_stepwise(
+            self,
+            input_ids: torch.LongTensor = None,
+            uncond_input_ids: torch.LongTensor = None,
+            attention_mask=None,
+            uncond_attention_mask=None,
+            temperature=1.0,
+            timesteps=18,  # ideal number of steps is 18 in maskgit paper
+            guidance_scale=0,
+            noise_schedule=cosine_schedule,
+            generator: torch.Generator = None,
+            config=None,
+            seq_len=1024,
+            mask_token_id = 126336,
+            resolution = 512,
+            codebook_size = 8192,
+            vq_model = None,
+            **kwargs,
+    ):
+        """
+        Generate 1:1 similar to the original MaskGit repo
+        https://github.com/google-research/maskgit/blob/main/maskgit/libml/parallel_decode.py#L79
+        """
+        # begin with all image token ids masked
+        # 计算有多少个mask token
+        mask_count = (input_ids == mask_token_id).sum().item()
+        num_vq_tokens = seq_len
+        num_new_special_tokens = 0
+        uni_prompting = kwargs.get("uni_prompting", None)
+        # print(f"config.model.mmada.llm_vocab_size: {config.model.mmada.llm_vocab_size}, {len(uni_prompting.text_tokenizer)}")
+        input_ids_minus_lm_vocab_size = input_ids[:, -(num_vq_tokens + 1):-1].clone()
+        input_ids_minus_lm_vocab_size = torch.where(input_ids_minus_lm_vocab_size == mask_token_id, mask_token_id, input_ids_minus_lm_vocab_size - len(uni_prompting.text_tokenizer) - num_new_special_tokens)
+        # for classifier-free guidance
+        if uncond_input_ids is not None:
+            uncond_prefix = uncond_input_ids[:, :resolution + 1]
+        for step in range(timesteps):
+            if uncond_input_ids is not None and guidance_scale > 0:
+                uncond_input_ids = torch.cat(
+                    [uncond_prefix, input_ids[:, resolution + 1:]], dim=1)
+                model_input = torch.cat([input_ids, uncond_input_ids])
+                attention_mask = torch.cat([attention_mask, uncond_attention_mask], dim=0)
+                attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
+                logits = self(model_input, attention_bias=attention_bias).logits
+                # print(f"logits.shape: {logits.shape}")
+                cond_logits, uncond_logits = torch.chunk(logits, 2, dim=0)
+                # logits = uncond_logits + guidance_scale * (cond_logits - uncond_logits)
+                # it seems that muse has a different cfg setting
+                logits = (1 + guidance_scale) * cond_logits - guidance_scale * uncond_logits
+                logits = logits[:, -(num_vq_tokens + 1):-1, len(uni_prompting.text_tokenizer) + num_new_special_tokens: len(uni_prompting.text_tokenizer) + num_new_special_tokens + codebook_size]
+            else:
+                attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
+                logits = self(input_ids, attention_bias=attention_bias).logits
+                logits = logits[:, -(num_vq_tokens + 1):-1, len(uni_prompting.text_tokenizer) + num_new_special_tokens: len(uni_prompting.text_tokenizer) + num_new_special_tokens + codebook_size]
+            # logits: 1, 1024, 8192
+            # print(f"logits.shape: {logits.shape}")
+            probs = logits.softmax(dim=-1)
+            sampled = probs.reshape(-1, logits.size(-1))
+            # print(f"probs: {probs}, probs.shape: {probs.shape}, sampled: {sampled}, sampled.shape: {sampled.shape}")
+            sampled_ids = torch.multinomial(sampled, 1, generator=generator)[:, 0].view(*logits.shape[:-1]) # 1, 1024
+            unknown_map = input_ids_minus_lm_vocab_size == mask_token_id
+            # print(f"unknown_map.sum(dim=-1, keepdim=True): {unknown_map.sum(dim=-1, keepdim=True)}")
+            sampled_ids = torch.where(unknown_map, sampled_ids, input_ids_minus_lm_vocab_size)
+            # Defines the mask ratio for the next round. The number to mask out is
+            current_image_vq_indices = sampled_ids.clone()
+            # print(f"current_image_vq_indices: {current_image_vq_indices}")
+            current_image_vq_indices = torch.clamp(current_image_vq_indices, 0, 8192 - 1)
+            current_image = vq_model.decode_code(current_image_vq_indices)
+            images = torch.clamp((current_image + 1.0) / 2.0, min=0.0, max=1.0)
+            images *= 255.0
+            images = images.permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)
+            pil_images = Image.fromarray(images[0])
+            yield pil_images, f"Step {step + 1}/{timesteps}"
+            # determined by mask_ratio * unknown_number_in_the_beginning.
+            ratio = 1.0 * (step + 1) / timesteps
+            mask_ratio = noise_schedule(torch.tensor(ratio))
+            # Computes the probabilities of each selected tokens.
+            selected_probs = torch.gather(probs, -1, sampled_ids.long()[..., None])
+            selected_probs = selected_probs.squeeze(-1)
+            # Ignores the tokens given in the input by overwriting their confidence.
+            selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max)
+            # Gets mask lens for each sample in the batch according to the mask ratio.
+            mask_len = (num_vq_tokens * mask_ratio).floor().unsqueeze(0).to(logits.device)
+            # Keeps at least one of prediction in this round and also masks out at least
+            # one and for the next iteration
+            mask_len = torch.max(
+                torch.tensor([1], device=logits.device), torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len)
+            )
+            # print(f"mask_len: {mask_len}, mask_len.shape: {mask_len.shape}")
+            # Adds noise for randomness
+            temperature = temperature * (1.0 - ratio)
+            masking = mask_by_random_topk(mask_len, selected_probs, temperature, generator=generator)
+            # Masks tokens with lower confidence.
+            input_ids[:, -(num_vq_tokens + 1):-1] = torch.where(masking, mask_token_id,
+                                                          sampled_ids + len(uni_prompting.text_tokenizer)
+                                                          + num_new_special_tokens)
+            input_ids_minus_lm_vocab_size = torch.where(masking, mask_token_id, sampled_ids)
+        return sampled_ids
+AutoConfig.register("mmada", MMadaConfig)
+AutoModelForCausalLM.register(MMadaConfig, MMadaModelLM)
+AutoModel.register(MMadaConfig, MMadaModelLM)

models/modeling_utils.py ADDED Viewed

	@@ -0,0 +1,1207 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import itertools
+import json
+import os
+import re
+from collections import OrderedDict
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, List, Optional, Tuple, Union
+import safetensors
+import torch
+from huggingface_hub import create_repo, split_torch_state_dict_into_shards
+from huggingface_hub.utils import validate_hf_hub_args
+from torch import Tensor, nn
+from diffusers import __version__
+from diffusers.utils import (
+    FLAX_WEIGHTS_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    WEIGHTS_INDEX_NAME,
+    _add_variant,
+    _get_checkpoint_shard_files,
+    _get_model_file,
+    deprecate,
+    is_accelerate_available,
+    is_torch_version,
+    logging,
+)
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+SAFETENSORS_WEIGHTS_NAME = "pytorch_model.safetensors"
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = "https://huggingface.co"
+from diffusers.utils.hub_utils import (
+    PushToHubMixin,
+    load_or_create_model_card,
+    populate_model_card,
+)
+from diffusers.models.model_loading_utils import (
+    _determine_device_map,
+    _fetch_index_file,
+    _load_state_dict_into_model,
+    load_model_dict_into_meta,
+    load_state_dict,
+)
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+logger = logging.get_logger(__name__)
+_REGEX_SHARD = re.compile(r"(.*?)-\d{5}-of-\d{5}")
+if is_torch_version(">=", "1.9.0"):
+    _LOW_CPU_MEM_USAGE_DEFAULT = True
+else:
+    _LOW_CPU_MEM_USAGE_DEFAULT = False
+if is_accelerate_available():
+    import accelerate
+def get_parameter_device(parameter: torch.nn.Module) -> torch.device:
+    try:
+        parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
+        return next(parameters_and_buffers).device
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].device
+def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
+    try:
+        params = tuple(parameter.parameters())
+        if len(params) > 0:
+            return params[0].dtype
+        buffers = tuple(parameter.buffers())
+        if len(buffers) > 0:
+            return buffers[0].dtype
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].dtype
+class ModelMixin(torch.nn.Module, PushToHubMixin):
+    r"""
+    Base class for all models.
+    [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
+    saving models.
+        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`].
+    """
+    config_name = CONFIG_NAME
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+    _supports_gradient_checkpointing = False
+    _keys_to_ignore_on_load_unexpected = None
+    _no_split_modules = None
+    def __init__(self):
+        super().__init__()
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
+        __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__':
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
+            return self._internal_dict[name]
+        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        return super().__getattr__(name)
+    @property
+    def is_gradient_checkpointing(self) -> bool:
+        """
+        Whether gradient checkpointing is activated for this model or not.
+        """
+        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
+    def enable_gradient_checkpointing(self) -> None:
+        """
+        Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if not self._supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+    def disable_gradient_checkpointing(self) -> None:
+        """
+        Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if self._supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+    def set_use_npu_flash_attention(self, valid: bool) -> None:
+        r"""
+        Set the switch for the npu flash attention.
+        """
+        def fn_recursive_set_npu_flash_attention(module: torch.nn.Module):
+            if hasattr(module, "set_use_npu_flash_attention"):
+                module.set_use_npu_flash_attention(valid)
+            for child in module.children():
+                fn_recursive_set_npu_flash_attention(child)
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_npu_flash_attention(module)
+    def enable_npu_flash_attention(self) -> None:
+        r"""
+        Enable npu flash attention from torch_npu
+        """
+        self.set_use_npu_flash_attention(True)
+    def disable_npu_flash_attention(self) -> None:
+        r"""
+        disable npu flash attention from torch_npu
+        """
+        self.set_use_npu_flash_attention(False)
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_mem_eff(module)
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None) -> None:
+        r"""
+        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
+        inference. Speed up during training is not guaranteed.
+        <Tip warning={true}>
+        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
+        precedent.
+        </Tip>
+        Parameters:
+            attention_op (`Callable`, *optional*):
+                Override the default `None` operator for use as `op` argument to the
+                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
+                function of xFormers.
+        Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import UNet2DConditionModel
+        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+        >>> model = UNet2DConditionModel.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
+        ... )
+        >>> model = model.to("cuda")
+        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+    def disable_xformers_memory_efficient_attention(self) -> None:
+        r"""
+        Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Optional[Callable] = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        max_shard_size: Union[int, str] = "10GB",
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Save a model and its configuration file to a directory so that it can be reloaded using the
+        [`~models.ModelMixin.from_pretrained`] class method.
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+            max_shard_size (`int` or `str`, defaults to `"10GB"`):
+                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`).
+                If expressed as an integer, the unit is bytes. Note that this limit will be decreased after a certain
+                period of time (starting from Oct 2024) to allow users to upgrade to the latest version of `diffusers`.
+                This is to establish a common default size for this argument across different libraries in the Hugging
+                Face ecosystem (`transformers`, and `accelerate`, for example).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        weights_name = _add_variant(weights_name, variant)
+        weight_name_split = weights_name.split(".")
+        if len(weight_name_split) in [2, 3]:
+            weights_name_pattern = weight_name_split[0] + "{suffix}." + ".".join(weight_name_split[1:])
+        else:
+            raise ValueError(f"Invalid {weights_name} provided.")
+        os.makedirs(save_directory, exist_ok=True)
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+        # Only save the model itself if we are using distributed training
+        model_to_save = self
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory)
+        # Save the model
+        state_dict = model_to_save.state_dict()
+        # Save the model
+        state_dict_split = split_torch_state_dict_into_shards(
+            state_dict, max_shard_size=max_shard_size, filename_pattern=weights_name_pattern
+        )
+        # Clean the folder from a previous save
+        if is_main_process:
+            for filename in os.listdir(save_directory):
+                if filename in state_dict_split.filename_to_tensors.keys():
+                    continue
+                full_filename = os.path.join(save_directory, filename)
+                if not os.path.isfile(full_filename):
+                    continue
+                weights_without_ext = weights_name_pattern.replace(".bin", "").replace(".safetensors", "")
+                weights_without_ext = weights_without_ext.replace("{suffix}", "")
+                filename_without_ext = filename.replace(".bin", "").replace(".safetensors", "")
+                # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
+                if (
+                    filename.startswith(weights_without_ext)
+                    and _REGEX_SHARD.fullmatch(filename_without_ext) is not None
+                ):
+                    os.remove(full_filename)
+        for filename, tensors in state_dict_split.filename_to_tensors.items():
+            shard = {tensor: state_dict[tensor] for tensor in tensors}
+            filepath = os.path.join(save_directory, filename)
+            if safe_serialization:
+                # At some point we will need to deal better with save_function (used for TPU and other distributed
+                # joyfulness), but for now this enough.
+                safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"})
+            else:
+                torch.save(shard, filepath)
+        if state_dict_split.is_sharded:
+            index = {
+                "metadata": state_dict_split.metadata,
+                "weight_map": state_dict_split.tensor_to_filename,
+            }
+            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
+            save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
+            # Save the index as well
+            with open(save_index_file, "w", encoding="utf-8") as f:
+                content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+                f.write(content)
+            logger.info(
+                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+                f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the "
+                f"index located at {save_index_file}."
+            )
+        else:
+            path_to_weights = os.path.join(save_directory, weights_name)
+            logger.info(f"Model weights saved in {path_to_weights}")
+        if push_to_hub:
+            # Create a new empty model card and eventually tag it
+            model_card = load_or_create_model_card(repo_id, token=token)
+            model_card = populate_model_card(model_card)
+            model_card.save(Path(save_directory, "README.md").as_posix())
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained PyTorch model from a pretrained model configuration.
+        The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To
+        train the model, set it back in training mode with `model.train()`.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`~ModelMixin.save_pretrained`].
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info (`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device. Defaults to `None`, meaning that the model will be loaded on CPU.
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if `device_map` contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors`
+                weights. If set to `False`, `safetensors` weights are not loaded.
+        <Tip>
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+        </Tip>
+        Example:
+        ```py
+        from diffusers import UNet2DConditionModel
+        unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+        ```
+        If you get the error message below, you need to finetune the weights for your downstream task:
+        ```bash
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_flax = kwargs.pop("from_flax", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+        # change device_map into a map if we passed an int, a str or a torch.device
+        if isinstance(device_map, torch.device):
+            device_map = {"": device_map}
+        elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
+            try:
+                device_map = {"": torch.device(device_map)}
+            except RuntimeError:
+                raise ValueError(
+                    "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or "
+                    f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}."
+                )
+        elif isinstance(device_map, int):
+            if device_map < 0:
+                raise ValueError(
+                    "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' "
+                )
+            else:
+                device_map = {"": device_map}
+        if device_map is not None:
+            if low_cpu_mem_usage is None:
+                low_cpu_mem_usage = True
+            elif not low_cpu_mem_usage:
+                raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")
+        if low_cpu_mem_usage:
+            if device_map is not None and not is_torch_version(">=", "1.10"):
+                # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info.
+                raise ValueError("`low_cpu_mem_usage` and `device_map` require PyTorch >= 1.10.")
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+        # load config
+        config, unused_kwargs, commit_hash = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # Determine if we're loading from a directory of sharded checkpoints.
+        is_sharded = False
+        index_file = None
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        index_file = _fetch_index_file(
+            is_local=is_local,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder or "",
+            use_safetensors=use_safetensors,
+            cache_dir=cache_dir,
+            variant=variant,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            user_agent=user_agent,
+            commit_hash=commit_hash,
+        )
+        if index_file is not None and index_file.is_file():
+            is_sharded = True
+        if is_sharded and from_flax:
+            raise ValueError("Loading of sharded checkpoints is not supported when `from_flax=True`.")
+        # load model
+        model_file = None
+        if from_flax:
+            model_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=FLAX_WEIGHTS_NAME,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+                commit_hash=commit_hash,
+            )
+            model = cls.from_config(config, **unused_kwargs)
+            # Convert the weights
+            from .modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
+            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
+        else:
+            if is_sharded:
+                sharded_ckpt_cached_folder, sharded_metadata = _get_checkpoint_shard_files(
+                    pretrained_model_name_or_path,
+                    index_file,
+                    cache_dir=cache_dir,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder or "",
+                )
+            elif use_safetensors and not is_sharded:
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                    )
+                except IOError as e:
+                    logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
+                    if not allow_pickle:
+                        raise
+                    logger.warning(
+                        "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
+                    )
+            if model_file is None and not is_sharded:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=_add_variant(WEIGHTS_NAME, variant),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    commit_hash=commit_hash,
+                )
+            if low_cpu_mem_usage:
+                # Instantiate model with empty weights
+                with accelerate.init_empty_weights():
+                    model = cls.from_config(config, **unused_kwargs)
+                # if device_map is None, load the state dict and move the params from meta device to the cpu
+                if device_map is None and not is_sharded:
+                    param_device = "cpu"
+                    state_dict = load_state_dict(model_file, variant=variant)
+                    model._convert_deprecated_attention_blocks(state_dict)
+                    # move the params from meta device to cpu
+                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+                    if len(missing_keys) > 0:
+                        raise ValueError(
+                            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
+                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
+                            " those weights or else make sure your checkpoint file is correct."
+                        )
+                    unexpected_keys = load_model_dict_into_meta(
+                        model,
+                        state_dict,
+                        device=param_device,
+                        dtype=torch_dtype,
+                        model_name_or_path=pretrained_model_name_or_path,
+                    )
+                    if cls._keys_to_ignore_on_load_unexpected is not None:
+                        for pat in cls._keys_to_ignore_on_load_unexpected:
+                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+                    if len(unexpected_keys) > 0:
+                        logger.warning(
+                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+                        )
+                else:  # else let accelerate handle loading and dispatching.
+                    # Load weights and dispatch according to the device_map
+                    # by default the device_map is None and the weights are loaded on the CPU
+                    force_hook = True
+                    device_map = _determine_device_map(model, device_map, max_memory, torch_dtype)
+                    if device_map is None and is_sharded:
+                        # we load the parameters on the cpu
+                        device_map = {"": "cpu"}
+                        force_hook = False
+                    try:
+                        accelerate.load_checkpoint_and_dispatch(
+                            model,
+                            model_file if not is_sharded else index_file,
+                            device_map,
+                            max_memory=max_memory,
+                            offload_folder=offload_folder,
+                            offload_state_dict=offload_state_dict,
+                            dtype=torch_dtype,
+                            force_hooks=force_hook,
+                            strict=True,
+                        )
+                    except AttributeError as e:
+                        # When using accelerate loading, we do not have the ability to load the state
+                        # dict and rename the weight names manually. Additionally, accelerate skips
+                        # torch loading conventions and directly writes into `module.{_buffers, _parameters}`
+                        # (which look like they should be private variables?), so we can't use the standard hooks
+                        # to rename parameters on load. We need to mimic the original weight names so the correct
+                        # attributes are available. After we have loaded the weights, we convert the deprecated
+                        # names to the new non-deprecated names. Then we _greatly encourage_ the user to convert
+                        # the weights so we don't have to do this again.
+                        if "'Attention' object has no attribute" in str(e):
+                            logger.warning(
+                                f"Taking `{str(e)}` while using `accelerate.load_checkpoint_and_dispatch` to mean {pretrained_model_name_or_path}"
+                                " was saved with deprecated attention block weight names. We will load it with the deprecated attention block"
+                                " names and convert them on the fly to the new attention block format. Please re-save the model after this conversion,"
+                                " so we don't have to do the on the fly renaming in the future. If the model is from a hub checkpoint,"
+                                " please also re-upload it or open a PR on the original repository."
+                            )
+                            model._temp_convert_self_to_deprecated_attention_blocks()
+                            accelerate.load_checkpoint_and_dispatch(
+                                model,
+                                model_file if not is_sharded else index_file,
+                                device_map,
+                                max_memory=max_memory,
+                                offload_folder=offload_folder,
+                                offload_state_dict=offload_state_dict,
+                                dtype=torch_dtype,
+                                force_hooks=force_hook,
+                                strict=True,
+                            )
+                            model._undo_temp_convert_self_to_deprecated_attention_blocks()
+                        else:
+                            raise e
+                loading_info = {
+                    "missing_keys": [],
+                    "unexpected_keys": [],
+                    "mismatched_keys": [],
+                    "error_msgs": [],
+                }
+            else:
+                model = cls.from_config(config, **unused_kwargs)
+                state_dict = load_state_dict(model_file, variant=variant)
+                model._convert_deprecated_attention_blocks(state_dict)
+                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+                    model,
+                    state_dict,
+                    model_file,
+                    pretrained_model_name_or_path,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                )
+                loading_info = {
+                    "missing_keys": missing_keys,
+                    "unexpected_keys": unexpected_keys,
+                    "mismatched_keys": mismatched_keys,
+                    "error_msgs": error_msgs,
+                }
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        elif torch_dtype is not None:
+            model = model.to(torch_dtype)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+        return model
+    @classmethod
+    def _load_pretrained_model(
+        cls,
+        model,
+        state_dict: OrderedDict,
+        resolved_archive_file,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        ignore_mismatched_sizes: bool = False,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        loaded_keys = list(state_dict.keys())
+        expected_keys = list(model_state_dict.keys())
+        original_loaded_keys = loaded_keys
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+        # Make sure we are able to load base models as well as derived models (with heads)
+        model_to_load = model
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+        if state_dict is not None:
+            # Whole checkpoint
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
+            )
+            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+    @classmethod
+    def _get_signature_keys(cls, obj):
+        parameters = inspect.signature(obj.__init__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+        expected_modules = set(required_parameters.keys()) - {"self"}
+        return expected_modules, optional_parameters
+    # Adapted from `transformers` modeling_utils.py
+    def _get_no_split_modules(self, device_map: str):
+        """
+        Get the modules of the model that should not be spit when using device_map. We iterate through the modules to
+        get the underlying `_no_split_modules`.
+        Args:
+            device_map (`str`):
+                The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"]
+        Returns:
+            `List[str]`: List of modules that should not be split
+        """
+        _no_split_modules = set()
+        modules_to_check = [self]
+        while len(modules_to_check) > 0:
+            module = modules_to_check.pop(-1)
+            # if the module does not appear in _no_split_modules, we also check the children
+            if module.__class__.__name__ not in _no_split_modules:
+                if isinstance(module, ModelMixin):
+                    if module._no_split_modules is None:
+                        raise ValueError(
+                            f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model "
+                            "class needs to implement the `_no_split_modules` attribute."
+                        )
+                    else:
+                        _no_split_modules = _no_split_modules | set(module._no_split_modules)
+                modules_to_check += list(module.children())
+        return list(_no_split_modules)
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        return get_parameter_device(self)
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+        """
+        Get number of (trainable or non-embedding) parameters in the module.
+        Args:
+            only_trainable (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of trainable parameters.
+            exclude_embeddings (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of non-embedding parameters.
+        Returns:
+            `int`: The number of parameters.
+        Example:
+        ```py
+        from diffusers import UNet2DConditionModel
+        model_id = "runwayml/stable-diffusion-v1-5"
+        unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
+        unet.num_parameters(only_trainable=True)
+        859520964
+        ```
+        """
+        if exclude_embeddings:
+            embedding_param_names = [
+                f"{name}.weight"
+                for name, module_type in self.named_modules()
+                if isinstance(module_type, torch.nn.Embedding)
+            ]
+            non_embedding_parameters = [
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+            ]
+            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
+        else:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+    def _convert_deprecated_attention_blocks(self, state_dict: OrderedDict) -> None:
+        deprecated_attention_block_paths = []
+        def recursive_find_attn_block(name, module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_paths.append(name)
+            for sub_name, sub_module in module.named_children():
+                sub_name = sub_name if name == "" else f"{name}.{sub_name}"
+                recursive_find_attn_block(sub_name, sub_module)
+        recursive_find_attn_block("", self)
+        # NOTE: we have to check if the deprecated parameters are in the state dict
+        # because it is possible we are loading from a state dict that was already
+        # converted
+        for path in deprecated_attention_block_paths:
+            # group_norm path stays the same
+            # query -> to_q
+            if f"{path}.query.weight" in state_dict:
+                state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
+            if f"{path}.query.bias" in state_dict:
+                state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
+            # key -> to_k
+            if f"{path}.key.weight" in state_dict:
+                state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
+            if f"{path}.key.bias" in state_dict:
+                state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
+            # value -> to_v
+            if f"{path}.value.weight" in state_dict:
+                state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
+            if f"{path}.value.bias" in state_dict:
+                state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
+            # proj_attn -> to_out.0
+            if f"{path}.proj_attn.weight" in state_dict:
+                state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
+            if f"{path}.proj_attn.bias" in state_dict:
+                state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
+    def _temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+        def recursive_find_attn_block(module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+        recursive_find_attn_block(self)
+        for module in deprecated_attention_block_modules:
+            module.query = module.to_q
+            module.key = module.to_k
+            module.value = module.to_v
+            module.proj_attn = module.to_out[0]
+            # We don't _have_ to delete the old attributes, but it's helpful to ensure
+            # that _all_ the weights are loaded into the new attributes and we're not
+            # making an incorrect assumption that this model should be converted when
+            # it really shouldn't be.
+            del module.to_q
+            del module.to_k
+            del module.to_v
+            del module.to_out
+    def _undo_temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+        def recursive_find_attn_block(module) -> None:
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+        recursive_find_attn_block(self)
+        for module in deprecated_attention_block_modules:
+            module.to_q = module.query
+            module.to_k = module.key
+            module.to_v = module.value
+            module.to_out = nn.ModuleList([module.proj_attn, nn.Dropout(module.dropout)])
+            del module.query
+            del module.key
+            del module.value
+            del module.proj_attn
+class LegacyModelMixin(ModelMixin):
+    r"""
+    A subclass of `ModelMixin` to resolve class mapping from legacy classes (like `Transformer2DModel`) to more
+    pipeline-specific classes (like `DiTTransformer2DModel`).
+    """
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        # To prevent dependency import problem.
+        from diffusers.models.model_loading_utils import _fetch_remapped_cls_from_config
+        # Create a copy of the kwargs so that we don't mess with the keyword arguments in the downstream calls.
+        kwargs_copy = kwargs.copy()
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+        # load config
+        config, _, _ = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # resolve remapping
+        remapped_class = _fetch_remapped_cls_from_config(config, cls)
+        return remapped_class.from_pretrained(pretrained_model_name_or_path, **kwargs_copy)

models/sampling.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Adapted from https://github.com/lucidrains/muse-maskgit-pytorch
+import math
+from functools import partial
+import torch
+import torch.nn.functional as F
+def log(t, eps=1e-20):
+    return torch.log(t.clamp(min=eps))
+def gumbel_noise(t, generator=None):
+    noise = torch.zeros_like(t).uniform_(0, 1, generator=generator)
+    return -log(-log(noise))
+def gumbel_sample(t, temperature=1.0, dim=-1, generator=None):
+    return ((t / max(temperature, 1e-10)) + gumbel_noise(t, generator=generator)).argmax(dim=dim)
+def top_k(logits, thres=0.9):
+    k = math.ceil((1 - thres) * logits.shape[-1])
+    val, ind = logits.topk(k, dim=-1)
+    probs = torch.full_like(logits, float("-inf"))
+    probs.scatter_(2, ind, val)
+    return probs
+def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
+    confidence = log(probs) + temperature * gumbel_noise(probs, generator=generator)
+    sorted_confidence = torch.sort(confidence, dim=-1).values
+    cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
+    masking = confidence < cut_off
+    return masking
+def cosine_schedule(t):
+    return torch.cos(t * math.pi * 0.5)
+def linear_schedule(t):
+    mask_ratio = 1 - t
+    mask_ratio = mask_ratio.clamp(min=1e-6, max=1.0)
+    return mask_ratio
+def pow(t, method):
+    exponent = float(method.replace("pow", ""))
+    mask_ratio = 1.0 - t**exponent
+    mask_ratio = mask_ratio.clamp(min=1e-6, max=1.0)
+    return mask_ratio
+def sigmoid_schedule(t, start=-3, end=3, tau=1.0, clip_min=1e-6):
+    for item in [t, start, end, tau]:
+        item = torch.tensor(item) if not torch.is_tensor(item) else item
+    # A gamma function based on sigmoid function.
+    v_start = torch.sigmoid(torch.tensor(start / tau))
+    v_end = torch.sigmoid(torch.tensor(end / tau))
+    output = torch.sigmoid((t * (end - start) + start) / tau)
+    output = (v_end - output) / (v_end - v_start)
+    return torch.clip(output, clip_min, 1.0)
+def get_mask_schedule(method, **schedule_kwargs):
+    if method == "cosine":
+        return cosine_schedule
+    elif method == "linear":
+        return linear_schedule
+    elif "pow" in method:
+        return partial(pow, method=method)
+    elif method == "sigmoid":
+        return partial(sigmoid_schedule, **schedule_kwargs)
+    else:
+        raise ValueError("Unknown schedule method: {}".format(method))
+def top_k_top_p_filtering(
+    logits: torch.Tensor,
+    top_k: int = 0,
+    top_p: float = 1.0,
+    filter_value: float = -float("Inf"),
+    min_tokens_to_keep: int = 1,
+) -> torch.Tensor:
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits

models/training_utils.py ADDED Viewed

	@@ -0,0 +1,455 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import os
+import random
+from typing import Any, Dict, Iterable, Optional, Union
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+def enable_full_determinism(seed: int):
+    """
+    Helper function for reproducible behavior during distributed training. See
+    - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
+    """
+    # set seed first
+    set_seed(seed)
+    #  Enable PyTorch deterministic mode. This potentially requires either the environment
+    #  variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set,
+    # depending on the CUDA version, so we set them both here
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+    torch.use_deterministic_algorithms(True)
+    # Enable CUDNN deterministic mode
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def set_seed(seed: int):
+    """
+    Args:
+    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
+        seed (`int`): The seed to set.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    # ^^ safe to call this function even if cuda is not available
+# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
+class EMA:
+    """
+    Exponential Moving Average of models weights
+    """
+    def __init__(
+        self,
+        parameters: Iterable[torch.nn.Parameter],
+        decay: float = 0.9999,
+        min_decay: float = 0.0,
+        update_after_step: int = 0,
+        use_ema_warmup: bool = False,
+        inv_gamma: Union[float, int] = 1.0,
+        power: Union[float, int] = 2 / 3,
+        model_cls: Optional[Any] = None,
+        model_config: Dict[str, Any] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            parameters (Iterable[torch.nn.Parameter]): The parameters to track.
+            decay (float): The decay factor for the exponential moving average.
+            min_decay (float): The minimum decay factor for the exponential moving average.
+            update_after_step (int): The number of steps to wait before starting to update the EMA weights.
+            use_ema_warmup (bool): Whether to use EMA warmup.
+            inv_gamma (float):
+                Inverse multiplicative factor of EMA warmup. Default: 1. Only used if `use_ema_warmup` is True.
+            power (float): Exponential factor of EMA warmup. Default: 2/3. Only used if `use_ema_warmup` is True.
+            device (Optional[Union[str, torch.device]]): The device to store the EMA weights on. If None, the EMA
+                        weights will be stored on CPU.
+        @crowsonkb's notes on EMA Warmup:
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+            at 215.4k steps).
+        """
+        parameters = list(parameters)
+        self.shadow_params = [p.clone().detach() for p in parameters]
+        self.temp_stored_params = None
+        self.decay = decay
+        self.min_decay = min_decay
+        self.update_after_step = update_after_step
+        self.use_ema_warmup = use_ema_warmup
+        self.inv_gamma = inv_gamma
+        self.power = power
+        self.optimization_step = 0
+        self.cur_decay_value = None  # set in `step()`
+        self.model_cls = model_cls
+        self.model_config = model_config
+    @classmethod
+    def from_pretrained(cls, path, model_cls) -> "EMA":
+        _, ema_kwargs = model_cls.load_config(path, return_unused_kwargs=True)
+        model = model_cls.from_pretrained(path)
+        ema_model = cls(model.parameters(), model_cls=model_cls, model_config=model.config)
+        ema_model.load_state_dict(ema_kwargs)
+        return ema_model
+    def save_pretrained(self, path):
+        if self.model_cls is None:
+            raise ValueError("`save_pretrained` can only be used if `model_cls` was defined at __init__.")
+        if self.model_config is None:
+            raise ValueError("`save_pretrained` can only be used if `model_config` was defined at __init__.")
+        model = self.model_cls.from_config(self.model_config)
+        state_dict = self.state_dict()
+        state_dict.pop("shadow_params", None)
+        model.register_to_config(**state_dict)
+        self.copy_to(model.parameters())
+        model.save_pretrained(path)
+    def get_decay(self, optimization_step: int) -> float:
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        step = max(0, optimization_step - self.update_after_step - 1)
+        if step <= 0:
+            return 0.0
+        if self.use_ema_warmup:
+            cur_decay_value = 1 - (1 + step / self.inv_gamma) ** -self.power
+        else:
+            cur_decay_value = (1 + step) / (10 + step)
+        cur_decay_value = min(cur_decay_value, self.decay)
+        # make sure decay is not smaller than min_decay
+        cur_decay_value = max(cur_decay_value, self.min_decay)
+        return cur_decay_value
+    @torch.no_grad()
+    def step(self, parameters: Iterable[torch.nn.Parameter]):
+        parameters = list(parameters)
+        self.optimization_step += 1
+        # Compute the decay factor for the exponential moving average.
+        decay = self.get_decay(self.optimization_step)
+        self.cur_decay_value = decay
+        one_minus_decay = 1 - decay
+        for s_param, param in zip(self.shadow_params, parameters):
+            if param.requires_grad:
+                s_param.sub_(one_minus_decay * (s_param - param))
+            else:
+                s_param.copy_(param)
+        torch.cuda.empty_cache()
+    def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        """
+        Copy current averaged parameters into given collection of parameters.
+        Args:
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored moving averages. If `None`, the parameters with which this
+                `ExponentialMovingAverage` was initialized will be used.
+        """
+        parameters = list(parameters)
+        for s_param, param in zip(self.shadow_params, parameters):
+            param.data.copy_(s_param.to(param.device).data)
+    def to(self, device=None, dtype=None) -> None:
+        r"""Move internal buffers of the ExponentialMovingAverage to `device`.
+        Args:
+            device: like `device` argument to `torch.Tensor.to`
+        """
+        # .to() on the tensors handles None correctly
+        self.shadow_params = [
+            p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
+            for p in self.shadow_params
+        ]
+    def state_dict(self) -> dict:
+        r"""
+        Returns the state of the ExponentialMovingAverage as a dict. This method is used by accelerate during
+        checkpointing to save the ema state dict.
+        """
+        # Following PyTorch conventions, references to tensors are returned:
+        # "returns a reference to the state and not its copy!" -
+        # https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict
+        return {
+            "decay": self.decay,
+            "min_decay": self.min_decay,
+            "optimization_step": self.optimization_step,
+            "update_after_step": self.update_after_step,
+            "use_ema_warmup": self.use_ema_warmup,
+            "inv_gamma": self.inv_gamma,
+            "power": self.power,
+            "shadow_params": self.shadow_params,
+        }
+    def store(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        r"""
+        Args:
+        Save the current parameters for restoring later.
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                temporarily stored.
+        """
+        self.temp_stored_params = [param.detach().cpu().clone() for param in parameters]
+    def restore(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        r"""
+        Args:
+        Restore the parameters stored with the `store` method. Useful to validate the model with EMA parameters without:
+        affecting the original optimization process. Store the parameters before the `copy_to()` method. After
+        validation (or model saving), use this to restore the former parameters.
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored parameters. If `None`, the parameters with which this
+                `ExponentialMovingAverage` was initialized will be used.
+        """
+        if self.temp_stored_params is None:
+            raise RuntimeError("This ExponentialMovingAverage has no `store()`ed weights to `restore()`")
+        for c_param, param in zip(self.temp_stored_params, parameters):
+            param.data.copy_(c_param.data)
+        # Better memory-wise.
+        self.temp_stored_params = None
+    def load_state_dict(self, state_dict: dict) -> None:
+        r"""
+        Args:
+        Loads the ExponentialMovingAverage state. This method is used by accelerate during checkpointing to save the
+        ema state dict.
+            state_dict (dict): EMA state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        # deepcopy, to be consistent with module API
+        state_dict = copy.deepcopy(state_dict)
+        self.decay = state_dict.get("decay", self.decay)
+        if self.decay < 0.0 or self.decay > 1.0:
+            raise ValueError("Decay must be between 0 and 1")
+        self.min_decay = state_dict.get("min_decay", self.min_decay)
+        if not isinstance(self.min_decay, float):
+            raise ValueError("Invalid min_decay")
+        self.optimization_step = state_dict.get("optimization_step", self.optimization_step)
+        if not isinstance(self.optimization_step, int):
+            raise ValueError("Invalid optimization_step")
+        self.update_after_step = state_dict.get("update_after_step", self.update_after_step)
+        if not isinstance(self.update_after_step, int):
+            raise ValueError("Invalid update_after_step")
+        self.use_ema_warmup = state_dict.get("use_ema_warmup", self.use_ema_warmup)
+        if not isinstance(self.use_ema_warmup, bool):
+            raise ValueError("Invalid use_ema_warmup")
+        self.inv_gamma = state_dict.get("inv_gamma", self.inv_gamma)
+        if not isinstance(self.inv_gamma, (float, int)):
+            raise ValueError("Invalid inv_gamma")
+        self.power = state_dict.get("power", self.power)
+        if not isinstance(self.power, (float, int)):
+            raise ValueError("Invalid power")
+        shadow_params = state_dict.get("shadow_params", None)
+        if shadow_params is not None:
+            self.shadow_params = shadow_params
+            if not isinstance(self.shadow_params, list):
+                raise ValueError("shadow_params must be a list")
+            if not all(isinstance(p, torch.Tensor) for p in self.shadow_params):
+                raise ValueError("shadow_params must all be Tensors")
+# calculates entropy over each pixel distribution
+def pixel_entropy_per_percent_masked_bucket(logits, input_ids, mask_id):
+    # only calculated entropy over image tokens that were masked in the original image
+    masked_tokens = input_ids == mask_id
+    num_masked_pixels = masked_tokens.sum(-1)
+    probs = F.softmax(logits, dim=-1)
+    log_probs = F.log_softmax(logits, dim=-1)
+    entropy_per_pixel = -((probs * log_probs).sum(-1))
+    # the predictions for non-masked aren't used, so set their entropies to zero
+    entropy_per_pixel[~masked_tokens] = 0
+    entropy_per_image_numerator = entropy_per_pixel.sum(-1)
+    entropy_per_image = entropy_per_image_numerator / num_masked_pixels
+    total_buckets = 10
+    masked_buckets = input_ids_to_masked_buckets(input_ids, mask_id, total_buckets)
+    entropy_by_masked_bucket = average_by_buckets(entropy_per_image, masked_buckets, total_buckets)
+    return entropy_by_masked_bucket
+# calculates entropy over the averaged distribution of pixels for the whole image
+def image_entropy_per_percent_masked_bucket(logits, input_ids, mask_id):
+    # only calculated entropy over image tokens that were masked in the original image
+    masked_tokens = input_ids == mask_id
+    num_masked_pixels = masked_tokens.sum(-1, keepdim=True)
+    pixel_probs = F.softmax(logits, dim=-1)
+    pixel_probs[~masked_tokens] = 0
+    image_probs_numerator = pixel_probs.sum(-2)
+    image_probs = image_probs_numerator / num_masked_pixels
+    image_log_probs = image_probs.log()
+    entropy_per_image = -((image_probs * image_log_probs).sum(-1))
+    total_buckets = 10
+    masked_buckets = input_ids_to_masked_buckets(input_ids, mask_id, total_buckets)
+    entropy_by_masked_bucket = average_by_buckets(entropy_per_image, masked_buckets, total_buckets)
+    return entropy_by_masked_bucket
+def cross_entropy_per_percent_masked_bucket(logits, labels, input_ids, mask_id, output_size, label_smoothing):
+    cross_entropy_per_image = F.cross_entropy(
+        logits.view(-1, output_size),
+        labels.view(-1),
+        ignore_index=-100,
+        label_smoothing=label_smoothing,
+        reduction="none",
+    )
+    total_buckets = 10
+    masked_buckets = input_ids_to_masked_buckets(input_ids, mask_id, total_buckets)
+    cross_entropy_by_percent_masked_bucket = average_by_buckets(cross_entropy_per_image, masked_buckets, total_buckets)
+    return cross_entropy_by_percent_masked_bucket
+def token_probability_distributions_per_percent_masked_bucket(logits, input_ids, mask_id):
+    probs = F.softmax(logits, dim=-1)
+    total_buckets = 10
+    masked_buckets = input_ids_to_masked_buckets(input_ids, mask_id, total_buckets)
+    data = []
+    for bucket_idx in range(total_buckets):
+        indices_for_bucket = masked_buckets[masked_buckets == bucket_idx]
+        # It's ok if none were noised in the range of this bucket. This
+        # function will be called for a later training step where it's likely
+        # there will be an element noised in the range.
+        if indices_for_bucket.shape[0] == 0:
+            continue
+        index_for_bucket = indices_for_bucket[0]
+        image_probs = probs[index_for_bucket]
+        # find the index of a masked pixel for the image
+        input_ids_for_image = input_ids[index_for_bucket]
+        masked_pixels_probs = image_probs[input_ids_for_image == mask_id]
+        masked_pixel_probs = masked_pixels_probs[0]
+        masked_pixel_probs = masked_pixel_probs.cpu().numpy()
+        for masked_pixel_prob in masked_pixel_probs:
+            data.append({"bucket": bucket_idx, "masked_pixel_prob": masked_pixel_prob})
+    df = pd.DataFrame(data)
+    return df
+def average_by_buckets(values, masked_buckets, total_buckets):
+    unique_buckets, bucket_counts = masked_buckets.unique(dim=0, return_counts=True)
+    numerator = torch.zeros(total_buckets, device=values.device)
+    numerator.scatter_add_(0, masked_buckets, values)
+    # default value is one because the buckets for which there aren't
+    # any values will have a numerator of zero. So we just need to not divide
+    # by zero.
+    denominator = torch.ones(total_buckets, device=values.device, dtype=torch.long)
+    denominator[unique_buckets] = bucket_counts
+    averaged_by_buckets = numerator / denominator
+    return averaged_by_buckets
+def input_ids_to_masked_buckets(input_ids, mask_id, total_buckets=10):
+    assert total_buckets == 10
+    masked_percent = (input_ids == mask_id).sum(-1) / input_ids.shape[-1]
+    # we do not formally use timesteps to noise images. Instead, we mask a percent
+    # of the pixels. We don't want to log entropy for every mask percent between 0 and 1,
+    # and we also want to track how the entropy evolves over time w/in a range of mask
+    # percents that should have similar entropy. So we bucket the masked percents into a
+    # fixed number of buckets
+    # we could generalize this later if needed but for now, let's just assume a fixed
+    # number of 10 buckets.
+    # How this maps to a bucket index:
+    # (mask) * bucket_index +
+    # (mask_1) * bucket_index_1
+    #
+    # -> Where the mask is true will be set to the expected bucket index,
+    # where the mask is false will be set to 0.
+    #
+    # Given the probabilities are between 0 and 1, each masked_percent will get mapped
+    # to a timestep by one and only one of the masks.
+    masked_buckets = (
+        ((0 < masked_percent) & (masked_percent <= 0.1)) * 0
+        + ((0.1 < masked_percent) & (masked_percent <= 0.2)) * 1
+        + ((0.2 < masked_percent) & (masked_percent <= 0.3)) * 2
+        + ((0.3 < masked_percent) & (masked_percent <= 0.4)) * 3
+        + ((0.4 < masked_percent) & (masked_percent <= 0.5)) * 4
+        + ((0.5 < masked_percent) & (masked_percent <= 0.6)) * 5
+        + ((0.6 < masked_percent) & (masked_percent <= 0.7)) * 6
+        + ((0.7 < masked_percent) & (masked_percent <= 0.8)) * 7
+        + ((0.8 < masked_percent) & (masked_percent <= 0.9)) * 8
+        + ((0.9 < masked_percent) & (masked_percent <= 1.0)) * 9
+    )
+    return masked_buckets

training/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # from .mmada_grpo_trainer import DiffusionGRPOTrainer

training/prompting_utils.py ADDED Viewed

	@@ -0,0 +1,475 @@

+# coding=utf-8
+# Copyright 2025 MMaDA team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+reserved_token_mapping = {
+    '<|soi|>': 126084,
+    '<|eoi|>': 126085,
+    '<|sov|>': 126086,
+    '<|eov|>': 126087,
+    '<|t2i|>': 126088,
+    '<|mmu|>': 126089,
+    '<|t2v|>': 126090,
+    '<|v2v|>': 126091,
+    '<|lvg|>': 126092,
+    '[iPAD]': 126093,
+    '<|r2i|>': 126094,
+}
+import torch
+class UniversalPrompting():
+    def __init__(self, text_tokenizer,
+                 special_tokens=("<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>", "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>"),
+                 max_text_len=8000, max_seq_len=377, ignore_id=-100, cond_dropout_prob=0.1, use_reserved_token=False):
+        """
+        :param text_tokenizer: original text tokenizer
+        """
+        if not use_reserved_token:
+            self.text_tokenizer = text_tokenizer
+            self.text_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+            self.text_tokenizer.add_tokens(list(special_tokens))
+            self.sptids_dict = {token: torch.tensor(self.text_tokenizer.convert_tokens_to_ids([token])) for token in
+                                special_tokens}
+            self.sptids_dict['<|sot|>'] = torch.tensor([self.text_tokenizer.bos_token_id])
+            self.sptids_dict['<|eot|>'] = torch.tensor([self.text_tokenizer.eos_token_id])
+            self.sptids_dict['<|pad|>'] = torch.tensor([self.text_tokenizer.pad_token_id])
+        else:
+            self.text_tokenizer = text_tokenizer
+            self.sptids_dict = {}
+            for token, token_id in reserved_token_mapping.items():
+                self.sptids_dict[token] = torch.tensor([token_id])
+            self.sptids_dict['<|sot|>'] = torch.tensor([self.text_tokenizer.bos_token_id])
+            self.sptids_dict['<|eot|>'] = torch.tensor([self.text_tokenizer.eos_token_id])
+            end_header_tokens = self.text_tokenizer.convert_tokens_to_ids(['<|end_header_id|>'])
+            if end_header_tokens and len(end_header_tokens) > 0 and end_header_tokens[0]:
+                self.sptids_dict['<|end_header_id|>'] = torch.tensor(end_header_tokens)
+                self.sptids_dict['<|eot_id|>'] = torch.tensor(self.text_tokenizer.convert_tokens_to_ids(['<|eot_id|>']))
+                self.sptids_dict['<|start_header_id|>'] = torch.tensor(self.text_tokenizer.convert_tokens_to_ids(['<|start_header_id|>']))
+            else:
+                special_tokens_dict = {
+                    'additional_special_tokens': [
+                        '<|start_header_id|>',
+                        '<|end_header_id|>',
+                        '<|eot_id|>'
+                    ]
+                }
+                num_added = self.text_tokenizer.add_special_tokens(special_tokens_dict)
+                new_token_id = self.text_tokenizer.convert_tokens_to_ids(['<|end_header_id|>'])
+                self.sptids_dict['<|end_header_id|>'] = torch.tensor(new_token_id)
+                self.sptids_dict['<|eot_id|>'] = torch.tensor(self.text_tokenizer.convert_tokens_to_ids(['<|eot_id|>']))
+                self.sptids_dict['<|start_header_id|>'] = torch.tensor(self.text_tokenizer.convert_tokens_to_ids(['<|start_header_id|>']))
+        # plus 1 because at this time we add a task token before
+        print(f"self.sptids_dict: {self.sptids_dict}")
+        self.max_text_len = max_text_len + 1
+        self.pad_id = reserved_token_mapping['[iPAD]']
+        self.ignore_id = ignore_id
+        self.cond_dropout_prob = cond_dropout_prob
+    def t2i_prompt(self, text_ids, image_ids, labels):
+        device = image_ids.device
+        sequence_ids = []
+        attention_masks = []
+        label_ids = []
+        probs = torch.rand(len(text_ids))
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            temp_ids = [int(self.sptids_dict['<|t2i|>'])] + text_ids[i] + [self.text_tokenizer.eos_token_id]
+            # randomly dropout text condition
+            if probs[i] < self.cond_dropout_prob:
+                temp_ids = [int(self.sptids_dict['<|t2i|>']), self.text_tokenizer.bos_token_id, self.text_tokenizer.eos_token_id]
+            if self.max_text_len >= len(temp_ids):
+                old_len = len(temp_ids)
+                temp_ids = [self.pad_id] * (self.max_text_len - len(temp_ids)) + temp_ids
+                temp_masks = [0] * (self.max_text_len - old_len) + [1] * (old_len + image_ids.shape[-1] + 2)
+            else:
+                # should add the eos token
+                temp_ids = temp_ids[:self.max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+                temp_masks = [1] * (len(temp_ids) + image_ids.shape[-1] + 2)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_label_ids = torch.cat([
+                # should we predict text tokens when doing image reconstruction?
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|soi|>'].to(device),
+                labels[i],
+                self.sptids_dict['<|eoi|>'].to(device)
+            ], dim=0)
+            temp_label_ids = torch.where(temp_label_ids == self.pad_id, self.ignore_id, temp_label_ids)
+            temp_ids = torch.cat([
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|soi|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eoi|>'].to(device)
+            ], dim=0)
+            # sequence_ids: [pad]...[pad] <|t2i|> <bos> text_1 ... text_n <eos> <|soi|> image_1 ... image_m <|eoi|>
+            temp_masks = torch.tensor(temp_masks).to(device)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+            label_ids.append(temp_label_ids.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0), torch.cat(label_ids, dim=0)
+    def t2i_gen_prompt(self, text_ids, image_ids):
+        device = image_ids.device
+        sequence_ids = []
+        attention_masks = []
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            # note that, llama3 tokenizer automatically add the bot token at first but without eot
+            temp_ids = [int(self.sptids_dict['<|t2i|>'])] + text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if self.max_text_len >= len(temp_ids):
+                old_len = len(temp_ids)
+                temp_ids = [self.pad_id] * (self.max_text_len - len(temp_ids)) + temp_ids
+                temp_masks = [0] * (self.max_text_len - old_len) + [1] * (old_len + image_ids.shape[-1] + 2)
+            else:
+                # should add the eos token
+                temp_ids = temp_ids[:self.max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+                temp_masks = [1] * (len(temp_ids) + image_ids.shape[-1] + 2)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_ids = torch.cat([
+                torch.tensor(temp_ids).to(device),
+                self.sptids_dict['<|soi|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eoi|>'].to(device)
+            ], dim=0)
+            temp_masks = torch.tensor(temp_masks).to(device)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0)
+    # language modeling
+    def lm_prompt(self, text_ids, max_seq_len):
+        sequence_ids = []
+        attention_masks = []
+        label_ids = []
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            temp_ids = text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if max_seq_len >= len(temp_ids):
+                temp_labels_ids = temp_ids + [self.text_tokenizer.eos_token_id] * (max_seq_len - len(temp_ids))
+                temp_ids = temp_ids + [self.text_tokenizer.eos_token_id] * (max_seq_len - len(temp_ids))
+                temp_masks = [1] * len(temp_ids) + [0] * (max_seq_len - len(temp_ids))
+            else:
+                # In language modeling, we only process text tokens. We do not add the eos token if the text length
+                # exceeds the max sequence length
+                temp_labels_ids = temp_ids[:max_seq_len]
+                temp_ids = temp_ids[:max_seq_len]
+                temp_masks = [1] * len(temp_ids)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_ids = torch.tensor(temp_ids)
+            temp_masks = torch.tensor(temp_masks)
+            temp_labels_ids = torch.tensor(temp_labels_ids)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            attention_masks.append(temp_masks.unsqueeze(0))
+            label_ids.append(temp_labels_ids.unsqueeze(0))
+        # input_ids, masks, labels
+        return torch.cat(sequence_ids, dim=0), torch.cat(attention_masks, dim=0), torch.cat(label_ids, dim=0)
+    # language modeling
+    def lm_chat_prompt(self, text_ids, max_seq_len):
+        sequence_ids = []
+        prompt_masks = []
+        label_ids = []
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            temp_ids = text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if max_seq_len >= len(temp_ids):
+                temp_labels_ids = temp_ids + [self.text_tokenizer.eos_token_id] * (max_seq_len - len(temp_ids))
+                temp_ids = temp_ids + [self.text_tokenizer.eos_token_id] * (max_seq_len - len(temp_ids))
+            else:
+                # In language modeling, we only process text tokens. We do not add the eos token if the text length
+                # exceeds the max sequence length
+                temp_labels_ids = temp_ids[:max_seq_len]
+                temp_ids = temp_ids[:max_seq_len]
+            end_header_id = int(self.sptids_dict['<|end_header_id|>'])
+            end_header_pos = -1
+            for pos in range(len(temp_ids) - 1, -1, -1):    # 尝试从文本序列中寻找<|end_header_id|>
+                if temp_ids[pos] == end_header_id:
+                    end_header_pos = pos
+                    break
+            if end_header_pos != -1:
+                prompt_length = end_header_pos + 1
+            else:
+                prompt_length = 0
+            temp_masks = [1] * prompt_length + [0] * (len(temp_ids) - prompt_length)
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_ids = torch.tensor(temp_ids)
+            temp_masks = torch.tensor(temp_masks)
+            temp_labels_ids = torch.tensor(temp_labels_ids)
+            sequence_ids.append(temp_ids.unsqueeze(0))
+            prompt_masks.append(temp_masks.unsqueeze(0))
+            label_ids.append(temp_labels_ids.unsqueeze(0))
+        # input_ids, masks, labels
+        return torch.cat(sequence_ids, dim=0), torch.cat(prompt_masks, dim=0), torch.cat(label_ids, dim=0)
+    def mmu_prompt(self, image_ids, text_ids):
+        device = image_ids.device
+        sequence_ids = []
+        prompt_masks = []
+        label_ids = []
+        max_text_len = self.max_text_len - 1
+        for i in range(len(text_ids)):
+            # note that, llama3 tokenizer automatically add the bot token at first but without eot
+            # for empty list []
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            temp_ids = text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if max_text_len >= len(temp_ids):
+                # minus 1 because task token was prepended to the former image tokens
+                temp_ids = temp_ids + [self.text_tokenizer.eos_token_id] * (max_text_len - len(temp_ids))
+                temp_masks = [1] * (len(temp_ids) + image_ids.shape[-1] + 3) + [0] * (max_text_len - len(temp_ids))
+            else:
+                # should add the eos token
+                temp_ids = temp_ids[:max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+                temp_masks = [1] * (len(temp_ids) + image_ids.shape[-1] + 3)  # +2 for two special tokens
+            # prompting -- [task token] [sot] [text tokens] [eot] [soi] [image tokens] [eoi]
+            temp_label_ids = torch.cat([
+                torch.tensor([self.ignore_id]).to(device),
+                torch.tensor([self.ignore_id]).to(device),
+                torch.ones_like(image_ids[i]) * self.ignore_id,
+                torch.tensor([self.ignore_id]).to(device),
+                torch.tensor(temp_ids).to(device),
+            ], dim=0)
+            temp_label_ids = torch.where(temp_label_ids == self.pad_id, self.ignore_id, temp_label_ids)
+            return_temp_ids = torch.cat([
+                self.sptids_dict['<|mmu|>'].to(device),  # task token
+                self.sptids_dict['<|soi|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eoi|>'].to(device),
+                torch.tensor(temp_ids).to(device),
+            ], dim=0)
+            end_header_id = int(self.sptids_dict['<|end_header_id|>'])
+            end_header_pos = -1
+            for pos in range(len(temp_ids) - 1, -1, -1):
+                if temp_ids[pos] == end_header_id:
+                    end_header_pos = pos
+                    break
+            if end_header_pos != -1:
+                prompt_length = len(return_temp_ids) - len(temp_ids) + end_header_pos + 1
+            else:
+                prompt_length = len(return_temp_ids) - len(temp_ids)
+            predict_length = len(return_temp_ids) - prompt_length
+            prompt_mask = [1] * prompt_length + [0] * predict_length
+            prompt_mask = torch.tensor(prompt_mask).to(device)
+            sequence_ids.append(return_temp_ids.unsqueeze(0))
+            prompt_masks.append(prompt_mask.unsqueeze(0))
+            label_ids.append(temp_label_ids.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(prompt_masks, dim=0), torch.cat(label_ids, dim=0)
+    def mmu_gen_prompt(self, image_ids, text_ids):
+        device = image_ids.device
+        sequence_ids = []
+        prompt_masks = []
+        max_text_len = self.max_text_len - 1
+        for i in range(len(text_ids)):
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0] != self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            temp_ids = text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if max_text_len >= len(temp_ids):
+                # minus 1 because task token was prepended to the former image tokens
+                temp_ids = temp_ids + [self.text_tokenizer.eos_token_id] * (max_text_len - len(temp_ids))
+            else:
+                # should add the eos token
+                temp_ids = temp_ids[:max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+            # print(f"mmu temp_ids: {temp_ids}")
+            return_temp_ids = torch.cat([
+                self.sptids_dict['<|mmu|>'].to(device),  # task token
+                self.sptids_dict['<|soi|>'].to(device),
+                image_ids[i],
+                self.sptids_dict['<|eoi|>'].to(device),
+                torch.tensor(temp_ids).to(device),
+            ], dim=0)
+            end_header_id = int(self.sptids_dict['<|end_header_id|>'])
+            end_header_pos = -1
+            for pos in range(len(temp_ids) - 1, -1, -1):
+                if temp_ids[pos] == end_header_id:
+                    end_header_pos = pos
+                    break
+            if end_header_pos != -1:
+                prompt_length = len(return_temp_ids) - len(temp_ids) + end_header_pos + 1
+            else:
+                prompt_length = len(return_temp_ids) - len(temp_ids)
+            predict_length = len(temp_ids) - prompt_length
+            print(f"prompt_length: {prompt_length}, predict_length: {predict_length}, all length: {len(return_temp_ids)}, {return_temp_ids[-predict_length:]}")
+            prompt_mask = [1] * prompt_length + [0] * predict_length
+            prompt_mask = torch.tensor(prompt_mask).to(device)
+            sequence_ids.append(return_temp_ids.unsqueeze(0))
+            prompt_masks.append(prompt_mask.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(prompt_masks, dim=0)
+    def r2i_prompt(self, image_ids, text_ids):
+        device = image_ids.device
+        sequence_ids = []
+        prompt_masks = []
+        label_ids = []
+        r2i_id = int(self.sptids_dict['<|r2i|>'])
+        soi_id = int(self.sptids_dict['<|soi|>'])
+        eoi_id = int(self.sptids_dict['<|eoi|>'])
+        max_text_len = self.max_text_len - 1    # 512，include BOS text EOS
+        for i in range(len(text_ids)):
+            # note that, llama3 tokenizer automatically add the bot token at first but without eot
+            # for empty list []
+            if len(text_ids[i]) == 0:
+                text_ids[i] = [self.text_tokenizer.bos_token_id]
+            elif text_ids[i][0]!= self.text_tokenizer.bos_token_id:
+                text_ids[i] = [self.text_tokenizer.bos_token_id] + text_ids[i]
+            text_ids_with_bos_eos = text_ids[i] + [self.text_tokenizer.eos_token_id]
+            if max_text_len >= len(text_ids_with_bos_eos):
+                # minus 1 because task token was prepended to the former image tokens
+                text_ids_full_len = text_ids_with_bos_eos + [self.text_tokenizer.eos_token_id] * (max_text_len - len(text_ids_with_bos_eos))
+            else:
+                # should add the eos token
+                text_ids_full_len = text_ids_with_bos_eos[:max_text_len - 1] + [self.text_tokenizer.eos_token_id]
+            sequence_ids.append(torch.cat([
+                torch.tensor([r2i_id]).to(device),  # task token
+                torch.tensor(text_ids_full_len).to(device),
+                torch.tensor([soi_id]).to(device),
+                image_ids[i],
+                torch.tensor([eoi_id]).to(device),
+            ], dim=0).unsqueeze(0))
+            end_header_id = int(self.sptids_dict['<|end_header_id|>'])
+            end_header_pos = -1
+            for pos in range(len(text_ids_full_len) - 1, -1, -1):
+                if text_ids_full_len[pos] == end_header_id:
+                    end_header_pos = pos
+                    break
+            prompt_mask = torch.zeros(sequence_ids[i].size(1)).to(device)
+            prompt_mask[0] = 1  # task_id
+            if end_header_pos != -1:
+                prompt_mask[1:end_header_pos+2] = 1
+            else:
+                prompt_mask[1:len(text_ids_full_len)+1] = 1
+            prompt_mask[len(text_ids_full_len)+1] = 1
+            prompt_mask[len(text_ids_full_len)+2+len(image_ids[i])] = 1
+            prompt_masks.append(prompt_mask.unsqueeze(0))
+        return torch.cat(sequence_ids, dim=0), torch.cat(prompt_masks, dim=0), torch.cat(sequence_ids, dim=0)
+    def mask_prompt(self):
+        pass
+    def __call__(self, input, task, padding=True, config=None):
+        """
+        input (tuple) : data pairs contain text(str), image(tensor), or videos(tensor).
+        task (str) : a flag indicates the current task.
+        """
+        if task == "t2i":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.t2i_prompt(text_ids, image_ids, input[2])
+        elif task == "t2v":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.t2v_prompt(text_ids, image_ids, input[2])
+        elif task == "t2i_plus_lm":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.t2i_prompt(text_ids[:config.training.batch_size], image_ids,
+                                                                   input[2])
+            sequence_ids_with_masks_lm = self.lm_prompt(text_ids[config.training.batch_size:], input[3])
+            return sequence_ids_with_masks, sequence_ids_with_masks_lm
+        elif task == "t2i_gen":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.t2i_gen_prompt(text_ids, image_ids)
+        elif task == "t2v_gen":
+            text_ids = self.text_tokenizer(input[0])['input_ids']  # (B, max_len)
+            image_ids = input[1]  # (B, #tokens)
+            sequence_ids_with_masks = self.t2v_gen_prompt(text_ids, image_ids)
+        elif task == "lm":
+            text_ids = self.text_tokenizer(input[0], truncation=True)['input_ids']  # (B, max_len)
+            sequence_ids_with_masks = self.lm_prompt(text_ids, input[1])
+        elif task == "lm_chat":
+            text_ids = self.text_tokenizer(input[0], truncation=True)['input_ids']  # (B, max_len)
+            sequence_ids_with_masks = self.lm_chat_prompt(text_ids, input[1])
+        elif task == "mmu":
+            image_ids = input[0]
+            text_ids = self.text_tokenizer(input[1])['input_ids']
+            sequence_ids_with_masks = self.mmu_prompt(image_ids, text_ids)
+        elif task == "r2i":
+            image_ids = input[0]
+            text_ids = self.text_tokenizer(input[1])['input_ids']
+            sequence_ids_with_masks = self.r2i_prompt(image_ids, text_ids)
+        else:
+            raise NotImplementedError
+        return sequence_ids_with_masks
+if __name__ == '__main__':
+    pass