Upload folder using huggingface_hub

1d30d42 verified 4 months ago

46.4 kB

	#ifndef __MMDIT_HPP__
	#define __MMDIT_HPP__

	#include "ggml_extend.hpp"
	#include "model.h"

	#define MMDIT_GRAPH_SIZE 10240

	struct Mlp : public GGMLBlock {
	public:
	Mlp(int64_t in_features,
	int64_t hidden_features = -1,
	int64_t out_features = -1,
	bool bias = true) {
	// act_layer is always lambda: nn.GELU(approximate="tanh")
	// norm_layer is always None
	// use_conv is always False
	if (hidden_features == -1) {
	hidden_features = in_features;
	}
	if (out_features == -1) {
	out_features = in_features;
	}
	blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
	blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
	}

	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
	// x: [N, n_token, in_features]
	auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
	auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);

	x = fc1->forward(ctx, x);
	x = ggml_gelu_inplace(ctx, x);
	x = fc2->forward(ctx, x);
	return x;
	}
	};

	struct PatchEmbed : public GGMLBlock {
	// 2D Image to Patch Embedding
	protected:
	bool flatten;
	bool dynamic_img_pad;
	int patch_size;

	public:
	PatchEmbed(int64_t img_size = 224,
	int patch_size = 16,
	int64_t in_chans = 3,
	int64_t embed_dim = 1536,
	bool bias = true,
	bool flatten = true,
	bool dynamic_img_pad = true)
	: patch_size(patch_size),
	flatten(flatten),
	dynamic_img_pad(dynamic_img_pad) {
	// img_size is always None
	// patch_size is always 2
	// in_chans is always 16
	// norm_layer is always False
	// strict_img_size is always true, but not used

	blocks["proj"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_chans,
	embed_dim,
	{patch_size, patch_size},
	{patch_size, patch_size},
	{0, 0},
	{1, 1},
	bias));
	}

	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
	// x: [N, C, H, W]
	// return: [N, H*W, embed_dim]
	auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]);

	if (dynamic_img_pad) {
	int64_t W = x->ne[0];
	int64_t H = x->ne[1];
	int pad_h = (patch_size - H % patch_size) % patch_size;
	int pad_w = (patch_size - W % patch_size) % patch_size;
	x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // TODO: reflect pad mode
	}
	x = proj->forward(ctx, x);

	if (flatten) {
	x = ggml_reshape_3d(ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
	x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
	}
	return x;
	}
	};

	struct TimestepEmbedder : public GGMLBlock {
	// Embeds scalar timesteps into vector representations.
	protected:
	int64_t frequency_embedding_size;

	public:
	TimestepEmbedder(int64_t hidden_size,
	int64_t frequency_embedding_size = 256)
	: frequency_embedding_size(frequency_embedding_size) {
	blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
	blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
	}

	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
	// t: [N, ]
	// return: [N, hidden_size]
	auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
	auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);

	auto t_freq = ggml_nn_timestep_embedding(ctx, t, frequency_embedding_size); // [N, frequency_embedding_size]

	auto t_emb = mlp_0->forward(ctx, t_freq);
	t_emb = ggml_silu_inplace(ctx, t_emb);
	t_emb = mlp_2->forward(ctx, t_emb);
	return t_emb;
	}
	};

	struct VectorEmbedder : public GGMLBlock {
	// Embeds a flat vector of dimension input_dim
	public:
	VectorEmbedder(int64_t input_dim,
	int64_t hidden_size) {
	blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size, true, true));
	blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
	}

	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
	// x: [N, input_dim]
	// return: [N, hidden_size]
	auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
	auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);

	x = mlp_0->forward(ctx, x);
	x = ggml_silu_inplace(ctx, x);
	x = mlp_2->forward(ctx, x);
	return x;
	}
	};

	class RMSNorm : public UnaryBlock {
	protected:
	int64_t hidden_size;
	float eps;

	void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
	enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
	params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
	}

	public:
	RMSNorm(int64_t hidden_size,
	float eps = 1e-06f)
	: hidden_size(hidden_size),
	eps(eps) {}

	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
	struct ggml_tensor* w = params["weight"];
	x = ggml_rms_norm(ctx, x, eps);
	x = ggml_mul(ctx, x, w);
	return x;
	}
	};

	class SelfAttention : public GGMLBlock {
	public:
	int64_t num_heads;
	bool pre_only;
	std::string qk_norm;

	public:
	SelfAttention(int64_t dim,
	int64_t num_heads = 8,
	std::string qk_norm = "",
	bool qkv_bias = false,
	bool pre_only = false)
	: num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm) {
	int64_t d_head = dim / num_heads;
	blocks["qkv"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
	if (!pre_only) {
	blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
	}
	if (qk_norm == "rms") {
	blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6));
	blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6));
	} else if (qk_norm == "ln") {
	blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6));
	blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6));
	}
	}

	std::vector<struct ggml_tensor> pre_attention(struct ggml_context ctx, struct ggml_tensor* x) {
	auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);

	auto qkv = qkv_proj->forward(ctx, x);
	auto qkv_vec = split_qkv(ctx, qkv);
	int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
	auto q = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]); // [N, n_token, n_head, d_head]
	auto k = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]); // [N, n_token, n_head, d_head]
	auto v = qkv_vec[2]; // [N, n_token, n_head*d_head]

	if (qk_norm == "rms" \|\| qk_norm == "ln") {
	auto ln_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["ln_q"]);
	auto ln_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["ln_k"]);
	q = ln_q->forward(ctx, q);
	k = ln_k->forward(ctx, k);
	}

	q = ggml_reshape_3d(ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]); // [N, n_token, n_head*d_head]
	k = ggml_reshape_3d(ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]); // [N, n_token, n_head*d_head]

	return {q, k, v};
	}

	struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
	GGML_ASSERT(!pre_only);

	auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);

	x = proj->forward(ctx, x); // [N, n_token, dim]
	return x;
	}

	// x: [N, n_token, dim]
	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
	auto qkv = pre_attention(ctx, x);
	x = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads); // [N, n_token, dim]
	x = post_attention(ctx, x); // [N, n_token, dim]
	return x;
	}
	};

	__STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
	struct ggml_tensor* x,
	struct ggml_tensor* shift,
	struct ggml_tensor* scale) {
	// x: [N, L, C]
	// scale: [N, C]
	// shift: [N, C]
	scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]); // [N, 1, C]
	shift = ggml_reshape_3d(ctx, shift, shift->ne[0], 1, shift->ne[1]); // [N, 1, C]
	x = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
	x = ggml_add(ctx, x, shift);
	return x;
	}

	struct DismantledBlock : public GGMLBlock {
	// A DiT block with gated adaptive layer norm (adaLN) conditioning.
	public:
	int64_t num_heads;
	bool pre_only;
	bool self_attn;

	public:
	DismantledBlock(int64_t hidden_size,
	int64_t num_heads,
	float mlp_ratio = 4.0,
	std::string qk_norm = "",
	bool qkv_bias = false,
	bool pre_only = false,
	bool self_attn = false)
	: num_heads(num_heads), pre_only(pre_only), self_attn(self_attn) {
	// rmsnorm is always Flase
	// scale_mod_only is always Flase
	// swiglu is always Flase
	blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
	blocks["attn"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only));

	if (self_attn) {
	blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false));
	}

	if (!pre_only) {
	blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
	int64_t mlp_hidden_dim = (int64_t)(hidden_size * mlp_ratio);
	blocks["mlp"] = std::shared_ptr<GGMLBlock>(new Mlp(hidden_size, mlp_hidden_dim));
	}

	int64_t n_mods = 6;
	if (pre_only) {
	n_mods = 2;
	}
	if (self_attn) {
	n_mods = 9;
	}
	blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, n_mods * hidden_size));
	}

	std::tuple<std::vector<struct ggml_tensor>, std::vector<struct ggml_tensor>, std::vector<struct ggml_tensor>> pre_attention_x(struct ggml_context ctx,
	struct ggml_tensor* x,
	struct ggml_tensor* c) {
	GGML_ASSERT(self_attn);
	// x: [N, n_token, hidden_size]
	// c: [N, hidden_size]
	auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
	auto attn = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
	auto attn2 = std::dynamic_pointer_cast<SelfAttention>(blocks["attn2"]);
	auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

	int64_t n_mods = 9;
	auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, n_mods * hidden_size]
	m = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size]
	m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size]

	int64_t offset = m->nb[1] * m->ne[1];
	auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
	auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
	auto gate_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size]

	auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size]
	auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size]
	auto gate_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size]

	auto shift_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6); // [N, hidden_size]
	auto scale_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7); // [N, hidden_size]
	auto gate_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8); // [N, hidden_size]

	auto x_norm = norm1->forward(ctx, x);

	auto attn_in = modulate(ctx, x_norm, shift_msa, scale_msa);
	auto qkv = attn->pre_attention(ctx, attn_in);

	auto attn2_in = modulate(ctx, x_norm, shift_msa2, scale_msa2);
	auto qkv2 = attn2->pre_attention(ctx, attn2_in);

	return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}};
	}

	std::pair<std::vector<struct ggml_tensor>, std::vector<struct ggml_tensor>> pre_attention(struct ggml_context* ctx,
	struct ggml_tensor* x,
	struct ggml_tensor* c) {
	// x: [N, n_token, hidden_size]
	// c: [N, hidden_size]
	auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
	auto attn = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
	auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

	int64_t n_mods = 6;
	if (pre_only) {
	n_mods = 2;
	}
	auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, n_mods * hidden_size]
	m = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size]
	m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size]

	int64_t offset = m->nb[1] * m->ne[1];
	auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
	auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
	if (!pre_only) {
	auto gate_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size]
	auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size]
	auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size]
	auto gate_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size]

	auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);

	auto qkv = attn->pre_attention(ctx, attn_in);

	return {qkv, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp}};
	} else {
	auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
	auto qkv = attn->pre_attention(ctx, attn_in);

	return {qkv, {NULL, NULL, NULL, NULL, NULL}};
	}
	}

	struct ggml_tensor* post_attention_x(struct ggml_context* ctx,
	struct ggml_tensor* attn_out,
	struct ggml_tensor* attn2_out,
	struct ggml_tensor* x,
	struct ggml_tensor* gate_msa,
	struct ggml_tensor* shift_mlp,
	struct ggml_tensor* scale_mlp,
	struct ggml_tensor* gate_mlp,
	struct ggml_tensor* gate_msa2) {
	// attn_out: [N, n_token, hidden_size]
	// x: [N, n_token, hidden_size]
	// gate_msa: [N, hidden_size]
	// shift_mlp: [N, hidden_size]
	// scale_mlp: [N, hidden_size]
	// gate_mlp: [N, hidden_size]
	// return: [N, n_token, hidden_size]
	GGML_ASSERT(!pre_only);

	auto attn = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
	auto attn2 = std::dynamic_pointer_cast<SelfAttention>(blocks["attn2"]);
	auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
	auto mlp = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);

	gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]); // [N, 1, hidden_size]
	gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]); // [N, 1, hidden_size]
	gate_msa2 = ggml_reshape_3d(ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]); // [N, 1, hidden_size]

	attn_out = attn->post_attention(ctx, attn_out);
	attn2_out = attn2->post_attention(ctx, attn2_out);

	x = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa));
	x = ggml_add(ctx, x, ggml_mul(ctx, attn2_out, gate_msa2));
	auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
	x = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp));

	return x;
	}

	struct ggml_tensor* post_attention(struct ggml_context* ctx,
	struct ggml_tensor* attn_out,
	struct ggml_tensor* x,
	struct ggml_tensor* gate_msa,
	struct ggml_tensor* shift_mlp,
	struct ggml_tensor* scale_mlp,
	struct ggml_tensor* gate_mlp) {
	// attn_out: [N, n_token, hidden_size]
	// x: [N, n_token, hidden_size]
	// gate_msa: [N, hidden_size]
	// shift_mlp: [N, hidden_size]
	// scale_mlp: [N, hidden_size]
	// gate_mlp: [N, hidden_size]
	// return: [N, n_token, hidden_size]
	GGML_ASSERT(!pre_only);

	auto attn = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
	auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
	auto mlp = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);

	gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]); // [N, 1, hidden_size]
	gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]); // [N, 1, hidden_size]

	attn_out = attn->post_attention(ctx, attn_out);

	x = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa));
	auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
	x = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp));

	return x;
	}

	struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* c) {
	// x: [N, n_token, hidden_size]
	// c: [N, hidden_size]
	// return: [N, n_token, hidden_size]

	auto attn = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
	if (self_attn) {
	auto qkv_intermediates = pre_attention_x(ctx, x, c);
	// auto qkv = qkv_intermediates.first;
	// auto intermediates = qkv_intermediates.second;
	// no longer a pair, but a tuple
	auto qkv = std::get<0>(qkv_intermediates);
	auto qkv2 = std::get<1>(qkv_intermediates);
	auto intermediates = std::get<2>(qkv_intermediates);

	auto attn_out = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads); // [N, n_token, dim]
	auto attn2_out = ggml_nn_attention_ext(ctx, qkv2[0], qkv2[1], qkv2[2], num_heads); // [N, n_token, dim]
	x = post_attention_x(ctx,
	attn_out,
	attn2_out,
	intermediates[0],
	intermediates[1],
	intermediates[2],
	intermediates[3],
	intermediates[4],
	intermediates[5]);
	return x; // [N, n_token, dim]
	} else {
	auto qkv_intermediates = pre_attention(ctx, x, c);
	auto qkv = qkv_intermediates.first;
	auto intermediates = qkv_intermediates.second;

	auto attn_out = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads); // [N, n_token, dim]
	x = post_attention(ctx,
	attn_out,
	intermediates[0],
	intermediates[1],
	intermediates[2],
	intermediates[3],
	intermediates[4]);
	return x; // [N, n_token, dim]
	}
	}
	};

	__STATIC_INLINE__ std::pair<struct ggml_tensor, struct ggml_tensor>
	block_mixing(struct ggml_context* ctx,
	struct ggml_tensor* context,
	struct ggml_tensor* x,
	struct ggml_tensor* c,
	std::shared_ptr<DismantledBlock> context_block,
	std::shared_ptr<DismantledBlock> x_block) {
	// context: [N, n_context, hidden_size]
	// x: [N, n_token, hidden_size]
	// c: [N, hidden_size]
	auto context_qkv_intermediates = context_block->pre_attention(ctx, context, c);
	auto context_qkv = context_qkv_intermediates.first;
	auto context_intermediates = context_qkv_intermediates.second;

	std::vector<ggml_tensor*> x_qkv, x_qkv2, x_intermediates;

	if (x_block->self_attn) {
	auto x_qkv_intermediates = x_block->pre_attention_x(ctx, x, c);
	x_qkv = std::get<0>(x_qkv_intermediates);
	x_qkv2 = std::get<1>(x_qkv_intermediates);
	x_intermediates = std::get<2>(x_qkv_intermediates);
	} else {
	auto x_qkv_intermediates = x_block->pre_attention(ctx, x, c);
	x_qkv = x_qkv_intermediates.first;
	x_intermediates = x_qkv_intermediates.second;
	}
	std::vector<struct ggml_tensor*> qkv;
	for (int i = 0; i < 3; i++) {
	qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1));
	}

	auto attn = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], x_block->num_heads); // [N, n_context + n_token, hidden_size]
	attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_context + n_token, N, hidden_size]
	auto context_attn = ggml_view_3d(ctx,
	attn,
	attn->ne[0],
	attn->ne[1],
	context->ne[1],
	attn->nb[1],
	attn->nb[2],
	0); // [n_context, N, hidden_size]
	context_attn = ggml_cont(ctx, ggml_permute(ctx, context_attn, 0, 2, 1, 3)); // [N, n_context, hidden_size]
	auto x_attn = ggml_view_3d(ctx,
	attn,
	attn->ne[0],
	attn->ne[1],
	x->ne[1],
	attn->nb[1],
	attn->nb[2],
	attn->nb[2] * context->ne[1]); // [n_token, N, hidden_size]
	x_attn = ggml_cont(ctx, ggml_permute(ctx, x_attn, 0, 2, 1, 3)); // [N, n_token, hidden_size]

	if (!context_block->pre_only) {
	context = context_block->post_attention(ctx,
	context_attn,
	context_intermediates[0],
	context_intermediates[1],
	context_intermediates[2],
	context_intermediates[3],
	context_intermediates[4]);
	} else {
	context = NULL;
	}

	if (x_block->self_attn) {
	auto attn2 = ggml_nn_attention_ext(ctx, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads); // [N, n_token, hidden_size]

	x = x_block->post_attention_x(ctx,
	x_attn,
	attn2,
	x_intermediates[0],
	x_intermediates[1],
	x_intermediates[2],
	x_intermediates[3],
	x_intermediates[4],
	x_intermediates[5]);
	} else {
	x = x_block->post_attention(ctx,
	x_attn,
	x_intermediates[0],
	x_intermediates[1],
	x_intermediates[2],
	x_intermediates[3],
	x_intermediates[4]);
	}

	return {context, x};
	}

	struct JointBlock : public GGMLBlock {
	public:
	JointBlock(int64_t hidden_size,
	int64_t num_heads,
	float mlp_ratio = 4.0,
	std::string qk_norm = "",
	bool qkv_bias = false,
	bool pre_only = false,
	bool self_attn_x = false) {
	blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only));
	blocks["x_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x));
	}

	std::pair<struct ggml_tensor, struct ggml_tensor> forward(struct ggml_context* ctx,
	struct ggml_tensor* context,
	struct ggml_tensor* x,
	struct ggml_tensor* c) {
	auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]);
	auto x_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]);

	return block_mixing(ctx, context, x, c, context_block, x_block);
	}
	};

	struct FinalLayer : public GGMLBlock {
	// The final layer of DiT.
	public:
	FinalLayer(int64_t hidden_size,
	int64_t patch_size,
	int64_t out_channels) {
	// total_out_channels is always None
	blocks["norm_final"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
	blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels, true, true));
	blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
	}

	struct ggml_tensor* forward(struct ggml_context* ctx,
	struct ggml_tensor* x,
	struct ggml_tensor* c) {
	// x: [N, n_token, hidden_size]
	// c: [N, hidden_size]
	// return: [N, n_token, patch_size * patch_size * out_channels]
	auto norm_final = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
	auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
	auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

	auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, 2 * hidden_size]
	m = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]); // [N, 2, hidden_size]
	m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [2, N, hidden_size]

	int64_t offset = m->nb[1] * m->ne[1];
	auto shift = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
	auto scale = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]

	x = modulate(ctx, norm_final->forward(ctx, x), shift, scale);
	x = linear->forward(ctx, x);

	return x;
	}
	};

	struct MMDiT : public GGMLBlock {
	// Diffusion model with a Transformer backbone.
	protected:
	int64_t input_size = -1;
	int64_t patch_size = 2;
	int64_t in_channels = 16;
	int64_t d_self = -1; // >=0 for MMdiT-X
	int64_t depth = 24;
	float mlp_ratio = 4.0f;
	int64_t adm_in_channels = 2048;
	int64_t out_channels = 16;
	int64_t pos_embed_max_size = 192;
	int64_t num_patchs = 36864; // 192 * 192
	int64_t context_size = 4096;
	int64_t context_embedder_out_dim = 1536;
	int64_t hidden_size;
	std::string qk_norm;

	void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
	enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "pos_embed") != tensor_types.end()) ? tensor_types[prefix + "pos_embed"] : GGML_TYPE_F32;
	params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
	}

	public:
	MMDiT(std::map<std::string, enum ggml_type>& tensor_types) {
	// input_size is always None
	// learn_sigma is always False
	// register_length is alwalys 0
	// rmsnorm is alwalys False
	// scale_mod_only is alwalys False
	// swiglu is alwalys False
	// qkv_bias is always True
	// context_processor_layers is always None
	// pos_embed_scaling_factor is not used
	// pos_embed_offset is not used
	// context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}

	// read tensors from tensor_types
	for (auto pair : tensor_types) {
	std::string tensor_name = pair.first;
	if (tensor_name.find("model.diffusion_model.") == std::string::npos)
	continue;
	size_t jb = tensor_name.find("joint_blocks.");
	if (jb != std::string::npos) {
	tensor_name = tensor_name.substr(jb); // remove prefix
	int block_depth = atoi(tensor_name.substr(13, tensor_name.find(".", 13)).c_str());
	if (block_depth + 1 > depth) {
	depth = block_depth + 1;
	}
	if (tensor_name.find("attn.ln") != std::string::npos) {
	if (tensor_name.find(".bias") != std::string::npos) {
	qk_norm = "ln";
	} else {
	qk_norm = "rms";
	}
	}
	if (tensor_name.find("attn2") != std::string::npos) {
	if (block_depth > d_self) {
	d_self = block_depth;
	}
	}
	}
	}

	if (d_self >= 0) {
	pos_embed_max_size *= 2;
	num_patchs *= 4;
	}

	LOG_INFO("MMDiT layers: %d (including %d MMDiT-x layers)", depth, d_self + 1);

	int64_t default_out_channels = in_channels;
	hidden_size = 64 * depth;
	context_embedder_out_dim = 64 * depth;
	int64_t num_heads = depth;

	blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(input_size, patch_size, in_channels, hidden_size, true));
	blocks["t_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedder(hidden_size));

	if (adm_in_channels != -1) {
	blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
	}

	blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, context_embedder_out_dim, true, true));

	for (int i = 0; i < depth; i++) {
	blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,
	num_heads,
	mlp_ratio,
	qk_norm,
	true,
	i == depth - 1,
	i <= d_self));
	}

	blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
	}

	struct ggml_tensor*
	cropped_pos_embed(struct ggml_context* ctx,
	int64_t h,
	int64_t w) {
	auto pos_embed = params["pos_embed"];

	h = (h + 1) / patch_size;
	w = (w + 1) / patch_size;

	GGML_ASSERT(h <= pos_embed_max_size && h > 0);
	GGML_ASSERT(w <= pos_embed_max_size && w > 0);

	int64_t top = (pos_embed_max_size - h) / 2;
	int64_t left = (pos_embed_max_size - w) / 2;

	auto spatial_pos_embed = ggml_reshape_3d(ctx, pos_embed, hidden_size, pos_embed_max_size, pos_embed_max_size);

	// spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
	spatial_pos_embed = ggml_view_3d(ctx,
	spatial_pos_embed,
	hidden_size,
	pos_embed_max_size,
	h,
	spatial_pos_embed->nb[1],
	spatial_pos_embed->nb[2],
	spatial_pos_embed->nb[2] * top); // [h, pos_embed_max_size, hidden_size]
	spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3)); // [pos_embed_max_size, h, hidden_size]
	spatial_pos_embed = ggml_view_3d(ctx,
	spatial_pos_embed,
	hidden_size,
	h,
	w,
	spatial_pos_embed->nb[1],
	spatial_pos_embed->nb[2],
	spatial_pos_embed->nb[2] * left); // [w, h, hidden_size]
	spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3)); // [h, w, hidden_size]
	spatial_pos_embed = ggml_reshape_3d(ctx, spatial_pos_embed, hidden_size, h * w, 1); // [1, h*w, hidden_size]
	return spatial_pos_embed;
	}

	struct ggml_tensor* unpatchify(struct ggml_context* ctx,
	struct ggml_tensor* x,
	int64_t h,
	int64_t w) {
	// x: [N, HW, patch_size patch_size * C]
	// return: [N, C, H, W]
	int64_t n = x->ne[2];
	int64_t c = out_channels;
	int64_t p = patch_size;
	h = (h + 1) / p;
	w = (w + 1) / p;

	GGML_ASSERT(h * w == x->ne[1]);

	x = ggml_reshape_4d(ctx, x, c, p * p, w * h, n); // [N, HW, PP, C]
	x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3)); // [N, C, HW, PP]
	x = ggml_reshape_4d(ctx, x, p, p, w, h * c * n); // [NCH, W, P, P]
	x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [NCH, P, W, P]
	x = ggml_reshape_4d(ctx, x, p * w, p * h, c, n); // [N, C, HP, WP]
	return x;
	}

	struct ggml_tensor* forward_core_with_concat(struct ggml_context* ctx,
	struct ggml_tensor* x,
	struct ggml_tensor* c_mod,
	struct ggml_tensor* context,
	std::vector<int> skip_layers = std::vector<int>()) {
	// x: [N, H*W, hidden_size]
	// context: [N, n_context, d_context]
	// c: [N, hidden_size]
	// return: [N, NW, patch_size patch_size * out_channels]
	auto final_layer = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);

	for (int i = 0; i < depth; i++) {
	// skip iteration if i is in skip_layers
	if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
	continue;
	}

	auto block = std::dynamic_pointer_cast<JointBlock>(blocks["joint_blocks." + std::to_string(i)]);

	auto context_x = block->forward(ctx, context, x, c_mod);
	context = context_x.first;
	x = context_x.second;
	}

	x = final_layer->forward(ctx, x, c_mod); // (N, T, patch_size ** 2 * out_channels)

	return x;
	}

	struct ggml_tensor* forward(struct ggml_context* ctx,
	struct ggml_tensor* x,
	struct ggml_tensor* t,
	struct ggml_tensor* y = NULL,
	struct ggml_tensor* context = NULL,
	std::vector<int> skip_layers = std::vector<int>()) {
	// Forward pass of DiT.
	// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
	// t: (N,) tensor of diffusion timesteps
	// y: (N, adm_in_channels) tensor of class labels
	// context: (N, L, D)
	// return: (N, C, H, W)
	auto x_embedder = std::dynamic_pointer_cast<PatchEmbed>(blocks["x_embedder"]);
	auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);

	int64_t w = x->ne[0];
	int64_t h = x->ne[1];

	auto patch_embed = x_embedder->forward(ctx, x); // [N, H*W, hidden_size]
	auto pos_embed = cropped_pos_embed(ctx, h, w); // [1, H*W, hidden_size]
	x = ggml_add(ctx, patch_embed, pos_embed); // [N, H*W, hidden_size]

	auto c = t_embedder->forward(ctx, t); // [N, hidden_size]
	if (y != NULL && adm_in_channels != -1) {
	auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);

	y = y_embedder->forward(ctx, y); // [N, hidden_size]
	c = ggml_add(ctx, c, y);
	}

	if (context != NULL) {
	auto context_embedder = std::dynamic_pointer_cast<Linear>(blocks["context_embedder"]);

	context = context_embedder->forward(ctx, context); // [N, L, D] aka [N, L, 1536]
	}

	x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, HW, patch_size * 2 * out_channels)

	x = unpatchify(ctx, x, h, w); // [N, C, H, W]

	return x;
	}
	};
	struct MMDiTRunner : public GGMLRunner {
	MMDiT mmdit;

	static std::map<std::string, enum ggml_type> empty_tensor_types;

	MMDiTRunner(ggml_backend_t backend,
	std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
	const std::string prefix = "")
	: GGMLRunner(backend), mmdit(tensor_types) {
	mmdit.init(params_ctx, tensor_types, prefix);
	}

	std::string get_desc() {
	return "mmdit";
	}

	void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
	mmdit.get_param_tensors(tensors, prefix);
	}

	struct ggml_cgraph* build_graph(struct ggml_tensor* x,
	struct ggml_tensor* timesteps,
	struct ggml_tensor* context,
	struct ggml_tensor* y,
	std::vector<int> skip_layers = std::vector<int>()) {
	struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false);

	x = to_backend(x);
	context = to_backend(context);
	y = to_backend(y);
	timesteps = to_backend(timesteps);

	struct ggml_tensor* out = mmdit.forward(compute_ctx,
	x,
	timesteps,
	y,
	context,
	skip_layers);

	ggml_build_forward_expand(gf, out);

	return gf;
	}

	void compute(int n_threads,
	struct ggml_tensor* x,
	struct ggml_tensor* timesteps,
	struct ggml_tensor* context,
	struct ggml_tensor* y,
	struct ggml_tensor** output = NULL,
	struct ggml_context* output_ctx = NULL,
	std::vector<int> skip_layers = std::vector<int>()) {
	// x: [N, in_channels, h, w]
	// timesteps: [N, ]
	// context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size]
	// y: [N, adm_in_channels] or [1, adm_in_channels]
	auto get_graph = [&]() -> struct ggml_cgraph* {
	return build_graph(x, timesteps, context, y, skip_layers);
	};

	GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
	}

	void test() {
	struct ggml_init_params params;
	params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
	params.mem_buffer = NULL;
	params.no_alloc = false;

	struct ggml_context* work_ctx = ggml_init(params);
	GGML_ASSERT(work_ctx != NULL);

	{
	// cpu f16: pass
	// cpu f32: pass
	// cuda f16: pass
	// cuda f32: pass
	auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 128, 128, 16, 1);
	std::vector<float> timesteps_vec(1, 999.f);
	auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
	ggml_set_f32(x, 0.01f);
	// print_ggml_tensor(x);

	auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 154, 1);
	ggml_set_f32(context, 0.01f);
	// print_ggml_tensor(context);

	auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 2048, 1);
	ggml_set_f32(y, 0.01f);
	// print_ggml_tensor(y);

	struct ggml_tensor* out = NULL;

	int t0 = ggml_time_ms();
	compute(8, x, timesteps, context, y, &out, work_ctx);
	int t1 = ggml_time_ms();

	print_ggml_tensor(out);
	LOG_DEBUG("mmdit test done in %dms", t1 - t0);
	}
	}

	static void load_from_file_and_test(const std::string& file_path) {
	// ggml_backend_t backend = ggml_backend_cuda_init(0);
	ggml_backend_t backend = ggml_backend_cpu_init();
	ggml_type model_data_type = GGML_TYPE_F16;
	std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend));
	{
	LOG_INFO("loading from '%s'", file_path.c_str());

	mmdit->alloc_params_buffer();
	std::map<std::string, ggml_tensor*> tensors;
	mmdit->get_param_tensors(tensors, "model.diffusion_model");

	ModelLoader model_loader;
	if (!model_loader.init_from_file(file_path)) {
	LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
	return;
	}

	bool success = model_loader.load_tensors(tensors, backend);

	if (!success) {
	LOG_ERROR("load tensors from model loader failed");
	return;
	}

	LOG_INFO("mmdit model loaded");
	}
	mmdit->test();
	}
	};

	#endif