root commited on
Commit
310ac15
·
1 Parent(s): 98dcaae

first upload

Browse files
README.md CHANGED
@@ -1,3 +1,30 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: text-to-image
3
+ license: apache-2.0
4
+ tags:
5
+ - Non-Autoregressive
6
+ ---
7
+
8
+ # Monetico: An Efficient Reproduction of Meissonic for Text-to-Image Synthesis
9
+
10
+ ## Introduction
11
+ Similar to Meissonic, Monetico is a non-autoregressive masked image modeling text-to-image synthesis model capable of generating high-resolution images. It is designed to run efficiently on consumer-grade graphics cards.
12
+
13
+ Monetico is an efficient reproduction of Meissonic. Trained on 8 H100 GPUs for approximately one week, Monetico can generate high-quality 512x512 images that are comparable to those produced by Meissonic and SDXL.
14
+
15
+ Monetico was developed by Collov Labs. We extend our gratitude to @MeissonFlow and @viiika for their valuable advice on efficient training.
16
+
17
+ ## Usage
18
+
19
+ For detailed usage instructions, please refer to [GitHub repository](https://github.com/viiika/Meissonic).
20
+
21
+ ## Citation
22
+ If you find this work helpful, please consider citing:
23
+ ```bibtex
24
+ @article{bai2024meissonic,
25
+ title={Meissonic: Revitalizing Masked Generative Transformers for Efficient High-Resolution Text-to-Image Synthesis},
26
+ author={Bai, Jinbin and Ye, Tian and Chow, Wei and Song, Enxin and Chen, Qing-Guo and Li, Xiangtai and Dong, Zhen and Zhu, Lei and Yan, Shuicheng},
27
+ journal={arXiv preprint arXiv:2410.08261},
28
+ year={2024}
29
+ }
30
+ ```
model_index.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Pipeline",
3
+ "_diffusers_version": "0.30.2",
4
+ "scheduler": [
5
+ "scheduler",
6
+ "Scheduler"
7
+ ],
8
+ "text_encoder": [
9
+ "transformers",
10
+ "CLIPTextModelWithProjection"
11
+ ],
12
+ "tokenizer": [
13
+ "transformers",
14
+ "CLIPTokenizer"
15
+ ],
16
+ "transformer": [
17
+ "transformer",
18
+ "Transformer2DModel"
19
+ ],
20
+ "vqvae": [
21
+ "diffusers",
22
+ "VQModel"
23
+ ]
24
+ }
scheduler/scheduler.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import math
15
+ from dataclasses import dataclass
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+
20
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
21
+ from diffusers.utils import BaseOutput
22
+ from diffusers.schedulers.scheduling_utils import SchedulerMixin
23
+
24
+
25
+ def gumbel_noise(t, generator=None):
26
+ device = generator.device if generator is not None else t.device
27
+ noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device)
28
+ return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20))
29
+
30
+
31
+ def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
32
+ confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator)
33
+ sorted_confidence = torch.sort(confidence, dim=-1).values
34
+ cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
35
+ masking = confidence < cut_off
36
+ return masking
37
+
38
+
39
+ @dataclass
40
+ class SchedulerOutput(BaseOutput):
41
+ """
42
+ Output class for the scheduler's `step` function output.
43
+
44
+ Args:
45
+ prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
46
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
47
+ denoising loop.
48
+ pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
49
+ The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
50
+ `pred_original_sample` can be used to preview progress or for guidance.
51
+ """
52
+
53
+ prev_sample: torch.Tensor
54
+ pred_original_sample: torch.Tensor = None
55
+
56
+
57
+ class Scheduler(SchedulerMixin, ConfigMixin):
58
+ order = 1
59
+
60
+ temperatures: torch.Tensor
61
+
62
+ @register_to_config
63
+ def __init__(
64
+ self,
65
+ mask_token_id: int,
66
+ masking_schedule: str = "cosine",
67
+ ):
68
+ self.temperatures = None
69
+ self.timesteps = None
70
+
71
+ def set_timesteps(
72
+ self,
73
+ num_inference_steps: int,
74
+ temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
75
+ device: Union[str, torch.device] = None,
76
+ ):
77
+ self.timesteps = torch.arange(num_inference_steps, device=device).flip(0)
78
+
79
+ if isinstance(temperature, (tuple, list)):
80
+ self.temperatures = torch.linspace(temperature[0], temperature[1], num_inference_steps, device=device)
81
+ else:
82
+ self.temperatures = torch.linspace(temperature, 0.01, num_inference_steps, device=device)
83
+
84
+ def step(
85
+ self,
86
+ model_output: torch.Tensor,
87
+ timestep: torch.long,
88
+ sample: torch.LongTensor,
89
+ starting_mask_ratio: int = 1,
90
+ generator: Optional[torch.Generator] = None,
91
+ return_dict: bool = True,
92
+ ) -> Union[SchedulerOutput, Tuple]:
93
+ two_dim_input = sample.ndim == 3 and model_output.ndim == 4
94
+
95
+ if two_dim_input:
96
+ batch_size, codebook_size, height, width = model_output.shape
97
+ sample = sample.reshape(batch_size, height * width)
98
+ model_output = model_output.reshape(batch_size, codebook_size, height * width).permute(0, 2, 1)
99
+
100
+ unknown_map = sample == self.config.mask_token_id
101
+
102
+ probs = model_output.softmax(dim=-1)
103
+
104
+ device = probs.device
105
+ probs_ = probs.to(generator.device) if generator is not None else probs # handles when generator is on CPU
106
+ if probs_.device.type == "cpu" and probs_.dtype != torch.float32:
107
+ probs_ = probs_.float() # multinomial is not implemented for cpu half precision
108
+ probs_ = probs_.reshape(-1, probs.size(-1))
109
+ pred_original_sample = torch.multinomial(probs_, 1, generator=generator).to(device=device)
110
+ pred_original_sample = pred_original_sample[:, 0].view(*probs.shape[:-1])
111
+ pred_original_sample = torch.where(unknown_map, pred_original_sample, sample)
112
+
113
+ if timestep == 0:
114
+ prev_sample = pred_original_sample
115
+ else:
116
+ seq_len = sample.shape[1]
117
+ step_idx = (self.timesteps == timestep).nonzero()
118
+ ratio = (step_idx + 1) / len(self.timesteps)
119
+
120
+ if self.config.masking_schedule == "cosine":
121
+ mask_ratio = torch.cos(ratio * math.pi / 2)
122
+ elif self.config.masking_schedule == "linear":
123
+ mask_ratio = 1 - ratio
124
+ else:
125
+ raise ValueError(f"unknown masking schedule {self.config.masking_schedule}")
126
+
127
+ mask_ratio = starting_mask_ratio * mask_ratio
128
+
129
+ mask_len = (seq_len * mask_ratio).floor()
130
+ # do not mask more than amount previously masked
131
+ mask_len = torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len)
132
+ # mask at least one
133
+ mask_len = torch.max(torch.tensor([1], device=model_output.device), mask_len)
134
+
135
+ selected_probs = torch.gather(probs, -1, pred_original_sample[:, :, None])[:, :, 0]
136
+ # Ignores the tokens given in the input by overwriting their confidence.
137
+ selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max)
138
+
139
+ masking = mask_by_random_topk(mask_len, selected_probs, self.temperatures[step_idx], generator)
140
+
141
+ # Masks tokens with lower confidence.
142
+ prev_sample = torch.where(masking, self.config.mask_token_id, pred_original_sample)
143
+
144
+ if two_dim_input:
145
+ prev_sample = prev_sample.reshape(batch_size, height, width)
146
+ pred_original_sample = pred_original_sample.reshape(batch_size, height, width)
147
+
148
+ if not return_dict:
149
+ return (prev_sample, pred_original_sample)
150
+
151
+ return SchedulerOutput(prev_sample, pred_original_sample)
152
+
153
+ def add_noise(self, sample, timesteps, generator=None):
154
+ step_idx = (self.timesteps == timesteps).nonzero()
155
+ ratio = (step_idx + 1) / len(self.timesteps)
156
+
157
+ if self.config.masking_schedule == "cosine":
158
+ mask_ratio = torch.cos(ratio * math.pi / 2)
159
+ elif self.config.masking_schedule == "linear":
160
+ mask_ratio = 1 - ratio
161
+ else:
162
+ raise ValueError(f"unknown masking schedule {self.config.masking_schedule}")
163
+
164
+ mask_indices = (
165
+ torch.rand(
166
+ sample.shape, device=generator.device if generator is not None else sample.device, generator=generator
167
+ ).to(sample.device)
168
+ < mask_ratio
169
+ )
170
+
171
+ masked_sample = sample.clone()
172
+
173
+ masked_sample[mask_indices] = self.config.mask_token_id
174
+
175
+ return masked_sample
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Scheduler",
3
+ "_diffusers_version": "0.30.2",
4
+ "mask_token_id": 8255,
5
+ "masking_schedule": "cosine"
6
+ }
7
+
text_encoder/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPTextModelWithProjection"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 0,
7
+ "dropout": 0.0,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_size": 1024,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 77,
16
+ "model_type": "clip_text_model",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 24,
19
+ "pad_token_id": 1,
20
+ "projection_dim": 1024,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.44.2",
23
+ "vocab_size": 49408
24
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ed02ba1546554a152c5e1f4920ba14466e3749e7feb42d8111857a8ed510574
3
+ size 1416177568
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "!",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "model_max_length": 77,
35
+ "pad_token": "!",
36
+ "tokenizer_class": "CLIPTokenizer",
37
+ "unk_token": "<|endoftext|>"
38
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
transformer/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Transformer2DModel",
3
+ "_diffusers_version": "0.30.2",
4
+ "attention_head_dim": 128,
5
+ "axes_dims_rope": [
6
+ 16,
7
+ 56,
8
+ 56
9
+ ],
10
+ "codebook_size": 8192,
11
+ "downsample": true,
12
+ "guidance_embeds": false,
13
+ "in_channels": 64,
14
+ "joint_attention_dim": 1024,
15
+ "num_attention_heads": 8,
16
+ "num_layers": 14,
17
+ "num_single_layers": 28,
18
+ "patch_size": 1,
19
+ "pooled_projection_dim": 1024,
20
+ "upsample": true,
21
+ "vocab_size": 8256
22
+ }
transformer/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f6db36e88e25b7cf8f9a7c90f0084a760e81147324c3a33b079766f8d2eec9d
3
+ size 3994323336
vqvae/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "VQModel",
3
+ "_diffusers_version": "0.30.2",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 256,
9
+ 512,
10
+ 768
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D",
17
+ "DownEncoderBlock2D"
18
+ ],
19
+ "in_channels": 3,
20
+ "latent_channels": 64,
21
+ "layers_per_block": 2,
22
+ "lookup_from_codebook": true,
23
+ "mid_block_add_attention": false,
24
+ "norm_num_groups": 32,
25
+ "norm_type": "group",
26
+ "num_vq_embeddings": 8192,
27
+ "out_channels": 3,
28
+ "sample_size": 32,
29
+ "scaling_factor": 0.18215,
30
+ "up_block_types": [
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D",
34
+ "UpDecoderBlock2D",
35
+ "UpDecoderBlock2D"
36
+ ],
37
+ "vq_embed_dim": null,
38
+ "force_upcast": true
39
+ }
vqvae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1241a5c88b635af4f8cfb268e388ccaa70f55a458a473d68943e5c28d7b7f762
3
+ size 585009980