Spaces:
Running
on
Zero
Running
on
Zero
modify 1024.yaml
Browse files- config/models/ace_0.6b_1024.yaml +171 -15
config/models/ace_0.6b_1024.yaml
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
NAME: ACE_0.
|
2 |
-
IS_DEFAULT:
|
3 |
DEFAULT_PARAS:
|
4 |
PARAS:
|
5 |
#
|
@@ -9,14 +9,18 @@ DEFAULT_PARAS:
|
|
9 |
TASK:
|
10 |
PROMPT: ""
|
11 |
NEGATIVE_PROMPT: ""
|
12 |
-
OUTPUT_HEIGHT:
|
13 |
-
OUTPUT_WIDTH:
|
14 |
SAMPLER: ddim
|
15 |
-
SAMPLE_STEPS:
|
16 |
GUIDE_SCALE: 4.5
|
17 |
GUIDE_RESCALE: 0.5
|
18 |
SEED: -1
|
19 |
TAR_INDEX: 0
|
|
|
|
|
|
|
|
|
20 |
OUTPUT:
|
21 |
LATENT:
|
22 |
IMAGES:
|
@@ -39,12 +43,12 @@ DEFAULT_PARAS:
|
|
39 |
#
|
40 |
COND_STAGE_MODEL:
|
41 |
FUNCTION:
|
42 |
-
- NAME:
|
43 |
DTYPE: bfloat16
|
44 |
INPUT: ["PROMPT"]
|
45 |
#
|
46 |
MODEL:
|
47 |
-
NAME:
|
48 |
PRETRAINED_MODEL:
|
49 |
IGNORE_KEYS: [ ]
|
50 |
SCALE_FACTOR: 0.18215
|
@@ -55,7 +59,7 @@ MODEL:
|
|
55 |
USE_TEXT_POS_EMBEDDINGS: True
|
56 |
#
|
57 |
DIFFUSION:
|
58 |
-
NAME:
|
59 |
PREDICTION_TYPE: eps
|
60 |
MIN_SNR_GAMMA:
|
61 |
NOISE_SCHEDULER:
|
@@ -65,8 +69,8 @@ MODEL:
|
|
65 |
BETA_MAX: 0.02
|
66 |
#
|
67 |
DIFFUSION_MODEL:
|
68 |
-
NAME:
|
69 |
-
PRETRAINED_MODEL:
|
70 |
IGNORE_KEYS: [ ]
|
71 |
PATCH_SIZE: 2
|
72 |
IN_CHANNELS: 4
|
@@ -78,7 +82,7 @@ MODEL:
|
|
78 |
DROP_PATH: 0.0
|
79 |
WINDOW_DIZE: 0
|
80 |
Y_CHANNELS: 4096
|
81 |
-
MAX_SEQ_LEN:
|
82 |
QK_NORM: True
|
83 |
USE_GRAD_CHECKPOINT: True
|
84 |
ATTENTION_BACKEND: flash_attn
|
@@ -86,7 +90,7 @@ MODEL:
|
|
86 |
FIRST_STAGE_MODEL:
|
87 |
NAME: AutoencoderKL
|
88 |
EMBED_DIM: 4
|
89 |
-
PRETRAINED_MODEL:
|
90 |
IGNORE_KEYS: []
|
91 |
#
|
92 |
ENCODER:
|
@@ -117,11 +121,163 @@ MODEL:
|
|
117 |
TANH_OUT: False
|
118 |
#
|
119 |
COND_STAGE_MODEL:
|
120 |
-
NAME:
|
121 |
-
PRETRAINED_MODEL:
|
122 |
-
TOKENIZER_PATH:
|
123 |
LENGTH: 120
|
124 |
T5_DTYPE: bfloat16
|
125 |
ADDED_IDENTIFIER: [ '{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
|
126 |
CLEAN: whitespace
|
127 |
USE_GRAD: False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
NAME: ACE_0.6B_1024_REFINER
|
2 |
+
IS_DEFAULT: True
|
3 |
DEFAULT_PARAS:
|
4 |
PARAS:
|
5 |
#
|
|
|
9 |
TASK:
|
10 |
PROMPT: ""
|
11 |
NEGATIVE_PROMPT: ""
|
12 |
+
OUTPUT_HEIGHT: 1024
|
13 |
+
OUTPUT_WIDTH: 1024
|
14 |
SAMPLER: ddim
|
15 |
+
SAMPLE_STEPS: 50
|
16 |
GUIDE_SCALE: 4.5
|
17 |
GUIDE_RESCALE: 0.5
|
18 |
SEED: -1
|
19 |
TAR_INDEX: 0
|
20 |
+
REFINER_SCALE: 0.2
|
21 |
+
USE_ACE: True
|
22 |
+
#REFINER_PROMPT: "High Resolution, Sharpness, Clarity, Detail Enhancement, Noise Reduction, HD, 4k, Image Restoration, HDR"
|
23 |
+
REFINER_PROMPT: "High Resolution, Sharpness, Clarity, Detail Enhancement, Noise Reduction, HD, 4k, Image Restoration, HDR"
|
24 |
OUTPUT:
|
25 |
LATENT:
|
26 |
IMAGES:
|
|
|
43 |
#
|
44 |
COND_STAGE_MODEL:
|
45 |
FUNCTION:
|
46 |
+
- NAME: encode_list_of_list
|
47 |
DTYPE: bfloat16
|
48 |
INPUT: ["PROMPT"]
|
49 |
#
|
50 |
MODEL:
|
51 |
+
NAME: LatentDiffusionACE
|
52 |
PRETRAINED_MODEL:
|
53 |
IGNORE_KEYS: [ ]
|
54 |
SCALE_FACTOR: 0.18215
|
|
|
59 |
USE_TEXT_POS_EMBEDDINGS: True
|
60 |
#
|
61 |
DIFFUSION:
|
62 |
+
NAME: BaseDiffusion
|
63 |
PREDICTION_TYPE: eps
|
64 |
MIN_SNR_GAMMA:
|
65 |
NOISE_SCHEDULER:
|
|
|
69 |
BETA_MAX: 0.02
|
70 |
#
|
71 |
DIFFUSION_MODEL:
|
72 |
+
NAME: ACE
|
73 |
+
PRETRAINED_MODEL: ms://iic/ACE-0.6B-1024px@models/dit/ace_0.6b_1024px.pth
|
74 |
IGNORE_KEYS: [ ]
|
75 |
PATCH_SIZE: 2
|
76 |
IN_CHANNELS: 4
|
|
|
82 |
DROP_PATH: 0.0
|
83 |
WINDOW_DIZE: 0
|
84 |
Y_CHANNELS: 4096
|
85 |
+
MAX_SEQ_LEN: 4096
|
86 |
QK_NORM: True
|
87 |
USE_GRAD_CHECKPOINT: True
|
88 |
ATTENTION_BACKEND: flash_attn
|
|
|
90 |
FIRST_STAGE_MODEL:
|
91 |
NAME: AutoencoderKL
|
92 |
EMBED_DIM: 4
|
93 |
+
PRETRAINED_MODEL: ms://iic/ACE-0.6B-1024px@models/vae/vae.bin
|
94 |
IGNORE_KEYS: []
|
95 |
#
|
96 |
ENCODER:
|
|
|
121 |
TANH_OUT: False
|
122 |
#
|
123 |
COND_STAGE_MODEL:
|
124 |
+
NAME: T5EmbedderHF
|
125 |
+
PRETRAINED_MODEL: ms://iic/ACE-0.6B-1024px@models/text_encoder/t5-v1_1-xxl/
|
126 |
+
TOKENIZER_PATH: ms://iic/ACE-0.6B-1024px@models/tokenizer/t5-v1_1-xxl
|
127 |
LENGTH: 120
|
128 |
T5_DTYPE: bfloat16
|
129 |
ADDED_IDENTIFIER: [ '{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
|
130 |
CLEAN: whitespace
|
131 |
USE_GRAD: False
|
132 |
+
|
133 |
+
ACE_PROMPT: [
|
134 |
+
"A cute cartoon rabbit holding a whiteboard that says 'ACE Refiner', standing in a sunny meadow filled with flowers, with a big smile and bright colors.",
|
135 |
+
"A beautiful young woman with long flowing hair, wearing a summer dress, holding a whiteboard that reads 'ACE Refiner' while sitting on a park bench surrounded by cherry blossoms.",
|
136 |
+
"An adorable cartoon cat wearing oversized glasses, holding a whiteboard that says 'ACE Refiner', perched on a stack of colorful books in a cozy library setting.",
|
137 |
+
"A charming girl with pigtails, wearing a cute school uniform, enthusiastically holding a whiteboard that has 'ACE Refiner' written on it, in a bright and cheerful classroom full of educational posters.",
|
138 |
+
"A friendly cartoon dog with floppy ears, sitting in front of a doghouse, proudly holding a whiteboard that says 'ACE Refiner', with a playful expression and a blue sky in the background.",
|
139 |
+
"A cute anime girl with big expressive eyes, dressed in a colorful outfit, holding a whiteboard that reads 'ACE Refiner' in a fantastical landscape filled with mythical creatures.",
|
140 |
+
"A vibrant cartoon fox holding a whiteboard that says 'ACE Refiner', standing on a rock by a sparkling stream, surrounded by lush greenery and butterflies.",
|
141 |
+
"A stylish young woman in a business outfit, smiling as she holds a whiteboard written with 'ACE Refiner', in a modern office filled with plants and natural light.",
|
142 |
+
"A cute cartoon unicorn holding a sparkling whiteboard that says 'ACE Refiner', frolicking in a magical forest, with rainbows and stars in the background.",
|
143 |
+
"A happy family, consisting of a cute little girl and her playful puppy, holding a whiteboard that says 'ACE Refiner', together in their backyard on a sunny day."
|
144 |
+
]
|
145 |
+
REFINER_MODEL:
|
146 |
+
NAME: ""
|
147 |
+
IS_DEFAULT: False
|
148 |
+
DEFAULT_PARAS:
|
149 |
+
PARAS:
|
150 |
+
RESOLUTIONS: [ [ 1024, 1024 ] ]
|
151 |
+
INPUT:
|
152 |
+
INPUT_IMAGE:
|
153 |
+
INPUT_MASK:
|
154 |
+
TASK:
|
155 |
+
PROMPT: ""
|
156 |
+
NEGATIVE_PROMPT: ""
|
157 |
+
OUTPUT_HEIGHT: 1024
|
158 |
+
OUTPUT_WIDTH: 1024
|
159 |
+
SAMPLER: flow_euler
|
160 |
+
SAMPLE_STEPS: 30
|
161 |
+
GUIDE_SCALE: 3.5
|
162 |
+
GUIDE_RESCALE:
|
163 |
+
OUTPUT:
|
164 |
+
LATENT:
|
165 |
+
IMAGES:
|
166 |
+
SEED:
|
167 |
+
MODULES_PARAS:
|
168 |
+
FIRST_STAGE_MODEL:
|
169 |
+
FUNCTION:
|
170 |
+
- NAME: encode
|
171 |
+
DTYPE: bfloat16
|
172 |
+
INPUT: [ "IMAGE" ]
|
173 |
+
- NAME: decode
|
174 |
+
DTYPE: bfloat16
|
175 |
+
INPUT: [ "LATENT" ]
|
176 |
+
PARAS:
|
177 |
+
SCALE_FACTOR: 1.5305
|
178 |
+
SHIFT_FACTOR: 0.0609
|
179 |
+
SIZE_FACTOR: 8
|
180 |
+
DIFFUSION_MODEL:
|
181 |
+
FUNCTION:
|
182 |
+
- NAME: forward
|
183 |
+
DTYPE: bfloat16
|
184 |
+
INPUT: [ "SAMPLE_STEPS", "SAMPLE", "GUIDE_SCALE" ]
|
185 |
+
COND_STAGE_MODEL:
|
186 |
+
FUNCTION:
|
187 |
+
- NAME: encode
|
188 |
+
DTYPE: bfloat16
|
189 |
+
INPUT: [ "PROMPT" ]
|
190 |
+
|
191 |
+
MODEL:
|
192 |
+
DIFFUSION:
|
193 |
+
NAME: DiffusionFluxRF
|
194 |
+
PREDICTION_TYPE: raw
|
195 |
+
NOISE_SCHEDULER:
|
196 |
+
NAME: FlowMatchSigmaScheduler
|
197 |
+
WEIGHTING_SCHEME: logit_normal
|
198 |
+
SHIFT: 3.0
|
199 |
+
LOGIT_MEAN: 0.0
|
200 |
+
LOGIT_STD: 1.0
|
201 |
+
MODE_SCALE: 1.29
|
202 |
+
DIFFUSION_MODEL:
|
203 |
+
NAME: FluxMR
|
204 |
+
PRETRAINED_MODEL: ms://AI-ModelScope/[email protected]
|
205 |
+
IN_CHANNELS: 64
|
206 |
+
OUT_CHANNELS: 64
|
207 |
+
HIDDEN_SIZE: 3072
|
208 |
+
NUM_HEADS: 24
|
209 |
+
AXES_DIM: [ 16, 56, 56 ]
|
210 |
+
THETA: 10000
|
211 |
+
VEC_IN_DIM: 768
|
212 |
+
GUIDANCE_EMBED: True
|
213 |
+
CONTEXT_IN_DIM: 4096
|
214 |
+
MLP_RATIO: 4.0
|
215 |
+
QKV_BIAS: True
|
216 |
+
DEPTH: 19
|
217 |
+
DEPTH_SINGLE_BLOCKS: 38
|
218 |
+
USE_GRAD_CHECKPOINT: True
|
219 |
+
ATTN_BACKEND: flash_attn
|
220 |
+
#
|
221 |
+
FIRST_STAGE_MODEL:
|
222 |
+
NAME: AutoencoderKLFlux
|
223 |
+
EMBED_DIM: 16
|
224 |
+
PRETRAINED_MODEL: ms://AI-ModelScope/[email protected]
|
225 |
+
IGNORE_KEYS: [ ]
|
226 |
+
BATCH_SIZE: 8
|
227 |
+
USE_CONV: False
|
228 |
+
SCALE_FACTOR: 0.3611
|
229 |
+
SHIFT_FACTOR: 0.1159
|
230 |
+
#
|
231 |
+
ENCODER:
|
232 |
+
NAME: Encoder
|
233 |
+
USE_CHECKPOINT: False
|
234 |
+
CH: 128
|
235 |
+
OUT_CH: 3
|
236 |
+
NUM_RES_BLOCKS: 2
|
237 |
+
IN_CHANNELS: 3
|
238 |
+
ATTN_RESOLUTIONS: [ ]
|
239 |
+
CH_MULT: [ 1, 2, 4, 4 ]
|
240 |
+
Z_CHANNELS: 16
|
241 |
+
DOUBLE_Z: True
|
242 |
+
DROPOUT: 0.0
|
243 |
+
RESAMP_WITH_CONV: True
|
244 |
+
#
|
245 |
+
DECODER:
|
246 |
+
NAME: Decoder
|
247 |
+
USE_CHECKPOINT: False
|
248 |
+
CH: 128
|
249 |
+
OUT_CH: 3
|
250 |
+
NUM_RES_BLOCKS: 2
|
251 |
+
IN_CHANNELS: 3
|
252 |
+
ATTN_RESOLUTIONS: [ ]
|
253 |
+
CH_MULT: [ 1, 2, 4, 4 ]
|
254 |
+
Z_CHANNELS: 16
|
255 |
+
DROPOUT: 0.0
|
256 |
+
RESAMP_WITH_CONV: True
|
257 |
+
GIVE_PRE_END: False
|
258 |
+
TANH_OUT: False
|
259 |
+
#
|
260 |
+
COND_STAGE_MODEL:
|
261 |
+
NAME: T5PlusClipFluxEmbedder
|
262 |
+
T5_MODEL:
|
263 |
+
NAME: HFEmbedder
|
264 |
+
HF_MODEL_CLS: T5EncoderModel
|
265 |
+
MODEL_PATH: ms://AI-ModelScope/FLUX.1-dev@text_encoder_2/
|
266 |
+
HF_TOKENIZER_CLS: T5Tokenizer
|
267 |
+
TOKENIZER_PATH: ms://AI-ModelScope/FLUX.1-dev@tokenizer_2/
|
268 |
+
MAX_LENGTH: 512
|
269 |
+
OUTPUT_KEY: last_hidden_state
|
270 |
+
D_TYPE: bfloat16
|
271 |
+
BATCH_INFER: False
|
272 |
+
CLEAN: whitespace
|
273 |
+
CLIP_MODEL:
|
274 |
+
NAME: HFEmbedder
|
275 |
+
HF_MODEL_CLS: CLIPTextModel
|
276 |
+
MODEL_PATH: ms://AI-ModelScope/FLUX.1-dev@text_encoder/
|
277 |
+
HF_TOKENIZER_CLS: CLIPTokenizer
|
278 |
+
TOKENIZER_PATH: ms://AI-ModelScope/FLUX.1-dev@tokenizer/
|
279 |
+
MAX_LENGTH: 77
|
280 |
+
OUTPUT_KEY: pooler_output
|
281 |
+
D_TYPE: bfloat16
|
282 |
+
BATCH_INFER: True
|
283 |
+
CLEAN: whitespace
|