|
dataset: |
|
dataset: imagenet |
|
image_resolution: 256 |
|
|
|
stage1: |
|
type: vqgan |
|
embed_dim: 256 |
|
n_embed: 16384 |
|
hparams: |
|
double_z: False |
|
z_channels: 256 |
|
resolution: 256 |
|
in_channels: 3 |
|
out_ch: 3 |
|
ch: 128 |
|
ch_mult: [1, 1, 2, 2, 4] |
|
num_res_blocks: 2 |
|
attn_resolutions: [16] |
|
pdrop: 0.0 |
|
|
|
stage2: |
|
type: igpt |
|
use_cls_cond: True |
|
vocab_size_img: 16384 |
|
hparams: |
|
embed_dim: 1536 |
|
n_layers: 42 |
|
n_heads: 24 |
|
n_dense_layers: 42 |
|
ctx_len_img: 256 |
|
embd_pdrop: 0.0 |
|
resid_pdrop: 0.0 |
|
attn_pdrop: 0.0 |
|
mlp_bias: True |
|
attn_bias: True |
|
gelu_use_approx: False |
|
n_classes: 1000 |
|
|
|
optimizer: |
|
opt_type: adamW |
|
base_lr: 1e-4 |
|
weight_decay: 0.0 |
|
betas: [0.9, 0.95] |
|
grad_clip_norm: 4.0 |
|
|
|
experiment: |
|
local_batch_size: 2 |
|
total_batch_size: 512 |
|
epochs: 8 |
|
|