NIRVANALAN commited on
Commit
c3a2df4
·
1 Parent(s): 592a426
Files changed (2) hide show
  1. app.py +88 -84
  2. configs/i23d_args.json +284 -0
app.py CHANGED
@@ -32,6 +32,8 @@ import numpy as np
32
  import torch as th
33
  import torch.distributed as dist
34
 
 
 
35
  def install_dependency():
36
  # install full cuda first
37
  # subprocess.run(
@@ -53,13 +55,13 @@ th.backends.cudnn.enabled = True
53
 
54
  from guided_diffusion import dist_util, logger
55
  from guided_diffusion.script_util import (
56
- NUM_CLASSES,
57
  model_and_diffusion_defaults,
58
  create_model_and_diffusion,
59
- add_dict_to_argparser,
60
  args_to_dict,
61
- continuous_diffusion_defaults,
62
- control_net_defaults,
 
 
63
  )
64
 
65
  from pathlib import Path
@@ -72,11 +74,11 @@ import nsr
72
  import nsr.lsgm
73
  from nsr.script_util import create_3DAE_model, encoder_and_nsr_defaults, loss_defaults, AE_with_Diffusion, rendering_options_defaults, eg3d_options_default, dataset_defaults
74
 
75
- from datasets.shapenet import load_eval_data
76
- from torch.utils.data import Subset
77
- from datasets.eg3d_dataset import init_dataset_kwargs
78
 
79
- from transport.train_utils import parse_transport_args
80
 
81
  from utils.infer_utils import remove_background, resize_foreground
82
 
@@ -376,81 +378,81 @@ def main(args):
376
 
377
 
378
 
379
- def create_argparser():
380
- defaults = dict(
381
- image_size_encoder=224,
382
- triplane_scaling_divider=1.0, # divide by this value
383
- diffusion_input_size=-1,
384
- trainer_name='adm',
385
- use_amp=False,
386
- # triplane_scaling_divider=1.0, # divide by this value
387
-
388
- # * sampling flags
389
- clip_denoised=False,
390
- num_samples=10,
391
- num_instances=10, # for i23d, loop different condition
392
- use_ddim=False,
393
- ddpm_model_path="",
394
- cldm_model_path="",
395
- rec_model_path="",
396
-
397
- # * eval logging flags
398
- logdir="/mnt/lustre/yslan/logs/nips23/",
399
- data_dir="",
400
- eval_data_dir="",
401
- eval_batch_size=1,
402
- num_workers=1,
403
-
404
- # * training flags for loading TrainingLoop class
405
- overfitting=False,
406
- image_size=128,
407
- iterations=150000,
408
- schedule_sampler="uniform",
409
- anneal_lr=False,
410
- lr=5e-5,
411
- weight_decay=0.0,
412
- lr_anneal_steps=0,
413
- batch_size=1,
414
- microbatch=-1, # -1 disables microbatches
415
- ema_rate="0.9999", # comma-separated list of EMA values
416
- log_interval=50,
417
- eval_interval=2500,
418
- save_interval=10000,
419
- resume_checkpoint="",
420
- resume_cldm_checkpoint="",
421
- resume_checkpoint_EG3D="",
422
- use_fp16=False,
423
- fp16_scale_growth=1e-3,
424
- load_submodule_name='', # for loading pretrained auto_encoder model
425
- ignore_resume_opt=False,
426
- freeze_ae=False,
427
- denoised_ae=True,
428
- # inference prompt
429
- prompt="a red chair",
430
- interval=1,
431
- save_img=False,
432
- use_train_trajectory=
433
- False, # use train trajectory to sample images for fid calculation
434
- unconditional_guidance_scale=1.0,
435
- use_eos_feature=False,
436
- export_mesh=False,
437
- cond_key='caption',
438
- allow_tf32=True,
439
- )
440
 
441
- defaults.update(model_and_diffusion_defaults())
442
- defaults.update(encoder_and_nsr_defaults()) # type: ignore
443
- defaults.update(loss_defaults())
444
- defaults.update(continuous_diffusion_defaults())
445
- defaults.update(control_net_defaults())
446
- defaults.update(dataset_defaults())
447
 
448
- parser = argparse.ArgumentParser()
449
- add_dict_to_argparser(parser, defaults)
450
 
451
- parse_transport_args(parser)
452
 
453
- return parser
454
 
455
 
456
  if __name__ == "__main__":
@@ -461,14 +463,16 @@ if __name__ == "__main__":
461
  os.environ[
462
  "TORCH_DISTRIBUTED_DEBUG"] = "DETAIL" # set to DETAIL for runtime logging.
463
 
464
- args = create_argparser().parse_args()
465
 
466
  # args.local_rank = int(os.environ["LOCAL_RANK"])
467
- args.local_rank = 0
468
- args.gpus = th.cuda.device_count()
469
-
470
- args.rendering_kwargs = rendering_options_defaults(args)
471
 
 
 
 
 
472
 
473
  # main(args)
474
 
 
32
  import torch as th
33
  import torch.distributed as dist
34
 
35
+ from dnnlib.util import EasyDict
36
+
37
  def install_dependency():
38
  # install full cuda first
39
  # subprocess.run(
 
55
 
56
  from guided_diffusion import dist_util, logger
57
  from guided_diffusion.script_util import (
 
58
  model_and_diffusion_defaults,
59
  create_model_and_diffusion,
 
60
  args_to_dict,
61
+ # NUM_CLASSES,
62
+ # add_dict_to_argparser,
63
+ # continuous_diffusion_defaults,
64
+ # control_net_defaults,
65
  )
66
 
67
  from pathlib import Path
 
74
  import nsr.lsgm
75
  from nsr.script_util import create_3DAE_model, encoder_and_nsr_defaults, loss_defaults, AE_with_Diffusion, rendering_options_defaults, eg3d_options_default, dataset_defaults
76
 
77
+ # from datasets.shapenet import load_eval_data
78
+ # from torch.utils.data import Subset
79
+ # from datasets.eg3d_dataset import init_dataset_kwargs
80
 
81
+ # from transport.train_utils import parse_transport_args
82
 
83
  from utils.infer_utils import remove_background, resize_foreground
84
 
 
378
 
379
 
380
 
381
+ # def create_argparser():
382
+ # defaults = dict(
383
+ # image_size_encoder=224,
384
+ # triplane_scaling_divider=1.0, # divide by this value
385
+ # diffusion_input_size=-1,
386
+ # trainer_name='adm',
387
+ # use_amp=False,
388
+ # # triplane_scaling_divider=1.0, # divide by this value
389
+
390
+ # # * sampling flags
391
+ # clip_denoised=False,
392
+ # num_samples=10,
393
+ # num_instances=10, # for i23d, loop different condition
394
+ # use_ddim=False,
395
+ # ddpm_model_path="",
396
+ # cldm_model_path="",
397
+ # rec_model_path="",
398
+
399
+ # # * eval logging flags
400
+ # logdir="/mnt/lustre/yslan/logs/nips23/",
401
+ # data_dir="",
402
+ # eval_data_dir="",
403
+ # eval_batch_size=1,
404
+ # num_workers=1,
405
+
406
+ # # * training flags for loading TrainingLoop class
407
+ # overfitting=False,
408
+ # image_size=128,
409
+ # iterations=150000,
410
+ # schedule_sampler="uniform",
411
+ # anneal_lr=False,
412
+ # lr=5e-5,
413
+ # weight_decay=0.0,
414
+ # lr_anneal_steps=0,
415
+ # batch_size=1,
416
+ # microbatch=-1, # -1 disables microbatches
417
+ # ema_rate="0.9999", # comma-separated list of EMA values
418
+ # log_interval=50,
419
+ # eval_interval=2500,
420
+ # save_interval=10000,
421
+ # resume_checkpoint="",
422
+ # resume_cldm_checkpoint="",
423
+ # resume_checkpoint_EG3D="",
424
+ # use_fp16=False,
425
+ # fp16_scale_growth=1e-3,
426
+ # load_submodule_name='', # for loading pretrained auto_encoder model
427
+ # ignore_resume_opt=False,
428
+ # freeze_ae=False,
429
+ # denoised_ae=True,
430
+ # # inference prompt
431
+ # prompt="a red chair",
432
+ # interval=1,
433
+ # save_img=False,
434
+ # use_train_trajectory=
435
+ # False, # use train trajectory to sample images for fid calculation
436
+ # unconditional_guidance_scale=1.0,
437
+ # use_eos_feature=False,
438
+ # export_mesh=False,
439
+ # cond_key='caption',
440
+ # allow_tf32=True,
441
+ # )
442
 
443
+ # defaults.update(model_and_diffusion_defaults())
444
+ # defaults.update(encoder_and_nsr_defaults()) # type: ignore
445
+ # defaults.update(loss_defaults())
446
+ # defaults.update(continuous_diffusion_defaults())
447
+ # defaults.update(control_net_defaults())
448
+ # defaults.update(dataset_defaults())
449
 
450
+ # parser = argparse.ArgumentParser()
451
+ # add_dict_to_argparser(parser, defaults)
452
 
453
+ # parse_transport_args(parser)
454
 
455
+ # return parser
456
 
457
 
458
  if __name__ == "__main__":
 
463
  os.environ[
464
  "TORCH_DISTRIBUTED_DEBUG"] = "DETAIL" # set to DETAIL for runtime logging.
465
 
466
+ # args = create_argparser().parse_args()
467
 
468
  # args.local_rank = int(os.environ["LOCAL_RANK"])
469
+ # args.local_rank = 0
470
+ # args.gpus = th.cuda.device_count()
 
 
471
 
472
+ # args.rendering_kwargs = rendering_options_defaults(args)
473
+ with open('configs/i23d_args.json') as f:
474
+ args = json.load(f)
475
+ args = EasyDict(args)
476
 
477
  # main(args)
478
 
configs/i23d_args.json ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_size_encoder": 256,
3
+ "triplane_scaling_divider": 0.96806,
4
+ "diffusion_input_size": 32,
5
+ "trainer_name": "flow_matching",
6
+ "use_amp": true,
7
+ "clip_denoised": false,
8
+ "num_samples": 4,
9
+ "num_instances": 10,
10
+ "use_ddim": false,
11
+ "ddpm_model_path": "",
12
+ "cldm_model_path": "",
13
+ "rec_model_path": "",
14
+ "logdir": "./logs/LSGM/inference/Objaverse/i23d/dit-L2/",
15
+ "data_dir": "NONE",
16
+ "eval_data_dir": "/cpfs01/user/lanyushi.p/Repo/eccv24/open-source/InstantMesh/test_dir",
17
+ "eval_batch_size": 1,
18
+ "num_workers": 0,
19
+ "overfitting": false,
20
+ "image_size": 256,
21
+ "iterations": 5000001,
22
+ "schedule_sampler": "uniform",
23
+ "anneal_lr": false,
24
+ "lr": 2e-05,
25
+ "weight_decay": 0.05,
26
+ "lr_anneal_steps": 0,
27
+ "batch_size": 1,
28
+ "microbatch": 1,
29
+ "ema_rate": "0.9999",
30
+ "log_interval": 50,
31
+ "eval_interval": 5000,
32
+ "save_interval": 10000,
33
+ "resume_checkpoint": "/nas/shared/V2V/yslan/logs/nips24/LSGM/t23d/FM/9cls/i23d/dit-L2-pixart-lognorm-rmsnorm-layernorm_before_pooled/gpu7-batch40-lr1e-4-bf16-qknorm-ctd3/model_joint_denoise_rec_model2990000.pt",
34
+ "resume_cldm_checkpoint": "",
35
+ "resume_checkpoint_EG3D": "",
36
+ "use_fp16": false,
37
+ "fp16_scale_growth": 0.001,
38
+ "load_submodule_name": "",
39
+ "ignore_resume_opt": false,
40
+ "freeze_ae": false,
41
+ "denoised_ae": true,
42
+ "prompt": "a red chair",
43
+ "interval": 5,
44
+ "save_img": false,
45
+ "use_train_trajectory": false,
46
+ "unconditional_guidance_scale": 6.5,
47
+ "use_eos_feature": false,
48
+ "export_mesh": false,
49
+ "cond_key": "img",
50
+ "allow_tf32": true,
51
+ "num_channels": 320,
52
+ "num_res_blocks": 2,
53
+ "num_heads": 8,
54
+ "num_heads_upsample": -1,
55
+ "num_head_channels": -1,
56
+ "attention_resolutions": "4,2,1",
57
+ "channel_mult": "",
58
+ "dropout": 0.0,
59
+ "class_cond": false,
60
+ "use_checkpoint": false,
61
+ "use_scale_shift_norm": true,
62
+ "resblock_updown": false,
63
+ "use_new_attention_order": false,
64
+ "denoise_in_channels": 4,
65
+ "denoise_out_channels": 4,
66
+ "create_controlnet": false,
67
+ "create_dit": true,
68
+ "i23d": true,
69
+ "create_unet_with_hint": false,
70
+ "dit_model_arch": "DiT-PixArt-L/2",
71
+ "use_spatial_transformer": true,
72
+ "transformer_depth": 1,
73
+ "context_dim": 1024,
74
+ "pooling_ctx_dim": 768,
75
+ "roll_out": true,
76
+ "n_embed": null,
77
+ "legacy": true,
78
+ "mixing_logit_init": -6,
79
+ "hint_channels": 3,
80
+ "learn_sigma": false,
81
+ "diffusion_steps": 1000,
82
+ "noise_schedule": "linear",
83
+ "standarization_xt": false,
84
+ "timestep_respacing": "",
85
+ "use_kl": false,
86
+ "predict_xstart": false,
87
+ "predict_v": true,
88
+ "rescale_timesteps": false,
89
+ "rescale_learned_sigmas": false,
90
+ "mixed_prediction": false,
91
+ "dino_version": "mv-sd-dit-dynaInp-trilatent",
92
+ "encoder_in_channels": 10,
93
+ "img_size": [
94
+ 256
95
+ ],
96
+ "patch_size": 14,
97
+ "in_chans": 384,
98
+ "num_classes": 0,
99
+ "embed_dim": 384,
100
+ "depth": 6,
101
+ "mlp_ratio": 4.0,
102
+ "qkv_bias": false,
103
+ "qk_scale": null,
104
+ "drop_rate": 0.1,
105
+ "attn_drop_rate": 0.0,
106
+ "drop_path_rate": 0.0,
107
+ "norm_layer": "nn.LayerNorm",
108
+ "cls_token": false,
109
+ "encoder_cls_token": false,
110
+ "decoder_cls_token": false,
111
+ "sr_kwargs": {},
112
+ "sr_ratio": 2,
113
+ "use_clip": false,
114
+ "arch_encoder": "vits",
115
+ "arch_decoder": "vitb",
116
+ "load_pretrain_encoder": false,
117
+ "encoder_lr": 1e-05,
118
+ "encoder_weight_decay": 0.001,
119
+ "no_dim_up_mlp": true,
120
+ "dim_up_mlp_as_func": false,
121
+ "decoder_load_pretrained": false,
122
+ "uvit_skip_encoder": true,
123
+ "vae_p": 2,
124
+ "ldm_z_channels": 4,
125
+ "ldm_embed_dim": 4,
126
+ "use_conf_map": false,
127
+ "sd_E_ch": 64,
128
+ "z_channels": 12,
129
+ "sd_E_num_res_blocks": 1,
130
+ "num_frames": 6,
131
+ "arch_dit_decoder": "DiT2-L/2",
132
+ "return_all_dit_layers": false,
133
+ "lrm_decoder": false,
134
+ "plane_n": 3,
135
+ "gs_rendering": false,
136
+ "decomposed": true,
137
+ "triplane_fg_bg": false,
138
+ "cfg": "objverse_tuneray_aug_resolution_64_64_auto",
139
+ "density_reg": 0.0,
140
+ "density_reg_p_dist": 0.004,
141
+ "reg_type": "l1",
142
+ "triplane_decoder_lr": 5e-05,
143
+ "super_resolution_lr": 5e-05,
144
+ "c_scale": 1,
145
+ "nsr_lr": 0.02,
146
+ "triplane_size": 224,
147
+ "decoder_in_chans": 32,
148
+ "triplane_in_chans": 32,
149
+ "decoder_output_dim": 3,
150
+ "out_chans": 96,
151
+ "c_dim": 25,
152
+ "ray_start": 0.6,
153
+ "ray_end": 1.8,
154
+ "rendering_kwargs": {
155
+ "image_resolution": 256,
156
+ "disparity_space_sampling": false,
157
+ "clamp_mode": "softplus",
158
+ "c_gen_conditioning_zero": true,
159
+ "c_scale": 1,
160
+ "superresolution_noise_mode": "none",
161
+ "density_reg": 0.0,
162
+ "density_reg_p_dist": 0.004,
163
+ "reg_type": "l1",
164
+ "decoder_lr_mul": 1,
165
+ "decoder_activation": "sigmoid",
166
+ "sr_antialias": true,
167
+ "return_triplane_features": false,
168
+ "return_sampling_details_flag": true,
169
+ "superresolution_module": "utils.torch_utils.components.NearestConvSR",
170
+ "depth_resolution": 64,
171
+ "depth_resolution_importance": 64,
172
+ "ray_start": "auto",
173
+ "ray_end": "auto",
174
+ "box_warp": 0.9,
175
+ "white_back": true,
176
+ "radius_range": [
177
+ 1.5,
178
+ 2
179
+ ],
180
+ "sampler_bbox_min": -0.45,
181
+ "sampler_bbox_max": 0.45,
182
+ "filter_out_of_bbox": true,
183
+ "PatchRaySampler": true,
184
+ "patch_rendering_resolution": 45,
185
+ "z_near": 1.05,
186
+ "z_far": 2.45
187
+ },
188
+ "sr_training": false,
189
+ "bcg_synthesis": false,
190
+ "bcg_synthesis_kwargs": {},
191
+ "patch_rendering_resolution": 45,
192
+ "vit_decoder_lr": 1e-05,
193
+ "vit_decoder_wd": 0.001,
194
+ "ae_classname": "vit.vit_triplane.RodinSR_256_fusionv6_ConvQuant_liteSR_dinoInit3DAttn_SD_B_3L_C_withrollout_withSD_D_ditDecoder",
195
+ "color_criterion": "mse",
196
+ "l2_lambda": 1.0,
197
+ "lpips_lambda": 0.8,
198
+ "lpips_delay_iter": 0,
199
+ "sr_delay_iter": 0,
200
+ "kl_anneal": false,
201
+ "latent_lambda": 0.0,
202
+ "latent_criterion": "mse",
203
+ "kl_lambda": 0.0,
204
+ "ssim_lambda": 0.0,
205
+ "l1_lambda": 0.0,
206
+ "id_lambda": 0.0,
207
+ "depth_lambda": 0.0,
208
+ "alpha_lambda": 1.0,
209
+ "fg_mse": false,
210
+ "bg_lamdba": 0.01,
211
+ "density_reg_every": 4,
212
+ "shape_uniform_lambda": 0.005,
213
+ "shape_importance_lambda": 0.01,
214
+ "shape_depth_lambda": 0.0,
215
+ "rec_cvD_lambda": 0.01,
216
+ "nvs_cvD_lambda": 0.025,
217
+ "patchgan_disc_factor": 0.01,
218
+ "patchgan_disc_g_weight": 0.2,
219
+ "r1_gamma": 1.0,
220
+ "sds_lamdba": 1.0,
221
+ "nvs_D_lr_mul": 1,
222
+ "cano_D_lr_mul": 1,
223
+ "ce_balanced_kl": 1.0,
224
+ "p_eps_lambda": 1,
225
+ "symmetry_loss": false,
226
+ "depth_smoothness_lambda": 0.0,
227
+ "ce_lambda": 0.5,
228
+ "negative_entropy_lambda": 0.5,
229
+ "grad_clip": true,
230
+ "online_mask": false,
231
+ "sde_time_eps": 0.01,
232
+ "sde_beta_start": 0.1,
233
+ "sde_beta_end": 20.0,
234
+ "sde_sde_type": "vpsde",
235
+ "sde_sigma2_0": 0.0,
236
+ "iw_sample_p": "drop_sigma2t_iw",
237
+ "iw_sample_q": "ll_iw",
238
+ "iw_subvp_like_vp_sde": false,
239
+ "train_vae": false,
240
+ "pred_type": "v",
241
+ "p_rendering_loss": false,
242
+ "unfix_logit": false,
243
+ "loss_type": "eps",
244
+ "loss_weight": "simple",
245
+ "diffusion_ce_anneal": true,
246
+ "enable_mixing_normal": false,
247
+ "only_mid_control": false,
248
+ "control_key": "img",
249
+ "normalize_clip_encoding": true,
250
+ "scale_clip_encoding": 1.0,
251
+ "cfg_dropout_prob": 0.1,
252
+ "use_lmdb": false,
253
+ "use_wds": false,
254
+ "use_lmdb_compressed": false,
255
+ "compile": false,
256
+ "objv_dataset": true,
257
+ "decode_encode_img_only": false,
258
+ "load_wds_diff": true,
259
+ "load_wds_latent": false,
260
+ "eval_load_wds_instance": true,
261
+ "shards_lst": "",
262
+ "eval_shards_lst": "",
263
+ "mv_input": true,
264
+ "duplicate_sample": true,
265
+ "orthog_duplicate": false,
266
+ "split_chunk_input": false,
267
+ "load_real": true,
268
+ "four_view_for_latent": false,
269
+ "single_view_for_i23d": false,
270
+ "shuffle_across_cls": true,
271
+ "load_extra_36_view": false,
272
+ "mv_latent_dir": "",
273
+ "append_depth": false,
274
+ "plucker_embedding": true,
275
+ "gs_cam_format": false,
276
+ "split_chunk_size": 8,
277
+ "path_type": "Linear",
278
+ "prediction": "velocity",
279
+ "sample_eps": null,
280
+ "train_eps": null,
281
+ "snr_type": "lognorm",
282
+ "local_rank": 0,
283
+ "gpus": 1
284
+ }