NVComposer / configs /dual_stream /nvcomposer.yaml
l-li's picture
init(*): initialization.
0b23d5a
num_frames: &num_frames 16
resolution: &resolution [576, 1024]
model:
base_learning_rate: 1.0e-5
scale_lr: false
target: core.models.diffusion.DualStreamMultiViewDiffusionModel
params:
use_task_embedding: false
ray_as_image: false
apply_condition_mask_in_training_loss: true
separate_noise_and_condition: true
condition_padding_with_anchor: false
use_ray_decoder_loss_high_frequency_isolation: false
train_with_multi_view_feature_alignment: true
use_text_cross_attention_condition: false
linear_start: 0.00085
linear_end: 0.012
num_time_steps_cond: 1
log_every_t: 200
time_steps: 1000
data_key_images: combined_images
data_key_rays: combined_rays
data_key_text_condition: caption
cond_stage_trainable: false
image_size: [72, 128]
channels: 10
monitor: global_step
scale_by_std: false
scale_factor: 0.18215
use_dynamic_rescale: true
base_scale: 0.3
use_ema: false
uncond_prob: 0.05
uncond_type: 'empty_seq'
use_camera_pose_query_transformer: false
random_cond: false
cond_concat: true
frame_mask: false
padding: true
per_frame_auto_encoding: true
parameterization: "v"
rescale_betas_zero_snr: true
use_noise_offset: false
scheduler_config:
target: utils.lr_scheduler.LambdaLRScheduler
interval: 'step'
frequency: 100
params:
start_step: 0
final_decay_ratio: 0.1
decay_steps: 100
bd_noise: false
unet_config:
target: core.modules.networks.unet_modules.UNetModel
params:
in_channels: 20
out_channels: 10
model_channels: 320
attention_resolutions:
- 4
- 2
- 1
num_res_blocks: 2
channel_mult:
- 1
- 2
- 4
- 4
dropout: 0.1
num_head_channels: 64
transformer_depth: 1
context_dim: 1024
use_linear: true
use_checkpoint: true
temporal_conv: true
temporal_attention: true
temporal_selfatt_only: true
use_relative_position: false
use_causal_attention: false
temporal_length: *num_frames
addition_attention: true
image_cross_attention: true
image_cross_attention_scale_learnable: true
default_fs: 3
fs_condition: false
use_spatial_temporal_attention: true
use_addition_ray_output_head: true
ray_channels: 6
use_lora_for_rays_in_output_blocks: false
use_task_embedding: false
use_ray_decoder: true
use_ray_decoder_residual: true
full_spatial_temporal_attention: true
enhance_multi_view_correspondence: false
camera_pose_condition: true
use_feature_alignment: true
first_stage_config:
target: core.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_img_config:
target: core.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
params:
freeze: true
image_proj_model_config:
target: core.modules.encoders.resampler.Resampler
params:
dim: 1024
depth: 4
dim_head: 64
heads: 12
num_queries: 16
embedding_dim: 1280
output_dim: 1024
ff_mult: 4
video_length: *num_frames