File size: 5,051 Bytes
a3a3ae4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
model:
  target: vtdm.vtdm_gen_v01.VideoLDM
  base_learning_rate: 1.0e-05
  params:
    input_key: video
    scale_factor: 0.18215
    log_keys: caption
    num_samples: 25 #frame_rate
    trained_param_keys:
    - diffusion_model.label_emb.0.0.weight
    - .emb_layers.
    - .time_stack.
    en_and_decode_n_samples_a_time: 25 #frame_rate
    disable_first_stage_autocast: true
    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.Denoiser
      params:
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
    network_config:
      target: sgm.modules.diffusionmodules.video_model.VideoUNet
      params:
        adm_in_channels: 768
        num_classes: sequential
        use_checkpoint: true
        in_channels: 8
        out_channels: 4
        model_channels: 320
        attention_resolutions:
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        - 4
        num_head_channels: 64
        use_linear_in_transformer: true
        transformer_depth: 1
        context_dim: 1024
        spatial_transformer_attn_type: softmax-xformers
        extra_ff_mix_layer: true
        use_spatial_context: true
        merge_strategy: learned_with_images
        video_kernel_size:
        - 3
        - 1
        - 1
    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
        - is_trainable: false
          input_key: cond_frames_without_noise
          ucg_rate: 0.1
          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
          params:
            n_cond_frames: 1
            n_copies: 1
            open_clip_embedding_config:
              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
              params:
                version: ckpts/open_clip_pytorch_model.bin
                freeze: true
        - is_trainable: false
          input_key: video
          ucg_rate: 0.0
          target: vtdm.encoders.AesEmbedder
        - is_trainable: false
          input_key: elevation
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
        - input_key: cond_frames
          is_trainable: false
          ucg_rate: 0.1
          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
          params:
            disable_encoder_autocast: true
            n_cond_frames: 1
            n_copies: 25 #frame_rate
            is_ae: true
            encoder_config:
              target: sgm.models.autoencoder.AutoencoderKLModeOnly
              params:
                embed_dim: 4
                monitor: val/rec_loss
                ddconfig:
                  attn_type: vanilla-xformers
                  double_z: true
                  z_channels: 4
                  resolution: 256
                  in_channels: 3
                  out_ch: 3
                  ch: 128
                  ch_mult:
                  - 1
                  - 2
                  - 4
                  - 4
                  num_res_blocks: 2
                  attn_resolutions: []
                  dropout: 0.0
                lossconfig:
                  target: torch.nn.Identity
        - input_key: cond_aug
          is_trainable: false
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
    first_stage_config:
      target: sgm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          attn_type: vanilla-xformers
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    loss_fn_config:
      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
      params:
        num_frames: 25 #frame_rate
        batch2model_keys:
        - num_video_frames
        - image_only_indicator
        sigma_sampler_config:
          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
          params:
            p_mean: 1.0
            p_std: 1.6
        loss_weighting_config:
          target: sgm.modules.diffusionmodules.loss_weighting.VWeighting
    sampler_config:
      target: sgm.modules.diffusionmodules.sampling.LinearMultistepSampler
      params:
        num_steps: 50
        verbose: True

        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
          params:
            sigma_max: 700.0

        guider_config:
          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
          params:
            num_frames: 25 #frame_rate
            max_scale: 2.5
            min_scale: 1.0