openfree commited on
Commit
0178f77
·
verified ·
1 Parent(s): 581c19e

Update sonic.py

Browse files
Files changed (1) hide show
  1. sonic.py +31 -15
sonic.py CHANGED
@@ -73,31 +73,47 @@ def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
73
 
74
  ref_list, audio_list, uncond_list, motion_buckets = [], [], [], []
75
 
 
76
  for i in tqdm(range(num_chunks)):
77
  start = i * 2 * step
78
 
79
- # ------ cond_clip : (bz=1, f=1, w=10, b=5, c=384) ----------------
80
- cond_clip = audio_prompts[:, start:start+10] # (1,≤10,12,384)
81
- if cond_clip.shape[1] < 10: # w 길이 패딩
82
- pad = torch.zeros_like(cond_clip[:, :10-cond_clip.shape[1]])
83
- cond_clip = torch.cat([cond_clip, pad], dim=1)
84
- cond_clip = cond_clip.unsqueeze(1) # f 차원 삽입 (1,1,10,12,384)
85
- cond_clip = cond_clip[:, :, :, :5, :] # b 차원 5 로 절단 → (1,1,10,5,384)
86
-
87
- # ------ bucket_clip : (1,1,50,1,384) -----------------------------
88
- bucket_clip = last_prompts[:, start:start+50] # (1,≤50,1,384)
89
- if bucket_clip.shape[1] < 50:
90
- pad = torch.zeros_like(bucket_clip[:, :50-bucket_clip.shape[1]])
91
- bucket_clip = torch.cat([bucket_clip, pad], dim=1)
92
- bucket_clip = bucket_clip.unsqueeze(1) # (1,1,50,1,384)
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
95
 
96
  ref_list.append(ref_img[0])
97
- audio_list.append(audio_pe(cond_clip).squeeze(0)[0]) # (tokens,1024)
98
  uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
99
  motion_buckets.append(motion[0])
100
 
 
 
 
 
101
  # -------- diffusion --------------------------------------------------
102
  video = pipe(
103
  ref_img, clip_img, face_mask,
 
73
 
74
  ref_list, audio_list, uncond_list, motion_buckets = [], [], [], []
75
 
76
+
77
  for i in tqdm(range(num_chunks)):
78
  start = i * 2 * step
79
 
80
+ # ------------------------------------------------------------
81
+ # cond_clip : (bz, f=1, w=10, b=5, c=384)
82
+ # bucket_clip: (bz, f=1, w=50, b=1, c=384)
83
+ # Whisper-tiny hidden_state 층 수가 2 → 5 로 패딩
84
+ # ------------------------------------------------------------
85
+ clip_raw = audio_prompts[:, start:start + 10] # (1,10, L, 384)
86
+ if clip_raw.shape[1] < 10: # w 패딩
87
+ pad_w = torch.zeros_like(clip_raw[:, :10 - clip_raw.shape[1]])
88
+ clip_raw = torch.cat([clip_raw, pad_w], dim=1)
89
+
90
+ # ---- L(=layers) 패딩: 부족하면 마지막 layer 를 반복 ----------
91
+ L_now = clip_raw.shape[2]
92
+ if L_now < 5:
93
+ pad_L = clip_raw[:, :, -1:].repeat(1, 1, 5 - L_now, 1)
94
+ clip_raw = torch.cat([clip_raw, pad_L], dim=2)
95
+ clip_raw = clip_raw[:, :, :5] # (1,10,5,384)
96
+
97
+ cond_clip = clip_raw.unsqueeze(1) # (1,1,10,5,384)
98
+
99
+ # ------------------------------------------------------------
100
+ bucket_raw = last_prompts[:, start:start + 50] # (1, ≤50, 1, 384)
101
+ if bucket_raw.shape[1] < 50:
102
+ pad_w = torch.zeros_like(bucket_raw[:, :50 - bucket_raw.shape[1]])
103
+ bucket_raw = torch.cat([bucket_raw, pad_w], dim=1)
104
+ bucket_clip = bucket_raw.unsqueeze(1) # (1,1,50,1,384)
105
 
106
  motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
107
 
108
  ref_list.append(ref_img[0])
109
+ audio_list.append(audio_pe(cond_clip).squeeze(0)[0])
110
  uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
111
  motion_buckets.append(motion[0])
112
 
113
+
114
+
115
+
116
+
117
  # -------- diffusion --------------------------------------------------
118
  video = pipe(
119
  ref_img, clip_img, face_mask,