Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on May 11

Commit

0178f77

verified ·

1 Parent(s): 581c19e

Update sonic.py

Browse files

Files changed (1) hide show

sonic.py +31 -15

sonic.py CHANGED Viewed

@@ -73,31 +73,47 @@ def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
     ref_list, audio_list, uncond_list, motion_buckets = [], [], [], []
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
-        # ------ cond_clip : (bz=1, f=1, w=10, b=5, c=384) ----------------
-        cond_clip = audio_prompts[:, start:start+10]          # (1,≤10,12,384)
-        if cond_clip.shape[1] < 10:                           # w 길이 패딩
-            pad = torch.zeros_like(cond_clip[:, :10-cond_clip.shape[1]])
-            cond_clip = torch.cat([cond_clip, pad], dim=1)
-        cond_clip = cond_clip.unsqueeze(1)                    # f 차원 삽입 → (1,1,10,12,384)
-        cond_clip = cond_clip[:, :, :, :5, :]                 # b 차원 5 로 절단 → (1,1,10,5,384)
-        # ------ bucket_clip : (1,1,50,1,384) -----------------------------
-        bucket_clip = last_prompts[:, start:start+50]         # (1,≤50,1,384)
-        if bucket_clip.shape[1] < 50:
-            pad = torch.zeros_like(bucket_clip[:, :50-bucket_clip.shape[1]])
-            bucket_clip = torch.cat([bucket_clip, pad], dim=1)
-        bucket_clip = bucket_clip.unsqueeze(1)                # (1,1,50,1,384)
         motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
-        audio_list.append(audio_pe(cond_clip).squeeze(0)[0])  # (tokens,1024)
         uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
         motion_buckets.append(motion[0])
     # -------- diffusion --------------------------------------------------
     video = pipe(
         ref_img, clip_img, face_mask,

     ref_list, audio_list, uncond_list, motion_buckets = [], [], [], []
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
+        # ------------------------------------------------------------
+        # cond_clip  : (bz, f=1, w=10, b=5, c=384)
+        # bucket_clip: (bz, f=1, w=50, b=1, c=384)
+        # Whisper-tiny 는 hidden_state 층 수가 2 → 5 로 패딩
+        # ------------------------------------------------------------
+        clip_raw = audio_prompts[:, start:start + 10]          # (1, ≤10, L, 384)
+        if clip_raw.shape[1] < 10:                             # w 패딩
+            pad_w = torch.zeros_like(clip_raw[:, :10 - clip_raw.shape[1]])
+            clip_raw = torch.cat([clip_raw, pad_w], dim=1)
+        # ---- L(=layers) 패딩: 부족하면 마지막 layer 를 반복 ----------
+        L_now = clip_raw.shape[2]
+        if L_now < 5:
+            pad_L = clip_raw[:, :, -1:].repeat(1, 1, 5 - L_now, 1)
+            clip_raw = torch.cat([clip_raw, pad_L], dim=2)
+        clip_raw = clip_raw[:, :, :5]                           # (1,10,5,384)
+        cond_clip = clip_raw.unsqueeze(1)                      # (1,1,10,5,384)
+        # ------------------------------------------------------------
+        bucket_raw = last_prompts[:, start:start + 50]          # (1, ≤50, 1, 384)
+        if bucket_raw.shape[1] < 50:
+            pad_w = torch.zeros_like(bucket_raw[:, :50 - bucket_raw.shape[1]])
+            bucket_raw = torch.cat([bucket_raw, pad_w], dim=1)
+        bucket_clip = bucket_raw.unsqueeze(1)                  # (1,1,50,1,384)
         motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
+        audio_list.append(audio_pe(cond_clip).squeeze(0)[0])
         uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
         motion_buckets.append(motion[0])
     # -------- diffusion --------------------------------------------------
     video = pipe(
         ref_img, clip_img, face_mask,