Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on May 11

Commit

43cb38b

verified ·

1 Parent(s): 914dc02

Update app.py

Browse files

Files changed (1) hide show

app.py +241 -220

app.py CHANGED Viewed

@@ -1,223 +1,244 @@
-import spaces
-import gradio as gr
-import os
-import numpy as np
-from pydub import AudioSegment
-import hashlib
-import io
-from sonic import Sonic
 from PIL import Image
-import torch
-# 초기 실행 시 필요한 모델들을 다운로드
-cmd = (
-    'python3 -m pip install "huggingface_hub[cli]" accelerate; '
-    'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; '
-    'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir  checkpoints/stable-video-diffusion-img2vid-xt; '
-    'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
 )
-os.system(cmd)
-pipe = Sonic()
-def get_md5(content_bytes: bytes):
-    """MD5 해시를 계산하여 32자리 문자열을 반환"""
-    return hashlib.md5(content_bytes).hexdigest()
-tmp_path = './tmp_path/'
-res_path = './res_path/'
-os.makedirs(tmp_path, exist_ok=True)
-os.makedirs(res_path, exist_ok=True)
-@spaces.GPU(duration=600)  # 긴 비디오 처리를 위해 duration 600초로 설정 (10분)
-def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
-    """
-    Sonic pipeline으로부터 실제 비디오를 생성하는 함수.
-    최대 60초 길이의 오디오에 대해 inference_steps를 결정하여,
-    얼굴 탐지 후 영상 생성 작업을 수행함.
-    """
-    expand_ratio = 0.0
-    min_resolution = 512
-    # 오디오 길이 계산
-    audio = AudioSegment.from_file(audio_path)
-    duration = len(audio) / 1000.0  # 초 단위
-    # 오디오 길이에 따라 inference_steps 결정 (최소 25프레임 ~ 최대 750프레임)
-    inference_steps = min(max(int(duration * 12.5), 25), 750)
-    print(f"[INFO] Audio duration: {duration:.2f} seconds, using inference_steps={inference_steps}")
-    # 얼굴 인식
-    face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
-    print(f"[INFO] Face detection info: {face_info}")
-    # 얼굴이 하나라도 검출되면 -> pipeline 진행
-    if face_info['face_num'] > 0:
-        os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
-        pipe.process(
-            img_path,
-            audio_path,
-            res_video_path,
-            min_resolution=min_resolution,
-            inference_steps=inference_steps,
-            dynamic_scale=dynamic_scale
         )
-        return res_video_path
-    else:
-        # 얼굴이 전혀 없으면 -1 리턴
-        return -1
-def process_sonic(image, audio, dynamic_scale):
-    """
-    Gradio 인터페이스에서 호출되는 함수:
-    1. 이미지/오디오 검사
-    2. MD5 해시 -> 파일명
-    3. 캐시 검사 -> 없으면 영상 생성
-    """
-    if image is None:
-        raise gr.Error("Please upload an image")
-    if audio is None:
-        raise gr.Error("Please upload an audio file")
-    # (1) 이미지 MD5
-    buf_img = io.BytesIO()
-    image.save(buf_img, format="PNG")
-    img_bytes = buf_img.getvalue()
-    img_md5 = get_md5(img_bytes)
-    # (2) 오디오 MD5
-    sampling_rate, arr = audio[:2]
-    if len(arr.shape) == 1:
-        arr = arr[:, None]
-    audio_segment = AudioSegment(
-        arr.tobytes(),
-        frame_rate=sampling_rate,
-        sample_width=arr.dtype.itemsize,
-        channels=arr.shape[1]
-    )
-    # Whisper 호환을 위해 mono/16kHz로 변환
-    audio_segment = audio_segment.set_channels(1).set_frame_rate(16000)
-    MAX_DURATION_MS = 60000
-    if len(audio_segment) > MAX_DURATION_MS:
-        audio_segment = audio_segment[:MAX_DURATION_MS]
-    buf_audio = io.BytesIO()
-    audio_segment.export(buf_audio, format="wav")
-    audio_bytes = buf_audio.getvalue()
-    audio_md5 = get_md5(audio_bytes)
-    # (3) 파일 경로
-    image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
-    audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
-    res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4'))
-    if not os.path.exists(image_path):
-        with open(image_path, "wb") as f:
-            f.write(img_bytes)
-    if not os.path.exists(audio_path):
-        with open(audio_path, "wb") as f:
-            f.write(audio_bytes)
-    # (4) 캐싱된 결과가 있으면 재사용
-    if os.path.exists(res_video_path):
-        print(f"[INFO] Using cached result: {res_video_path}")
-        return res_video_path
-    else:
-        print(f"[INFO] Generating new video with dynamic_scale={dynamic_scale}")
-        video_result = get_video_res(image_path, audio_path, res_video_path, dynamic_scale)
-        return video_result
-def get_example():
-    return []
-css = """
-.gradio-container {
-    font-family: 'Arial', sans-serif;
-}
-.main-header {
-    text-align: center;
-    color: #2a2a2a;
-    margin-bottom: 2em;
-}
-.parameter-section {
-    background-color: #f5f5f5;
-    padding: 1em;
-    border-radius: 8px;
-    margin: 1em 0;
-}
-.example-section {
-    margin-top: 2em;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    gr.HTML("""
-        <div class="main-header">
-            <h1>🎭 Sonic: Advanced Portrait Animation</h1>
-            <p>Transform still images into dynamic videos synchronized with audio (up to 1 minute)</p>
-        </div>
-    """)
-    with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(
-                type='pil',
-                label="Portrait Image",
-                elem_id="image_input"
-            )
-            audio_input = gr.Audio(
-                label="Voice/Audio Input (up to 1 minute)",
-                elem_id="audio_input",
-                type="numpy"
-            )
-            with gr.Column():
-                dynamic_scale = gr.Slider(
-                    minimum=0.5,
-                    maximum=2.0,
-                    value=1.0,
-                    step=0.1,
-                    label="Animation Intensity",
-                    info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)"
-                )
-            process_btn = gr.Button(
-                "Generate Animation",
-                variant="primary",
-                elem_id="process_btn"
-            )
-        with gr.Column():
-            video_output = gr.Video(
-                label="Generated Animation",
-                elem_id="video_output"
-            )
-    process_btn.click(
-        fn=process_sonic,
-        inputs=[image_input, audio_input, dynamic_scale],
-        outputs=video_output,
-    )
-    gr.Examples(
-        examples=get_example(),
-        fn=process_sonic,
-        inputs=[image_input, audio_input, dynamic_scale],
-        outputs=video_output,
-        cache_examples=False
-    )
-    gr.HTML("""
-        <div style="text-align: center; margin-top: 2em;">
-            <div style="margin-bottom: 1em;">
-                <a href="https://github.com/jixiaozhong/Sonic" target="_blank" style="text-decoration: none;">
-                    <img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
-                </a>
-                <a href="https://arxiv.org/pdf/2411.16331" target="_blank" style="text-decoration: none;">
-                    <img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
-                </a>
-            </div>
-            <p>🔔 Note: For optimal results, use clear portrait images and high-quality audio (now supports up to 1 minute!)</p>
-        </div>
-    """)
-demo.launch(share=True)

+import os, math, torch, cv2
 from PIL import Image
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from diffusers import AutoencoderKLTemporalDecoder
+from diffusers.schedulers import EulerDiscreteScheduler
+from transformers import WhisperModel, CLIPVisionModelWithProjection, AutoFeatureExtractor
+from src.utils.util import save_videos_grid, seed_everything
+from src.dataset.test_preprocess import process_bbox, image_audio_to_tensor
+from src.models.base.unet_spatio_temporal_condition import (
+    UNetSpatioTemporalConditionModel, add_ip_adapters,
 )
+from src.pipelines.pipeline_sonic import SonicPipeline
+from src.models.audio_adapter.audio_proj import AudioProjModel
+from src.models.audio_adapter.audio_to_bucket import Audio2bucketModel
+from src.utils.RIFE.RIFE_HDv3 import RIFEModel
+from src.dataset.face_align.align import AlignImage
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# ------------------------------------------------------------------
+#            single image + speech  →  video-tensor generator
+# ------------------------------------------------------------------
+def test(
+    pipe, config, wav_enc, audio_pe, audio2bucket, image_encoder,
+    width, height, batch,
+):
+    # ---- 배치 차원 맞추기 -----------------------------------------
+    for k, v in batch.items():
+        if isinstance(v, torch.Tensor):
+            batch[k] = v.unsqueeze(0).to(pipe.device).float()
+    ref_img   = batch["ref_img"]
+    clip_img  = batch["clip_images"]
+    face_mask = batch["face_mask"]
+    image_embeds = image_encoder(clip_img).image_embeds             # (1,1024)
+    audio_feature = batch["audio_feature"]                          # (1, 80, T)
+    audio_len     = int(batch["audio_len"])
+    step          = int(config.step)
+    window = 16_000                                                 # 1-sec chunks
+    audio_prompts, last_prompts = [], []
+    for i in range(0, audio_feature.shape[-1], window):
+        chunk = audio_feature[:, :, i : i + window]                 # (1, 80, win)
+        layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
+        last   = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)
+        audio_prompts.append(torch.stack(layers, dim=2))            # (1, w, L, 384)
+        last_prompts.append(last)
+    if not audio_prompts:
+        raise ValueError("[ERROR] No speech recognised in the provided audio.")
+    audio_prompts = torch.cat(audio_prompts, dim=1)
+    last_prompts  = torch.cat(last_prompts,  dim=1)
+    # padding 규칙
+    audio_prompts = torch.cat(
+        [torch.zeros_like(audio_prompts[:, :4]), audio_prompts,
+         torch.zeros_like(audio_prompts[:, :6])], dim=1)
+    last_prompts = torch.cat(
+        [torch.zeros_like(last_prompts[:, :24]), last_prompts,
+         torch.zeros_like(last_prompts[:, :26])], dim=1)
+    total_tokens = audio_prompts.shape[1]
+    num_chunks   = max(1, math.ceil(total_tokens / (2 * step)))
+    ref_list, audio_list, uncond_list, motion_buckets = [], [], [], []
+    for i in tqdm(range(num_chunks)):
+        start = i * 2 * step
+        # ------------ cond_clip : (1,1,10,5,384) ------------------
+        clip_raw = audio_prompts[:, start : start + 10]              # (1, ≤10, L, 384)
+        # ★ W-padding은 dim=1 이어야 함!
+        if clip_raw.shape[1] < 10:
+            pad_w = torch.zeros_like(clip_raw[:, : 10 - clip_raw.shape[1]])
+            clip_raw = torch.cat([clip_raw, pad_w], dim=1)
+        # ★ L-padding은 dim=2
+        while clip_raw.shape[2] < 5:
+            clip_raw = torch.cat([clip_raw, clip_raw[:, :, -1:]], dim=2)
+        clip_raw = clip_raw[:, :, :5]                                # (1,10,5,384)
+        cond_clip = clip_raw.unsqueeze(1)                            # (1,1,10,5,384)
+        # ------------ bucket_clip : (1,1,50,1,384) -----------------
+        bucket_raw = last_prompts[:, start : start + 50]
+        if bucket_raw.shape[1] < 50:                                 # ★ dim=1
+            pad_w = torch.zeros_like(bucket_raw[:, : 50 - bucket_raw.shape[1]])
+            bucket_raw = torch.cat([bucket_raw, pad_w], dim=1)
+        bucket_clip = bucket_raw.unsqueeze(1)                        # (1,1,50,1,384)
+        motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
+        ref_list.append(ref_img[0])
+        audio_list.append(audio_pe(cond_clip).squeeze(0))            # (50,1024)
+        uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0))
+        motion_buckets.append(motion[0])
+    # ---- Stable-Video-Diffusion 호출 ------------------------------
+    video = pipe(
+        ref_img, clip_img, face_mask,
+        audio_list, uncond_list, motion_buckets,
+        height=height, width=width,
+        num_frames=len(audio_list),
+        decode_chunk_size=config.decode_chunk_size,
+        motion_bucket_scale=config.motion_bucket_scale,
+        fps=config.fps,
+        noise_aug_strength=config.noise_aug_strength,
+        min_guidance_scale1=config.min_appearance_guidance_scale,
+        max_guidance_scale1=config.max_appearance_guidance_scale,
+        min_guidance_scale2=config.audio_guidance_scale,
+        max_guidance_scale2=config.audio_guidance_scale,
+        overlap=config.overlap,
+        shift_offset=config.shift_offset,
+        frames_per_batch=config.n_sample_frames,
+        num_inference_steps=config.num_inference_steps,
+        i2i_noise_strength=config.i2i_noise_strength,
+    ).frames
+    video = (video * 0.5 + 0.5).clamp(0, 1)
+    return video.to(pipe.device).unsqueeze(0).cpu()
+# ------------------------------------------------------------------
+#                        Sonic  클래스
+# ------------------------------------------------------------------
+class Sonic:
+    config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
+    config      = OmegaConf.load(config_file)
+    def __init__(self, device_id: int = 0, enable_interpolate_frame: bool = True):
+        cfg                = self.config
+        cfg.use_interframe = enable_interpolate_frame
+        self.device        = f"cuda:{device_id}" if device_id >= 0 and torch.cuda.is_available() else "cpu"
+        cfg.pretrained_model_name_or_path = os.path.join(BASE_DIR, cfg.pretrained_model_name_or_path)
+        self._load_models(cfg)
+        print("Sonic init done")
+    # --------------------------------------------------------------
+    def _load_models(self, cfg):
+        dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[cfg.weight_dtype]
+        vae   = AutoencoderKLTemporalDecoder.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", variant="fp16")
+        sched = EulerDiscreteScheduler.from_pretrained        (cfg.pretrained_model_name_or_path, subfolder="scheduler")
+        img_e = CLIPVisionModelWithProjection.from_pretrained (cfg.pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16")
+        unet  = UNetSpatioTemporalConditionModel.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="unet", variant="fp16")
+        add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
+        a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
+        a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
+        unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path),          map_location="cpu"))
+        a2t.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path),    map_location="cpu"))
+        a2b.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path),   map_location="cpu"))
+        whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
+        whisper.requires_grad_(False)
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny"))
+        self.face_det = AlignImage(self.device, det_path=os.path.join(BASE_DIR, "checkpoints/yoloface_v5m.pt"))
+        if cfg.use_interframe:
+            self.rife = RIFEModel(device=self.device)
+            self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
+        img_e.to(dtype); vae.to(dtype); unet.to(dtype)
+        self.pipe          = SonicPipeline(unet=unet, image_encoder=img_e, vae=vae, scheduler=sched).to(device=self.device, dtype=dtype)
+        self.image_encoder = img_e
+        self.audio2token   = a2t
+        self.audio2bucket  = a2b
+        self.whisper       = whisper
+    # --------------------------------------------------------------
+    def preprocess(self, img_path: str, expand_ratio: float = 1.0):
+        img = cv2.imread(img_path)
+        h, w = img.shape[:2]
+        _, _, faces = self.face_det(img, maxface=True)
+        if faces:
+            x1, y1, ww, hh = faces[0]
+            return {"face_num": 1, "crop_bbox": process_bbox((x1, y1, x1 + ww, y1 + hh), expand_ratio, h, w)}
+        return {"face_num": 0, "crop_bbox": None}
+    # --------------------------------------------------------------
+    @torch.no_grad()
+    def process(
+        self,
+        img_path:  str,
+        audio_path:str,
+        out_path:  str,
+        min_resolution: int = 512,
+        inference_steps:int = 25,
+        dynamic_scale: float = 1.0,
+        keep_resolution: bool = False,
+        seed: int | None = None,
+    ):
+        cfg = self.config
+        if seed is not None: cfg.seed = seed
+        cfg.num_inference_steps  = inference_steps
+        cfg.motion_bucket_scale  = dynamic_scale
+        seed_everything(cfg.seed)
+        sample = image_audio_to_tensor(
+            self.face_det, self.feature_extractor,
+            img_path, audio_path,
+            limit=-1, image_size=min_resolution, area=cfg.area,
+        )
+        if sample is None:
+            return -1
+        h, w = sample["ref_img"].shape[-2:]
+        resolution = (f"{(Image.open(img_path).width  //2)*2}x{(Image.open(img_path).height//2)*2}"
+                      if keep_resolution else f"{w}x{h}")
+        video = test(
+            self.pipe, cfg, self.whisper, self.audio2token,
+            self.audio2bucket, self.image_encoder,
+            w, h, sample,
+        )
+        if cfg.use_interframe:
+            out = video.to(self.device)
+            frames = []
+            for i in tqdm(range(out.shape[2] - 1), ncols=0):
+                mid = self.rife.inference(out[:, :, i], out[:, :, i + 1]).clamp(0, 1).detach()
+                frames.extend([out[:, :, i], mid])
+            frames.append(out[:, :, -1])
+            video = torch.stack(frames, 2).cpu()
+        tmp = out_path.replace(".mp4", "_noaudio.mp4")
+        save_videos_grid(video, tmp, n_rows=video.shape[0], fps=cfg.fps * (2 if cfg.use_interframe else 1))
+        os.system(
+            f"ffmpeg -i '{tmp}' -i '{audio_path}' -s {resolution} "
+            f"-vcodec libx264 -acodec aac -crf 18 -shortest '{out_path}' -y -loglevel error"
         )
+        os.remove(tmp)
+        return 0