Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on 29 days ago

Commit

ec118f6

verified ·

1 Parent(s): 2399e79

Update sonic.py

Browse files

Files changed (1) hide show

sonic.py +295 -292

sonic.py CHANGED Viewed

@@ -1,12 +1,8 @@
-# sonic.py
-# ---------------------------------------------------------------------
-#  Sonic – single-image + speech → talking-head video  (offline edition)
-# ---------------------------------------------------------------------
-import os, math
-from typing import Dict, Any, List
 import torch
 from PIL import Image
 from omegaconf import OmegaConf
 from tqdm import tqdm
 import cv2
@@ -17,311 +13,318 @@ from transformers import WhisperModel, CLIPVisionModelWithProjection, AutoFeatur
 from src.utils.util import save_videos_grid, seed_everything
 from src.dataset.test_preprocess import process_bbox, image_audio_to_tensor
-from src.models.base.unet_spatio_temporal_condition import (
-    UNetSpatioTemporalConditionModel,
-    add_ip_adapters,
-)
 from src.pipelines.pipeline_sonic import SonicPipeline
 from src.models.audio_adapter.audio_proj import AudioProjModel
 from src.models.audio_adapter.audio_to_bucket import Audio2bucketModel
 from src.utils.RIFE.RIFE_HDv3 import RIFEModel
 from src.dataset.face_align.align import AlignImage
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-# ------------------------------------------------------------------ #
-#                     헬퍼 : diffusers 경로 자동 찾기                  #
-# ------------------------------------------------------------------ #
-def _locate_diffusers_dir(root: str) -> str:
-    """
-    `root` 하위 디렉터리에서 diffusers 스냅샷(model_index.json or config.json)
-    이 들어 있는 실제 모델 폴더를 찾아서 반환한다. 존재하지 않으면 오류.
-    """
-    for cur, _dirs, files in os.walk(root):
-        if {"model_index.json", "config.json"} & set(files):
-            return cur
-    raise FileNotFoundError(
-        f"[ERROR] No diffusers model files found under '{root}'. "
-        "Check that the checkpoint was downloaded correctly."
-    )
-# ------------------------------------------------------------------ #
-#                        영상 생성용 내부 함수                         #
-# ------------------------------------------------------------------ #
-def _gen_video_tensor(
-    pipe: SonicPipeline,
-    cfg: OmegaConf,
-    wav_enc: WhisperModel,
-    audio_pe: AudioProjModel,
-    audio2bucket: Audio2bucketModel,
-    image_encoder: CLIPVisionModelWithProjection,
-    width: int,
-    height: int,
-    batch: Dict[str, torch.Tensor],
-) -> torch.Tensor:
-    """
-    single 이미지 + 오디오 feature → video tensor (C,T,H,W)
-    """
-    # -------- batch 차원 보정 --------------------------------------
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
-    ref_img   = batch["ref_img"]        # (1,C,H,W)
-    clip_img  = batch["clip_images"]
-    face_mask = batch["face_mask"]
-    image_embeds = image_encoder(clip_img).image_embeds
-    audio_feat: torch.Tensor = batch["audio_feature"]          # (1, 80, T)
-    audio_len:  int          = int(batch["audio_len"])         # scalar
-    step:       int          = int(cfg.step)
-    # step 이 전체 길이보다 크면 최소 1 로 보정
-    if audio_len < step:
-        step = max(1, audio_len)
-    # -------- Whisper encoder 1초 단위로 수행 ----------------------
-    window = 16_000                                             # 1-s chunk
-    aud_prompts: List[torch.Tensor] = []
-    last_prompts: List[torch.Tensor] = []
-    for i in range(0, audio_feat.shape[-1], window):
-        chunk = audio_feat[:, :, i : i + window]
-        # 모든 hidden-states / 마지막 hidden-state
-        layers: List[torch.Tensor] = wav_enc.encoder(
-            chunk, output_hidden_states=True
-        ).hidden_states
-        last_hidden = wav_enc.encoder(chunk).last_hidden_state  # (1, 80, 384)
-        # Whisper layer 는 6개 → AudioProj 가 기대하는 5개로 truncate
-        prompt = torch.stack(layers, dim=2)[:, :, :5]           # (1,80,5,384)
-        aud_prompts.append(prompt)
-        last_prompts.append(last_hidden.unsqueeze(-2))          # (1,80,1,384)
-    if len(aud_prompts) == 0:
-        raise ValueError("[ERROR] No speech recognised in the provided audio.")
-    # concat 뒤 padding 규칙 적용
-    aud_prompts = torch.cat(aud_prompts, dim=1)                 # (1, 80*…, 5, 384)
-    last_prompts = torch.cat(last_prompts, dim=1)               # (1, 80*…, 1, 384)
-    aud_prompts = torch.cat(
-        [torch.zeros_like(aud_prompts[:, :4]), aud_prompts, torch.zeros_like(aud_prompts[:, :6])],
-        dim=1,
-    )
-    last_prompts = torch.cat(
-        [torch.zeros_like(last_prompts[:, :24]), last_prompts, torch.zeros_like(last_prompts[:, :26])],
-        dim=1,
-    )
-    # --------  f=10 / w=5 로 clip 자르기 --------------------------
-    ref_list, aud_list, uncond_list, mb_list = [], [], [], []
-    total_tokens = aud_prompts.shape[1]
-    n_chunks = max(1, math.ceil(total_tokens / (2 * step)))
-    for i in tqdm(range(n_chunks), desc="audio-chunks", ncols=0):
-        s = i * 2 * step
-        cond_clip = aud_prompts[:, s : s + 10]                  # (1,10,5,384)
-        if cond_clip.shape[1] < 10:                             # 뒤쪽 padding
-            pad = torch.zeros_like(cond_clip[:, : 10 - cond_clip.shape[1]])
-            cond_clip = torch.cat([cond_clip, pad], dim=1)
-        bucket_clip = last_prompts[:, s : s + 50]               # (1,50,1,384)
-        if bucket_clip.shape[1] < 50:
-            pad = torch.zeros_like(bucket_clip[:, : 50 - bucket_clip.shape[1]])
-            bucket_clip = torch.cat([bucket_clip, pad], dim=1)
-        # (bz,f,w,b,c) 5-D 로 변환
-        cond_clip      = cond_clip.unsqueeze(3)                 # (1,10,5,1,384)
-        bucket_clip    = bucket_clip.unsqueeze(3)               # (1,50,1,1,384)
-        uncond_clip    = torch.zeros_like(cond_clip)
-        motion_bucket  = audio2bucket(bucket_clip, image_embeds) * 16 + 16
-        ref_list      .append(ref_img[0])
-        aud_list      .append(audio_pe(cond_clip).squeeze(0)[0])      # (ctx,1024)
-        uncond_list   .append(audio_pe(uncond_clip).squeeze(0)[0])    # (ctx,1024)
-        mb_list       .append(motion_bucket[0])
-    # --------  UNet 파이프라인 실행 --------------------------------
-    video = (
-        pipe(
-            ref_img,
-            clip_img,
-            face_mask,
-            aud_list,
-            uncond_list,
-            mb_list,
-            height=height,
-            width=width,
-            num_frames=len(aud_list),
-            decode_chunk_size=cfg.decode_chunk_size,
-            motion_bucket_scale=cfg.motion_bucket_scale,
-            fps=cfg.fps,
-            noise_aug_strength=cfg.noise_aug_strength,
-            min_guidance_scale1=cfg.min_appearance_guidance_scale,
-            max_guidance_scale1=cfg.max_appearance_guidance_scale,
-            min_guidance_scale2=cfg.audio_guidance_scale,
-            max_guidance_scale2=cfg.audio_guidance_scale,
-            overlap=cfg.overlap,
-            shift_offset=cfg.shift_offset,
-            frames_per_batch=cfg.n_sample_frames,
-            num_inference_steps=cfg.num_inference_steps,
-            i2i_noise_strength=cfg.i2i_noise_strength,
-        ).frames
-        * 0.5
-        + 0.5
-    ).clamp(0, 1)
-    # (B,C,T,H,W)   → (C,T,H,W)
-    return video.to(pipe.device).squeeze(0).cpu()
-# ------------------------------------------------------------------ #
-#                         Sonic  –  main class                        #
-# ------------------------------------------------------------------ #
-class Sonic:
-    config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
-    config      = OmegaConf.load(config_file)
-    def __init__(self, device_id: int = 0, enable_interpolate_frame: bool = True):
-        cfg                = self.config
-        cfg.use_interframe = enable_interpolate_frame
-        # diffusers 모델 상위 폴더 (로컬 다운로드 경로)
-        self.diffusers_root = os.path.join(BASE_DIR, cfg.pretrained_model_name_or_path)
-        self.device = (
-            f"cuda:{device_id}" if device_id >= 0 and torch.cuda.is_available() else "cpu"
         )
-        self._load_models(cfg)
-        print("Sonic init done")
-    # -------------------------------------------------------------- #
-    def _load_models(self, cfg):
-        # dtype
-        dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[cfg.weight_dtype]
-        diff_root = _locate_diffusers_dir(self.diffusers_root)
-        # diffusers 모듈들
-        vae   = AutoencoderKLTemporalDecoder.from_pretrained(diff_root, subfolder="vae", variant="fp16")
-        sched = EulerDiscreteScheduler.from_pretrained(diff_root, subfolder="scheduler")
-        img_e = CLIPVisionModelWithProjection.from_pretrained(diff_root, subfolder="image_encoder", variant="fp16")
-        unet  = UNetSpatioTemporalConditionModel.from_pretrained(diff_root, subfolder="unet", variant="fp16")
-        add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
-        # 오디오 어댑터
-        a2t = AudioProjModel(seq_len=10, blocks=5, channels=384,
-                             intermediate_dim=1024, output_dim=1024, context_tokens=32).to(self.device)
-        a2b = Audio2bucketModel(seq_len=50, blocks=1, channels=384,
-                                clip_channels=1024, intermediate_dim=1024, output_dim=1,
-                                context_tokens=2).to(self.device)
-        # 체크포인트 로드
-        a2t.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path), map_location="cpu"))
-        a2b.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path), map_location="cpu"))
-        unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path), map_location="cpu"))
-        # Whisper
-        whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
         whisper.requires_grad_(False)
-        # 이미지 / 얼굴 / 보간
-        self.feature_extractor = AutoFeatureExtractor.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny"))
-        self.face_det = AlignImage(self.device, det_path=os.path.join(BASE_DIR, "checkpoints/yoloface_v5m.pt"))
-        if cfg.use_interframe:
-            self.rife = RIFEModel(device=self.device)
-            self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
-        # dtype 적용
-        for m in (vae, img_e, unet):
-            m.to(dtype)
-        self.pipe          = SonicPipeline(unet=unet, image_encoder=img_e, vae=vae, scheduler=sched).to(self.device, dtype=dtype)
-        self.image_encoder = img_e
-        self.audio2token   = a2t
-        self.audio2bucket  = a2b
-        self.whisper       = whisper
-    # -------------------------------------------------------------- #
-    def preprocess(self, image_path: str, expand_ratio: float = 1.0) -> Dict[str, Any]:
-        img = cv2.imread(image_path)
-        h, w = img.shape[:2]
-        _, _, bboxes = self.face_det(img, maxface=True)
-        if bboxes:
             x1, y1, ww, hh = bboxes[0]
-            crop = process_bbox((x1, y1, x1 + ww, y1 + hh), expand_ratio, h, w)
-            return {"face_num": 1, "crop_bbox": crop}
-        return {"face_num": 0, "crop_bbox": None}
-    # -------------------------------------------------------------- #
     @torch.no_grad()
-    def process(
-        self,
-        image_path: str,
-        audio_path: str,
-        output_path: str,
-        min_resolution: int = 512,
-        inference_steps: int = 25,
-        dynamic_scale: float = 1.0,
-        keep_resolution: bool = False,
-        seed: int | None = None,
-    ) -> int:
-        cfg = self.config
-        if seed is not None:
-            cfg.seed = seed
-        cfg.num_inference_steps   = inference_steps
-        cfg.motion_bucket_scale   = dynamic_scale
-        seed_everything(cfg.seed)
-        # 이미지·오디오 tensor 변환
-        data = image_audio_to_tensor(
-            self.face_det,
-            self.feature_extractor,
-            image_path,
-            audio_path,
-            limit=-1,
-            image_size=min_resolution,
-            area=cfg.area,
-        )
-        if data is None:
             return -1
-        h, w = data["ref_img"].shape[-2:]
         if keep_resolution:
-            im = Image.open(image_path)
-            resolution = f"{(im.width // 2) * 2}x{(im.height // 2) * 2}"
         else:
-            resolution = f"{w}x{h}"
-        # video tensor 생성
-        video = _gen_video_tensor(
-            self.pipe, cfg, self.whisper, self.audio2token, self.audio2bucket,
-            self.image_encoder, w, h, data,
-        )
-        # 중간 프레임 보간
-        if cfg.use_interframe:
-            out = video.to(self.device)
-            frames = []
-            for i in tqdm(range(out.shape[1] - 1), desc="interpolate", ncols=0):
-                frames.extend([out[:, i], self.rife.inference(out[:, i], out[:, i + 1]).clamp(0, 1)])
-            frames.append(out[:, -1])
-            video = torch.stack(frames, 1).cpu()  # (C,T',H,W)
-        # 저장
-        tmp = output_path.replace(".mp4", "_noaudio.mp4")
-        save_videos_grid(video.unsqueeze(0), tmp, n_rows=1, fps=cfg.fps * (2 if cfg.use_interframe else 1))
-        os.system(
-            f"ffmpeg -loglevel error -y -i '{tmp}' -i '{audio_path}' -s {resolution} "
-            f"-vcodec libx264 -acodec aac -crf 18 -shortest '{output_path}'"
-        )
-        os.remove(tmp)
         return 0

+import os
 import torch
+import torch.utils.checkpoint
 from PIL import Image
+import numpy as np
 from omegaconf import OmegaConf
 from tqdm import tqdm
 import cv2
 from src.utils.util import save_videos_grid, seed_everything
 from src.dataset.test_preprocess import process_bbox, image_audio_to_tensor
+from src.models.base.unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel, add_ip_adapters
 from src.pipelines.pipeline_sonic import SonicPipeline
 from src.models.audio_adapter.audio_proj import AudioProjModel
 from src.models.audio_adapter.audio_to_bucket import Audio2bucketModel
 from src.utils.RIFE.RIFE_HDv3 import RIFEModel
 from src.dataset.face_align.align import AlignImage
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+def test(
+    pipe,
+    config,
+    wav_enc,
+    audio_pe,
+    audio2bucket,
+    image_encoder,
+    width,
+    height,
+    batch
+):
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
+    ref_img = batch['ref_img']
+    clip_img = batch['clip_images']
+    face_mask = batch['face_mask']
+    image_embeds = image_encoder(
+        clip_img
+            ).image_embeds
+    audio_feature = batch['audio_feature']
+    audio_len = batch['audio_len']
+    step = int(config.step)
+    window = 3000
+    audio_prompts = []
+    last_audio_prompts = []
+    for i in range(0, audio_feature.shape[-1], window):
+        audio_prompt = wav_enc.encoder(audio_feature[:,:,i:i+window], output_hidden_states=True).hidden_states
+        last_audio_prompt = wav_enc.encoder(audio_feature[:,:,i:i+window]).last_hidden_state
+        last_audio_prompt = last_audio_prompt.unsqueeze(-2)
+        audio_prompt = torch.stack(audio_prompt, dim=2)
+        audio_prompts.append(audio_prompt)
+        last_audio_prompts.append(last_audio_prompt)
+    audio_prompts = torch.cat(audio_prompts, dim=1)
+    audio_prompts = audio_prompts[:,:audio_len*2]
+    audio_prompts = torch.cat([torch.zeros_like(audio_prompts[:,:4]), audio_prompts, torch.zeros_like(audio_prompts[:,:6])], 1)
+    last_audio_prompts = torch.cat(last_audio_prompts, dim=1)
+    last_audio_prompts = last_audio_prompts[:,:audio_len*2]
+    last_audio_prompts = torch.cat([torch.zeros_like(last_audio_prompts[:,:24]), last_audio_prompts, torch.zeros_like(last_audio_prompts[:,:26])], 1)
+    ref_tensor_list = []
+    audio_tensor_list = []
+    uncond_audio_tensor_list = []
+    motion_buckets = []
+    for i in tqdm(range(audio_len//step)):
+        audio_clip = audio_prompts[:,i*2*step:i*2*step+10].unsqueeze(0)
+        audio_clip_for_bucket = last_audio_prompts[:,i*2*step:i*2*step+50].unsqueeze(0)
+        motion_bucket = audio2bucket(audio_clip_for_bucket, image_embeds)
+        motion_bucket = motion_bucket * 16 + 16
+        motion_buckets.append(motion_bucket[0])
+        cond_audio_clip = audio_pe(audio_clip).squeeze(0)
+        uncond_audio_clip = audio_pe(torch.zeros_like(audio_clip)).squeeze(0)
+        ref_tensor_list.append(ref_img[0])
+        audio_tensor_list.append(cond_audio_clip[0])
+        uncond_audio_tensor_list.append(uncond_audio_clip[0])
+    video = pipe(
+        ref_img,
+        clip_img,
+        face_mask,
+        audio_tensor_list,
+        uncond_audio_tensor_list,
+        motion_buckets,
+        height=height,
+        width=width,
+        num_frames=len(audio_tensor_list),
+        decode_chunk_size=config.decode_chunk_size,
+        motion_bucket_scale=config.motion_bucket_scale,
+        fps=config.fps,
+        noise_aug_strength=config.noise_aug_strength,
+        min_guidance_scale1=config.min_appearance_guidance_scale, # 1.0,
+        max_guidance_scale1=config.max_appearance_guidance_scale,
+        min_guidance_scale2=config.audio_guidance_scale, # 1.0,
+        max_guidance_scale2=config.audio_guidance_scale,
+        overlap=config.overlap,
+        shift_offset=config.shift_offset,
+        frames_per_batch=config.n_sample_frames,
+        num_inference_steps=config.num_inference_steps,
+        i2i_noise_strength=config.i2i_noise_strength
+    ).frames
+    # Concat it with pose tensor
+    # pose_tensor = torch.stack(pose_tensor_list,1).unsqueeze(0)
+    video = (video*0.5 + 0.5).clamp(0, 1)
+    video = torch.cat([video.to(pipe.device)], dim=0).cpu()
+    return video
+class Sonic():
+    config_file = os.path.join(BASE_DIR, 'config/inference/sonic.yaml')
+    config = OmegaConf.load(config_file)
+    def __init__(self,
+                 device_id=0,
+                 enable_interpolate_frame=True,
+                 ):
+        config = self.config
+        config.use_interframe = enable_interpolate_frame
+        device = 'cuda:{}'.format(device_id) if device_id > -1 else 'cpu'
+        config.pretrained_model_name_or_path = os.path.join(BASE_DIR, config.pretrained_model_name_or_path)
+        vae = AutoencoderKLTemporalDecoder.from_pretrained(
+            config.pretrained_model_name_or_path,
+            subfolder="vae",
+            variant="fp16")
+        val_noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+            config.pretrained_model_name_or_path,
+            subfolder="scheduler")
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            config.pretrained_model_name_or_path,
+            subfolder="image_encoder",
+            variant="fp16")
+        unet = UNetSpatioTemporalConditionModel.from_pretrained(
+            config.pretrained_model_name_or_path,
+            subfolder="unet",
+            variant="fp16")
+        add_ip_adapters(unet, [32], [config.ip_audio_scale])
+        audio2token = AudioProjModel(seq_len=10, blocks=5, channels=384, intermediate_dim=1024, output_dim=1024, context_tokens=32).to(device)
+        audio2bucket = Audio2bucketModel(seq_len=50, blocks=1, channels=384, clip_channels=1024, intermediate_dim=1024, output_dim=1, context_tokens=2).to(device)
+        unet_checkpoint_path = os.path.join(BASE_DIR, config.unet_checkpoint_path)
+        audio2token_checkpoint_path = os.path.join(BASE_DIR, config.audio2token_checkpoint_path)
+        audio2bucket_checkpoint_path = os.path.join(BASE_DIR, config.audio2bucket_checkpoint_path)
+        unet.load_state_dict(
+            torch.load(unet_checkpoint_path, map_location="cpu"),
+            strict=True,
+        )
+        audio2token.load_state_dict(
+            torch.load(audio2token_checkpoint_path, map_location="cpu"),
+            strict=True,
+        )
+        audio2bucket.load_state_dict(
+            torch.load(audio2bucket_checkpoint_path, map_location="cpu"),
+            strict=True,
         )
+        if config.weight_dtype == "fp16":
+            weight_dtype = torch.float16
+        elif config.weight_dtype == "fp32":
+            weight_dtype = torch.float32
+        elif config.weight_dtype == "bf16":
+            weight_dtype = torch.bfloat16
+        else:
+            raise ValueError(
+                f"Do not support weight dtype: {config.weight_dtype} during training"
+            )
+        whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, 'checkpoints/whisper-tiny/')).to(device).eval()
         whisper.requires_grad_(False)
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained(os.path.join(BASE_DIR, 'checkpoints/whisper-tiny/'))
+        det_path = os.path.join(BASE_DIR, os.path.join(BASE_DIR, 'checkpoints/yoloface_v5m.pt'))
+        self.face_det = AlignImage(device, det_path=det_path)
+        if config.use_interframe:
+            rife = RIFEModel(device=device)
+            rife.load_model(os.path.join(BASE_DIR, 'checkpoints', 'RIFE/'))
+            self.rife = rife
+        image_encoder.to(weight_dtype)
+        vae.to(weight_dtype)
+        unet.to(weight_dtype)
+        pipe = SonicPipeline(
+            unet=unet,
+            image_encoder=image_encoder,
+            vae=vae,
+            scheduler=val_noise_scheduler,
+        )
+        pipe = pipe.to(device=device, dtype=weight_dtype)
+        self.pipe = pipe
+        self.whisper = whisper
+        self.audio2token = audio2token
+        self.audio2bucket = audio2bucket
+        self.image_encoder = image_encoder
+        self.device = device
+        print('init done')
+    def preprocess(self,
+              image_path, expand_ratio=1.0):
+        face_image = cv2.imread(image_path)
+        h, w = face_image.shape[:2]
+        _, _, bboxes = self.face_det(face_image, maxface=True)
+        face_num = len(bboxes)
+        bbox = []
+        if face_num > 0:
             x1, y1, ww, hh = bboxes[0]
+            x2, y2 = x1 + ww, y1 + hh
+            bbox = x1, y1, x2, y2
+            bbox_s = process_bbox(bbox, expand_radio=expand_ratio, height=h, width=w)
+        return {
+            'face_num': face_num,
+            'crop_bbox': bbox_s,
+        }
+    def crop_image(self,
+                   input_image_path,
+                   output_image_path,
+                   crop_bbox):
+        face_image = cv2.imread(input_image_path)
+        crop_image = face_image[crop_bbox[1]:crop_bbox[3], crop_bbox[0]:crop_bbox[2]]
+        cv2.imwrite(output_image_path, crop_image)
     @torch.no_grad()
+    def process(self,
+                image_path,
+                audio_path,
+                output_path,
+                min_resolution=512,
+                inference_steps=25,
+                dynamic_scale=1.0,
+                keep_resolution=False,
+                seed=None):
+        config = self.config
+        device = self.device
+        pipe = self.pipe
+        whisper = self.whisper
+        audio2token = self.audio2token
+        audio2bucket = self.audio2bucket
+        image_encoder = self.image_encoder
+        # specific parameters
+        if seed:
+            config.seed = seed
+        config.num_inference_steps = inference_steps
+        config.motion_bucket_scale = dynamic_scale
+        seed_everything(config.seed)
+        video_path = output_path.replace('.mp4', '_noaudio.mp4')
+        audio_video_path = output_path
+        imSrc_ = Image.open(image_path).convert('RGB')
+        raw_w, raw_h = imSrc_.size
+        test_data = image_audio_to_tensor(self.face_det, self.feature_extractor, image_path, audio_path, limit=config.frame_num, image_size=min_resolution, area=config.area)
+        if test_data is None:
             return -1
+        height, width = test_data['ref_img'].shape[-2:]
         if keep_resolution:
+            resolution = f'{raw_w//2*2}x{raw_h//2*2}'
         else:
+            resolution = f'{width}x{height}'
+        video = test(
+            pipe,
+            config,
+            wav_enc=whisper,
+            audio_pe=audio2token,
+            audio2bucket=audio2bucket,
+            image_encoder=image_encoder,
+            width=width,
+            height=height,
+            batch=test_data,
+            )
+        if config.use_interframe:
+            rife = self.rife
+            out = video.to(device)
+            results = []
+            video_len = out.shape[2]
+            for idx in tqdm(range(video_len-1), ncols=0):
+                I1 = out[:, :, idx]
+                I2 = out[:, :, idx+1]
+                middle = rife.inference(I1, I2).clamp(0, 1).detach()
+                results.append(out[:, :, idx])
+                results.append(middle)
+            results.append(out[:, :, video_len-1])
+            video = torch.stack(results, 2).cpu()
+        save_videos_grid(video, video_path, n_rows=video.shape[0], fps=config.fps * 2 if config.use_interframe else config.fps)
+        os.system(f"ffmpeg -i '{video_path}'  -i '{audio_path}' -s {resolution} -vcodec libx264 -acodec aac -crf 18 -shortest '{audio_video_path}' -y; rm '{video_path}'")
         return 0