import os import math import torch import random import torchaudio import folder_paths import numpy as np import platform import subprocess import sys import importlib.util import importlib.machinery import argparse from omegaconf import OmegaConf from PIL import Image import shutil import decimal from decimal import Decimal, ROUND_UP def import_inference_script(script_path): """Import a Python file as a module using its file path.""" if not os.path.exists(script_path): raise ImportError(f"Script not found: {script_path}") module_name = "latentsync_inference" spec = importlib.util.spec_from_file_location(module_name, script_path) if spec is None: raise ImportError(f"Failed to create module spec for {script_path}") module = importlib.util.module_from_spec(spec) sys.modules[module_name] = module try: spec.loader.exec_module(module) except Exception as e: del sys.modules[module_name] raise ImportError(f"Failed to execute module: {str(e)}") return module def check_ffmpeg(): try: if platform.system() == "Windows": # Check if ffmpeg exists in PATH ffmpeg_path = shutil.which("ffmpeg.exe") if ffmpeg_path is None: # Look for ffmpeg in common locations possible_paths = [ os.path.join(os.environ.get("ProgramFiles", "C:\\Program Files"), "ffmpeg", "bin"), os.path.join(os.environ.get("ProgramFiles(x86)", "C:\\Program Files (x86)"), "ffmpeg", "bin"), os.path.join(os.path.dirname(os.path.abspath(__file__)), "ffmpeg", "bin"), ] for path in possible_paths: if os.path.exists(os.path.join(path, "ffmpeg.exe")): # Add to PATH os.environ["PATH"] = path + os.pathsep + os.environ.get("PATH", "") return True print("FFmpeg not found. Please install FFmpeg and add it to PATH") return False return True else: subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True) return True except (subprocess.CalledProcessError, FileNotFoundError): print("FFmpeg not found. Please install FFmpeg") return False def check_and_install_dependencies(): if not check_ffmpeg(): raise RuntimeError("FFmpeg is required but not found") required_packages = [ 'omegaconf', 'pytorch_lightning', 'transformers', 'accelerate', 'huggingface_hub', 'einops', 'diffusers' ] def is_package_installed(package_name): return importlib.util.find_spec(package_name) is not None def install_package(package): python_exe = sys.executable try: subprocess.check_call([python_exe, '-m', 'pip', 'install', package], stdout=subprocess.PIPE, stderr=subprocess.PIPE) print(f"Successfully installed {package}") except subprocess.CalledProcessError as e: print(f"Error installing {package}: {str(e)}") raise RuntimeError(f"Failed to install required package: {package}") for package in required_packages: if not is_package_installed(package): print(f"Installing required package: {package}") try: install_package(package) except Exception as e: print(f"Warning: Failed to install {package}: {str(e)}") raise def normalize_path(path): """Normalize path to handle spaces and special characters""" return os.path.normpath(path).replace('\\', '/') def get_ext_dir(subpath=None, mkdir=False): dir = os.path.dirname(__file__) if subpath is not None: dir = os.path.join(dir, subpath) dir = os.path.abspath(dir) if mkdir and not os.path.exists(dir): os.makedirs(dir) return dir def save_and_reload_frames(frames, temp_dir): final_frames = [] for frame in frames: # Convert to proper range (0-1) frame = frame.float() / max(frame.max(), 1.0) # Ensure CHW format if frame.shape[0] != 3: frame = frame.permute(2, 0, 1) final_frames.append(frame) stacked = torch.stack(final_frames) print(f"Stacked min/max: {stacked.min()}, {stacked.max()}") return stacked.to(device='cpu', dtype=torch.float32) def setup_models(): cur_dir = get_ext_dir() ckpt_dir = os.path.join(cur_dir, "checkpoints") whisper_dir = os.path.join(ckpt_dir, "whisper") # Create directories if they don't exist os.makedirs(ckpt_dir, exist_ok=True) os.makedirs(whisper_dir, exist_ok=True) unet_path = os.path.join(ckpt_dir, "latentsync_unet.pt") whisper_path = os.path.join(whisper_dir, "tiny.pt") if not (os.path.exists(unet_path) and os.path.exists(whisper_path)): print("Downloading required model checkpoints... This may take a while.") try: from huggingface_hub import snapshot_download snapshot_download(repo_id="chunyu-li/LatentSync", allow_patterns=["latentsync_unet.pt", "whisper/tiny.pt"], local_dir=ckpt_dir, local_dir_use_symlinks=False) print("Model checkpoints downloaded successfully!") except Exception as e: print(f"Error downloading models: {str(e)}") print("\nPlease download models manually:") print("1. Visit: https://huggingface.co/chunyu-li/LatentSync") print("2. Download: latentsync_unet.pt and whisper/tiny.pt") print(f"3. Place them in: {ckpt_dir}") print(f" with whisper/tiny.pt in: {whisper_dir}") raise RuntimeError("Model download failed. See instructions above.") class LatentSyncNode: def __init__(self): check_and_install_dependencies() setup_models() @classmethod def INPUT_TYPES(s): return {"required": { "images": ("IMAGE",), "audio": ("AUDIO", ), "seed": ("INT", {"default": 1247}), },} CATEGORY = "LatentSyncNode" RETURN_TYPES = ("IMAGE", ) RETURN_NAMES = ("images", ) FUNCTION = "inference" def inference(self, images, audio, seed): cur_dir = get_ext_dir() ckpt_dir = os.path.join(cur_dir, "checkpoints") output_dir = folder_paths.get_output_directory() temp_dir = os.path.join(output_dir, "temp_frames") os.makedirs(output_dir, exist_ok=True) os.makedirs(temp_dir, exist_ok=True) # Create a temporary video file from the input frames output_name = ''.join(random.choice("abcdefghijklmnopqrstuvwxyz") for _ in range(5)) temp_video_path = os.path.join(output_dir, f"temp_{output_name}.mp4") output_video_path = os.path.join(output_dir, f"latentsync_{output_name}_out.mp4") # Save frames as temporary video import torchvision.io as io if isinstance(images, list): frames = torch.stack(images) else: frames = images print(f"Initial frame count: {frames.shape[0]}") frames = (frames * 255).byte() if len(frames.shape) == 3: frames = frames.unsqueeze(0) print(f"Frame count before writing video: {frames.shape[0]}") if isinstance(frames, torch.Tensor): frames = frames.cpu() try: io.write_video(temp_video_path, frames, fps=25, video_codec='h264') except TypeError: # Fallback for newer versions import av container = av.open(temp_video_path, mode='w') stream = container.add_stream('h264', rate=25) stream.width = frames.shape[2] stream.height = frames.shape[1] for frame in frames: frame = av.VideoFrame.from_ndarray(frame.numpy(), format='rgb24') packet = stream.encode(frame) container.mux(packet) # Flush stream packet = stream.encode(None) container.mux(packet) container.close() video_path = normalize_path(temp_video_path) if not os.path.exists(ckpt_dir): print("Downloading model checkpoints... This may take a while.") from huggingface_hub import snapshot_download snapshot_download(repo_id="chunyu-li/LatentSync", allow_patterns=["latentsync_unet.pt", "whisper/tiny.pt"], local_dir=ckpt_dir, local_dir_use_symlinks=False) print("Model checkpoints downloaded successfully!") inference_script_path = os.path.join(cur_dir, "scripts", "inference.py") unet_config_path = normalize_path(os.path.join(cur_dir, "configs", "unet", "second_stage.yaml")) scheduler_config_path = normalize_path(os.path.join(cur_dir, "configs")) ckpt_path = normalize_path(os.path.join(ckpt_dir, "latentsync_unet.pt")) whisper_ckpt_path = normalize_path(os.path.join(ckpt_dir, "whisper", "tiny.pt")) # resample audio to 16k hz and save to wav waveform = audio["waveform"] sample_rate = audio["sample_rate"] if waveform.dim() == 3: # Expected shape: [channels, samples] waveform = waveform.squeeze(0) if sample_rate != 16000: new_sample_rate = 16000 waveform_16k = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)(waveform) waveform, sample_rate = waveform_16k, new_sample_rate audio_path = normalize_path(os.path.join(output_dir, f"latentsync_{output_name}_audio.wav")) torchaudio.save(audio_path, waveform, sample_rate) print(f"Using video path: {video_path}") print(f"Video file exists: {os.path.exists(video_path)}") print(f"Video file size: {os.path.getsize(video_path)} bytes") assert os.path.exists(video_path), f"video_path not exists: {video_path}" assert os.path.exists(audio_path), f"audio_path not exists: {audio_path}" try: # Add the package root to Python path package_root = os.path.dirname(cur_dir) if package_root not in sys.path: sys.path.insert(0, package_root) # Add the current directory to Python path if cur_dir not in sys.path: sys.path.insert(0, cur_dir) # Import the inference module inference_module = import_inference_script(inference_script_path) # Create a Namespace object with the arguments args = argparse.Namespace( unet_config_path=unet_config_path, inference_ckpt_path=ckpt_path, video_path=video_path, audio_path=audio_path, video_out_path=output_video_path, seed=seed, scheduler_config_path=scheduler_config_path, whisper_ckpt_path=whisper_ckpt_path ) # Load the config config = OmegaConf.load(unet_config_path) # Call main with both config and args inference_module.main(config, args) # Load the processed video back as frames processed_frames = io.read_video(output_video_path, pts_unit='sec')[0] # [T, H, W, C] print(f"Frame count after reading video: {processed_frames.shape[0]}") # Process frames following wav2lip.py pattern out_tensor_list = [] for frame in processed_frames: # Convert to numpy and ensure correct format frame = frame.numpy() # Convert frame to float32 and normalize frame = frame.astype(np.float32) / 255.0 # Convert back to tensor frame = torch.from_numpy(frame) # Ensure we have 3 channels if len(frame.shape) == 2: # If grayscale frame = frame.unsqueeze(2).repeat(1, 1, 3) elif frame.shape[2] == 4: # If RGBA frame = frame[:, :, :3] # Change to [C, H, W] format frame = frame.permute(2, 0, 1) out_tensor_list.append(frame) processed_frames = io.read_video(output_video_path, pts_unit='sec')[0] # [T, H, W, C] processed_frames = processed_frames.float() / 255.0 print(f"Frame count after normalization: {processed_frames.shape[0]}") # Fix dimensions for VideoCombine compatibility if len(processed_frames.shape) == 3: processed_frames = processed_frames.unsqueeze(0) if processed_frames.shape[0] == 1 and len(processed_frames.shape) == 4: processed_frames = processed_frames.squeeze(0) if processed_frames.shape[0] == 3: # If in CHW format processed_frames = processed_frames.permute(1, 2, 0) # Convert to HWC if processed_frames.shape[-1] == 4: # If RGBA processed_frames = processed_frames[..., :3] print(f"Final frame count: {processed_frames.shape[0]}") print(f"Final shape: {processed_frames.shape}") # Clean up if os.path.exists(temp_video_path): os.remove(temp_video_path) if os.path.exists(output_video_path): os.remove(output_video_path) shutil.rmtree(temp_dir, ignore_errors=True) except Exception as e: # Clean up on error if os.path.exists(temp_video_path): os.remove(temp_video_path) if os.path.exists(output_video_path): os.remove(output_video_path) shutil.rmtree(temp_dir, ignore_errors=True) print(f"Error during inference: {str(e)}") import traceback traceback.print_exc() raise return (processed_frames,) class VideoLengthAdjuster: @classmethod def INPUT_TYPES(s): return { "required": { "images": ("IMAGE",), "audio": ("AUDIO",), "mode": (["normal", "pingpong", "loop_to_audio"], {"default": "normal"}), "fps": ("FLOAT", {"default": 25.0, "min": 1.0, "max": 120.0}), "pingpong_smoothing": ("INT", {"default": 2, "min": 0, "max": 10}), } } CATEGORY = "LatentSyncNode" RETURN_TYPES = ("IMAGE", "AUDIO") RETURN_NAMES = ("images", "audio") FUNCTION = "adjust" def adjust(self, images, audio, mode, fps=25.0, pingpong_smoothing=2): # --- High-Precision Initialization --- ctx = decimal.getcontext() ctx.rounding = ROUND_UP # --- Audio Validation --- waveform = audio["waveform"].squeeze(0) if waveform.numel() < 10: raise ValueError("Audio input too short for processing") sample_rate = Decimal(str(audio["sample_rate"])) fps_dec = Decimal(str(fps)).quantize(Decimal('1.000')) # --- Frame Preparation --- original_frames = [images[i] for i in range(images.shape[0])] original_count = len(original_frames) # --- Ping-Pong Processing --- if mode == "pingpong": reversed_frames = original_frames[::-1] for i in range(int(pingpong_smoothing)): # Convert to int alpha = (i + 1) / (pingpong_smoothing + 1) original_frames[-1 - i] = original_frames[-1 - i] * (1 - float(alpha)) + reversed_frames[i] * float(alpha) frames = original_frames + reversed_frames[int(pingpong_smoothing):] # Convert to int else: frames = original_frames.copy() # --- Integer Conversion for Indexing --- audio_duration = Decimal(waveform.shape[1]) / sample_rate exact_frames_needed = int((audio_duration * fps_dec).to_integral_value()) # Convert to int final_video_duration = exact_frames_needed / float(fps_dec) # Use float for duration required_samples = int((final_video_duration * float(sample_rate))) # Convert to int # --- Frame Adjustment --- current_frames = len(frames) if current_frames < exact_frames_needed: repeat_times = math.ceil(exact_frames_needed / current_frames) frames = (frames * repeat_times)[:exact_frames_needed] # Now using integers elif current_frames > exact_frames_needed: frames = frames[:exact_frames_needed] # --- Audio Trimming --- adjusted_audio = waveform[:, :required_samples] return ( torch.stack(frames), {"waveform": adjusted_audio.unsqueeze(0), "sample_rate": int(sample_rate)} ) NODE_CLASS_MAPPINGS = { "D_LatentSyncNode": LatentSyncNode, "D_VideoLengthAdjuster": VideoLengthAdjuster, } NODE_DISPLAY_NAME_MAPPINGS = { "D_LatentSyncNode": "LatentSync Node", "D_VideoLengthAdjuster": "Video Length Adjuster", }