Spaces:

ibrahim313
/

Lipsing

Configuration error

App Files Files Community

ibrahim313 commited on Nov 14, 2024

Commit

24bde82

verified ·

1 Parent(s): 885d086

Upload 12 files

Browse files

Files changed (12) hide show

GUI.py +361 -0
README.md +1 -12
audio.py +179 -0
config.ini +51 -0
degradations.py +764 -0
easy_functions.py +196 -0
enhance.py +22 -0
hparams.py +99 -0
inference.py +781 -0
install.py +96 -0
requirements.txt +18 -0
run.py +496 -0

GUI.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import tkinter as tk
+from tkinter import filedialog, ttk
+import configparser
+import os
+try:
+    with open('installed.txt', 'r') as file:
+        version = file.read()
+except FileNotFoundError:
+    print("SyncKing-Kong does not appear to have installed correctly.")
+    print("Please try to install it again.")
+    print("https://github.com/anothermartz/Easy-Wav2Lip/issues")
+    input()
+    exit()
+print("opening GUI")
+runfile = 'run.txt'
+if os.path.exists(runfile):
+    os.remove(runfile)
+import webbrowser
+def open_github_link(event):
+    webbrowser.open("https://github.com/anothermartz/Easy-Wav2Lip?tab=readme-ov-file#advanced-tweaking")
+def read_config():
+    # Read the config.ini file
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    return config
+def save_config(config):
+    # Save the updated config back to config.ini
+    with open("config.ini", "w") as config_file:
+        config.write(config_file)
+def open_video_file():
+    file_path = filedialog.askopenfilename(title="Select a video file", filetypes=[("All files", "*.*")])
+    if file_path:
+        video_file_var.set(file_path)
+def open_vocal_file():
+    file_path = filedialog.askopenfilename(title="Select a vocal file", filetypes=[("All files", "*.*")])
+    if file_path:
+        vocal_file_var.set(file_path)
+# feathering
+def validate_frame_preview(P):
+    if P == "":
+        return True  # Allow empty input
+    try:
+        num = float(P)
+        if (num.is_integer()):
+            return True
+    except ValueError:
+        pass
+    return False
+def start_syncking_kong():
+    print("Saving config")
+    config["OPTIONS"]["video_file"] = str(video_file_var.get())
+    config["OPTIONS"]["vocal_file"] = str(vocal_file_var.get())
+    config["OPTIONS"]["quality"] = str(quality_var.get())
+    config["OPTIONS"]["output_height"] = str(output_height_combobox.get())
+    config["OPTIONS"]["wav2lip_version"] = str(wav2lip_version_var.get())
+    config["OPTIONS"]["use_previous_tracking_data"] = str(use_previous_tracking_data_var.get())
+    config["OPTIONS"]["nosmooth"] = str(nosmooth_var.get())
+    config["OPTIONS"]["preview_window"] = str(preview_window_var.get())
+    config["PADDING"]["u"] = str(padding_vars["u"].get())
+    config["PADDING"]["d"] = str(padding_vars["d"].get())
+    config["PADDING"]["l"] = str(padding_vars["l"].get())
+    config["PADDING"]["r"] = str(padding_vars["r"].get())
+    config["MASK"]["size"] = str(size_var.get())
+    config["MASK"]["feathering"] = str(feathering_var.get())
+    config["MASK"]["mouth_tracking"] = str(mouth_tracking_var.get())
+    config["MASK"]["debug_mask"] = str(debug_mask_var.get())
+    config["OTHER"]["batch_process"] = str(batch_process_var.get())
+    config["OTHER"]["output_suffix"] = str(output_suffix_var.get())
+    config["OTHER"]["include_settings_in_suffix"] = str(include_settings_in_suffix_var.get())
+    config["OTHER"]["preview_settings"] = str(preview_settings_var.get())
+    config["OTHER"]["frame_to_preview"] = str(frame_to_preview_var.get())
+    save_config(config)  # Save the updated config
+    with open("run.txt", "w") as f:
+        f.write("run")
+        exit()
+    # Add your logic here
+root = tk.Tk()
+root.title("SyncKing-Kong GUI")
+root.geometry("800x720")
+root.configure(bg="lightblue")
+# Read the existing config.ini
+config = read_config()
+row=0
+tk.Label(root, text=version, bg="lightblue").grid(row=row, column=0, sticky="w")
+# Create a label for video file
+row+=1
+video_label = tk.Label(root, text="Video File Path:", bg="lightblue")
+video_label.grid(row=row, column=0, sticky="e")
+# Entry widget for video file path
+video_file_var = tk.StringVar()
+video_entry = tk.Entry(root, textvariable=video_file_var, width=80)
+video_entry.grid(row=row, column=1, sticky="w")
+# Create a button to open the file dialog
+select_button = tk.Button(root, text="...", command=open_video_file)
+select_button.grid(row=row, column=1, sticky="w", padx=490)
+# Set the default value based on the existing config
+video_file_var.set(config["OPTIONS"].get("video_file", ""))
+row+=1
+tk.Label(root, text="", bg="lightblue").grid(row=row, column=0, sticky="w")
+# String input for vocal_file
+row+=1
+# Create a label for the input box
+vocal_file_label = tk.Label(root, text="Vocal File Path:", bg="lightblue")
+vocal_file_label.grid(row=row, column=0, sticky="e")
+# Create an input box for the vocal file path
+vocal_file_var = tk.StringVar()
+vocal_file_entry = tk.Entry(root, textvariable=vocal_file_var, width=80)
+vocal_file_entry.grid(row=row, column=1, sticky="w")
+# Create a button to open the file dialog
+select_button = tk.Button(root, text="...", command=open_vocal_file)
+select_button.grid(row=row, column=1, sticky="w", padx=490)
+# Set the initial value from the 'config' dictionary (if available)
+vocal_file_var.set(config["OPTIONS"].get("vocal_file", ""))
+row+=1
+tk.Label(root, text="", bg="lightblue").grid(row=row, column=0, sticky="w")
+# Dropdown box for quality options
+row+=1
+quality_label = tk.Label(root, text="Select Quality:", bg="lightblue")
+quality_label.grid(row=row, column=0, sticky="e")
+quality_options = ["Fast", "Improved", "Enhanced"]
+quality_var = tk.StringVar()
+quality_var.set(config["OPTIONS"].get("quality", "Improved"))
+quality_dropdown = tk.OptionMenu(root, quality_var, *quality_options)
+quality_dropdown.grid(row=row, column=1, sticky="w")
+row+=1
+tk.Label(root, text="", bg="lightblue").grid(row=row, column=0, sticky="w")
+# Output height
+row+=1
+output_height_label = tk.Label(root, text="Output height:", bg="lightblue")
+output_height_label.grid(row=row, column=0, sticky="e")
+output_height_options = ["half resolution", "full resolution"]
+output_height_combobox = ttk.Combobox(root, values=output_height_options)
+output_height_combobox.set(config["OPTIONS"].get("output_height", "full resolution"))  # Set default value
+output_height_combobox.grid(row=row, column=1, sticky="w")
+row+=1
+tk.Label(root, text="", bg="lightblue").grid(row=row, column=0, sticky="w")
+# Dropdown box for wav2lip version options
+row+=1
+wav2lip_version_label = tk.Label(root, text="Select Wav2Lip version:", bg="lightblue")
+wav2lip_version_label.grid(row=row, column=0, sticky="e")
+wav2lip_version_options = ["Wav2Lip", "Wav2Lip_GAN"]
+wav2lip_version_var = tk.StringVar()
+wav2lip_version_var.set(config["OPTIONS"].get("wav2lip_version", "Wav2Lip"))
+wav2lip_version_dropdown = tk.OptionMenu(root, wav2lip_version_var, *wav2lip_version_options)
+wav2lip_version_dropdown.grid(row=row, column=1, sticky="w")
+row+=1
+tk.Label(root, text="", bg="lightblue").grid(row=row, column=0, sticky="w")
+# output_suffix
+row+=1
+output_suffix_label = tk.Label(root, text="Output File Suffix:", bg="lightblue")
+output_suffix_label.grid(row=row, column=0, sticky="e")
+output_suffix_var = tk.StringVar()
+output_suffix_var.set(config["OTHER"].get("output_suffix", "_SyncKing-Kong"))
+output_suffix_entry = output_suffix_entry = tk.Entry(root, textvariable=output_suffix_var, width=20)
+output_suffix_entry.grid(row=row, column=1, sticky="w")
+include_settings_in_suffix_var = tk.BooleanVar()
+include_settings_in_suffix_var.set(config["OTHER"].get("include_settings_in_suffix", True))  # Set default value
+include_settings_in_suffix_checkbox = tk.Checkbutton(root, text="Add Settings to Suffix", variable=include_settings_in_suffix_var, bg="lightblue")
+include_settings_in_suffix_checkbox.grid(row=row, column=1, sticky="w", padx=130)
+# batch_process
+row+=1
+tk.Label(root, text="", bg="lightblue").grid(row=row, column=0, sticky="w")
+row+=1
+batch_process_label = tk.Label(root, text="Batch Process:", bg="lightblue")
+batch_process_label.grid(row=row, column=0, sticky="e")
+batch_process_var = tk.BooleanVar()
+batch_process_var.set(config["OTHER"].get("batch_process", True))  # Set default value
+batch_process_checkbox = tk.Checkbutton(root, text="", variable=batch_process_var, bg="lightblue")
+batch_process_checkbox.grid(row=row, column=1, sticky="w")
+# Dropdown box for preview window options
+row+=1
+preview_window_label = tk.Label(root, text="Preview Window:", bg="lightblue")
+preview_window_label.grid(row=row, column=0, sticky="e")
+preview_window_options = ["Face", "Full", "Both", "None"]
+preview_window_var = tk.StringVar()
+preview_window_var.set(config["OPTIONS"].get("preview_window", "Face"))
+preview_window_dropdown = tk.OptionMenu(root, preview_window_var, *preview_window_options)
+preview_window_dropdown.grid(row=row, column=1, sticky="w")
+row+=1
+tk.Label(root, text="", bg="lightblue").grid(row=row, column=0, sticky="w")
+# Button to start SyncKing-Kong
+row+=1
+start_button = tk.Button(root, text="Start SyncKing-Kong", command=start_syncking_kong, bg="#5af269", font=("Arial", 16))
+start_button.grid(row=row, column=0, sticky="w", padx=290, columnspan=2)
+row+=1
+tk.Label(root, text="", bg="lightblue").grid(row=row, column=0, sticky="w")
+tk.Label(root, text="", bg="lightblue").grid(row=row, column=0, sticky="w")
+row+=1
+tk.Label(root, text="Advanced Tweaking:", bg="lightblue", font=("Arial", 16)).grid(row=row, column=0, sticky="w")
+row+=1
+# Create a label with a custom cursor
+link = tk.Label(root, text="(Click here to see readme)", bg="lightblue", fg="blue", font=("Arial", 10), cursor="hand2")
+link.grid(row=row, column=0)
+# Bind the click event to the label
+link.bind("<Button-1>", open_github_link)
+# Process one frame only
+preview_settings_var = tk.BooleanVar()
+preview_settings_var.set(config["OTHER"].get("preview_settings", True))  # Set default value
+preview_settings_checkbox = tk.Checkbutton(root, text="Process one frame only - Frame to process:", variable=preview_settings_var, bg="lightblue")
+preview_settings_checkbox.grid(row=row, column=1, sticky="w")
+frame_to_preview_var = tk.StringVar()
+frame_to_preview_entry = tk.Entry(root, textvariable=frame_to_preview_var, validate="key", width=3, validatecommand=(root.register(validate_frame_preview), "%P"))
+frame_to_preview_entry.grid(row=row, column=1, sticky="w", padx=255)
+frame_to_preview_var.set(config["OTHER"].get("frame_to_preview", "100"))
+# Checkbox for nosmooth option
+row+=1
+nosmooth_var = tk.BooleanVar()
+nosmooth_var.set(config["OPTIONS"].get("nosmooth", True))  # Set default value
+nosmooth_checkbox = tk.Checkbutton(root, text="nosmooth - unticking will smooth face detection between 5 frames", variable=nosmooth_var, bg="lightblue")
+nosmooth_checkbox.grid(row=row, column=1, sticky="w")
+# Checkbox for use_previous_tracking_data option
+row+=1
+use_previous_tracking_data_var = tk.BooleanVar()
+use_previous_tracking_data_var.set(config["OPTIONS"].get("use_previous_tracking_data", True))  # Set default value
+use_previous_tracking_data_checkbox = tk.Checkbutton(root, text="Keep previous face tracking data if using same video", variable=use_previous_tracking_data_var, bg="lightblue")
+use_previous_tracking_data_checkbox.grid(row=row, column=1, sticky="w")
+# padding
+row+=1
+tk.Label(root, text="Padding:", bg="lightblue", font=("Arial", 12)).grid(row=row, column=1, sticky="sw", pady=10)
+row+=1
+tk.Label(root, text="(Up, Down, Left, Right)", bg="lightblue").grid(row=row, column=1, rowspan=4, sticky="w", padx=100)
+padding_vars = {}
+# Create a list of padding labels and their corresponding keys
+padding_labels = [("U:", "u"), ("D:", "d"), ("L:", "l"), ("R:", "r")]
+# Validation function to allow only integers
+def validate_integer(P):
+    if P == "" or P == "-" or P.lstrip("-").isdigit():
+        return True
+    return False
+# Create the padding labels and entry widgets using a loop
+for label_text, key in padding_labels:
+    label = tk.Label(root, text=label_text, bg="lightblue")
+    label.grid(row=row, column=1, sticky="w", padx=50)
+    # Create a StringVar for the current key
+    padding_var = tk.StringVar()
+    # Set validation to allow positive and negative integers
+    entry = tk.Entry(root, textvariable=padding_var, width=3, validate="key", validatecommand=(root.register(validate_integer), "%P"))
+    entry.grid(row=row, column=1, sticky="w", padx=70)
+    # Set the default value from the 'config' dictionary
+    padding_var.set(config["PADDING"].get(key, ""))
+    # Store the StringVar in the dictionary
+    padding_vars[key] = padding_var
+    # Increment the row
+    row += 1
+tk.Label(root, text="", bg="lightblue").grid(row=row, column=0, sticky="w")
+row+=1
+# mask size
+def validate_custom_number(P):
+    if P == "":
+        return True  # Allow empty input
+    try:
+        num = float(P)
+        if 0 <= num <= 6 and (num.is_integer() or (num * 10) % 1 == 0):
+            return True
+    except ValueError:
+        pass
+    return False
+row+=1
+tk.Label(root, text="Mask settings:", bg="lightblue", font=("Arial", 12)).grid(row=row, column=1, sticky="sw")
+row+=1
+size_label = tk.Label(root, text="Mask size:", bg="lightblue", padx=50)
+size_label.grid(row=row, column=1, sticky="w")
+size_var = tk.StringVar()
+size_entry = tk.Entry(root, textvariable=size_var, validate="key", width=3, validatecommand=(root.register(validate_custom_number), "%P"))
+size_entry.grid(row=row, column=1, sticky="w", padx=120)
+size_var.set(config["MASK"].get("size", "2.5"))
+# feathering
+def validate_feather(P):
+    if P == "":
+        return True  # Allow empty input
+    try:
+        num = float(P)
+        if 0 <= num <= 3 and (num.is_integer()):
+            return True
+    except ValueError:
+        pass
+    return False
+row+=1
+feathering_label = tk.Label(root, text="Feathering:", bg="lightblue", padx=50)
+feathering_label.grid(row=row, column=1, sticky="w")
+feathering_var = tk.StringVar()
+feathering_entry = tk.Entry(root, textvariable=feathering_var, validate="key", width=3, validatecommand=(root.register(validate_feather), "%P"))
+feathering_entry.grid(row=row, column=1, sticky="w", padx=120)
+feathering_var.set(config["MASK"].get("feathering", "2.5"))
+# mouth_tracking
+row+=1
+mouth_tracking_var = tk.BooleanVar()
+mouth_tracking_var.set(config["MASK"].get("mouth_tracking", True))  # Set default value
+mouth_tracking_checkbox = tk.Checkbutton(root, text="track mouth for mask on every frame", variable=mouth_tracking_var, bg="lightblue", padx=50)
+mouth_tracking_checkbox.grid(row=row, column=1, sticky="w")
+# debug_mask
+row+=1
+debug_mask_var = tk.BooleanVar()
+debug_mask_var.set(config["MASK"].get("debug_mask", True))  # Set default value
+debug_mask_checkbox = tk.Checkbutton(root, text="highlight mask for debugging", variable=debug_mask_var, bg="lightblue", padx=50)
+debug_mask_checkbox.grid(row=row, column=1, sticky="w")
+# Increase spacing between all rows (uniformly)
+for row in range(row):
+    root.rowconfigure(row, weight=1)
+root.mainloop()

README.md CHANGED Viewed

@@ -1,12 +1 @@
----
-title: Lipsing
-emoji: 🐠
-colorFrom: yellow
-colorTo: pink
-sdk: gradio
-sdk_version: 5.5.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ This is readme file

audio.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import librosa
+import librosa.filters
+import numpy as np
+# import tensorflow as tf
+from scipy import signal
+from scipy.io import wavfile
+from hparams import hparams as hp
+def load_wav(path, sr):
+    return librosa.core.load(path, sr=sr)[0]
+def save_wav(wav, path, sr):
+    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+    # proposed by @dsmiller
+    wavfile.write(path, sr, wav.astype(np.int16))
+def save_wavenet_wav(wav, path, sr):
+    librosa.output.write_wav(path, wav, sr=sr)
+def preemphasis(wav, k, preemphasize=True):
+    if preemphasize:
+        return signal.lfilter([1, -k], [1], wav)
+    return wav
+def inv_preemphasis(wav, k, inv_preemphasize=True):
+    if inv_preemphasize:
+        return signal.lfilter([1], [1, -k], wav)
+    return wav
+def get_hop_size():
+    hop_size = hp.hop_size
+    if hop_size is None:
+        assert hp.frame_shift_ms is not None
+        hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
+    return hop_size
+def linearspectrogram(wav):
+    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
+    S = _amp_to_db(np.abs(D)) - hp.ref_level_db
+    if hp.signal_normalization:
+        return _normalize(S)
+    return S
+def melspectrogram(wav):
+    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
+    S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
+    if hp.signal_normalization:
+        return _normalize(S)
+    return S
+def _lws_processor():
+    import lws
+    return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
+def _stft(y):
+    if hp.use_lws:
+        return _lws_processor(hp).stft(y).T
+    else:
+        return librosa.stft(
+            y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size
+        )
+##########################################################
+# Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
+def num_frames(length, fsize, fshift):
+    """Compute number of time frames of spectrogram"""
+    pad = fsize - fshift
+    if length % fshift == 0:
+        M = (length + pad * 2 - fsize) // fshift + 1
+    else:
+        M = (length + pad * 2 - fsize) // fshift + 2
+    return M
+def pad_lr(x, fsize, fshift):
+    """Compute left and right padding"""
+    M = num_frames(len(x), fsize, fshift)
+    pad = fsize - fshift
+    T = len(x) + 2 * pad
+    r = (M - 1) * fshift + fsize - T
+    return pad, pad + r
+##########################################################
+# Librosa correct padding
+def librosa_pad_lr(x, fsize, fshift):
+    return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+# Conversions
+_mel_basis = None
+def _linear_to_mel(spectogram):
+    global _mel_basis
+    if _mel_basis is None:
+        _mel_basis = _build_mel_basis()
+    return np.dot(_mel_basis, spectogram)
+def _build_mel_basis():
+    assert hp.fmax <= hp.sample_rate // 2
+    return librosa.filters.mel(
+        sr=hp.sample_rate,
+        n_fft=hp.n_fft,
+        n_mels=hp.num_mels,
+        fmin=hp.fmin,
+        fmax=hp.fmax,
+    )
+def _amp_to_db(x):
+    min_level = np.exp(hp.min_level_db / 20 * np.log(10))
+    return 20 * np.log10(np.maximum(min_level, x))
+def _db_to_amp(x):
+    return np.power(10.0, (x) * 0.05)
+def _normalize(S):
+    if hp.allow_clipping_in_normalization:
+        if hp.symmetric_mels:
+            return np.clip(
+                (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db))
+                - hp.max_abs_value,
+                -hp.max_abs_value,
+                hp.max_abs_value,
+            )
+        else:
+            return np.clip(
+                hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)),
+                0,
+                hp.max_abs_value,
+            )
+    assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
+    if hp.symmetric_mels:
+        return (2 * hp.max_abs_value) * (
+            (S - hp.min_level_db) / (-hp.min_level_db)
+        ) - hp.max_abs_value
+    else:
+        return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
+def _denormalize(D):
+    if hp.allow_clipping_in_normalization:
+        if hp.symmetric_mels:
+            return (
+                (np.clip(D, -hp.max_abs_value, hp.max_abs_value) + hp.max_abs_value)
+                * -hp.min_level_db
+                / (2 * hp.max_abs_value)
+            ) + hp.min_level_db
+        else:
+            return (
+                np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value
+            ) + hp.min_level_db
+    if hp.symmetric_mels:
+        return (
+            (D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)
+        ) + hp.min_level_db
+    else:
+        return (D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db

config.ini ADDED Viewed

	@@ -0,0 +1,51 @@

+[OPTIONS]
+video_file =
+vocal_file =
+quality = Improved
+# Options:
+;         Fast:	Wav2Lip only
+;     Improved:	Wav2Lip with a feathered mask around the mouth to remove the square around the face
+;     Enhanced:	Wav2Lip + mask + GFPGAN upscaling done on the face
+; Experimental:	Test version of applying gfpgan - see release notes
+output_height = full resolution
+# Options:
+; full resolution
+; half resolution
+; video height in pixels eg: 480
+wav2lip_version = Wav2Lip
+# Wav2Lip or Wav2Lip_GAN
+# Please consult the readme for this and the rest of the options:
+; https://github.com/anothermartz/Easy-Wav2Lip#advanced-tweaking
+use_previous_tracking_data = True
+nosmooth = True
+preview_window = Full
+[PADDING]
+u = 0
+d = 0
+l = 0
+r = 0
+[MASK]
+size = 2.5
+feathering = 2
+mouth_tracking = False
+debug_mask = False
+[OTHER]
+batch_process = False
+output_suffix = _SyncKing-Kong
+include_settings_in_suffix = False
+preview_settings = False
+frame_to_preview = 100

degradations.py ADDED Viewed

	@@ -0,0 +1,764 @@

+import cv2
+import math
+import numpy as np
+import random
+import torch
+from scipy import special
+from scipy.stats import multivariate_normal
+from torchvision.transforms.functional import rgb_to_grayscale
+# -------------------------------------------------------------------- #
+# --------------------------- blur kernels --------------------------- #
+# -------------------------------------------------------------------- #
+# --------------------------- util functions --------------------------- #
+def sigma_matrix2(sig_x, sig_y, theta):
+    """Calculate the rotated sigma matrix (two dimensional matrix).
+    Args:
+        sig_x (float):
+        sig_y (float):
+        theta (float): Radian measurement.
+    Returns:
+        ndarray: Rotated sigma matrix.
+    """
+    d_matrix = np.array([[sig_x**2, 0], [0, sig_y**2]])
+    u_matrix = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
+    return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T))
+def mesh_grid(kernel_size):
+    """Generate the mesh grid, centering at zero.
+    Args:
+        kernel_size (int):
+    Returns:
+        xy (ndarray): with the shape (kernel_size, kernel_size, 2)
+        xx (ndarray): with the shape (kernel_size, kernel_size)
+        yy (ndarray): with the shape (kernel_size, kernel_size)
+    """
+    ax = np.arange(-kernel_size // 2 + 1., kernel_size // 2 + 1.)
+    xx, yy = np.meshgrid(ax, ax)
+    xy = np.hstack((xx.reshape((kernel_size * kernel_size, 1)), yy.reshape(kernel_size * kernel_size,
+                                                                           1))).reshape(kernel_size, kernel_size, 2)
+    return xy, xx, yy
+def pdf2(sigma_matrix, grid):
+    """Calculate PDF of the bivariate Gaussian distribution.
+    Args:
+        sigma_matrix (ndarray): with the shape (2, 2)
+        grid (ndarray): generated by :func:`mesh_grid`,
+            with the shape (K, K, 2), K is the kernel size.
+    Returns:
+        kernel (ndarrray): un-normalized kernel.
+    """
+    inverse_sigma = np.linalg.inv(sigma_matrix)
+    kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2))
+    return kernel
+def cdf2(d_matrix, grid):
+    """Calculate the CDF of the standard bivariate Gaussian distribution.
+        Used in skewed Gaussian distribution.
+    Args:
+        d_matrix (ndarrasy): skew matrix.
+        grid (ndarray): generated by :func:`mesh_grid`,
+            with the shape (K, K, 2), K is the kernel size.
+    Returns:
+        cdf (ndarray): skewed cdf.
+    """
+    rv = multivariate_normal([0, 0], [[1, 0], [0, 1]])
+    grid = np.dot(grid, d_matrix)
+    cdf = rv.cdf(grid)
+    return cdf
+def bivariate_Gaussian(kernel_size, sig_x, sig_y, theta, grid=None, isotropic=True):
+    """Generate a bivariate isotropic or anisotropic Gaussian kernel.
+    In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
+    Args:
+        kernel_size (int):
+        sig_x (float):
+        sig_y (float):
+        theta (float): Radian measurement.
+        grid (ndarray, optional): generated by :func:`mesh_grid`,
+            with the shape (K, K, 2), K is the kernel size. Default: None
+        isotropic (bool):
+    Returns:
+        kernel (ndarray): normalized kernel.
+    """
+    if grid is None:
+        grid, _, _ = mesh_grid(kernel_size)
+    if isotropic:
+        sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
+    else:
+        sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
+    kernel = pdf2(sigma_matrix, grid)
+    kernel = kernel / np.sum(kernel)
+    return kernel
+def bivariate_generalized_Gaussian(kernel_size, sig_x, sig_y, theta, beta, grid=None, isotropic=True):
+    """Generate a bivariate generalized Gaussian kernel.
+    ``Paper: Parameter Estimation For Multivariate Generalized Gaussian Distributions``
+    In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
+    Args:
+        kernel_size (int):
+        sig_x (float):
+        sig_y (float):
+        theta (float): Radian measurement.
+        beta (float): shape parameter, beta = 1 is the normal distribution.
+        grid (ndarray, optional): generated by :func:`mesh_grid`,
+            with the shape (K, K, 2), K is the kernel size. Default: None
+    Returns:
+        kernel (ndarray): normalized kernel.
+    """
+    if grid is None:
+        grid, _, _ = mesh_grid(kernel_size)
+    if isotropic:
+        sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
+    else:
+        sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
+    inverse_sigma = np.linalg.inv(sigma_matrix)
+    kernel = np.exp(-0.5 * np.power(np.sum(np.dot(grid, inverse_sigma) * grid, 2), beta))
+    kernel = kernel / np.sum(kernel)
+    return kernel
+def bivariate_plateau(kernel_size, sig_x, sig_y, theta, beta, grid=None, isotropic=True):
+    """Generate a plateau-like anisotropic kernel.
+    1 / (1+x^(beta))
+    Reference: https://stats.stackexchange.com/questions/203629/is-there-a-plateau-shaped-distribution
+    In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
+    Args:
+        kernel_size (int):
+        sig_x (float):
+        sig_y (float):
+        theta (float): Radian measurement.
+        beta (float): shape parameter, beta = 1 is the normal distribution.
+        grid (ndarray, optional): generated by :func:`mesh_grid`,
+            with the shape (K, K, 2), K is the kernel size. Default: None
+    Returns:
+        kernel (ndarray): normalized kernel.
+    """
+    if grid is None:
+        grid, _, _ = mesh_grid(kernel_size)
+    if isotropic:
+        sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
+    else:
+        sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
+    inverse_sigma = np.linalg.inv(sigma_matrix)
+    kernel = np.reciprocal(np.power(np.sum(np.dot(grid, inverse_sigma) * grid, 2), beta) + 1)
+    kernel = kernel / np.sum(kernel)
+    return kernel
+def random_bivariate_Gaussian(kernel_size,
+                              sigma_x_range,
+                              sigma_y_range,
+                              rotation_range,
+                              noise_range=None,
+                              isotropic=True):
+    """Randomly generate bivariate isotropic or anisotropic Gaussian kernels.
+    In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
+    Args:
+        kernel_size (int):
+        sigma_x_range (tuple): [0.6, 5]
+        sigma_y_range (tuple): [0.6, 5]
+        rotation range (tuple): [-math.pi, math.pi]
+        noise_range(tuple, optional): multiplicative kernel noise,
+            [0.75, 1.25]. Default: None
+    Returns:
+        kernel (ndarray):
+    """
+    assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
+    assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
+    sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
+    if isotropic is False:
+        assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
+        assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
+        sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
+        rotation = np.random.uniform(rotation_range[0], rotation_range[1])
+    else:
+        sigma_y = sigma_x
+        rotation = 0
+    kernel = bivariate_Gaussian(kernel_size, sigma_x, sigma_y, rotation, isotropic=isotropic)
+    # add multiplicative noise
+    if noise_range is not None:
+        assert noise_range[0] < noise_range[1], 'Wrong noise range.'
+        noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape)
+        kernel = kernel * noise
+    kernel = kernel / np.sum(kernel)
+    return kernel
+def random_bivariate_generalized_Gaussian(kernel_size,
+                                          sigma_x_range,
+                                          sigma_y_range,
+                                          rotation_range,
+                                          beta_range,
+                                          noise_range=None,
+                                          isotropic=True):
+    """Randomly generate bivariate generalized Gaussian kernels.
+    In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
+    Args:
+        kernel_size (int):
+        sigma_x_range (tuple): [0.6, 5]
+        sigma_y_range (tuple): [0.6, 5]
+        rotation range (tuple): [-math.pi, math.pi]
+        beta_range (tuple): [0.5, 8]
+        noise_range(tuple, optional): multiplicative kernel noise,
+            [0.75, 1.25]. Default: None
+    Returns:
+        kernel (ndarray):
+    """
+    assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
+    assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
+    sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
+    if isotropic is False:
+        assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
+        assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
+        sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
+        rotation = np.random.uniform(rotation_range[0], rotation_range[1])
+    else:
+        sigma_y = sigma_x
+        rotation = 0
+    # assume beta_range[0] < 1 < beta_range[1]
+    if np.random.uniform() < 0.5:
+        beta = np.random.uniform(beta_range[0], 1)
+    else:
+        beta = np.random.uniform(1, beta_range[1])
+    kernel = bivariate_generalized_Gaussian(kernel_size, sigma_x, sigma_y, rotation, beta, isotropic=isotropic)
+    # add multiplicative noise
+    if noise_range is not None:
+        assert noise_range[0] < noise_range[1], 'Wrong noise range.'
+        noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape)
+        kernel = kernel * noise
+    kernel = kernel / np.sum(kernel)
+    return kernel
+def random_bivariate_plateau(kernel_size,
+                             sigma_x_range,
+                             sigma_y_range,
+                             rotation_range,
+                             beta_range,
+                             noise_range=None,
+                             isotropic=True):
+    """Randomly generate bivariate plateau kernels.
+    In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
+    Args:
+        kernel_size (int):
+        sigma_x_range (tuple): [0.6, 5]
+        sigma_y_range (tuple): [0.6, 5]
+        rotation range (tuple): [-math.pi/2, math.pi/2]
+        beta_range (tuple): [1, 4]
+        noise_range(tuple, optional): multiplicative kernel noise,
+            [0.75, 1.25]. Default: None
+    Returns:
+        kernel (ndarray):
+    """
+    assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
+    assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
+    sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
+    if isotropic is False:
+        assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
+        assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
+        sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
+        rotation = np.random.uniform(rotation_range[0], rotation_range[1])
+    else:
+        sigma_y = sigma_x
+        rotation = 0
+    # TODO: this may be not proper
+    if np.random.uniform() < 0.5:
+        beta = np.random.uniform(beta_range[0], 1)
+    else:
+        beta = np.random.uniform(1, beta_range[1])
+    kernel = bivariate_plateau(kernel_size, sigma_x, sigma_y, rotation, beta, isotropic=isotropic)
+    # add multiplicative noise
+    if noise_range is not None:
+        assert noise_range[0] < noise_range[1], 'Wrong noise range.'
+        noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape)
+        kernel = kernel * noise
+    kernel = kernel / np.sum(kernel)
+    return kernel
+def random_mixed_kernels(kernel_list,
+                         kernel_prob,
+                         kernel_size=21,
+                         sigma_x_range=(0.6, 5),
+                         sigma_y_range=(0.6, 5),
+                         rotation_range=(-math.pi, math.pi),
+                         betag_range=(0.5, 8),
+                         betap_range=(0.5, 8),
+                         noise_range=None):
+    """Randomly generate mixed kernels.
+    Args:
+        kernel_list (tuple): a list name of kernel types,
+            support ['iso', 'aniso', 'skew', 'generalized', 'plateau_iso',
+            'plateau_aniso']
+        kernel_prob (tuple): corresponding kernel probability for each
+            kernel type
+        kernel_size (int):
+        sigma_x_range (tuple): [0.6, 5]
+        sigma_y_range (tuple): [0.6, 5]
+        rotation range (tuple): [-math.pi, math.pi]
+        beta_range (tuple): [0.5, 8]
+        noise_range(tuple, optional): multiplicative kernel noise,
+            [0.75, 1.25]. Default: None
+    Returns:
+        kernel (ndarray):
+    """
+    kernel_type = random.choices(kernel_list, kernel_prob)[0]
+    if kernel_type == 'iso':
+        kernel = random_bivariate_Gaussian(
+            kernel_size, sigma_x_range, sigma_y_range, rotation_range, noise_range=noise_range, isotropic=True)
+    elif kernel_type == 'aniso':
+        kernel = random_bivariate_Gaussian(
+            kernel_size, sigma_x_range, sigma_y_range, rotation_range, noise_range=noise_range, isotropic=False)
+    elif kernel_type == 'generalized_iso':
+        kernel = random_bivariate_generalized_Gaussian(
+            kernel_size,
+            sigma_x_range,
+            sigma_y_range,
+            rotation_range,
+            betag_range,
+            noise_range=noise_range,
+            isotropic=True)
+    elif kernel_type == 'generalized_aniso':
+        kernel = random_bivariate_generalized_Gaussian(
+            kernel_size,
+            sigma_x_range,
+            sigma_y_range,
+            rotation_range,
+            betag_range,
+            noise_range=noise_range,
+            isotropic=False)
+    elif kernel_type == 'plateau_iso':
+        kernel = random_bivariate_plateau(
+            kernel_size, sigma_x_range, sigma_y_range, rotation_range, betap_range, noise_range=None, isotropic=True)
+    elif kernel_type == 'plateau_aniso':
+        kernel = random_bivariate_plateau(
+            kernel_size, sigma_x_range, sigma_y_range, rotation_range, betap_range, noise_range=None, isotropic=False)
+    return kernel
+np.seterr(divide='ignore', invalid='ignore')
+def circular_lowpass_kernel(cutoff, kernel_size, pad_to=0):
+    """2D sinc filter
+    Reference: https://dsp.stackexchange.com/questions/58301/2-d-circularly-symmetric-low-pass-filter
+    Args:
+        cutoff (float): cutoff frequency in radians (pi is max)
+        kernel_size (int): horizontal and vertical size, must be odd.
+        pad_to (int): pad kernel size to desired size, must be odd or zero.
+    """
+    assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
+    kernel = np.fromfunction(
+        lambda x, y: cutoff * special.j1(cutoff * np.sqrt(
+            (x - (kernel_size - 1) / 2)**2 + (y - (kernel_size - 1) / 2)**2)) / (2 * np.pi * np.sqrt(
+                (x - (kernel_size - 1) / 2)**2 + (y - (kernel_size - 1) / 2)**2)), [kernel_size, kernel_size])
+    kernel[(kernel_size - 1) // 2, (kernel_size - 1) // 2] = cutoff**2 / (4 * np.pi)
+    kernel = kernel / np.sum(kernel)
+    if pad_to > kernel_size:
+        pad_size = (pad_to - kernel_size) // 2
+        kernel = np.pad(kernel, ((pad_size, pad_size), (pad_size, pad_size)))
+    return kernel
+# ------------------------------------------------------------- #
+# --------------------------- noise --------------------------- #
+# ------------------------------------------------------------- #
+# ----------------------- Gaussian Noise ----------------------- #
+def generate_gaussian_noise(img, sigma=10, gray_noise=False):
+    """Generate Gaussian noise.
+    Args:
+        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+        sigma (float): Noise scale (measured in range 255). Default: 10.
+    Returns:
+        (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
+            float32.
+    """
+    if gray_noise:
+        noise = np.float32(np.random.randn(*(img.shape[0:2]))) * sigma / 255.
+        noise = np.expand_dims(noise, axis=2).repeat(3, axis=2)
+    else:
+        noise = np.float32(np.random.randn(*(img.shape))) * sigma / 255.
+    return noise
+def add_gaussian_noise(img, sigma=10, clip=True, rounds=False, gray_noise=False):
+    """Add Gaussian noise.
+    Args:
+        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+        sigma (float): Noise scale (measured in range 255). Default: 10.
+    Returns:
+        (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
+            float32.
+    """
+    noise = generate_gaussian_noise(img, sigma, gray_noise)
+    out = img + noise
+    if clip and rounds:
+        out = np.clip((out * 255.0).round(), 0, 255) / 255.
+    elif clip:
+        out = np.clip(out, 0, 1)
+    elif rounds:
+        out = (out * 255.0).round() / 255.
+    return out
+def generate_gaussian_noise_pt(img, sigma=10, gray_noise=0):
+    """Add Gaussian noise (PyTorch version).
+    Args:
+        img (Tensor): Shape (b, c, h, w), range[0, 1], float32.
+        scale (float | Tensor): Noise scale. Default: 1.0.
+    Returns:
+        (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
+            float32.
+    """
+    b, _, h, w = img.size()
+    if not isinstance(sigma, (float, int)):
+        sigma = sigma.view(img.size(0), 1, 1, 1)
+    if isinstance(gray_noise, (float, int)):
+        cal_gray_noise = gray_noise > 0
+    else:
+        gray_noise = gray_noise.view(b, 1, 1, 1)
+        cal_gray_noise = torch.sum(gray_noise) > 0
+    if cal_gray_noise:
+        noise_gray = torch.randn(*img.size()[2:4], dtype=img.dtype, device=img.device) * sigma / 255.
+        noise_gray = noise_gray.view(b, 1, h, w)
+    # always calculate color noise
+    noise = torch.randn(*img.size(), dtype=img.dtype, device=img.device) * sigma / 255.
+    if cal_gray_noise:
+        noise = noise * (1 - gray_noise) + noise_gray * gray_noise
+    return noise
+def add_gaussian_noise_pt(img, sigma=10, gray_noise=0, clip=True, rounds=False):
+    """Add Gaussian noise (PyTorch version).
+    Args:
+        img (Tensor): Shape (b, c, h, w), range[0, 1], float32.
+        scale (float | Tensor): Noise scale. Default: 1.0.
+    Returns:
+        (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
+            float32.
+    """
+    noise = generate_gaussian_noise_pt(img, sigma, gray_noise)
+    out = img + noise
+    if clip and rounds:
+        out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
+    elif clip:
+        out = torch.clamp(out, 0, 1)
+    elif rounds:
+        out = (out * 255.0).round() / 255.
+    return out
+# ----------------------- Random Gaussian Noise ----------------------- #
+def random_generate_gaussian_noise(img, sigma_range=(0, 10), gray_prob=0):
+    sigma = np.random.uniform(sigma_range[0], sigma_range[1])
+    if np.random.uniform() < gray_prob:
+        gray_noise = True
+    else:
+        gray_noise = False
+    return generate_gaussian_noise(img, sigma, gray_noise)
+def random_add_gaussian_noise(img, sigma_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
+    noise = random_generate_gaussian_noise(img, sigma_range, gray_prob)
+    out = img + noise
+    if clip and rounds:
+        out = np.clip((out * 255.0).round(), 0, 255) / 255.
+    elif clip:
+        out = np.clip(out, 0, 1)
+    elif rounds:
+        out = (out * 255.0).round() / 255.
+    return out
+def random_generate_gaussian_noise_pt(img, sigma_range=(0, 10), gray_prob=0):
+    sigma = torch.rand(
+        img.size(0), dtype=img.dtype, device=img.device) * (sigma_range[1] - sigma_range[0]) + sigma_range[0]
+    gray_noise = torch.rand(img.size(0), dtype=img.dtype, device=img.device)
+    gray_noise = (gray_noise < gray_prob).float()
+    return generate_gaussian_noise_pt(img, sigma, gray_noise)
+def random_add_gaussian_noise_pt(img, sigma_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
+    noise = random_generate_gaussian_noise_pt(img, sigma_range, gray_prob)
+    out = img + noise
+    if clip and rounds:
+        out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
+    elif clip:
+        out = torch.clamp(out, 0, 1)
+    elif rounds:
+        out = (out * 255.0).round() / 255.
+    return out
+# ----------------------- Poisson (Shot) Noise ----------------------- #
+def generate_poisson_noise(img, scale=1.0, gray_noise=False):
+    """Generate poisson noise.
+    Reference: https://github.com/scikit-image/scikit-image/blob/main/skimage/util/noise.py#L37-L219
+    Args:
+        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+        scale (float): Noise scale. Default: 1.0.
+        gray_noise (bool): Whether generate gray noise. Default: False.
+    Returns:
+        (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
+            float32.
+    """
+    if gray_noise:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    # round and clip image for counting vals correctly
+    img = np.clip((img * 255.0).round(), 0, 255) / 255.
+    vals = len(np.unique(img))
+    vals = 2**np.ceil(np.log2(vals))
+    out = np.float32(np.random.poisson(img * vals) / float(vals))
+    noise = out - img
+    if gray_noise:
+        noise = np.repeat(noise[:, :, np.newaxis], 3, axis=2)
+    return noise * scale
+def add_poisson_noise(img, scale=1.0, clip=True, rounds=False, gray_noise=False):
+    """Add poisson noise.
+    Args:
+        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+        scale (float): Noise scale. Default: 1.0.
+        gray_noise (bool): Whether generate gray noise. Default: False.
+    Returns:
+        (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
+            float32.
+    """
+    noise = generate_poisson_noise(img, scale, gray_noise)
+    out = img + noise
+    if clip and rounds:
+        out = np.clip((out * 255.0).round(), 0, 255) / 255.
+    elif clip:
+        out = np.clip(out, 0, 1)
+    elif rounds:
+        out = (out * 255.0).round() / 255.
+    return out
+def generate_poisson_noise_pt(img, scale=1.0, gray_noise=0):
+    """Generate a batch of poisson noise (PyTorch version)
+    Args:
+        img (Tensor): Input image, shape (b, c, h, w), range [0, 1], float32.
+        scale (float | Tensor): Noise scale. Number or Tensor with shape (b).
+            Default: 1.0.
+        gray_noise (float | Tensor): 0-1 number or Tensor with shape (b).
+            0 for False, 1 for True. Default: 0.
+    Returns:
+        (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
+            float32.
+    """
+    b, _, h, w = img.size()
+    if isinstance(gray_noise, (float, int)):
+        cal_gray_noise = gray_noise > 0
+    else:
+        gray_noise = gray_noise.view(b, 1, 1, 1)
+        cal_gray_noise = torch.sum(gray_noise) > 0
+    if cal_gray_noise:
+        img_gray = rgb_to_grayscale(img, num_output_channels=1)
+        # round and clip image for counting vals correctly
+        img_gray = torch.clamp((img_gray * 255.0).round(), 0, 255) / 255.
+        # use for-loop to get the unique values for each sample
+        vals_list = [len(torch.unique(img_gray[i, :, :, :])) for i in range(b)]
+        vals_list = [2**np.ceil(np.log2(vals)) for vals in vals_list]
+        vals = img_gray.new_tensor(vals_list).view(b, 1, 1, 1)
+        out = torch.poisson(img_gray * vals) / vals
+        noise_gray = out - img_gray
+        noise_gray = noise_gray.expand(b, 3, h, w)
+    # always calculate color noise
+    # round and clip image for counting vals correctly
+    img = torch.clamp((img * 255.0).round(), 0, 255) / 255.
+    # use for-loop to get the unique values for each sample
+    vals_list = [len(torch.unique(img[i, :, :, :])) for i in range(b)]
+    vals_list = [2**np.ceil(np.log2(vals)) for vals in vals_list]
+    vals = img.new_tensor(vals_list).view(b, 1, 1, 1)
+    out = torch.poisson(img * vals) / vals
+    noise = out - img
+    if cal_gray_noise:
+        noise = noise * (1 - gray_noise) + noise_gray * gray_noise
+    if not isinstance(scale, (float, int)):
+        scale = scale.view(b, 1, 1, 1)
+    return noise * scale
+def add_poisson_noise_pt(img, scale=1.0, clip=True, rounds=False, gray_noise=0):
+    """Add poisson noise to a batch of images (PyTorch version).
+    Args:
+        img (Tensor): Input image, shape (b, c, h, w), range [0, 1], float32.
+        scale (float | Tensor): Noise scale. Number or Tensor with shape (b).
+            Default: 1.0.
+        gray_noise (float | Tensor): 0-1 number or Tensor with shape (b).
+            0 for False, 1 for True. Default: 0.
+    Returns:
+        (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
+            float32.
+    """
+    noise = generate_poisson_noise_pt(img, scale, gray_noise)
+    out = img + noise
+    if clip and rounds:
+        out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
+    elif clip:
+        out = torch.clamp(out, 0, 1)
+    elif rounds:
+        out = (out * 255.0).round() / 255.
+    return out
+# ----------------------- Random Poisson (Shot) Noise ----------------------- #
+def random_generate_poisson_noise(img, scale_range=(0, 1.0), gray_prob=0):
+    scale = np.random.uniform(scale_range[0], scale_range[1])
+    if np.random.uniform() < gray_prob:
+        gray_noise = True
+    else:
+        gray_noise = False
+    return generate_poisson_noise(img, scale, gray_noise)
+def random_add_poisson_noise(img, scale_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
+    noise = random_generate_poisson_noise(img, scale_range, gray_prob)
+    out = img + noise
+    if clip and rounds:
+        out = np.clip((out * 255.0).round(), 0, 255) / 255.
+    elif clip:
+        out = np.clip(out, 0, 1)
+    elif rounds:
+        out = (out * 255.0).round() / 255.
+    return out
+def random_generate_poisson_noise_pt(img, scale_range=(0, 1.0), gray_prob=0):
+    scale = torch.rand(
+        img.size(0), dtype=img.dtype, device=img.device) * (scale_range[1] - scale_range[0]) + scale_range[0]
+    gray_noise = torch.rand(img.size(0), dtype=img.dtype, device=img.device)
+    gray_noise = (gray_noise < gray_prob).float()
+    return generate_poisson_noise_pt(img, scale, gray_noise)
+def random_add_poisson_noise_pt(img, scale_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
+    noise = random_generate_poisson_noise_pt(img, scale_range, gray_prob)
+    out = img + noise
+    if clip and rounds:
+        out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
+    elif clip:
+        out = torch.clamp(out, 0, 1)
+    elif rounds:
+        out = (out * 255.0).round() / 255.
+    return out
+# ------------------------------------------------------------------------ #
+# --------------------------- JPEG compression --------------------------- #
+# ------------------------------------------------------------------------ #
+def add_jpg_compression(img, quality=90):
+    """Add JPG compression artifacts.
+    Args:
+        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+        quality (float): JPG compression quality. 0 for lowest quality, 100 for
+            best quality. Default: 90.
+    Returns:
+        (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1],
+            float32.
+    """
+    img = np.clip(img, 0, 1)
+    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
+    _, encimg = cv2.imencode('.jpg', img * 255., encode_param)
+    img = np.float32(cv2.imdecode(encimg, 1)) / 255.
+    return img
+def random_add_jpg_compression(img, quality_range=(90, 100)):
+    """Randomly add JPG compression artifacts.
+    Args:
+        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+        quality_range (tuple[float] | list[float]): JPG compression quality
+            range. 0 for lowest quality, 100 for best quality.
+            Default: (90, 100).
+    Returns:
+        (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1],
+            float32.
+    """
+    quality = np.random.uniform(quality_range[0], quality_range[1])
+    return add_jpg_compression(img, quality)

easy_functions.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import torch
+import subprocess
+import json
+import os
+import dlib
+import gdown
+import pickle
+import re
+from models import Wav2Lip
+from base64 import b64encode
+from urllib.parse import urlparse
+from torch.hub import download_url_to_file, get_dir
+from IPython.display import HTML, display
+device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+def get_video_details(filename):
+    cmd = [
+        "ffprobe",
+        "-v",
+        "error",
+        "-show_format",
+        "-show_streams",
+        "-of",
+        "json",
+        filename,
+    ]
+    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    info = json.loads(result.stdout)
+    # Get video stream
+    video_stream = next(
+        stream for stream in info["streams"] if stream["codec_type"] == "video"
+    )
+    # Get resolution
+    width = int(video_stream["width"])
+    height = int(video_stream["height"])
+    resolution = width * height
+    # Get fps
+    fps = eval(video_stream["avg_frame_rate"])
+    # Get length
+    length = float(info["format"]["duration"])
+    return width, height, fps, length
+def show_video(file_path):
+    """Function to display video in Colab"""
+    mp4 = open(file_path, "rb").read()
+    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
+    width, _, _, _ = get_video_details(file_path)
+    display(
+        HTML(
+            """
+  <video controls width=%d>
+      <source src="%s" type="video/mp4">
+  </video>
+  """
+            % (min(width, 1280), data_url)
+        )
+    )
+def format_time(seconds):
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds = int(seconds % 60)
+    if hours > 0:
+        return f"{hours}h {minutes}m {seconds}s"
+    elif minutes > 0:
+        return f"{minutes}m {seconds}s"
+    else:
+        return f"{seconds}s"
+def _load(checkpoint_path):
+    if device != "cpu":
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(
+            checkpoint_path, map_location=lambda storage, loc: storage
+        )
+    return checkpoint
+def load_model(path):
+    # If results file exists, load it and return
+    working_directory = os.getcwd()
+    folder, filename_with_extension = os.path.split(path)
+    filename, file_type = os.path.splitext(filename_with_extension)
+    results_file = os.path.join(folder, filename + ".pk1")
+    if os.path.exists(results_file):
+        with open(results_file, "rb") as f:
+            return pickle.load(f)
+    model = Wav2Lip()
+    print("Loading {}".format(path))
+    checkpoint = _load(path)
+    s = checkpoint["state_dict"]
+    new_s = {}
+    for k, v in s.items():
+        new_s[k.replace("module.", "")] = v
+    model.load_state_dict(new_s)
+    model = model.to(device)
+    # Save results to file
+    with open(results_file, "wb") as f:
+        pickle.dump(model.eval(), f)
+    # os.remove(path)
+    return model.eval()
+def get_input_length(filename):
+    result = subprocess.run(
+        [
+            "ffprobe",
+            "-v",
+            "error",
+            "-show_entries",
+            "format=duration",
+            "-of",
+            "default=noprint_wrappers=1:nokey=1",
+            filename,
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+    return float(result.stdout)
+def is_url(string):
+    url_regex = re.compile(r"^(https?|ftp)://[^\s/$.?#].[^\s]*$")
+    return bool(url_regex.match(string))
+def load_predictor():
+    checkpoint = os.path.join(
+        "checkpoints", "shape_predictor_68_face_landmarks_GTX.dat"
+    )
+    predictor = dlib.shape_predictor(checkpoint)
+    mouth_detector = dlib.get_frontal_face_detector()
+    # Serialize the variables
+    with open(os.path.join("checkpoints", "predictor.pkl"), "wb") as f:
+        pickle.dump(predictor, f)
+    with open(os.path.join("checkpoints", "mouth_detector.pkl"), "wb") as f:
+        pickle.dump(mouth_detector, f)
+    # delete the .dat file as it is no longer needed
+    # os.remove(output)
+def load_file_from_url(url, model_dir=None, progress=True, file_name=None):
+    """Load file form http url, will download models if necessary.
+    Ref:https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py
+    Args:
+        url (str): URL to be downloaded.
+        model_dir (str): The path to save the downloaded model. Should be a full path. If None, use pytorch hub_dir.
+            Default: None.
+        progress (bool): Whether to show the download progress. Default: True.
+        file_name (str): The downloaded file name. If None, use the file name in the url. Default: None.
+    Returns:
+        str: The path to the downloaded file.
+    """
+    if model_dir is None:  # use the pytorch hub_dir
+        hub_dir = get_dir()
+        model_dir = os.path.join(hub_dir, "checkpoints")
+    os.makedirs(model_dir, exist_ok=True)
+    parts = urlparse(url)
+    filename = os.path.basename(parts.path)
+    if file_name is not None:
+        filename = file_name
+    cached_file = os.path.abspath(os.path.join(model_dir, filename))
+    if not os.path.exists(cached_file):
+        print(f'Downloading: "{url}" to {cached_file}\n')
+        download_url_to_file(url, cached_file, hash_prefix=None, progress=progress)
+    return cached_file
+def g_colab():
+    try:
+        import google.colab
+        return True
+    except ImportError:
+        return False

enhance.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import warnings
+from gfpgan import GFPGANer
+warnings.filterwarnings("ignore")
+def load_sr():
+    run_params = GFPGANer(
+        model_path="checkpoints/GFPGANv1.4.pth",
+        upscale=1,
+        arch="clean",
+        channel_multiplier=2,
+        bg_upsampler=None,
+    )
+    return run_params
+def upscale(image, properties):
+    _, _, output = properties.enhance(
+        image, has_aligned=False, only_center_face=False, paste_back=True
+    )
+    return output

hparams.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from glob import glob
+import os
+def get_image_list(data_root, split):
+    filelist = []
+    with open("filelists/{}.txt".format(split)) as f:
+        for line in f:
+            line = line.strip()
+            if " " in line:
+                line = line.split()[0]
+            filelist.append(os.path.join(data_root, line))
+    return filelist
+class HParams:
+    def __init__(self, **kwargs):
+        self.data = {}
+        for key, value in kwargs.items():
+            self.data[key] = value
+    def __getattr__(self, key):
+        if key not in self.data:
+            raise AttributeError("'HParams' object has no attribute %s" % key)
+        return self.data[key]
+    def set_hparam(self, key, value):
+        self.data[key] = value
+# Default hyperparameters
+hparams = HParams(
+    num_mels=80,  # Number of mel-spectrogram channels and local conditioning dimensionality
+    #  network
+    rescale=True,  # Whether to rescale audio prior to preprocessing
+    rescaling_max=0.9,  # Rescaling value
+    # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
+    # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
+    # Does not work if n_ffit is not multiple of hop_size!!
+    use_lws=False,
+    n_fft=800,  # Extra window size is filled with 0 paddings to match this parameter
+    hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
+    win_size=800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
+    sample_rate=16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
+    frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)
+    # Mel and Linear spectrograms normalization/scaling and clipping
+    signal_normalization=True,
+    # Whether to normalize mel spectrograms to some predefined range (following below parameters)
+    allow_clipping_in_normalization=True,  # Only relevant if mel_normalization = True
+    symmetric_mels=True,
+    # Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2,
+    # faster and cleaner convergence)
+    max_abs_value=4.0,
+    # max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not
+    # be too big to avoid gradient explosion,
+    # not too small for fast convergence)
+    # Contribution by @begeekmyfriend
+    # Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude
+    # levels. Also allows for better G&L phase reconstruction)
+    preemphasize=True,  # whether to apply filter
+    preemphasis=0.97,  # filter coefficient.
+    # Limits
+    min_level_db=-100,
+    ref_level_db=20,
+    fmin=55,
+    # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To
+    # test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+    fmax=7600,  # To be increased/reduced depending on data.
+    ###################### Our training parameters #################################
+    img_size=96,
+    fps=25,
+    batch_size=16,
+    initial_learning_rate=1e-4,
+    nepochs=200000000000000000,  ### ctrl + c, stop whenever eval loss is consistently greater than train loss for ~10 epochs
+    num_workers=16,
+    checkpoint_interval=3000,
+    eval_interval=3000,
+    save_optimizer_state=True,
+    syncnet_wt=0.0,  # is initially zero, will be set automatically to 0.03 later. Leads to faster convergence.
+    syncnet_batch_size=64,
+    syncnet_lr=1e-4,
+    syncnet_eval_interval=10000,
+    syncnet_checkpoint_interval=10000,
+    disc_wt=0.07,
+    disc_initial_learning_rate=1e-4,
+)
+def hparams_debug_string():
+    values = hparams.values()
+    hp = [
+        "  %s: %s" % (name, values[name])
+        for name in sorted(values)
+        if name != "sentences"
+    ]
+    return "Hyperparameters:\n" + "\n".join(hp)

inference.py ADDED Viewed

	@@ -0,0 +1,781 @@

+print("\rloading torch       ", end="")
+import torch
+print("\rloading numpy       ", end="")
+import numpy as np
+print("\rloading Image       ", end="")
+from PIL import Image
+print("\rloading argparse    ", end="")
+import argparse
+print("\rloading configparser", end="")
+import configparser
+print("\rloading math        ", end="")
+import math
+print("\rloading os          ", end="")
+import os
+print("\rloading subprocess  ", end="")
+import subprocess
+print("\rloading pickle      ", end="")
+import pickle
+print("\rloading cv2         ", end="")
+import cv2
+print("\rloading audio       ", end="")
+import audio
+print("\rloading RetinaFace ", end="")
+from batch_face import RetinaFace
+print("\rloading re          ", end="")
+import re
+print("\rloading partial     ", end="")
+from functools import partial
+print("\rloading tqdm        ", end="")
+from tqdm import tqdm
+print("\rloading warnings    ", end="")
+import warnings
+warnings.filterwarnings(
+    "ignore", category=UserWarning, module="torchvision.transforms.functional_tensor"
+)
+print("\rloading upscale     ", end="")
+from enhance import upscale
+print("\rloading load_sr     ", end="")
+from enhance import load_sr
+print("\rloading load_model  ", end="")
+from easy_functions import load_model, g_colab
+print("\rimports loaded!     ")
+device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+gpu_id = 0 if torch.cuda.is_available() else -1
+if device == 'cpu':
+    print('Warning: No GPU detected so inference will be done on the CPU which is VERY SLOW!')
+parser = argparse.ArgumentParser(
+    description="Inference code to lip-sync videos in the wild using Wav2Lip models"
+)
+parser.add_argument(
+    "--checkpoint_path",
+    type=str,
+    help="Name of saved checkpoint to load weights from",
+    required=True,
+)
+parser.add_argument(
+    "--segmentation_path",
+    type=str,
+    default="checkpoints/face_segmentation.pth",
+    help="Name of saved checkpoint of segmentation network",
+    required=False,
+)
+parser.add_argument(
+    "--face",
+    type=str,
+    help="Filepath of video/image that contains faces to use",
+    required=True,
+)
+parser.add_argument(
+    "--audio",
+    type=str,
+    help="Filepath of video/audio file to use as raw audio source",
+    required=True,
+)
+parser.add_argument(
+    "--outfile",
+    type=str,
+    help="Video path to save result. See default for an e.g.",
+    default="results/result_voice.mp4",
+)
+parser.add_argument(
+    "--static",
+    type=bool,
+    help="If True, then use only first video frame for inference",
+    default=False,
+)
+parser.add_argument(
+    "--fps",
+    type=float,
+    help="Can be specified only if input is a static image (default: 25)",
+    default=25.0,
+    required=False,
+)
+parser.add_argument(
+    "--pads",
+    nargs="+",
+    type=int,
+    default=[0, 10, 0, 0],
+    help="Padding (top, bottom, left, right). Please adjust to include chin at least",
+)
+parser.add_argument(
+    "--wav2lip_batch_size", type=int, help="Batch size for Wav2Lip model(s)", default=1
+)
+parser.add_argument(
+    "--out_height",
+    default=480,
+    type=int,
+    help="Output video height. Best results are obtained at 480 or 720",
+)
+parser.add_argument(
+    "--crop",
+    nargs="+",
+    type=int,
+    default=[0, -1, 0, -1],
+    help="Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. "
+    "Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width",
+)
+parser.add_argument(
+    "--box",
+    nargs="+",
+    type=int,
+    default=[-1, -1, -1, -1],
+    help="Specify a constant bounding box for the face. Use only as a last resort if the face is not detected."
+    "Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).",
+)
+parser.add_argument(
+    "--rotate",
+    default=False,
+    action="store_true",
+    help="Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg."
+    "Use if you get a flipped result, despite feeding a normal looking video",
+)
+parser.add_argument(
+    "--nosmooth",
+    type=str,
+    default=False,
+    help="Prevent smoothing face detections over a short temporal window",
+)
+parser.add_argument(
+    "--no_seg",
+    default=False,
+    action="store_true",
+    help="Prevent using face segmentation",
+)
+parser.add_argument(
+    "--no_sr", default=False, action="store_true", help="Prevent using super resolution"
+)
+parser.add_argument(
+    "--sr_model",
+    type=str,
+    default="gfpgan",
+    help="Name of upscaler - gfpgan or RestoreFormer",
+    required=False,
+)
+parser.add_argument(
+    "--fullres",
+    default=3,
+    type=int,
+    help="used only to determine if full res is used so that no resizing needs to be done if so",
+)
+parser.add_argument(
+    "--debug_mask",
+    type=str,
+    default=False,
+    help="Makes background grayscale to see the mask better",
+)
+parser.add_argument(
+    "--preview_settings", type=str, default=False, help="Processes only one frame"
+)
+parser.add_argument(
+    "--mouth_tracking",
+    type=str,
+    default=False,
+    help="Tracks the mouth in every frame for the mask",
+)
+parser.add_argument(
+    "--mask_dilation",
+    default=150,
+    type=float,
+    help="size of mask around mouth",
+    required=False,
+)
+parser.add_argument(
+    "--mask_feathering",
+    default=151,
+    type=int,
+    help="amount of feathering of mask around mouth",
+    required=False,
+)
+parser.add_argument(
+    "--quality",
+    type=str,
+    help="Choose between Fast, Improved and Enhanced",
+    default="Fast",
+)
+with open(os.path.join("checkpoints", "predictor.pkl"), "rb") as f:
+    predictor = pickle.load(f)
+with open(os.path.join("checkpoints", "mouth_detector.pkl"), "rb") as f:
+    mouth_detector = pickle.load(f)
+# creating variables to prevent failing when a face isn't detected
+kernel = last_mask = x = y = w = h = None
+g_colab = g_colab()
+if not g_colab:
+  # Load the config file
+  config = configparser.ConfigParser()
+  config.read('config.ini')
+  # Get the value of the "preview_window" variable
+  preview_window = config.get('OPTIONS', 'preview_window')
+all_mouth_landmarks = []
+model = detector = detector_model = None
+def do_load(checkpoint_path):
+    global model, detector, detector_model
+    model = load_model(checkpoint_path)
+    detector = RetinaFace(
+        gpu_id=gpu_id, model_path="checkpoints/mobilenet.pth", network="mobilenet"
+    )
+    detector_model = detector.model
+def face_rect(images):
+    face_batch_size = 8
+    num_batches = math.ceil(len(images) / face_batch_size)
+    prev_ret = None
+    for i in range(num_batches):
+        batch = images[i * face_batch_size : (i + 1) * face_batch_size]
+        all_faces = detector(batch)  # return faces list of all images
+        for faces in all_faces:
+            if faces:
+                box, landmarks, score = faces[0]
+                prev_ret = tuple(map(int, box))
+            yield prev_ret
+def create_tracked_mask(img, original_img):
+    global kernel, last_mask, x, y, w, h  # Add last_mask to global variables
+    # Convert color space from BGR to RGB if necessary
+    cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+    cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB, original_img)
+    # Detect face
+    faces = mouth_detector(img)
+    if len(faces) == 0:
+        if last_mask is not None:
+            last_mask = cv2.resize(last_mask, (img.shape[1], img.shape[0]))
+            mask = last_mask  # use the last successful mask
+        else:
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+            return img, None
+    else:
+        face = faces[0]
+        shape = predictor(img, face)
+        # Get points for mouth
+        mouth_points = np.array(
+            [[shape.part(i).x, shape.part(i).y] for i in range(48, 68)]
+        )
+        # Calculate bounding box dimensions
+        x, y, w, h = cv2.boundingRect(mouth_points)
+        # Set kernel size as a fraction of bounding box size
+        kernel_size = int(max(w, h) * args.mask_dilation)
+        # if kernel_size % 2 == 0:  # Ensure kernel size is odd
+        # kernel_size += 1
+        # Create kernel
+        kernel = np.ones((kernel_size, kernel_size), np.uint8)
+        # Create binary mask for mouth
+        mask = np.zeros(img.shape[:2], dtype=np.uint8)
+        cv2.fillConvexPoly(mask, mouth_points, 255)
+        last_mask = mask  # Update last_mask with the new mask
+    # Dilate the mask
+    dilated_mask = cv2.dilate(mask, kernel)
+    # Calculate distance transform of dilated mask
+    dist_transform = cv2.distanceTransform(dilated_mask, cv2.DIST_L2, 5)
+    # Normalize distance transform
+    cv2.normalize(dist_transform, dist_transform, 0, 255, cv2.NORM_MINMAX)
+    # Convert normalized distance transform to binary mask and convert it to uint8
+    _, masked_diff = cv2.threshold(dist_transform, 50, 255, cv2.THRESH_BINARY)
+    masked_diff = masked_diff.astype(np.uint8)
+    # make sure blur is an odd number
+    blur = args.mask_feathering
+    if blur % 2 == 0:
+        blur += 1
+    # Set blur size as a fraction of bounding box size
+    blur = int(max(w, h) * blur)  # 10% of bounding box size
+    if blur % 2 == 0:  # Ensure blur size is odd
+        blur += 1
+    masked_diff = cv2.GaussianBlur(masked_diff, (blur, blur), 0)
+    # Convert numpy arrays to PIL Images
+    input1 = Image.fromarray(img)
+    input2 = Image.fromarray(original_img)
+    # Convert mask to single channel where pixel values are from the alpha channel of the current mask
+    mask = Image.fromarray(masked_diff)
+    # Ensure images are the same size
+    assert input1.size == input2.size == mask.size
+    # Paste input1 onto input2 using the mask
+    input2.paste(input1, (0, 0), mask)
+    # Convert the final PIL Image back to a numpy array
+    input2 = np.array(input2)
+    # input2 = cv2.cvtColor(input2, cv2.COLOR_BGR2RGB)
+    cv2.cvtColor(input2, cv2.COLOR_BGR2RGB, input2)
+    return input2, mask
+def create_mask(img, original_img):
+    global kernel, last_mask, x, y, w, h # Add last_mask to global variables
+    # Convert color space from BGR to RGB if necessary
+    cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+    cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB, original_img)
+    if last_mask is not None:
+        last_mask = np.array(last_mask)  # Convert PIL Image to numpy array
+        last_mask = cv2.resize(last_mask, (img.shape[1], img.shape[0]))
+        mask = last_mask  # use the last successful mask
+        mask = Image.fromarray(mask)
+    else:
+        # Detect face
+        faces = mouth_detector(img)
+        if len(faces) == 0:
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+            return img, None
+        else:
+            face = faces[0]
+            shape = predictor(img, face)
+            # Get points for mouth
+            mouth_points = np.array(
+                [[shape.part(i).x, shape.part(i).y] for i in range(48, 68)]
+            )
+            # Calculate bounding box dimensions
+            x, y, w, h = cv2.boundingRect(mouth_points)
+            # Set kernel size as a fraction of bounding box size
+            kernel_size = int(max(w, h) * args.mask_dilation)
+            # if kernel_size % 2 == 0:  # Ensure kernel size is odd
+            # kernel_size += 1
+            # Create kernel
+            kernel = np.ones((kernel_size, kernel_size), np.uint8)
+            # Create binary mask for mouth
+            mask = np.zeros(img.shape[:2], dtype=np.uint8)
+            cv2.fillConvexPoly(mask, mouth_points, 255)
+            # Dilate the mask
+            dilated_mask = cv2.dilate(mask, kernel)
+            # Calculate distance transform of dilated mask
+            dist_transform = cv2.distanceTransform(dilated_mask, cv2.DIST_L2, 5)
+            # Normalize distance transform
+            cv2.normalize(dist_transform, dist_transform, 0, 255, cv2.NORM_MINMAX)
+            # Convert normalized distance transform to binary mask and convert it to uint8
+            _, masked_diff = cv2.threshold(dist_transform, 50, 255, cv2.THRESH_BINARY)
+            masked_diff = masked_diff.astype(np.uint8)
+            if not args.mask_feathering == 0:
+                blur = args.mask_feathering
+                # Set blur size as a fraction of bounding box size
+                blur = int(max(w, h) * blur)  # 10% of bounding box size
+                if blur % 2 == 0:  # Ensure blur size is odd
+                    blur += 1
+                masked_diff = cv2.GaussianBlur(masked_diff, (blur, blur), 0)
+            # Convert mask to single channel where pixel values are from the alpha channel of the current mask
+            mask = Image.fromarray(masked_diff)
+            last_mask = mask  # Update last_mask with the final mask after dilation and feathering
+    # Convert numpy arrays to PIL Images
+    input1 = Image.fromarray(img)
+    input2 = Image.fromarray(original_img)
+    # Resize mask to match image size
+    # mask = Image.fromarray(mask)
+    mask = mask.resize(input1.size)
+    # Ensure images are the same size
+    assert input1.size == input2.size == mask.size
+    # Paste input1 onto input2 using the mask
+    input2.paste(input1, (0, 0), mask)
+    # Convert the final PIL Image back to a numpy array
+    input2 = np.array(input2)
+    # input2 = cv2.cvtColor(input2, cv2.COLOR_BGR2RGB)
+    cv2.cvtColor(input2, cv2.COLOR_BGR2RGB, input2)
+    return input2, mask
+def get_smoothened_boxes(boxes, T):
+    for i in range(len(boxes)):
+        if i + T > len(boxes):
+            window = boxes[len(boxes) - T :]
+        else:
+            window = boxes[i : i + T]
+        boxes[i] = np.mean(window, axis=0)
+    return boxes
+def face_detect(images, results_file="last_detected_face.pkl"):
+    # If results file exists, load it and return
+    if os.path.exists(results_file):
+        print("Using face detection data from last input")
+        with open(results_file, "rb") as f:
+            return pickle.load(f)
+    results = []
+    pady1, pady2, padx1, padx2 = args.pads
+    tqdm_partial = partial(tqdm, position=0, leave=True)
+    for image, (rect) in tqdm_partial(
+        zip(images, face_rect(images)),
+        total=len(images),
+        desc="detecting face in every frame",
+        ncols=100,
+    ):
+        if rect is None:
+            cv2.imwrite(
+                "temp/faulty_frame.jpg", image
+            )  # check this frame where the face was not detected.
+            raise ValueError(
+                "Face not detected! Ensure the video contains a face in all the frames."
+            )
+        y1 = max(0, rect[1] - pady1)
+        y2 = min(image.shape[0], rect[3] + pady2)
+        x1 = max(0, rect[0] - padx1)
+        x2 = min(image.shape[1], rect[2] + padx2)
+        results.append([x1, y1, x2, y2])
+    boxes = np.array(results)
+    if str(args.nosmooth) == "False":
+        boxes = get_smoothened_boxes(boxes, T=5)
+    results = [
+        [image[y1:y2, x1:x2], (y1, y2, x1, x2)]
+        for image, (x1, y1, x2, y2) in zip(images, boxes)
+    ]
+    # Save results to file
+    with open(results_file, "wb") as f:
+        pickle.dump(results, f)
+    return results
+def datagen(frames, mels):
+    img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+    print("\r" + " " * 100, end="\r")
+    if args.box[0] == -1:
+        if not args.static:
+            face_det_results = face_detect(frames)  # BGR2RGB for CNN face detection
+        else:
+            face_det_results = face_detect([frames[0]])
+    else:
+        print("Using the specified bounding box instead of face detection...")
+        y1, y2, x1, x2 = args.box
+        face_det_results = [[f[y1:y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
+    for i, m in enumerate(mels):
+        idx = 0 if args.static else i % len(frames)
+        frame_to_save = frames[idx].copy()
+        face, coords = face_det_results[idx].copy()
+        face = cv2.resize(face, (args.img_size, args.img_size))
+        img_batch.append(face)
+        mel_batch.append(m)
+        frame_batch.append(frame_to_save)
+        coords_batch.append(coords)
+        if len(img_batch) >= args.wav2lip_batch_size:
+            img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+            img_masked = img_batch.copy()
+            img_masked[:, args.img_size // 2 :] = 0
+            img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.0
+            mel_batch = np.reshape(
+                mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]
+            )
+            yield img_batch, mel_batch, frame_batch, coords_batch
+            img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+    if len(img_batch) > 0:
+        img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+        img_masked = img_batch.copy()
+        img_masked[:, args.img_size // 2 :] = 0
+        img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.0
+        mel_batch = np.reshape(
+            mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]
+        )
+        yield img_batch, mel_batch, frame_batch, coords_batch
+mel_step_size = 16
+def _load(checkpoint_path):
+    if device != "cpu":
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(
+            checkpoint_path, map_location=lambda storage, loc: storage
+        )
+    return checkpoint
+def main():
+    args.img_size = 96
+    frame_number = 11
+    if os.path.isfile(args.face) and args.face.split(".")[1] in ["jpg", "png", "jpeg"]:
+        args.static = True
+    if not os.path.isfile(args.face):
+        raise ValueError("--face argument must be a valid path to video/image file")
+    elif args.face.split(".")[1] in ["jpg", "png", "jpeg"]:
+        full_frames = [cv2.imread(args.face)]
+        fps = args.fps
+    else:
+        if args.fullres != 1:
+            print("Resizing video...")
+        video_stream = cv2.VideoCapture(args.face)
+        fps = video_stream.get(cv2.CAP_PROP_FPS)
+        full_frames = []
+        while 1:
+            still_reading, frame = video_stream.read()
+            if not still_reading:
+                video_stream.release()
+                break
+            if args.fullres != 1:
+                aspect_ratio = frame.shape[1] / frame.shape[0]
+                frame = cv2.resize(
+                    frame, (int(args.out_height * aspect_ratio), args.out_height)
+                )
+            if args.rotate:
+                frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
+            y1, y2, x1, x2 = args.crop
+            if x2 == -1:
+                x2 = frame.shape[1]
+            if y2 == -1:
+                y2 = frame.shape[0]
+            frame = frame[y1:y2, x1:x2]
+            full_frames.append(frame)
+    if not args.audio.endswith(".wav"):
+        print("Converting audio to .wav")
+        subprocess.check_call(
+            [
+                "ffmpeg",
+                "-y",
+                "-loglevel",
+                "error",
+                "-i",
+                args.audio,
+                "temp/temp.wav",
+            ]
+        )
+        args.audio = "temp/temp.wav"
+    print("analysing audio...")
+    wav = audio.load_wav(args.audio, 16000)
+    mel = audio.melspectrogram(wav)
+    if np.isnan(mel.reshape(-1)).sum() > 0:
+        raise ValueError(
+            "Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again"
+        )
+    mel_chunks = []
+    mel_idx_multiplier = 80.0 / fps
+    i = 0
+    while 1:
+        start_idx = int(i * mel_idx_multiplier)
+        if start_idx + mel_step_size > len(mel[0]):
+            mel_chunks.append(mel[:, len(mel[0]) - mel_step_size :])
+            break
+        mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
+        i += 1
+    full_frames = full_frames[: len(mel_chunks)]
+    if str(args.preview_settings) == "True":
+        full_frames = [full_frames[0]]
+        mel_chunks = [mel_chunks[0]]
+    print(str(len(full_frames)) + " frames to process")
+    batch_size = args.wav2lip_batch_size
+    if str(args.preview_settings) == "True":
+        gen = datagen(full_frames, mel_chunks)
+    else:
+        gen = datagen(full_frames.copy(), mel_chunks)
+    for i, (img_batch, mel_batch, frames, coords) in enumerate(
+        tqdm(
+            gen,
+            total=int(np.ceil(float(len(mel_chunks)) / batch_size)),
+            desc="Processing Wav2Lip",
+            ncols=100,
+        )
+    ):
+        if i == 0:
+            if not args.quality == "Fast":
+                print(
+                    f"mask size: {args.mask_dilation}, feathering: {args.mask_feathering}"
+                )
+                if not args.quality == "Improved":
+                    print("Loading", args.sr_model)
+                    run_params = load_sr()
+            print("Starting...")
+            frame_h, frame_w = full_frames[0].shape[:-1]
+            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+            out = cv2.VideoWriter("temp/result.mp4", fourcc, fps, (frame_w, frame_h))
+        img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
+        mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
+        with torch.no_grad():
+            pred = model(mel_batch, img_batch)
+        pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.0
+        for p, f, c in zip(pred, frames, coords):
+            # cv2.imwrite('temp/f.jpg', f)
+            y1, y2, x1, x2 = c
+            if (
+                str(args.debug_mask) == "True"
+            ):  # makes the background black & white so you can see the mask better
+                f = cv2.cvtColor(f, cv2.COLOR_BGR2GRAY)
+                f = cv2.cvtColor(f, cv2.COLOR_GRAY2BGR)
+            p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
+            cf = f[y1:y2, x1:x2]
+            if args.quality == "Enhanced":
+                p = upscale(p, run_params)
+            if args.quality in ["Enhanced", "Improved"]:
+                if str(args.mouth_tracking) == "True":
+                    p, last_mask = create_tracked_mask(p, cf)
+                else:
+                    p, last_mask = create_mask(p, cf)
+            f[y1:y2, x1:x2] = p
+            if not g_colab:
+                # Display the frame
+                if preview_window == "Face":
+                    cv2.imshow("face preview - press Q to abort", p)
+                elif preview_window == "Full":
+                    cv2.imshow("full preview - press Q to abort", f)
+                elif preview_window == "Both":
+                    cv2.imshow("face preview - press Q to abort", p)
+                    cv2.imshow("full preview - press Q to abort", f)
+                key = cv2.waitKey(1) & 0xFF
+                if key == ord('q'):
+                    exit()  # Exit the loop when 'Q' is pressed
+            if str(args.preview_settings) == "True":
+                cv2.imwrite("temp/preview.jpg", f)
+                if not g_colab:
+                    cv2.imshow("preview - press Q to close", f)
+                    if cv2.waitKey(-1) & 0xFF == ord('q'):
+                        exit()  # Exit the loop when 'Q' is pressed
+            else:
+                out.write(f)
+    # Close the window(s) when done
+    cv2.destroyAllWindows()
+    out.release()
+    if str(args.preview_settings) == "False":
+        print("converting to final video")
+        subprocess.check_call([
+            "ffmpeg",
+            "-y",
+            "-loglevel",
+            "error",
+            "-i",
+            "temp/result.mp4",
+            "-i",
+            args.audio,
+            "-c:v",
+            "libx264",
+            args.outfile
+        ])
+if __name__ == "__main__":
+    args = parser.parse_args()
+    do_load(args.checkpoint_path)
+    main()

install.py ADDED Viewed

	@@ -0,0 +1,96 @@

+version = 'v8.3'
+import os
+import re
+import argparse
+import shutil
+import subprocess
+from IPython.display import clear_output
+from easy_functions import (format_time,
+                            load_file_from_url,
+                            load_model,
+                            load_predictor)
+                            # Get the location of the basicsr package
+import os
+import shutil
+import subprocess
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="torchvision.transforms.functional_tensor")
+# Get the location of the basicsr package
+def get_basicsr_location():
+    result = subprocess.run(['pip', 'show', 'basicsr'], capture_output=True, text=True)
+    for line in result.stdout.split('\n'):
+        if 'Location: ' in line:
+            return line.split('Location: ')[1]
+    return None
+# Move and replace a file to the basicsr location
+def move_and_replace_file_to_basicsr(file_name):
+    basicsr_location = get_basicsr_location()
+    if basicsr_location:
+        destination = os.path.join(basicsr_location, file_name)
+        # Move and replace the file
+        shutil.copyfile(file_name, destination)
+        print(f'File replaced at {destination}')
+    else:
+        print('Could not find basicsr location.')
+# Example usage
+file_to_replace = 'degradations.py'  # Replace with your file name
+move_and_replace_file_to_basicsr(file_to_replace)
+from enhance import load_sr
+working_directory = os.getcwd()
+# download and initialize both wav2lip models
+print("downloading wav2lip essentials")
+load_file_from_url(
+    url="https://github.com/anothermartz/Easy-Wav2Lip/releases/download/Prerequesits/Wav2Lip_GAN.pth",
+    model_dir="checkpoints",
+    progress=True,
+    file_name="Wav2Lip_GAN.pth",
+)
+model = load_model(os.path.join(working_directory, "checkpoints", "Wav2Lip_GAN.pth"))
+print("wav2lip_gan loaded")
+load_file_from_url(
+    url="https://github.com/anothermartz/Easy-Wav2Lip/releases/download/Prerequesits/Wav2Lip.pth",
+    model_dir="checkpoints",
+    progress=True,
+    file_name="Wav2Lip.pth",
+)
+model = load_model(os.path.join(working_directory, "checkpoints", "Wav2Lip.pth"))
+print("wav2lip loaded")
+# download gfpgan files
+print("downloading gfpgan essentials")
+load_file_from_url(
+    url="https://github.com/anothermartz/Easy-Wav2Lip/releases/download/Prerequesits/GFPGANv1.4.pth",
+    model_dir="checkpoints",
+    progress=True,
+    file_name="GFPGANv1.4.pth",
+)
+load_sr()
+# load face detectors
+print("initializing face detectors")
+load_file_from_url(
+    url="https://github.com/anothermartz/Easy-Wav2Lip/releases/download/Prerequesits/shape_predictor_68_face_landmarks_GTX.dat",
+    model_dir="checkpoints",
+    progress=True,
+    file_name="shape_predictor_68_face_landmarks_GTX.dat",
+)
+load_predictor()
+# write a file to signify setup is done
+with open("installed.txt", "w") as f:
+    f.write(version)
+print("Installation complete!")
+print(
+    "If you just updated from v8 - make sure to download the updated Easy-Wav2Lip.bat too!"
+)

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+basicsr==1.4.2
+batch-face==1.4.0
+dlib==19.24.2
+facexlib==0.3.0
+gdown==4.7.1
+gfpgan==1.3.8
+imageio-ffmpeg==0.4.9
+importlib-metadata==6.8.0
+ipython==8.16.1
+librosa==0.10.1
+moviepy==1.0.3
+numpy==1.26.1
+opencv-python==4.8.1.78
+scipy==1.11.3
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch==2.1.0
+torchaudio==2.1.0
+torchvision==0.16.0

run.py ADDED Viewed

	@@ -0,0 +1,496 @@

+import os
+import sys
+import re
+import argparse
+from easy_functions import (format_time,
+                            get_input_length,
+                            get_video_details,
+                            show_video,
+                            g_colab)
+import contextlib
+import shutil
+import subprocess
+import time
+from IPython.display import Audio, Image, clear_output, display
+from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
+import configparser
+parser = argparse.ArgumentParser(description='SyncKing-Kong main run file')
+parser.add_argument('-video_file', type=str,
+                    help='Input video file path', required=False, default=False)
+parser.add_argument('-vocal_file', type=str,
+                    help='Input audio file path', required=False, default=False)
+parser.add_argument('-output_file', type=str,
+                    help='Output video file path', required=False, default=False)
+args = parser.parse_args()
+# retrieve variables from config.ini
+config = configparser.ConfigParser()
+config.read('config.ini')
+if args.video_file:
+    video_file = args.video_file
+else:
+    video_file = config['OPTIONS']['video_file']
+if args.vocal_file:
+    vocal_file = args.vocal_file
+else:
+    vocal_file = config['OPTIONS']['vocal_file']
+quality = config['OPTIONS']['quality']
+output_height = config['OPTIONS']['output_height']
+wav2lip_version = config['OPTIONS']['wav2lip_version']
+use_previous_tracking_data = config['OPTIONS']['use_previous_tracking_data']
+nosmooth = config.getboolean('OPTIONS', 'nosmooth')
+U = config.getint('PADDING', 'U')
+D = config.getint('PADDING', 'D')
+L = config.getint('PADDING', 'L')
+R = config.getint('PADDING', 'R')
+size = config.getfloat('MASK', 'size')
+feathering = config.getint('MASK', 'feathering')
+mouth_tracking = config.getboolean('MASK', 'mouth_tracking')
+debug_mask = config.getboolean('MASK', 'debug_mask')
+batch_process = config.getboolean('OTHER', 'batch_process')
+output_suffix = config['OTHER']['output_suffix']
+include_settings_in_suffix = config.getboolean('OTHER', 'include_settings_in_suffix')
+if g_colab():
+    preview_input = config.getboolean("OTHER", "preview_input")
+else:
+    preview_input = False
+preview_settings = config.getboolean("OTHER", "preview_settings")
+frame_to_preview = config.getint("OTHER", "frame_to_preview")
+working_directory = os.getcwd()
+start_time = time.time()
+video_file = video_file.strip('"')
+vocal_file = vocal_file.strip('"')
+# check video_file exists
+if video_file == "":
+    sys.exit(f"video_file cannot be blank")
+if os.path.isdir(video_file):
+    sys.exit(f"{video_file} is a directory, you need to point to a file")
+if not os.path.exists(video_file):
+    sys.exit(f"Could not find file: {video_file}")
+if wav2lip_version == "Wav2Lip_GAN":
+    checkpoint_path = os.path.join(working_directory, "checkpoints", "Wav2Lip_GAN.pth")
+else:
+    checkpoint_path = os.path.join(working_directory, "checkpoints", "Wav2Lip.pth")
+if feathering == 3:
+    feathering = 5
+if feathering == 2:
+    feathering = 3
+resolution_scale = 1
+res_custom = False
+if output_height == "half resolution":
+    resolution_scale = 2
+elif output_height == "full resolution":
+    resolution_scale = 1
+else:
+    res_custom = True
+    resolution_scale = 3
+in_width, in_height, in_fps, in_length = get_video_details(video_file)
+out_height = round(in_height / resolution_scale)
+if res_custom:
+    out_height = int(output_height)
+fps_for_static_image = 30
+if output_suffix == "" and not include_settings_in_suffix:
+    sys.exit(
+        "Current suffix settings will overwrite your input video! Please add a suffix or tick include_settings_in_suffix"
+    )
+frame_to_preview = max(frame_to_preview - 1, 0)
+if include_settings_in_suffix:
+    if wav2lip_version == "Wav2Lip_GAN":
+        output_suffix = f"{output_suffix}_GAN"
+    output_suffix = f"{output_suffix}_{quality}"
+    if output_height != "full resolution":
+        output_suffix = f"{output_suffix}_{out_height}"
+    if nosmooth:
+        output_suffix = f"{output_suffix}_nosmooth1"
+    else:
+        output_suffix = f"{output_suffix}_nosmooth0"
+    if U != 0 or D != 0 or L != 0 or R != 0:
+        output_suffix = f"{output_suffix}_pads-"
+        if U != 0:
+            output_suffix = f"{output_suffix}U{U}"
+        if D != 0:
+            output_suffix = f"{output_suffix}D{D}"
+        if L != 0:
+            output_suffix = f"{output_suffix}L{L}"
+        if R != 0:
+            output_suffix = f"{output_suffix}R{R}"
+    if quality != "fast":
+        output_suffix = f"{output_suffix}_mask-S{size}F{feathering}"
+        if mouth_tracking:
+            output_suffix = f"{output_suffix}_mt"
+        if debug_mask:
+            output_suffix = f"{output_suffix}_debug"
+if preview_settings:
+    output_suffix = f"{output_suffix}_preview"
+rescaleFactor = str(round(1 // resolution_scale))
+pad_up = str(round(U * resolution_scale))
+pad_down = str(round(D * resolution_scale))
+pad_left = str(round(L * resolution_scale))
+pad_right = str(round(R * resolution_scale))
+################################################################################
+######################### reconstruct input paths ##############################
+# Extract each part of the path
+folder, filename_with_extension = os.path.split(video_file)
+filename, file_type = os.path.splitext(filename_with_extension)
+# Extract filenumber if it exists
+filenumber_match = re.search(r"\d+$", filename)
+if filenumber_match:  # if there is a filenumber - extract it
+    filenumber = str(filenumber_match.group())
+    filenamenonumber = re.sub(r"\d+$", "", filename)
+else:  # if there is no filenumber - make it blank
+    filenumber = ""
+    filenamenonumber = filename
+# if vocal_file is blank - use the video as audio
+if vocal_file == "":
+    vocal_file = video_file
+# if not, check that the vocal_file file exists
+else:
+    if not os.path.exists(vocal_file):
+        sys.exit(f"Could not find file: {vocal_file}")
+    if os.path.isdir(vocal_file):
+        sys.exit(f"{vocal_file} is a directory, you need to point to a file")
+# Extract each part of the path
+audio_folder, audio_filename_with_extension = os.path.split(vocal_file)
+audio_filename, audio_file_type = os.path.splitext(audio_filename_with_extension)
+# Extract filenumber if it exists
+audio_filenumber_match = re.search(r"\d+$", audio_filename)
+if audio_filenumber_match:  # if there is a filenumber - extract it
+    audio_filenumber = str(audio_filenumber_match.group())
+    audio_filenamenonumber = re.sub(r"\d+$", "", audio_filename)
+else:  # if there is no filenumber - make it blank
+    audio_filenumber = ""
+    audio_filenamenonumber = audio_filename
+################################################################################
+# set process_failed to False so that it may be set to True if one or more processings fail
+process_failed = False
+temp_output = os.path.join(working_directory, "temp", "output.mp4")
+temp_folder = os.path.join(working_directory, "temp")
+last_input_video = None
+last_input_audio = None
+# --------------------------Batch processing loop-------------------------------!
+while True:
+    # construct input_video
+    input_video = os.path.join(folder, filenamenonumber + str(filenumber) + file_type)
+    input_videofile = os.path.basename(input_video)
+    # construct input_audio
+    input_audio = os.path.join(
+        audio_folder, audio_filenamenonumber + str(audio_filenumber) + audio_file_type
+    )
+    input_audiofile = os.path.basename(input_audio)
+    # see if filenames are different:
+    if filenamenonumber + str(filenumber) != audio_filenamenonumber + str(
+        audio_filenumber
+    ):
+        output_filename = (
+            filenamenonumber
+            + str(filenumber)
+            + "_"
+            + audio_filenamenonumber
+            + str(audio_filenumber)
+        )
+    else:
+        output_filename = filenamenonumber + str(filenumber)
+    # construct output_video
+    output_video = os.path.join(folder, output_filename + output_suffix + ".mp4")
+    output_video = os.path.normpath(output_video)
+    output_videofile = os.path.basename(output_video)
+    # remove last outputs
+    if os.path.exists("temp"):
+        shutil.rmtree("temp")
+    os.makedirs("temp", exist_ok=True)
+    # preview inputs (if enabled)
+    if preview_input:
+        print("input video:")
+        show_video(input_video)
+        if vocal_file != "":
+            print("input audio:")
+            display(Audio(input_audio))
+        else:
+            print("using", input_videofile, "for audio")
+        print("You may want to check now that they're the correct files!")
+    last_input_video = input_video
+    last_input_audio = input_audio
+    shutil.copy(input_video, temp_folder)
+    shutil.copy(input_audio, temp_folder)
+    # rename temp file to include padding or else changing padding does nothing
+    temp_input_video = os.path.join(temp_folder, input_videofile)
+    renamed_temp_input_video = os.path.join(
+        temp_folder, str(U) + str(D) + str(L) + str(R) + input_videofile
+    )
+    shutil.copy(temp_input_video, renamed_temp_input_video)
+    temp_input_video = renamed_temp_input_video
+    temp_input_videofile = os.path.basename(renamed_temp_input_video)
+    temp_input_audio = os.path.join(temp_folder, input_audiofile)
+    # trim video if it's longer than the audio
+    video_length = get_input_length(temp_input_video)
+    audio_length = get_input_length(temp_input_audio)
+    if preview_settings:
+        batch_process = False
+        preview_length_seconds = 1
+        converted_preview_frame = frame_to_preview / in_fps
+        preview_start_time = min(
+            converted_preview_frame, video_length - preview_length_seconds
+        )
+        preview_video_path = os.path.join(
+            temp_folder,
+            "preview_"
+            + str(preview_start_time)
+            + "_"
+            + str(U)
+            + str(D)
+            + str(L)
+            + str(R)
+            + input_videofile,
+        )
+        preview_audio_path = os.path.join(temp_folder, "preview_" + input_audiofile)
+        subprocess.call(
+            [
+                "ffmpeg",
+                "-loglevel",
+                "error",
+                "-i",
+                temp_input_video,
+                "-ss",
+                str(preview_start_time),
+                "-to",
+                str(preview_start_time + preview_length_seconds),
+                "-c",
+                "copy",
+                preview_video_path,
+            ]
+        )
+        subprocess.call(
+            [
+                "ffmpeg",
+                "-loglevel",
+                "error",
+                "-i",
+                temp_input_audio,
+                "-ss",
+                str(preview_start_time),
+                "-to",
+                str(preview_start_time + 1),
+                "-c",
+                "copy",
+                preview_audio_path,
+            ]
+        )
+        temp_input_video = preview_video_path
+        temp_input_audio = preview_audio_path
+    if video_length > audio_length:
+        trimmed_video_path = os.path.join(
+            temp_folder, "trimmed_" + temp_input_videofile
+        )
+        with open(os.devnull, "w") as devnull:
+            with contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(
+                devnull
+            ):
+                ffmpeg_extract_subclip(
+                    temp_input_video, 0, audio_length, targetname=trimmed_video_path
+                )
+        temp_input_video = trimmed_video_path
+    # check if face detection has already happened on this clip
+    last_detected_face = os.path.join(working_directory, "last_detected_face.pkl")
+    if os.path.isfile("last_file.txt"):
+        with open("last_file.txt", "r") as file:
+            last_file = file.readline()
+        if last_file != temp_input_video or use_previous_tracking_data == "False":
+            if os.path.isfile(last_detected_face):
+                os.remove(last_detected_face)
+    # ----------------------------Process the inputs!-----------------------------!
+    print(
+        f"Processing{' preview of' if preview_settings else ''} "
+        f"{input_videofile} using {input_audiofile} for audio"
+    )
+    # execute Wav2Lip & upscaler
+    cmd = [
+        sys.executable,
+        "inference.py",
+        "--face",
+        temp_input_video,
+        "--audio",
+        temp_input_audio,
+        "--outfile",
+        temp_output,
+        "--pads",
+        str(pad_up),
+        str(pad_down),
+        str(pad_left),
+        str(pad_right),
+        "--checkpoint_path",
+        checkpoint_path,
+        "--out_height",
+        str(out_height),
+        "--fullres",
+        str(resolution_scale),
+        "--quality",
+        quality,
+        "--mask_dilation",
+        str(size),
+        "--mask_feathering",
+        str(feathering),
+        "--nosmooth",
+        str(nosmooth),
+        "--debug_mask",
+        str(debug_mask),
+        "--preview_settings",
+        str(preview_settings),
+        "--mouth_tracking",
+        str(mouth_tracking),
+    ]
+    # Run the command
+    subprocess.run(cmd)
+    if preview_settings:
+        if os.path.isfile(os.path.join(temp_folder, "preview.jpg")):
+            print(f"preview successful! Check out temp/preview.jpg")
+            with open("last_file.txt", "w") as f:
+                f.write(temp_input_video)
+            # end processing timer and format the time it took
+            end_time = time.time()
+            elapsed_time = end_time - start_time
+            formatted_setup_time = format_time(elapsed_time)
+            print(f"Execution time: {formatted_setup_time}")
+            break
+        else:
+            print(f"Processing failed! :( see line above 👆")
+            print("Consider searching the issues tab on the github:")
+            print("https://github.com/anothermartz/Easy-Wav2Lip/issues")
+            exit()
+    # rename temp file and move to correct directory
+    if os.path.isfile(temp_output):
+        if os.path.isfile(output_video):
+            os.remove(output_video)
+        shutil.copy(temp_output, output_video)
+        # show output video
+        with open("last_file.txt", "w") as f:
+            f.write(temp_input_video)
+        print(f"{output_filename} successfully lip synced! It will be found here:")
+        print(output_video)
+        # end processing timer and format the time it took
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        formatted_setup_time = format_time(elapsed_time)
+        print(f"Execution time: {formatted_setup_time}")
+    else:
+        print(f"Processing failed! :( see line above 👆")
+        print("Consider searching the issues tab on the github:")
+        print("https://github.com/anothermartz/Easy-Wav2Lip/issues")
+        process_failed = True
+    if batch_process == False:
+        if process_failed:
+            exit()
+        else:
+            break
+    elif filenumber == "" and audio_filenumber == "":
+        print("Files not set for batch processing")
+        break
+    # -----------------------------Batch Processing!------------------------------!
+    if filenumber != "":  # if video has a filenumber
+        match = re.search(r"\d+", filenumber)
+        # add 1 to video filenumber
+        filenumber = (
+            f"{filenumber[:match.start()]}{int(match.group())+1:0{len(match.group())}d}"
+        )
+    if audio_filenumber != "":  # if audio has a filenumber
+        match = re.search(r"\d+", audio_filenumber)
+        # add 1 to audio filenumber
+        audio_filenumber = f"{audio_filenumber[:match.start()]}{int(match.group())+1:0{len(match.group())}d}"
+    # construct input_video
+    input_video = os.path.join(folder, filenamenonumber + str(filenumber) + file_type)
+    input_videofile = os.path.basename(input_video)
+    # construct input_audio
+    input_audio = os.path.join(
+        audio_folder, audio_filenamenonumber + str(audio_filenumber) + audio_file_type
+    )
+    input_audiofile = os.path.basename(input_audio)
+    # now check which input files exist and what to do for each scenario
+    # both +1 files exist - continue processing
+    if os.path.exists(input_video) and os.path.exists(input_audio):
+        continue
+    # video +1 only - continue with last audio file
+    if os.path.exists(input_video) and input_video != last_input_video:
+        if audio_filenumber != "":  # if audio has a filenumber
+            match = re.search(r"\d+", audio_filenumber)
+            # take 1 from audio filenumber
+            audio_filenumber = f"{audio_filenumber[:match.start()]}{int(match.group())-1:0{len(match.group())}d}"
+        continue
+    # audio +1 only - continue with last video file
+    if os.path.exists(input_audio) and input_audio != last_input_audio:
+        if filenumber != "":  # if video has a filenumber
+            match = re.search(r"\d+", filenumber)
+            # take 1 from video filenumber
+            filenumber = f"{filenumber[:match.start()]}{int(match.group())-1:0{len(match.group())}d}"
+        continue
+    # neither +1 files exist or current files already processed - finish processing
+    print("Finished all sequentially numbered files")
+    if process_failed:
+        sys.exit("Processing failed on at least one video")
+    else:
+        break