MultiTalk-Code / eval_avlr /
ameerazam08's picture
Upload folder using huggingface_hub
6931c7b verified
# Copyright (c) Meta Platforms, Inc. and its affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import os
import pdb
import time
import dlib
import torch
import logging
import numpy as np
from ffmpy import FFmpeg
from pathlib import Path
from import wavfile
from collections import OrderedDict, defaultdict
import math
from utils import (
from fairseq import utils, checkpoint_utils
from fairseq.dataclass.configs import GenerationConfig
logger = logging.getLogger(__name__)
TIME_TRACKER = OrderedDict() # to track time-taken per step
MAX_MISSING_FRAMES_RATIO = 0.75 #max video frames that is ok to be missing
USE_CUDA = torch.cuda.is_available()
def load_noise_samples(noise_path):
# download_extract_file_if_not(
# url="",
# compressed_filepath= (noise_path/"noise_samples.tgz"),
# download_filename="noise_samples"
# )
noise_dict = defaultdict(list)
for wav_filepath in (noise_path).rglob('*.wav'):
category = wav_filepath.parent.stem
return noise_dict
def load_av_models(av_models_path):
av_resources = defaultdict(dict)
# Load AV-HuBERT Models
for parent_path in sorted(av_models_path.glob("*")): # SORT THEM
if parent_path.is_file(): continue # parent_path has to be a directory
# prepare needed variables
key = parent_path.stem
lang_label = key.split('_')[0].split('-')[-1]
label_path = str(parent_path)
ckpt_path = str(parent_path / "")
# load av model
arg_overrides = {
"modalities": ["audio", "video"],
"data": label_path,
"labels": [lang_label],
"label_dir": label_path,
"tokenizer_bpe_model": f"{label_path}/tokenizer.model",
"noise_prob": 0,
"noise_wav": None,
models, _, task = checkpoint_utils.load_model_ensemble_and_task(
[ckpt_path], arg_overrides
models = [
model.eval().cuda() if USE_CUDA else model.eval()
for model in models
generator = task.build_generator(models, GenerationConfig(beam=1))
# save info
av_resources[key]["model"] = models
av_resources[key]["task"] = task
av_resources[key]["generator"] = generator
del models, task, generator
return av_resources
def add_noise(signal, noise, snr):
signal: 1D tensor in [-32768, 32767] (16-bit depth)
noise: 1D tensor in [-32768, 32767] (16-bit depth)
snr: tuple or float
signal = signal.astype(np.float32)
noise = noise.astype(np.float32)
if type(snr) == tuple:
assert len(snr) == 2
snr = np.random.uniform(snr[0], snr[1])
snr = float(snr)
if len(signal) > len(noise):
ratio = int(np.ceil(len(signal) / len(noise)))
noise = np.concatenate([noise for _ in range(ratio)])
if len(signal) < len(noise):
start = 0
noise = noise[start : start + len(signal)]
amp_s = np.sqrt(np.mean(np.square(signal), axis=-1))
amp_n = np.sqrt(np.mean(np.square(noise), axis=-1))
noise = noise * (amp_s / amp_n) / (10 ** (snr / 20))
mixed = signal + noise
# Avoid clipping noise
max_int16 = np.iinfo(np.int16).max
min_int16 = np.iinfo(np.int16).min
if mixed.max(axis=0) > max_int16 or mixed.min(axis=0) < min_int16:
if mixed.max(axis=0) >= abs(mixed.min(axis=0)):
reduction_rate = max_int16 / mixed.max(axis=0)
reduction_rate = min_int16 / mixed.min(axis=0)
mixed = mixed * (reduction_rate)
mixed = mixed.astype(np.int16)
return mixed
def linear_interpolate(landmarks, start_idx, stop_idx):
start_landmarks = landmarks[start_idx]
stop_landmarks = landmarks[stop_idx]
delta = stop_landmarks - start_landmarks
for idx in range(1, stop_idx - start_idx):
landmarks[start_idx + idx] = (
start_landmarks + idx / float(stop_idx - start_idx) * delta
return landmarks
def landmarks_interpolate(landmarks):
"""Interpolate landmarks
param list landmarks: landmarks detected in raw videos
valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None ]
if not valid_frames_idx:
return None
for idx in range(1, len(valid_frames_idx)):
if valid_frames_idx[idx] - valid_frames_idx[idx - 1] == 1:
landmarks = linear_interpolate(
landmarks, valid_frames_idx[idx - 1], valid_frames_idx[idx]
valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]
# -- Corner case: keep frames at the beginning or at the end failed to be detected.
if valid_frames_idx:
landmarks[: valid_frames_idx[0]] = [
] * valid_frames_idx[0]
landmarks[valid_frames_idx[-1] :] = [landmarks[valid_frames_idx[-1]]] * (
len(landmarks) - valid_frames_idx[-1]
valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]
assert len(valid_frames_idx) == len(landmarks), "not every frame has landmark"
return landmarks
def track_time(func):
def wrap_func(*args, **kwargs):
t1 = time.time()
result = func(*args, **kwargs)
t2 = time.time()
TIME_TRACKER[f"{func.__name__!r}"] = round(t2-t1, 2)
return result
return wrap_func
def load_needed_models_for_lip_movement(metadata_path):
metadata_path.mkdir(parents=True, exist_ok=True)
# load dlib's face detector (NOTE: can be replaced by any other model)
logger.debug("Loading frontal face detector!")
detector = dlib.get_frontal_face_detector()
# load landmark predictor
logger.debug("Loading shape predictor!")
filename = "shape_predictor_68_face_landmarks.dat"
# download_extract_file_if_not(
# url=f"{filename}.bz2",
# compressed_filepath=metadata_path/f"{filename}.bz2",
# download_filename=filename
# )
shape_predictor_path = metadata_path / filename
predictor = dlib.shape_predictor(str(shape_predictor_path))
# load metadata
logger.debug("Loading mean-face metadata!")
mean_face_landmarks = load_meanface_metadata(metadata_path)
return (
detector, predictor, mean_face_landmarks
def mix_audio_with_noise(webcam_video, audio_file, out_file, noise_wav_file, snr):
# get audio from webcam video
inputs={webcam_video: None},
outputs={audio_file: "-v quiet -vn -acodec pcm_s16le -ar 16000 -ac 1"},
# read audio WAV file
sr, audio =
# read noise WAV file
logger.debug(f"Noise Wav used is {noise_wav_file}")
_, noise_wav =
# mix audio + noise
mixed = add_noise(audio, noise_wav, snr)
# save resulting noisy audio WAV file
wavfile.write(out_file, sr, mixed)
return mixed
def infer_av_hubert(
def decode_fn(x, av_task, gen, gen_subset_name):
dictionary = av_task.target_dictionary
symbols_ignore = gen.symbols_to_strip_from_output
return av_task.datasets[gen_subset_name].label_processors[0].decode(
x, symbols_ignore
logger.debug("Preparing manifest & label files.")
gen_subset = "test"
av_label_path = av_task.cfg.label_dir
av_label_ext = av_task.cfg.labels[0]
manifest_filepath = Path(av_label_path) / f"{gen_subset}.tsv"
a_frames = int(duration * 16000) # NOTE: 16000 is the audio sample rate
v_frames = int(25 * duration) #NOTE: 25 is the video framerate per second
with open(manifest_filepath, "w") as fout:
label_filepath = f"{av_label_path}/{gen_subset}.{av_label_ext}"
with open(label_filepath, "w") as fo:
logger.debug(f"Manifest filepath: {manifest_filepath}")
logger.debug(f"Label filepath: {label_filepath}")
av_task.load_dataset(gen_subset, task_cfg=av_task.cfg)
itr = av_task.get_batch_iterator(
sample = next(itr)
if USE_CUDA: sample = utils.move_to_cuda(sample)
hypos = av_task.inference_step(av_generator, av_models, sample)
# ref = decode_fn(sample['target'][0].int().cpu())
hypo = hypos[0][0]['tokens'].int().cpu()
hypo = decode_fn(hypo, av_task, av_generator, gen_subset)
# clear un-needed files
logger.debug("Done cleaning un-needed files")
return hypo