Spaces:
Runtime error
Runtime error
from tqdm import tqdm | |
import sys | |
import torch | |
import shutil | |
import perth | |
from pathlib import Path | |
import argparse | |
import os | |
import librosa | |
import soundfile as sf | |
from chatterbox.models.s3tokenizer import S3_SR | |
from chatterbox.models.s3gen import S3GEN_SR, S3Gen | |
AUDIO_EXTENSIONS = ["wav", "mp3", "flac", "opus"] | |
def main(): | |
parser = argparse.ArgumentParser(description="Voice Conversion") | |
parser.add_argument("input", type=str, help="Path to input (a sample or folder of samples).") | |
parser.add_argument("target_speaker", type=str, help="Path to the sample for the target speaker.") | |
parser.add_argument("-o", "--output_folder", type=str, default="vc_outputs") | |
parser.add_argument("-g", "--gpu_id", type=int, default=None) | |
parser.add_argument("--no-watermark", action="store_true", help="Skip watermarking") | |
args = parser.parse_args() | |
# Folders | |
input = Path(args.input) | |
output_folder = Path(args.output_folder) | |
output_orig_folder = output_folder / "input" | |
output_vc_folder = output_folder / "output" | |
ref_folder = output_vc_folder / "target" | |
output_orig_folder.mkdir(exist_ok=True, parents=True) | |
output_vc_folder.mkdir(exist_ok=True) | |
ref_folder.mkdir(exist_ok=True) | |
device = torch.device("cpu" if args.gpu_id is None else f"cuda:{args.gpu_id}") | |
## s3gen | |
s3g_fp = "checkpoints/s3gen.pt" | |
s3gen = S3Gen() | |
s3gen.load_state_dict(torch.load(s3g_fp)) | |
s3gen.to(device) | |
s3gen.eval() | |
wav_fpaths = [] | |
if input.is_dir(): | |
for ext in AUDIO_EXTENSIONS: | |
wav_fpaths += list(input.glob(f"*.{ext}")) | |
else: | |
wav_fpaths.append(input) | |
assert wav_fpaths, f"Didn't find any audio in '{input}'" | |
ref_24, _ = librosa.load(args.target_speaker, sr=S3GEN_SR, duration=10) | |
ref_24 = torch.tensor(ref_24).float() | |
shutil.copy(args.target_speaker, ref_folder / Path(args.target_speaker).name) | |
if not args.no_watermark: | |
watermarker = perth.PerthImplicitWatermarker() | |
for wav_fpath in tqdm(wav_fpaths): | |
shutil.copy(wav_fpath, output_orig_folder / wav_fpath.name) | |
audio_16, _ = librosa.load(str(wav_fpath), sr=S3_SR) | |
audio_16 = torch.tensor(audio_16).float().to(device)[None, ] | |
s3_tokens, _ = s3gen.tokenizer(audio_16) | |
wav = s3gen(s3_tokens.to(device), ref_24, S3GEN_SR) | |
wav = wav.view(-1).cpu().numpy() | |
if not args.no_watermark: | |
wav = watermarker.apply_watermark(wav, sample_rate=S3GEN_SR) | |
save_path = output_vc_folder / wav_fpath.name | |
sf.write(str(save_path), wav, samplerate=S3GEN_SR) | |
if __name__ == "__main__": | |
main() | |