Spaces:
Running
Running
File size: 5,539 Bytes
3bbf2c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import os
import sys
from random import randint
from typing import List, Optional, Set, Union
from tortoise.utils.audio import get_voices, load_audio, load_voices
from tortoise.utils.text import split_and_recombine_text
def get_all_voices(extra_voice_dirs_str: str = ""):
extra_voice_dirs = extra_voice_dirs_str.split(",") if extra_voice_dirs_str else []
return sorted(get_voices(extra_voice_dirs)), extra_voice_dirs
def parse_voice_str(voice_str: str, all_voices: List[str]):
selected_voices = all_voices if voice_str == "all" else voice_str.split(",")
selected_voices = [v.split("&") if "&" in v else [v] for v in selected_voices]
for voices in selected_voices:
for v in voices:
if v != "random" and v not in all_voices:
raise ValueError(
f"voice {v} not available, use --list-voices to see available voices."
)
return selected_voices
def voice_loader(selected_voices: list, extra_voice_dirs: List[str]):
for voices in selected_voices:
yield voices, *load_voices(voices, extra_voice_dirs)
def parse_multiarg_text(text: List[str]):
return (" ".join(text) if text else "".join(line for line in sys.stdin)).strip()
def split_text(text: str, text_split: str):
if text_split:
desired_length, max_length = map(int, text_split.split(","))
if desired_length > max_length:
raise ValueError(
f"--text-split: desired_length ({desired_length}) must be <= max_length ({max_length})"
)
texts = split_and_recombine_text(text, desired_length, max_length)
else:
texts = split_and_recombine_text(text)
#
if not texts:
raise ValueError("no text provided")
return texts
def validate_output_dir(output_dir: str, selected_voices: list, candidates: int):
if output_dir:
os.makedirs(output_dir, exist_ok=True)
else:
if len(selected_voices) > 1:
raise ValueError('cannot have multiple voices without --output-dir"')
if candidates > 1:
raise ValueError('cannot have multiple candidates without --output-dir"')
return output_dir
def check_pydub(play: bool):
if play:
try:
import pydub
import pydub.playback
return pydub
except ImportError:
raise RuntimeError(
'--play requires pydub to be installed, which can be done with "pip install pydub"'
)
def get_seed(seed: Optional[int]):
return randint(0, 2**32 - 1) if seed is None else seed
from pathlib import Path
from typing import Any, Callable
import torch
import torchaudio
def run_and_save_tts(
call_tts,
text,
output_dir: Path,
return_deterministic_state,
return_filepaths=False,
voicefixer=True,
):
output_dir.mkdir(exist_ok=True)
if return_deterministic_state:
gen, dbg = call_tts(text)
torch.save(dbg, output_dir / "dbg.pt")
else:
gen = call_tts(text)
#
if not isinstance(gen, list):
gen = [gen]
gen = [g.squeeze(0).cpu() for g in gen]
fps = []
for i, g in enumerate(gen):
fps.append(output_dir / f"{i}.wav")
save_gen_with_voicefix(g, fps[-1], squeeze=False, voicefixer=voicefixer)
# torchaudio.save(output_dir/f'{i}.wav', g, 24000)
return fps if return_filepaths else gen
def infer_on_texts(
call_tts: Callable[[str], Any],
texts: List[str],
output_dir: Union[str, Path],
return_deterministic_state: bool,
lines_to_regen: Set[int],
logger=print,
return_filepaths=False,
voicefixer=True,
):
audio_chunks = []
base_p = Path(output_dir)
base_p.mkdir(exist_ok=True)
for text_idx, text in enumerate(texts):
line_p = base_p / f"{text_idx}"
line_p.mkdir(exist_ok=True)
#
if text_idx not in lines_to_regen:
files = list(line_p.glob("*.wav"))
if files:
logger(f"loading existing audio fragments for [{text_idx}]")
audio_chunks.append([load_audio(str(f), 24000) for f in files])
continue
else:
logger(f"no existing audio fragment for [{text_idx}]")
#
logger(f"generating audio for text {text_idx}: {text}")
audio_chunks.append(
run_and_save_tts(
call_tts,
text,
line_p,
return_deterministic_state,
voicefixer=voicefixer,
)
)
fnames = []
results = []
for i in range(len(audio_chunks[0])):
resultant = torch.cat([c[i] for c in audio_chunks], dim=-1)
fnames.append(base_p / f"combined-{i}.wav")
save_gen_with_voicefix(
resultant, fnames[-1], squeeze=False, voicefixer=False
) # do not run fix on combined!!
results.append(resultant)
# torchaudio.save(base_p/'combined.wav', resultant, 24000)
return fnames if return_filepaths else results
from voicefixer import VoiceFixer
vfixer = VoiceFixer()
def save_gen_with_voicefix(g, fpath, squeeze=True, voicefixer=True):
torchaudio.save(fpath, g.squeeze(0).cpu() if squeeze else g, 24000, format="wav")
if voicefixer:
vfixer.restore(
input=fpath,
output=fpath,
cuda=True,
mode=0,
# your_vocoder_func = convert_mel_to_wav # TODO test if integration with unvinet improves things
)
|