|
|
|
|
|
|
|
from cog import BasePredictor, Input, Path |
|
|
|
import os |
|
import re |
|
import torch |
|
import torchaudio |
|
import numpy as np |
|
import tempfile |
|
from einops import rearrange |
|
from ema_pytorch import EMA |
|
from vocos import Vocos |
|
from pydub import AudioSegment |
|
from model import CFM, UNetT, DiT, MMDiT |
|
from cached_path import cached_path |
|
from model.utils import ( |
|
get_tokenizer, |
|
convert_char_to_pinyin, |
|
save_spectrogram, |
|
) |
|
from transformers import pipeline |
|
import librosa |
|
|
|
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" |
|
|
|
target_sample_rate = 24000 |
|
n_mel_channels = 100 |
|
hop_length = 256 |
|
target_rms = 0.1 |
|
nfe_step = 32 |
|
cfg_strength = 2.0 |
|
ode_method = 'euler' |
|
sway_sampling_coef = -1.0 |
|
speed = 1.0 |
|
|
|
fix_duration = None |
|
|
|
|
|
class Predictor(BasePredictor): |
|
def load_model(exp_name, model_cls, model_cfg, ckpt_step): |
|
checkpoint = torch.load(str(cached_path(f"hf://SWivid/F5-TTS/{exp_name}/model_{ckpt_step}.pt")), map_location=device) |
|
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin") |
|
model = CFM( |
|
transformer=model_cls( |
|
**model_cfg, |
|
text_num_embeds=vocab_size, |
|
mel_dim=n_mel_channels |
|
), |
|
mel_spec_kwargs=dict( |
|
target_sample_rate=target_sample_rate, |
|
n_mel_channels=n_mel_channels, |
|
hop_length=hop_length, |
|
), |
|
odeint_kwargs=dict( |
|
method=ode_method, |
|
), |
|
vocab_char_map=vocab_char_map, |
|
).to(device) |
|
|
|
ema_model = EMA(model, include_online_model=False).to(device) |
|
ema_model.load_state_dict(checkpoint['ema_model_state_dict']) |
|
ema_model.copy_params_from_ema_to_model() |
|
|
|
return ema_model, model |
|
def setup(self) -> None: |
|
"""Load the model into memory to make running multiple predictions efficient""" |
|
|
|
print("Loading Whisper model...") |
|
self.pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-large-v3-turbo", |
|
torch_dtype=torch.float16, |
|
device=device, |
|
) |
|
print("Loading F5-TTS model...") |
|
|
|
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4) |
|
self.F5TTS_ema_model, self.F5TTS_base_model = self.load_model("F5TTS_Base", DiT, F5TTS_model_cfg, 1200000) |
|
|
|
|
|
def predict( |
|
self, |
|
gen_text: str = Input(description="Text to generate"), |
|
ref_audio_orig: Path = Input(description="Reference audio"), |
|
remove_silence: bool = Input(description="Remove silences", default=True), |
|
) -> Path: |
|
"""Run a single prediction on the model""" |
|
model_choice = "F5-TTS" |
|
print(gen_text) |
|
if len(gen_text) > 200: |
|
raise gr.Error("Please keep your text under 200 chars.") |
|
gr.Info("Converting audio...") |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
|
aseg = AudioSegment.from_file(ref_audio_orig) |
|
audio_duration = len(aseg) |
|
if audio_duration > 15000: |
|
gr.Warning("Audio is over 15s, clipping to only first 15s.") |
|
aseg = aseg[:15000] |
|
aseg.export(f.name, format="wav") |
|
ref_audio = f.name |
|
ema_model = self.F5TTS_ema_model |
|
base_model = self.F5TTS_base_model |
|
|
|
if not ref_text.strip(): |
|
gr.Info("No reference text provided, transcribing reference audio...") |
|
ref_text = outputs = self.pipe( |
|
ref_audio, |
|
chunk_length_s=30, |
|
batch_size=128, |
|
generate_kwargs={"task": "transcribe"}, |
|
return_timestamps=False, |
|
)['text'].strip() |
|
gr.Info("Finished transcription") |
|
else: |
|
gr.Info("Using custom reference text...") |
|
audio, sr = torchaudio.load(ref_audio) |
|
|
|
rms = torch.sqrt(torch.mean(torch.square(audio))) |
|
if rms < target_rms: |
|
audio = audio * target_rms / rms |
|
if sr != target_sample_rate: |
|
resampler = torchaudio.transforms.Resample(sr, target_sample_rate) |
|
audio = resampler(audio) |
|
audio = audio.to(device) |
|
|
|
|
|
text_list = [ref_text + gen_text] |
|
final_text_list = convert_char_to_pinyin(text_list) |
|
|
|
|
|
ref_audio_len = audio.shape[-1] // hop_length |
|
|
|
|
|
|
|
zh_pause_punc = r"。,、;:?!" |
|
ref_text_len = len(ref_text) + len(re.findall(zh_pause_punc, ref_text)) |
|
gen_text_len = len(gen_text) + len(re.findall(zh_pause_punc, gen_text)) |
|
duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed) |
|
|
|
|
|
gr.Info(f"Generating audio using F5-TTS") |
|
with torch.inference_mode(): |
|
generated, _ = base_model.sample( |
|
cond=audio, |
|
text=final_text_list, |
|
duration=duration, |
|
steps=nfe_step, |
|
cfg_strength=cfg_strength, |
|
sway_sampling_coef=sway_sampling_coef, |
|
) |
|
|
|
generated = generated[:, ref_audio_len:, :] |
|
generated_mel_spec = rearrange(generated, '1 n d -> 1 d n') |
|
gr.Info("Running vocoder") |
|
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz") |
|
generated_wave = vocos.decode(generated_mel_spec.cpu()) |
|
if rms < target_rms: |
|
generated_wave = generated_wave * rms / target_rms |
|
|
|
|
|
generated_wave = generated_wave.squeeze().cpu().numpy() |
|
|
|
if remove_silence: |
|
gr.Info("Removing audio silences... This may take a moment") |
|
non_silent_intervals = librosa.effects.split(generated_wave, top_db=30) |
|
non_silent_wave = np.array([]) |
|
for interval in non_silent_intervals: |
|
start, end = interval |
|
non_silent_wave = np.concatenate([non_silent_wave, generated_wave[start:end]]) |
|
generated_wave = non_silent_wave |
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav: |
|
wav_path = tmp_wav.name |
|
torchaudio.save(wav_path, torch.tensor(generated_wave), target_sample_rate) |
|
|
|
return wav_path |