from __future__ import annotations import functools import io import os import tempfile import traceback import filelock import numpy as np import uuid import subprocess import time from enums import coqui_lock_name from tts_sentence_parsing import init_sentence_state, get_sentence, clean_sentence, detect_language from tts_utils import prepare_speech, get_no_audio, chunk_speed_change, combine_audios from utils import cuda_vis_check, get_lock_file import torch n_gpus1 = torch.cuda.device_count() if torch.cuda.is_available() else 0 n_gpus1, gpu_ids = cuda_vis_check(n_gpus1) def list_models(): from TTS.utils.manage import ModelManager return ModelManager().list_tts_models() def get_xtt(model_name="tts_models/multilingual/multi-dataset/xtts_v2", deepspeed=True, use_gpu=True, gpu_id='auto'): if n_gpus1 == 0: use_gpu = False # By using XTTS you agree to CPML license https://coqui.ai/cpml os.environ["COQUI_TOS_AGREED"] = "1" from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from TTS.utils.generic_utils import get_user_data_dir # This will trigger downloading model print("Downloading if not downloaded Coqui XTTS V2") from TTS.utils.manage import ModelManager ModelManager().download_model(model_name) model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--")) print("XTTS downloaded") print("Loading XTTS") config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) # Config will have more correct languages, they may be added before we append here ##["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"] supported_languages = config.languages model = Xtts.init_from_config(config) with filelock.FileLock(get_lock_file(coqui_lock_name)): model.load_checkpoint( config, checkpoint_dir=os.path.dirname(os.path.join(model_path, "model.pth")), checkpoint_path=os.path.join(model_path, "model.pth"), vocab_path=os.path.join(model_path, "vocab.json"), eval=True, use_deepspeed=deepspeed, ) if use_gpu: if gpu_id == 'auto': model.cuda() else: model.cuda(device='cuda:%d' % gpu_id) print("Done loading TTS") return model, supported_languages def get_latent(speaker_wav, voice_cleanup=False, model=None, gpt_cond_len=30, max_ref_length=60, sr=24000): if model is None: model, supported_languages = get_xtt() if voice_cleanup: speaker_wav = filter_wave_1(speaker_wav) # speaker_wav = filter_wave_2(speaker_wav) else: speaker_wav = speaker_wav # create as function as we can populate here with voice cleanup/filtering # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference # latent = (gpt_cond_latent, speaker_embedding) with filelock.FileLock(get_lock_file(coqui_lock_name)): latent = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=gpt_cond_len, max_ref_length=max_ref_length, load_sr=sr) return latent def get_voice_streaming(prompt, language, latent, suffix="0", model=None, sr=24000, tts_speed=1.0): if model is None: model, supported_languages = get_xtt() gpt_cond_latent, speaker_embedding = latent try: t0 = time.time() chunks = model.inference_stream( prompt, language, gpt_cond_latent, speaker_embedding, repetition_penalty=7.0, temperature=0.85, ) first_chunk = True for i, chunk in enumerate(chunks): if first_chunk: first_chunk_time = time.time() - t0 first_chunk = False chunk = chunk.detach().cpu().numpy().squeeze() chunk = (chunk * 32767).astype(np.int16) chunk = chunk_speed_change(chunk, sr, tts_speed=tts_speed) yield chunk.tobytes() except RuntimeError as e: if "device-side assert" in str(e): print(f"Restarted required due to exception: %s" % str(e), flush=True) else: print("Failed to generate wave: %s" % str(e)) traceback.print_exc() except Exception as e: traceback.print_exc() print("Failed to generate wave: %s" % str(e)) def generate_speech(response, model=None, language='autodetect', supported_languages=None, latent=None, sentence_state=None, return_as_byte=True, return_nonbyte_as_file=False, sr=24000, tts_speed=1.0, return_gradio=False, is_final=False, verbose=False, debug=False): if model is None or supported_languages is None: model, supported_languages = get_xtt() if sentence_state is None: sentence_state = init_sentence_state() if latent is None: latent = get_latent("models/female.wav", model=model) sentence, sentence_state, _ = get_sentence(response, sentence_state=sentence_state, is_final=is_final, verbose=verbose) if sentence: t0 = time.time() if verbose: print("sentence_to_wave: %s" % sentence) audio = sentence_to_wave(sentence, supported_languages, tts_speed, model=model, latent=latent, return_as_byte=return_as_byte, return_nonbyte_as_file=return_nonbyte_as_file, sr=sr, language=language, return_gradio=return_gradio) if verbose: print("done sentence_to_wave: %s" % (time.time() - t0), flush=True) else: if verbose and debug: # too much in general print("No audio", flush=True) no_audio = get_no_audio(sr=sr, return_as_byte=return_as_byte, return_nonbyte_as_file=return_nonbyte_as_file) if return_gradio: import gradio as gr audio = gr.Audio(value=no_audio, autoplay=False) else: audio = no_audio return audio, sentence, sentence_state def sentence_to_wave(sentence, supported_languages, tts_speed, latent=None, return_as_byte=False, return_nonbyte_as_file=False, sr=24000, model=None, return_gradio=True, language='autodetect', verbose=False): """ generate speech audio file per sentence """ import noisereduce as nr import wave sentence = clean_sentence(sentence, verbose=verbose) sentence_list = [sentence] try: wav_bytestream = b"" for sentence in sentence_list: # have to lock entire sentence, model doesn't handle threads, # this is ok since usually have many sentences with filelock.FileLock(get_lock_file(coqui_lock_name)): if any(c.isalnum() for c in sentence): if language == "autodetect": # on first call autodetect, next sentence calls will use same language language = detect_language(sentence, supported_languages, verbose=verbose) # exists at least 1 alphanumeric (utf-8) audio_stream = get_voice_streaming( sentence, language, latent, model=model, tts_speed=tts_speed, ) else: # likely got a ' or " or some other text without alphanumeric in it audio_stream = None if audio_stream is not None: frame_length = 0 for chunk in audio_stream: try: wav_bytestream += chunk frame_length += len(chunk) except Exception as e: print("Exception in chunk appending: %s" % str(e), flush=True) continue # Filter output for better voice filter_output = False if filter_output: data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream) // 2, offset=0) float_data = data_s16 * 0.5 ** 15 reduced_noise = nr.reduce_noise(y=float_data, sr=sr, prop_decrease=0.8, n_fft=1024) wav_bytestream = (reduced_noise * 32767).astype(np.int16) if return_as_byte: wav_bytestream = wav_bytestream.tobytes() if audio_stream is not None: if not return_as_byte: if return_nonbyte_as_file: tmpdir = os.getenv('TMPDDIR', tempfile.mkdtemp()) audio_unique_filename = os.path.join(tmpdir, str(uuid.uuid4()) + ".wav") with wave.open(audio_unique_filename, "w") as f: f.setnchannels(1) # 2 bytes per sample. f.setsampwidth(2) f.setframerate(sr) f.writeframes(wav_bytestream) ret_value = audio_unique_filename else: data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream) // 2, offset=0) float_data = data_s16 * 0.5 ** 15 reduced_noise = nr.reduce_noise(y=float_data, sr=sr, prop_decrease=0.8, n_fft=1024) wav_np = (reduced_noise * 32767).astype(np.int16) ret_value = wav_np else: ret_value = wav_bytestream if return_gradio: import gradio as gr return gr.Audio(value=ret_value, autoplay=True) else: return ret_value except RuntimeError as e: if "device-side assert" in str(e): print(f"Restarted required due to exception: %s" % str(e), flush=True) else: print("Failed to generate wave: %s" % str(e)) raise def get_role_to_wave_map(): # only for test and initializing state roles_map = {} roles_map["Female AI Assistant"] = "models/female.wav" roles_map["Male AI Assistant"] = "models/male.wav" roles_map["AI Beard The Pirate"] = "models/pirate_by_coqui.wav" roles_map["None"] = "" return roles_map def allowed_roles(): return list(get_role_to_wave_map().keys()) def get_roles(choices=None, value=None): if choices is None: choices = allowed_roles() if value is None: value = choices[0] import gradio as gr chatbot_role = gr.Dropdown( label="Speech Style", choices=choices, value=value, ) return chatbot_role def predict_from_text(response, chatbot_role, language, roles_map, tts_speed, model=None, supported_languages=None, return_as_byte=True, sr=24000, return_prefix_every_yield=False, include_audio0=True, return_dict=False, verbose=False): if chatbot_role == "None": return audio0 = prepare_speech(sr=sr) if not return_prefix_every_yield and include_audio0: if not return_dict: yield audio0 else: yield dict(audio=audio0, sr=sr) latent = get_latent(roles_map[chatbot_role], model=model) sentence_state = init_sentence_state() generate_speech_func = functools.partial(generate_speech, model=model, language=language, supported_languages=supported_languages, latent=latent, sentence_state=sentence_state, return_as_byte=return_as_byte, sr=sr, tts_speed=tts_speed, verbose=verbose) while True: audio1, sentence, sentence_state = generate_speech_func(response, is_final=False) if sentence is not None: if return_prefix_every_yield and include_audio0: audio_out = combine_audios([audio0], audio=audio1, channels=1, sample_width=2, sr=sr, expect_bytes=return_as_byte, verbose=verbose) else: audio_out = audio1 if not return_dict: yield audio_out else: yield dict(audio=audio_out, sr=sr) else: break audio1, sentence, sentence_state = generate_speech_func(response, is_final=True) if return_prefix_every_yield and include_audio0: audio_out = combine_audios([audio0], audio=audio1, channels=1, sample_width=2, sr=sr, expect_bytes=return_as_byte, verbose=verbose) else: audio_out = audio1 if not return_dict: yield audio_out else: yield dict(audio=audio_out, sr=sr) def filter_wave_1(speaker_wav): try: cleanup_filter = "lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02" resample_filter = "-ac 1 -ar 22050" out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" # ffmpeg to know output format # we will use newer ffmpeg as that has afftn denoise filter shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split( " ") command_result = subprocess.run([item for item in shell_command], capture_output=False, text=True, check=True) speaker_wav = out_filename print("Filtered microphone input") except subprocess.CalledProcessError: # There was an error - command exited with non-zero code print("Error: failed filtering, use original microphone input") return speaker_wav def filter_wave_2(speaker_wav): # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end # This is fast filtering not perfect # Apply all on demand lowpassfilter = denoise = trim = loudness = True if lowpassfilter: lowpass_highpass = "lowpass=8000,highpass=75," else: lowpass_highpass = "" if trim: # better to remove silence in beginning and end for microphone trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02," else: trim_silence = "" try: out_filename = ( speaker_wav + str(uuid.uuid4()) + ".wav" ) # ffmpeg to know output format # we will use newer ffmpeg as that has afftn denoise filter shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split( " " ) command_result = subprocess.run( [item for item in shell_command], capture_output=False, text=True, check=True, ) speaker_wav = out_filename print("Filtered microphone input") except subprocess.CalledProcessError: # There was an error - command exited with non-zero code print("Error: failed filtering, use original microphone input") return speaker_wav def get_languages_gr(visible=True, value=None): import gradio as gr choices = [ "autodetect", "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "ko", "hu" ] if value is None: value = choices[0] language_gr = gr.Dropdown( label="Language", info="Select an output language for the synthesised speech", choices=choices, value=value, visible=visible, ) return language_gr