Spaces:
Running
Running
import IPython | |
from huggingface_hub.inference_api import InferenceApi | |
import torch | |
from TTS.api import TTS | |
import wave | |
import espeakng | |
import subprocess | |
from scipy.io import wavfile | |
from transformers import pipeline | |
import os | |
import numpy as np | |
def synth_mms(text:str, model:str): | |
''' | |
Use Huggingface inference pipeline to synthesize text. | |
(Can be replaced by inference API, but that requires stored API token.) | |
Inputs: | |
text: Text to synthesze | |
model: Model code of the form mms-tts-LAN | |
Returns: | |
Streaming numpy and sampling rate. | |
''' | |
#inference = InferenceApi(repo_id=f"facebook/{model}", | |
# token=API_TOKEN) | |
#mms_tts = inference(inputs=text, | |
# raw_response=True)._content | |
if model is not None: | |
pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU | |
mms_tts = pipe(text) | |
return mms_tts['audio'], mms_tts['sampling_rate'] | |
else: | |
return None | |
def synth_coqui(text:str, model:str): | |
''' | |
Use Coqui inference API to synthesize text. | |
Inputs: | |
text: Text to synthesze | |
model: Model code | |
Returns: | |
Streaming Wav and sampling rate. | |
IMPORTANT: Current implementation assumes 22050 sampling rate, this should be verified when adding a new model. | |
''' | |
if model is not None: | |
# Get device | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Init TTS | |
tts = TTS(model, progress_bar=False).to(device) | |
# Infer | |
wav = tts.tts(text=text) # is_multi_speaker=False | |
return np.array(wav), 22050 | |
else: | |
return None | |
def synth_espeakng(text:str, model:str): | |
''' | |
Use ESpeak-NG to synthesize text. | |
Inputs: | |
text: Text to synthesze | |
model: Model code | |
Returns: | |
Streaming Wav and sampling rate. | |
''' | |
if model is not None: | |
subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]) | |
#esng = espeakng.Speaker() | |
#esng.voice = model | |
#esng.say(text, export_path="test.wav") | |
sampling_rate, wav = wavfile.read('test.wav') | |
os.remove("test.wav") | |
#wav = tts.tts(text=text) | |
return wav, sampling_rate | |
else: | |
return None | |