Spaces:
Running
Running
from functools import lru_cache | |
import sherpa_onnx | |
from huggingface_hub import hf_hub_download | |
def get_file( | |
repo_id: str, | |
filename: str, | |
subfolder: str = ".", | |
) -> str: | |
model_filename = hf_hub_download( | |
repo_id=repo_id, | |
filename=filename, | |
subfolder=subfolder, | |
) | |
return model_filename | |
def _get_vits_vctk(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
assert repo_id == "csukuangfj/vits-vctk" | |
model = get_file( | |
repo_id=repo_id, | |
filename="vits-vctk.onnx", | |
subfolder=".", | |
) | |
lexicon = get_file( | |
repo_id=repo_id, | |
filename="lexicon.txt", | |
subfolder=".", | |
) | |
tokens = get_file( | |
repo_id=repo_id, | |
filename="tokens.txt", | |
subfolder=".", | |
) | |
tts_config = sherpa_onnx.OfflineTtsConfig( | |
model=sherpa_onnx.OfflineTtsModelConfig( | |
vits=sherpa_onnx.OfflineTtsVitsModelConfig( | |
model=model, | |
lexicon=lexicon, | |
tokens=tokens, | |
length_scale=1.0 / speed, | |
), | |
provider="cpu", | |
debug=True, | |
num_threads=2, | |
) | |
) | |
tts = sherpa_onnx.OfflineTts(tts_config) | |
return tts | |
def _get_vits_ljs(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
assert repo_id == "csukuangfj/vits-ljs" | |
model = get_file( | |
repo_id=repo_id, | |
filename="vits-ljs.onnx", | |
subfolder=".", | |
) | |
lexicon = get_file( | |
repo_id=repo_id, | |
filename="lexicon.txt", | |
subfolder=".", | |
) | |
tokens = get_file( | |
repo_id=repo_id, | |
filename="tokens.txt", | |
subfolder=".", | |
) | |
tts_config = sherpa_onnx.OfflineTtsConfig( | |
model=sherpa_onnx.OfflineTtsModelConfig( | |
vits=sherpa_onnx.OfflineTtsVitsModelConfig( | |
model=model, | |
lexicon=lexicon, | |
tokens=tokens, | |
length_scale=1.0 / speed, | |
), | |
provider="cpu", | |
debug=True, | |
num_threads=2, | |
) | |
) | |
tts = sherpa_onnx.OfflineTts(tts_config) | |
return tts | |
def _get_vits_piper(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
data_dir = "/tmp/espeak-ng-data" | |
if "coqui" in repo_id or "vits-mms" in repo_id: | |
name = "model" | |
elif "piper" in repo_id: | |
n = len("vits-piper-") | |
name = repo_id.split("/")[1][n:] | |
elif "mimic3" in repo_id: | |
n = len("vits-mimic3-") | |
name = repo_id.split("/")[1][n:] | |
else: | |
raise ValueError(f"Unsupported {repo_id}") | |
if "vits-coqui-uk-mai" in repo_id or "vits-mms" in repo_id: | |
data_dir = "" | |
model = get_file( | |
repo_id=repo_id, | |
filename=f"{name}.onnx", | |
subfolder=".", | |
) | |
tokens = get_file( | |
repo_id=repo_id, | |
filename="tokens.txt", | |
subfolder=".", | |
) | |
tts_config = sherpa_onnx.OfflineTtsConfig( | |
model=sherpa_onnx.OfflineTtsModelConfig( | |
vits=sherpa_onnx.OfflineTtsVitsModelConfig( | |
model=model, | |
lexicon="", | |
data_dir=data_dir, | |
tokens=tokens, | |
length_scale=1.0 / speed, | |
), | |
provider="cpu", | |
debug=True, | |
num_threads=2, | |
) | |
) | |
tts = sherpa_onnx.OfflineTts(tts_config) | |
return tts | |
def _get_vits_mms(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
return _get_vits_piper(repo_id, speed) | |
def _get_vits_zh_aishell3(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
assert repo_id == "csukuangfj/vits-zh-aishell3" | |
model = get_file( | |
repo_id=repo_id, | |
filename="vits-aishell3.onnx", | |
subfolder=".", | |
) | |
lexicon = get_file( | |
repo_id=repo_id, | |
filename="lexicon.txt", | |
subfolder=".", | |
) | |
tokens = get_file( | |
repo_id=repo_id, | |
filename="tokens.txt", | |
subfolder=".", | |
) | |
rule_fst = get_file( | |
repo_id=repo_id, | |
filename="rule.fst", | |
subfolder=".", | |
) | |
tts_config = sherpa_onnx.OfflineTtsConfig( | |
model=sherpa_onnx.OfflineTtsModelConfig( | |
vits=sherpa_onnx.OfflineTtsVitsModelConfig( | |
model=model, | |
lexicon=lexicon, | |
tokens=tokens, | |
length_scale=1.0 / speed, | |
), | |
provider="cpu", | |
debug=True, | |
num_threads=2, | |
), | |
rule_fsts=rule_fst, | |
) | |
tts = sherpa_onnx.OfflineTts(tts_config) | |
return tts | |
def _get_vits_hf(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
if "fanchen" in repo_id or "vits-cantonese-hf-xiaomaiiwn" in repo_id: | |
model = repo_id.split("/")[-1] | |
else: | |
model = repo_id.split("-")[-1] | |
model = get_file( | |
repo_id=repo_id, | |
filename=f"{model}.onnx", | |
subfolder=".", | |
) | |
lexicon = get_file( | |
repo_id=repo_id, | |
filename="lexicon.txt", | |
subfolder=".", | |
) | |
tokens = get_file( | |
repo_id=repo_id, | |
filename="tokens.txt", | |
subfolder=".", | |
) | |
rule_fst = get_file( | |
repo_id=repo_id, | |
filename="rule.fst", | |
subfolder=".", | |
) | |
tts_config = sherpa_onnx.OfflineTtsConfig( | |
model=sherpa_onnx.OfflineTtsModelConfig( | |
vits=sherpa_onnx.OfflineTtsVitsModelConfig( | |
model=model, | |
lexicon=lexicon, | |
tokens=tokens, | |
length_scale=1.0 / speed, | |
), | |
provider="cpu", | |
debug=True, | |
num_threads=2, | |
), | |
rule_fsts=rule_fst, | |
) | |
tts = sherpa_onnx.OfflineTts(tts_config) | |
return tts | |
def get_pretrained_model(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
if repo_id in english_models: | |
return english_models[repo_id](repo_id, speed) | |
elif repo_id in arabic_models: | |
return arabic_models[repo_id](repo_id, speed) | |
elif repo_id in turkish_models: | |
return turkish_models[repo_id](repo_id, speed) | |
elif repo_id in persian_models: | |
return persian_models[repo_id](repo_id, speed) | |
elif repo_id in hindi_models: | |
return hindi_models[repo_id](repo_id, speed) | |
elif repo_id in gujarati_models: | |
return gujarati_models[repo_id](repo_id, speed) | |
else: | |
raise ValueError(f"Unsupported repo_id: {repo_id}") | |
english_models = { | |
"csukuangfj/vits-piper-en_US-glados": _get_vits_piper, | |
# coqui-ai | |
"csukuangfj/vits-coqui-en-ljspeech": _get_vits_piper, | |
"csukuangfj/vits-coqui-en-ljspeech-neon": _get_vits_piper, | |
"csukuangfj/vits-coqui-en-vctk": _get_vits_piper, | |
# piper, US | |
"csukuangfj/vits-piper-en_GB-sweetbbak-amy": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-amy-low": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-amy-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-arctic-medium": _get_vits_piper, # 18 speakers | |
"csukuangfj/vits-piper-en_US-danny-low": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-hfc_male-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-joe-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-kathleen-low": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-kusal-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-l2arctic-medium": _get_vits_piper, # 24 speakers | |
"csukuangfj/vits-piper-en_US-lessac-low": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-lessac-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-lessac-high": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-libritts-high": _get_vits_piper, # 904 speakers | |
"csukuangfj/vits-piper-en_US-libritts_r-medium": _get_vits_piper, # 904 speakers | |
"csukuangfj/vits-piper-en_US-ryan-low": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-ryan-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_US-ryan-high": _get_vits_piper, | |
# piper, GB | |
"csukuangfj/vits-piper-en_GB-alan-low": _get_vits_piper, | |
"csukuangfj/vits-piper-en_GB-alan-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_GB-alba-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_GB-jenny_dioco-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_GB-northern_english_male-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_GB-semaine-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-en_GB-southern_english_female-low": _get_vits_piper, | |
"csukuangfj/vits-piper-en_GB-vctk-medium": _get_vits_piper, | |
# | |
"csukuangfj/vits-vctk": _get_vits_vctk, # 109 speakers | |
"csukuangfj/vits-ljs": _get_vits_ljs, | |
} | |
arabic_models = { | |
"csukuangfj/vits-piper-ar_JO-kareem-low": _get_vits_piper, | |
"csukuangfj/vits-piper-ar_JO-kareem-medium": _get_vits_piper, | |
} | |
turkish_models = { | |
"csukuangfj/vits-piper-tr_TR-dfki-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-tr_TR-fahrettin-medium": _get_vits_piper, | |
} | |
persian_models = { | |
"csukuangfj/vits-piper-fa_IR-amir-medium": _get_vits_piper, | |
"csukuangfj/vits-piper-fa_IR-gyro-medium": _get_vits_piper, | |
"csukuangfj/vits-mimic3-fa-haaniye_low": _get_vits_piper, | |
} | |
gujarati_models = { | |
"csukuangfj/vits-mimic3-gu_IN-cmu-indic_low": _get_vits_piper, | |
} | |
hindi_models = { | |
"vosk-model-hi-0.22": _get_vits_piper, | |
} | |
language_to_models = { | |
"English": list(english_models.keys()), | |
"Arabic": list(arabic_models.keys()), | |
"Hindi": list(hindi_models.keys()), | |
"Gujarati": list(gujarati_models.keys()), | |
"Persian": list(persian_models.keys()), | |
"Turkish": list(turkish_models.keys()), | |
} | |