Spaces:
Running
Running
File size: 4,331 Bytes
a31d739 b5a171c c0f10bd 09a8764 b5a171c a31d739 a581dbd 53825e7 a31d739 cfc25e8 02e1a35 cfc25e8 a31d739 3887b31 8583009 a31d739 cfc25e8 a31d739 e970b40 a31d739 e970b40 a31d739 e970b40 a31d739 c64f48b a31d739 502d6ca cfc25e8 a31d739 67c5fd4 02e1a35 a31d739 cfc25e8 a31d739 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
import librosa
import numpy as np
import torch
import re
from num2words import num2words
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
checkpoint = "GreenCounsel/speecht5_tts_common_voice_5_sv"
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
speaker_embeddings = {
"Female": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
"Male": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
"Experimental":"spkemb/embeddings.npy",
}
def predict(text, speaker):
if len(text.strip()) == 0 or len(text.strip())>200:
text="Du måste ha minst ett och max 200 tecken."
ar=[int(s) for s in re.findall(r'\b\d+\b',text)]
for arr in ar:
text=text.replace(str(arr),num2words(arr,lang="sv"))
repl = [
('Ä', 'ae'),
('Å', 'o'),
('Ö', 'oe'),
('ä', 'ae'),
('å', 'o'),
('ö', 'oe'),
('ô','oe'),
('-',''),
('‘',''),
('’',''),
('“',''),
('”',''),
]
for src, dst in repl:
text = text.replace(src, dst)
inputs = processor(text=text, return_tensors="pt")
# limit input length
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :model.config.max_text_positions]
speaker_embedding = np.load(speaker_embeddings[speaker])
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
title = "SpeechT5 finetuned Swedish, TTS "
description = """
SpeechT5 text-to-speech model finetuned on the Swedish language from the
Common Voice dataset. Inference runs on a basic CPU (2 vCPU, 16 GB ram) so
please have patience if it takes some time. As a company founded by a female
coder, our resources are extremely limited (female founders in tech only get approx.
1 % of the venture capital and the women who receive funding seldom are the
ones actually handling the tech). We are in a very biased sphere where
female coders' companies seldom get the resources which would normally
be necessary to do what they do. The app uses the SpeechT5 model
finetuned for swedish by GreenCounsel, available here: [https://huggingface.co/GreenCounsel/speecht5_tts_common_voice_5_sv](https://huggingface.co/GreenCounsel/speecht5_tts_common_voice_5_sv).
"""
article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
<a href="https://github.com/microsoft/SpeechT5/">original SpeechT5</a> |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
<pre>
@article{Ao2021SpeechT5,
title = {SpeechT5: Unified-Modal Encoder-Decoder Pre-training for Spoken Language Processing},
author = {Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
eprint={2110.07205},
archivePrefix={arXiv},
primaryClass={eess.AS},
year={2021}
}
</pre>
</div>
"""
examples = [
["GreenCounsel grundades i Malmö för sex år sedan.", "Female"],
["Med hjälp av maskininlärning kan mycket av juridiken automatiseras samtidigt som juristerna fokuserar på frågor där de ger störst värde.", "Male"],
["GreenCounsel har byggt en chatbott som kan förstå frågor på många olika språk och ge kvalitetssäkrade svar.", "Female"],
["Vi har också byggt ett system för att automatisera arbetsflöden för juridiska tjänster via internet.", "Male"],
["Talsyntesen bygger på en engelsk modell och kan därför upplevas som att jag bryter lite på engelska.","Female"]
]
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Input Text"),
gr.Radio(label="Speaker", choices=[
"Female",
"Male",
"Experimental",
],
value="Female"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
title=title,
description=description,
article=article,
examples=examples,
).launch() |