Spaces:
Running
on
T4
Running
on
T4
import spaces | |
import gradio as gr | |
import io | |
import os | |
import re | |
import torch | |
import torchaudio | |
from pathlib import Path | |
from whisperspeech.pipeline import Pipeline | |
title = """# 🙋🏻♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech | |
You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Install it and use your command line interface locally with `pip install whisperspeech`. It's like Stable Diffusion but for speech – both powerful and easily customizable : so you can use it programmatically in your own pipelines! [Contribute to whisperspeech here](https://github.com/collabora/WhisperSpeech) | |
You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></h3> | |
We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗 | |
### How to Use | |
Input text with tahe language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio. | |
This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request ! | |
""" | |
text_examples = [ | |
["This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.", None], | |
["World War II or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis.", "https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg"], | |
["<pl>To jest pierwszy test wielojęzycznego <en>Whisper Speech <pl>, modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze <en>Jewels.", None], | |
["<en> WhisperSpeech is an Open Source library that helps you convert text to speech. <pl>Teraz także po Polsku! <en>I think I just tried saying \"now also in Polish\", don't judge me...", None], | |
# ["<de> WhisperSpeech is multi-lingual <es> y puede cambiar de idioma <hi> मध्य वाक्य में"], | |
["<pl>To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.", None], | |
# ["<en> The big difference between Europe <fr> et les Etats Unis <pl> jest to, że mamy tak wiele języków <uk> тут, в Європі"] | |
] | |
def parse_multilingual_text(input_text): | |
pattern = r"(?:<(\w+)>)|([^<]+)" | |
cur_lang = 'en' | |
segments = [] | |
for i, (lang, txt) in enumerate(re.findall(pattern, input_text)): | |
if lang: cur_lang = lang | |
else: segments.append((cur_lang, f" {txt} ")) # add spaces to give it some time to switch languages | |
if not segments: return [("en", "")] | |
return segments | |
def generate_audio(pipe, segments, speaker, speaker_url, cps=14): | |
if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker) | |
elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url) | |
else: speaker = pipe.default_speaker | |
langs, texts = [list(x) for x in zip(*segments)] | |
print(texts, langs) | |
stoks = pipe.t2s.generate(texts, cps=cps, lang=langs)[0] | |
atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0)) | |
audio = pipe.vocoder.decode(atoks) | |
return audio.cpu() | |
def whisper_speech_demo(multilingual_text, speaker_audio, speaker_url, cps): | |
if len(multilingual_text) == 0: | |
raise gr.Error("Please enter some text for me to speak!") | |
segments = parse_multilingual_text(multilingual_text) | |
audio = generate_audio(pipe, segments, speaker_audio, speaker_url, cps) | |
return (24000, audio.T.numpy()) | |
# Did not work for me in Safari: | |
# mp3 = io.BytesIO() | |
# torchaudio.save(mp3, audio, 24000, format='mp3') | |
# return mp3.getvalue() | |
with gr.Blocks() as demo: | |
gr.Markdown(title) | |
with gr.Row(equal_height=True): | |
with gr.Column(scale=2): | |
text_input = gr.Textbox(label="Enter multilingual text💬📝", | |
value=text_examples[0][0], | |
info="You can use `<en>` for English and `<pl>` for Polish, see examples below.") | |
cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25, | |
label="Tempo (in characters per second)") | |
speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬", | |
sources=["upload", "microphone"], | |
type='filepath') | |
gr.Markdown(" \n ") # fixes the bottom overflow from Audio | |
url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:") | |
generate_button = gr.Button("Try Collabora's WhisperSpeech🌟") | |
with gr.Column(scale=1): | |
output_audio = gr.Audio(label="WhisperSpeech says…") | |
with gr.Row(): | |
gr.Examples( | |
examples=text_examples, | |
inputs=[text_input, url_input], | |
outputs=[output_audio], | |
fn=whisper_speech_demo, | |
cache_examples=False, | |
label="Try these to get started !🌟🌬️" | |
) | |
generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio) | |
pipe = Pipeline()#torch_compile=True) | |
pipe.generate("WhisperSpeech warmup") | |
demo.launch(server_port=3000)#, share=True) | |