import spaces
import tempfile
import wave
import gradio as gr
import os
import re
import torch
import soundfile as sf
import numpy as np
import torch.nn.functional as F
from whisperspeech.pipeline import Pipeline
from whisperspeech.languages import LANGUAGES
from whisperspeech.pipeline import Pipeline
from whisperspeech.utils import resampler
title = """# 🙋🏻♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech
You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Install it and use your command line interface locally with `pip install whisperspeech`. It's like Stable Diffusion but for speech – both powerful and easily customizable : so you can use it programmatically in your own pipelines! [Contribute to whisperspeech here](https://github.com/collabora/WhisperSpeech)
You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here:
We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗
### How to Use
Input text with the language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio.
This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request !
"""
# text examples=[" Hello, how are you? Bonjour, comment ça va?", " Guten Tag Buongiorno こんにちは"]
# audio examples=["path/to/tonic.wav"]
# Function to parse the multilingual input text
def parse_multilingual_text(input_text):
pattern = r"<(\w+)>\s(.*?)\s(?=<\w+>|$)"
segments = re.findall(pattern, input_text)
return [(lang, text.strip()) for lang, text in segments if lang in LANGUAGES.keys()]
@spaces.GPU
def generate_segment_audio(text, lang, speaker_url, pipe):
if not isinstance(text, str):
text = text.decode("utf-8") if isinstance(text, bytes) else str(text)
# Generating stoks (tokens) from text
# stoks = pipe.t2s.generate([text], lang=[lang])
audio_data = pipe.generate(text, speaker_url, lang)
resample_audio = resampler(newsr=24000)
audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
audio_np = audio_data_resampled.cpu().numpy()
return audio_np
# Function to concatenate audio segments in stereo
def concatenate_audio_segments(segments):
total_length = sum(seg.shape[0] for seg in segments)
concatenated_audio = np.zeros((total_length, 2), dtype=np.float32)
current_index = 0
for seg in segments:
end_index = current_index + seg.shape[0]
concatenated_audio[current_index:end_index, :] = seg
current_index = end_index
concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
return concatenated_audio
@spaces.GPU
def whisper_speech_demo(multilingual_text, speaker_audio):
segments = parse_multilingual_text(multilingual_text)
if not segments:
return None, "No valid language segments found. Please use the format: text"
pipe = Pipeline()
speaker_url = speaker_audio if speaker_audio is not None else None
audio_segments = []
for lang, text in segments:
text_str = text if isinstance(text, str) else str(text)
audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
audio_segments.append(audio_np)
concatenated_audio = concatenate_audio_segments(audio_segments)
audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
audio_stereo = audio_stereo.reshape(-1, 2)
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')
return tmp_file.name
with gr.Blocks() as demo:
gr.Markdown(title)
output_audio = gr.Audio(label="Generated Speech")
generate_button = gr.Button("Try 🌟Collabora🌬️💬📝WhisperSpeech")
with gr.Row():
text_input = gr.Textbox(label="Enter multilingual text", placeholder="e.g., Hello Bonjour Hola")
speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"])
with gr.Accordion("Available Languages and Their Tags"):
language_list = "\n".join([f"{lang}: {LANGUAGES[lang]}" for lang in LANGUAGES])
gr.Markdown(language_list)
generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input], outputs=output_audio)
demo.launch()