import spaces import tempfile import wave import gradio as gr import os import re import torch import soundfile as sf import numpy as np import torch.nn.functional as F from whisperspeech.pipeline import Pipeline from whisperspeech.languages import LANGUAGES from whisperspeech.pipeline import Pipeline from whisperspeech.utils import resampler title = """# 🙋🏻‍♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Install it and use your command line interface locally with `pip install whisperspeech`. It's like Stable Diffusion but for speech – both powerful and easily customizable : so you can use it programmatically in your own pipelines! [Contribute to whisperspeech here](https://github.com/collabora/WhisperSpeech) You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here: Duplicate Space We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗 ### How to Use Input text with tahe language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio. This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request ! """ # text examples=[" Hello, how are you? Bonjour, comment ça va?", " Guten Tag Buongiorno こんにちは"] # audio examples=["path/to/tonic.wav"] # Function to parse the multilingual input text def parse_multilingual_text(input_text): pattern = r"<(\w+)>\s(.*?)\s(?=<\w+>|$)" segments = re.findall(pattern, input_text) return [(lang, text.strip()) for lang, text in segments if lang in LANGUAGES.keys()] @spaces.GPU def generate_segment_audio(text, lang, speaker_url, pipe): if not isinstance(text, str): text = text.decode("utf-8") if isinstance(text, bytes) else str(text) # Generating stoks (tokens) from text # stoks = pipe.t2s.generate([text], lang=[lang]) audio_data = pipe.generate(text, speaker_url, lang) resample_audio = resampler(newsr=24000) audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k'] audio_np = audio_data_resampled.cpu().numpy() print("Shape after resampling:", audio_np.shape) # Debug statement return audio_np # Function to append and concatenate audio segments with padding def concatenate_audio_segments(segments): # Determine the length of the longest segment max_length = max(seg.shape[0] for seg in segments) print("Max length of segments:", max_length) # Debug statement # Pad each segment to the length of the longest segment and stack them padded_segments = [] for seg in segments: # Check if the segment is stereo; if not, convert it to stereo if seg.ndim == 1 or seg.shape[1] == 1: stereo_segment = np.stack((seg, seg), axis=-1) else: stereo_segment = seg # Pad the segment to the max length padding_length = max_length - stereo_segment.shape[0] padded_segment = np.pad(stereo_segment, ((0, padding_length), (0, 0)), 'constant') print("Padded segment shape:", padded_segment.shape) # Debug statement padded_segments.append(padded_segment) concatenated_audio = np.vstack(padded_segments) print("Concatenated audio shape:", concatenated_audio.shape) # Debug statement concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio)) return concatenated_audio # The rest of the code in app.py remains the same @spaces.GPU def whisper_speech_demo(multilingual_text, speaker_audio): segments = parse_multilingual_text(multilingual_text) if not segments: return None, "No valid language segments found. Please use the format: text" pipe = Pipeline() speaker_url = speaker_audio if speaker_audio is not None else None audio_segments = [] for lang, text in segments: text_str = text if isinstance(text, str) else str(text) audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe) print("Audio segment shape:", audio_np.shape) # Debug statement audio_segments.append(audio_np) concatenated_audio = concatenate_audio_segments(audio_segments) print("Final concatenated audio shape:", concatenated_audio.shape) # Debug statement audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1) audio_stereo = audio_stereo.reshape(-1, 2) with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16') return tmp_file.name with gr.Blocks() as demo: gr.Markdown(title) output_audio = gr.Audio(label="Generated Speech") generate_button = gr.Button("Try 🌟Collabora🌬️💬📝WhisperSpeech") with gr.Row(): text_input = gr.Textbox(label="Enter multilingual text", placeholder="e.g., Hello Bonjour Hola") speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"]) with gr.Accordion("Available Languages and Their Tags"): language_list = "\n".join([f"{lang}: {LANGUAGES[lang]}" for lang in LANGUAGES]) gr.Markdown(language_list) generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input], outputs=output_audio) demo.launch()