whisperspeech

Paused

App Files Files Community

whisperspeech / app.py

Tonic

Update app.py

54d10de verified 10 months ago

raw

history blame

6.63 kB

	# import spaces
	import tempfile
	import wave
	import gradio as gr
	import os
	import re
	import torch
	import soundfile as sf
	import numpy as np
	import torch.nn.functional as F
	from whisperspeech.pipeline import Pipeline
	from whisperspeech.languages import LANGUAGES
	from whisperspeech.pipeline import Pipeline
	from whisperspeech.utils import resampler

	title = """# 🙋🏻‍♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech

	You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Install it and use your command line interface locally with `pip install whisperspeech`. It's like Stable Diffusion but for speech – both powerful and easily customizable : so you can use it programmatically in your own pipelines! [Contribute to whisperspeech here](https://github.com/collabora/WhisperSpeech)
	You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3>

	We're celebrating the release of the whisperspeech at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗

	### How to Use
	Input text with tahe language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio.
	This space runs on ZeroGPU, so you need to be patient while you acquire the GPU and load the model the first time you make a request !
	"""

	# text examples=["<en> Hello, how are you? <fr> Bonjour, comment ça va?", "<de> Guten Tag <it> Buongiorno <jp> こんにちは"]
	# audio examples=["path/to/tonic.wav"]

	# Function to parse the multilingual input text
	def parse_multilingual_text(input_text):
	pattern = r"<(\w+)>\s(.*?)\s(?=<\w+>\|$)"
	segments = re.findall(pattern, input_text)
	return [(lang, text.strip()) for lang, text in segments if lang in LANGUAGES.keys()]

	#@spaces.GPU
	def generate_segment_audio(text, lang, speaker_url, pipe):
	if not isinstance(text, str):
	text = text.decode("utf-8") if isinstance(text, bytes) else str(text)

	# Generating stoks (tokens<pl>) from text
	# stoks = pipe.t2s.generate([text], lang=[lang])
	audio_data = pipe.generate(text, speaker_url, lang)
	resample_audio = resampler(newsr=24000)
	audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
	audio_np = audio_data_resampled.cpu().numpy()
	print("Shape after resampling:", audio_np.shape) # Debug statement
	return audio_np

	# Function to append and concatenate audio segments with padding
	def concatenate_audio_segments(segments):
	# # Determine the length of the longest segment
	# max_length = max(seg.shape[0] for seg in segments)
	# print("Max length of segments:", max_length) # Debug statement
	# # Pad each segment to the length of the longest segment and stack them
	# padded_segments = []
	# for seg in segments:
	# # Check if the segment is stereo; if not, convert it to stereo
	# if seg.ndim == 1 or seg.shape[1] == 1:
	# stereo_segment = np.stack((seg, seg), axis=-1)
	# else:
	# stereo_segment = seg

	# Pad the segment to the max length
	# padding_length = max_length - stereo_segment.shape[0]
	# padded_segment = np.pad(stereo_segment, ((0, padding_length), (0, 0)), 'constant')
	# print("Padded segment shape:", padded_segment.shape) # Debug statement
	# padded_segments.append(padded_segment)

	concatenated_audio = np.concatenate(segments , axis=1)

	print("Concatenated audio shape:", concatenated_audio.shape) # Debug statement
	# concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
	return concatenated_audio

	# The rest of the code in app.py remains the same

	# @spaces.GPU
	def whisper_speech_demo(multilingual_text, speaker_audio):
	segments = parse_multilingual_text(multilingual_text)
	if not segments:
	return None, "No valid language segments found. Please use the format: <lang> text"

	pipe = Pipeline()
	speaker_url = speaker_audio if speaker_audio is not None else None
	audio_segments = []

	for lang, text in segments:
	text_str = text if isinstance(text, str) else str(text)
	audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
	print("Audio segment shape:", audio_np.shape) # Debug statement
	audio_segments.append(audio_np)
	concatenated_audio = concatenate_audio_segments(audio_segments)
	print("Final concatenated audio shape:", concatenated_audio.shape) # Debug statement
	# audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
	# audio_stereo = audio_stereo.reshape(-1, 2)

	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
	sf.write(tmp_file.name, concatenated_audio, 24000, format='WAV', subtype='PCM_16')
	return tmp_file.name

	with gr.Blocks() as demo:
	gr.Markdown(title)
	output_audio = gr.Audio(label="Generated Speech")
	generate_button = gr.Button("Try 🌟Collabora🌬️💬📝WhisperSpeech")
	with gr.Row():
	text_input = gr.Textbox(label="Enter multilingual text", placeholder="e.g., <en> Hello <fr> Bonjour <es> Hola")
	speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"])
	with gr.Accordion("Available Languages and Their Tags"):
	language_list = "\n".join([f"{lang}: {LANGUAGES[lang]}" for lang in LANGUAGES])
	gr.Markdown(language_list)
	generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input], outputs=output_audio)

	demo.launch()