File size: 4,813 Bytes
33d9042
4c1c145
33d9042
 
 
 
9488c79
 
33d9042
 
 
 
85d5a02
 
33d9042
 
 
 
 
 
 
a71b09f
33d9042
e27c13f
33d9042
 
 
 
 
 
 
 
 
 
 
8b6e3fd
 
 
 
33d9042
 
8b6e3fd
 
e27c13f
 
 
8b6e3fd
33d9042
 
 
 
 
8b6e3fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33d9042
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import spaces
import tempfile
import gradio as gr
import os
from whisperspeech.pipeline import Pipeline
import torch
import soundfile as sf
import numpy as np
import torch.nn.functional as F
from whisperspeech.languages import LANGUAGES
from whisperspeech.pipeline import Pipeline

title = """# 🙋🏻‍♂️ Welcome to🌟Tonic's🌬️💬📝WhisperSpeech

You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Previously known as spear-tts-pytorch. It's like Stable Diffusion but for speech – both powerful and easily customizable.
You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3> 
Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community 👻  [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Polytonic](https://github.com/tonic-ai) & contribute to 🌟 [Poly](https://github.com/tonic-ai/poly) 🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
"""

@spaces.GPU
def whisper_speech_demo(text, lang, speaker_audio=None, mix_lang=None, mix_text=None):
    pipe = Pipeline()
    speaker_url = None

    if speaker_audio is not None:
        speaker_url = speaker_audio.name

    if mix_lang and mix_text:
        mixed_langs = lang.split(',') + mix_lang.split(',')
        mixed_texts = [text] + mix_text.split(',')
        stoks = pipe.t2s.generate(mixed_texts, lang=mixed_langs)
        audio_data = pipe.generate(stoks, speaker_url, lang=mixed_langs[0])
    else:
        audio_data = pipe.generate(text, speaker_url, lang)

    resample_audio = resampler(newsr=24000)
    audio_data_resampled = next(resample_audio([{'sample_rate': 22050, 'samples': audio_data.cpu()}]))['samples_24k']

    # Normalize
    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
        tmp_file_name = tmp_file.name
        audio_np = audio_data_resampled.numpy()  # Convert to numpy array

        if audio_np.max() > 1.0 or audio_np.min() < -1.0:
            audio_np = audio_np / np.max(np.abs(audio_np))

        sf.write(tmp_file_name, audio_np, 24000, 'PCM_24')  # Write with a sample rate of 24000 Hz

    return tmp_file_name

with gr.Blocks() as demo:
    gr.Markdown(title)

    with gr.Tabs():
        with gr.TabItem("Standard TTS"):
            with gr.Row():
                text_input = gr.Textbox(label="Enter text")
                lang_input = gr.Dropdown(choices=list(LANGUAGES.keys()), label="Language")
                speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"], type="filepath")
                generate_button = gr.Button("Generate Speech")
            output_audio_standard = gr.Audio(label="🌬️💬📝WhisperSpeech")

            generate_button.click(
                whisper_speech_demo,
                inputs=[text_input, lang_input, speaker_input, None, None],
                outputs=output_audio_standard
            )

        with gr.TabItem("Mixed Language TTS"):
            with gr.Row():
                mix_text_input = gr.Textbox(label="Enter mixed language text", placeholder="e.g., Hello, Cześć")
                mix_lang_input = gr.CheckboxGroup(choices=list(LANGUAGES.keys()), label="Select Languages")
                mix_generate_button = gr.Button("Generate Mixed Speech")
            output_audio_mixed = gr.Audio(label="🌬️💬📝WhisperSpeech Mixed")

            mix_generate_button.click(
                whisper_speech_demo,
                inputs=[mix_text_input, None, None, mix_lang_input, mix_text_input],
                outputs=output_audio_mixed
            )

demo.launch()