llasa-1b-tts-multilingual

Running on Zero

App Files Files Community

SebastianBodza commited on 27 days ago

Commit

67a186c

verified ·

1 Parent(s): a370bc6

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -56

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import torchaudio
 import gradio as gr
 import tempfile
 import os
 llasa_1b ='SebastianBodza/Kartoffel-1B-v0.2'
@@ -20,7 +21,6 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model_path = "srinivasbilla/xcodec2"
 Codec_model = XCodec2Model.from_pretrained(model_path)
 Codec_model.eval().cuda()
@@ -32,57 +32,37 @@ whisper_turbo_pipe = pipeline(
 )
-vad_model, utils = torch.hub.load(
-    "snakers4/silero-vad",
-    model="silero_vad",
-    force_reload=False,
-    source="github")
-get_speech_timestamps, *_ = utils
-def remove_silence_silero(waveform, sample_rate, vad_model):
     """
-    Remove leading silence using Silero VAD.
     Args:
-        waveform: torch.Tensor audio waveform (channels, samples)
-        sample_rate: int sample rate
-        vad_model: Silero VAD model
     """
-    if waveform.size(0) > 1:
-        waveform = torch.mean(waveform, dim=0, keepdim=True)
-    original_waveform = waveform
-    if sample_rate != 16000:
-        waveform_16k = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
-    else:
-        waveform_16k = waveform
-    # Get speech timestamps
-    speech_timestamps = get_speech_timestamps(waveform_16k[0], vad_model, sampling_rate=16000)
-    if speech_timestamps:
-        # Get first speech segment start
-        first_speech = speech_timestamps[0]['start']
-        # Add small padding before speech (0.1 seconds)
-        padding_samples = int(0.1 * sample_rate)
-        start_idx = max(0, int(first_speech * sample_rate/16000) - padding_samples)
-        # Same for the end
-        last_speech = speech_timestamps[-1]['end']
-        end_idx = min(original_waveform.size(1), int(last_speech * sample_rate/16000) + padding_samples)
-        # Trim the original waveform (not the resampled one)
-        trimmed_wav = original_waveform[:, start_idx:end_idx]
-        # added padding of 16 at the start and end
-        return torch.nn.functional.pad(trimmed_wav, (16, 16), "constant", 0)
-    return original_waveform
 def ids_to_speech_tokens(speech_ids):
@@ -105,18 +85,19 @@ def extract_speech_ids(speech_tokens_str):
     return speech_ids
 @spaces.GPU(duration=30)
-def infer(sample_audio_path, target_text, progress=gr.Progress()):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         progress(0, 'Loading and trimming audio...')
         waveform, sample_rate = torchaudio.load(sample_audio_path)
-        waveform = remove_silence_silero(waveform, sample_rate, vad_model)
-        # For debugging save the trimmed audio
-        torchaudio.save("dev.wav", waveform, sample_rate)
         if len(waveform[0])/sample_rate > 15:
             gr.Warning("Trimming audio to first 15secs.")
             waveform = waveform[:, :sample_rate*15]
         # Check if the audio is stereo (i.e., has more than one channel)
         if waveform.size(0) > 1:
@@ -132,11 +113,12 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
         if len(target_text) == 0:
             return None
-        elif len(target_text) > 300:
             gr.Warning("Text is too long. Please keep it under 300 characters.")
-            target_text = target_text[:300]
         input_text = prompt_text + ' ' + target_text
         #TTS start!
         with torch.no_grad():
@@ -159,7 +141,7 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
                 chat,
                 tokenize=True,
                 return_tensors='pt',
-                continue_final_message=True
             )
             input_ids = input_ids.to('cuda')
             speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
@@ -168,11 +150,13 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
             outputs = model.generate(
                 input_ids,
                 max_length=2048,  # We trained our model with a max length of 2048
-                eos_token_id= speech_end_id ,
-                do_sample=True,
-                top_p=1,
-                temperature=0.8
             )
             # Extract the speech tokens
             generated_ids = outputs[0][input_ids.shape[1]-len(speech_ids_prefix):-1]
@@ -198,6 +182,34 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
 with gr.Blocks() as app_tts:
     gr.Markdown("# Zero Shot Voice Clone TTS")
     ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
     gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
@@ -211,6 +223,10 @@ with gr.Blocks() as app_tts:
         inputs=[
             ref_audio_input,
             gen_text_input,
         ],
         outputs=[audio_output, raw_output_display]  # Include both outputs
     )
@@ -230,7 +246,7 @@ with gr.Blocks() as app:
 This is a local web UI for my finetune of the llasa 1b SOTA(imo) Zero Shot Voice Cloning and TTS model.
-The checkpoints support English and Chinese.
 If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
 """

 import gradio as gr
 import tempfile
 import os
+import numpy as np
 llasa_1b ='SebastianBodza/Kartoffel-1B-v0.2'
 )
 model_path = "srinivasbilla/xcodec2"
 Codec_model = XCodec2Model.from_pretrained(model_path)
 Codec_model.eval().cuda()
 )
+def normalize_audio(waveform: torch.Tensor, target_db: float = -20) -> torch.Tensor:
     """
+    Normalize audio volume to target dB and limit gain range.
     Args:
+        waveform (torch.Tensor): Input audio waveform
+        target_db (float): Target dB level (default: -20)
+    Returns:
+        torch.Tensor: Normalized audio waveform
     """
+    # Calculate current dB
+    eps = 1e-10
+    current_db = 20 * torch.log10(torch.max(torch.abs(waveform)) + eps)
+    # Calculate required gain
+    gain_db = target_db - current_db
+    # Limit gain to -3 to 3 dB range
+    gain_db = torch.clamp(gain_db, min=-3, max=3)
+    # Apply gain
+    gain_factor = 10 ** (gain_db / 20)
+    normalized = waveform * gain_factor
+    # Final peak normalization
+    max_amplitude = torch.max(torch.abs(normalized))
+    if max_amplitude > 0:
+        normalized = normalized / max_amplitude
+    return normalized
 def ids_to_speech_tokens(speech_ids):
     return speech_ids
 @spaces.GPU(duration=30)
+def infer(sample_audio_path, target_text, temp, top_p_val, min_new_tokens, do_sample, progress=gr.Progress()):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         progress(0, 'Loading and trimming audio...')
         waveform, sample_rate = torchaudio.load(sample_audio_path)
+        waveform = normalize_audio(waveform)
         if len(waveform[0])/sample_rate > 15:
             gr.Warning("Trimming audio to first 15secs.")
             waveform = waveform[:, :sample_rate*15]
+            waveform = torch.nn.functional.pad(waveform, (0, int(sample_rate*0.5)), "constant", 0)
         # Check if the audio is stereo (i.e., has more than one channel)
         if waveform.size(0) > 1:
         if len(target_text) == 0:
             return None
+        elif len(target_text) > 500:
             gr.Warning("Text is too long. Please keep it under 300 characters.")
+            target_text = target_text[:500]
         input_text = prompt_text + ' ' + target_text
+        print("Transcribed text:", input_text)
         #TTS start!
         with torch.no_grad():
                 chat,
                 tokenize=True,
                 return_tensors='pt',
+                continue_final_message=True,
             )
             input_ids = input_ids.to('cuda')
             speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
             outputs = model.generate(
                 input_ids,
                 max_length=2048,  # We trained our model with a max length of 2048
+                eos_token_id= speech_end_id,
+                do_sample=do_sample,
+                top_p=top_p_val,
+                temperature=temp,
+                min_new_tokens=min_new_tokens,
             )
             # Extract the speech tokens
             generated_ids = outputs[0][input_ids.shape[1]-len(speech_ids_prefix):-1]
 with gr.Blocks() as app_tts:
     gr.Markdown("# Zero Shot Voice Clone TTS")
+    with gr.Accordion("Model Settings", open=False):
+        temperature = gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.8,
+            step=0.1,
+            label="Temperature",
+            info="Higher values = more random/creative output"
+        )
+        top_p = gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=1.0,
+            step=0.1,
+            label="Top P",
+            info="Nucleus sampling threshold"
+        )
+        min_new_tokens = gr.Slider(
+            minimum=0,
+            maximum=128,
+            value=3,
+            step=1,
+            label="Min Length",
+            info="If the model just produces a click you can force it to create longer generations."
+        )
+        do_sample = gr.Checkbox(label="Sample", value=True, info="Sample from the distribution")
     ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
     gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
         inputs=[
             ref_audio_input,
             gen_text_input,
+            temperature,
+            top_p,
+            min_new_tokens,
+            do_sample
         ],
         outputs=[audio_output, raw_output_display]  # Include both outputs
     )
 This is a local web UI for my finetune of the llasa 1b SOTA(imo) Zero Shot Voice Cloning and TTS model.
+The checkpoints support German. If the audio is of low quality, the model may struggle to generate speech. Turn the **temperature** up to get more coherent results.
 If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
 """