llasa-3b-tts

Running on Zero

App Files Files Community

gorbiz commited on Jan 28

Commit

793e8f1

verified ·

1 Parent(s): 3f8297c

Default wav file(?)

Browse files

Files changed (1) hide show

app.py +23 -30

app.py CHANGED Viewed

@@ -6,8 +6,21 @@ from xcodec2.modeling_xcodec2 import XCodec2Model
 import torchaudio
 import gradio as gr
 import tempfile
-llasa_3b ='srinivasbilla/llasa-3b'
 tokenizer = AutoTokenizer.from_pretrained(llasa_3b)
@@ -30,19 +43,16 @@ whisper_turbo_pipe = pipeline(
 )
 def ids_to_speech_tokens(speech_ids):
     speech_tokens_str = []
     for speech_id in speech_ids:
         speech_tokens_str.append(f"<|s_{speech_id}|>")
     return speech_tokens_str
 def extract_speech_ids(speech_tokens_str):
     speech_ids = []
     for token_str in speech_tokens_str:
         if token_str.startswith('<|s_') and token_str.endswith('|>'):
             num_str = token_str[4:-2]
             num = int(num_str)
             speech_ids.append(num)
         else:
@@ -58,12 +68,9 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
             gr.Warning("Trimming audio to first 15secs.")
             waveform = waveform[:, :sample_rate*15]
-        # Check if the audio is stereo (i.e., has more than one channel)
         if waveform.size(0) > 1:
-            # Convert stereo to mono by averaging the channels
             waveform_mono = torch.mean(waveform, dim=0, keepdim=True)
         else:
-            # If already mono, just use the original waveform
             waveform_mono = waveform
         prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
@@ -78,18 +85,13 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
         input_text = prompt_text + ' ' + target_text
-        #TTS start!
         with torch.no_grad():
-            # Encode the prompt wav
             vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
             vq_code_prompt = vq_code_prompt[0,0,:]
-            # Convert int 12345 to token <|s_12345|>
             speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)
             formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
-            # Tokenize the text and the speech prefix
             chat = [
                 {"role": "user", "content": "Convert the text to speech:" + formatted_text},
                 {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + ''.join(speech_ids_prefix)}
@@ -104,29 +106,20 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
             input_ids = input_ids.to('cuda')
             speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
-            # Generate the speech autoregressively
             outputs = model.generate(
                 input_ids,
-                max_length=2048,  # We trained our model with a max length of 2048
                 eos_token_id= speech_end_id ,
                 do_sample=True,
                 top_p=1,
                 temperature=0.8
             )
-            # Extract the speech tokens
             generated_ids = outputs[0][input_ids.shape[1]-len(speech_ids_prefix):-1]
             speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-            # Convert  token <|s_23456|> to int 23456
             speech_tokens = extract_speech_ids(speech_tokens)
             speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
-            # Decode the speech tokens to speech waveform
             gen_wav = Codec_model.decode_code(speech_tokens)
-            # if only need the generated part
             gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]
             progress(1, 'Synthesized!')
@@ -135,19 +128,20 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
 with gr.Blocks() as app_tts:
     gr.Markdown("# Zero Shot Voice Clone TTS")
-    ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
     gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
     generate_btn = gr.Button("Synthesize", variant="primary")
     audio_output = gr.Audio(label="Synthesized Audio")
     generate_btn.click(
         infer,
-        inputs=[
-            ref_audio_input,
-            gen_text_input,
-        ],
         outputs=[audio_output],
     )
@@ -173,5 +167,4 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
     )
     gr.TabbedInterface([app_tts], ["TTS"])
 app.launch(ssr_mode=False)

 import torchaudio
 import gradio as gr
 import tempfile
+import requests  # Added import for downloading the default WAV
+# Download the default WAV file
+default_wav_url = "https://file.thatvoid.com/main/20250127T095211591Z-ee8c576d2304e5195ddfce77a45e0377.wav"
+default_wav_path = "default_voice.wav"
+try:
+    response = requests.get(default_wav_url)
+    response.raise_for_status()
+    with open(default_wav_path, "wb") as f:
+        f.write(response.content)
+except Exception as e:
+    print(f"Failed to download default WAV: {e}")
+    default_wav_path = None  # Fallback to requiring user input
+llasa_3b = 'srinivasbilla/llasa-3b'
 tokenizer = AutoTokenizer.from_pretrained(llasa_3b)
 )
 def ids_to_speech_tokens(speech_ids):
     speech_tokens_str = []
     for speech_id in speech_ids:
         speech_tokens_str.append(f"<|s_{speech_id}|>")
     return speech_tokens_str
 def extract_speech_ids(speech_tokens_str):
     speech_ids = []
     for token_str in speech_tokens_str:
         if token_str.startswith('<|s_') and token_str.endswith('|>'):
             num_str = token_str[4:-2]
             num = int(num_str)
             speech_ids.append(num)
         else:
             gr.Warning("Trimming audio to first 15secs.")
             waveform = waveform[:, :sample_rate*15]
         if waveform.size(0) > 1:
             waveform_mono = torch.mean(waveform, dim=0, keepdim=True)
         else:
             waveform_mono = waveform
         prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
         input_text = prompt_text + ' ' + target_text
         with torch.no_grad():
             vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
             vq_code_prompt = vq_code_prompt[0,0,:]
             speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)
             formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
             chat = [
                 {"role": "user", "content": "Convert the text to speech:" + formatted_text},
                 {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + ''.join(speech_ids_prefix)}
             input_ids = input_ids.to('cuda')
             speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
             outputs = model.generate(
                 input_ids,
+                max_length=2048,
                 eos_token_id= speech_end_id ,
                 do_sample=True,
                 top_p=1,
                 temperature=0.8
             )
             generated_ids = outputs[0][input_ids.shape[1]-len(speech_ids_prefix):-1]
             speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
             speech_tokens = extract_speech_ids(speech_tokens)
             speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
             gen_wav = Codec_model.decode_code(speech_tokens)
             gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]
             progress(1, 'Synthesized!')
 with gr.Blocks() as app_tts:
     gr.Markdown("# Zero Shot Voice Clone TTS")
+    # Set default value for the audio input
+    ref_audio_input = gr.Audio(
+        label="Reference Audio",
+        type="filepath",
+        value=default_wav_path if default_wav_path else None  # Use downloaded file or fallback
+    )
     gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
     generate_btn = gr.Button("Synthesize", variant="primary")
     audio_output = gr.Audio(label="Synthesized Audio")
     generate_btn.click(
         infer,
+        inputs=[ref_audio_input, gen_text_input],
         outputs=[audio_output],
     )
     )
     gr.TabbedInterface([app_tts], ["TTS"])
 app.launch(ssr_mode=False)