Spaces:

leenag
/

Multilingual_TTS

Running

leenag commited on May 7

Commit

5a1a7ec

verified ·

1 Parent(s): c953361

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,9 +1,7 @@
 import gradio as gr
 import torch
 from transformers import VitsModel, AutoTokenizer
-import soundfile as sf
-import tempfile
-import os
 LANG_MODEL_MAP = {
     "English": "facebook/mms-tts-eng",
@@ -25,35 +23,26 @@ def load_model_and_tokenizer(language):
     return cache[model_name]
 def tts(language, text):
-    try:
-        if not text.strip():
-            return "Please enter some text.", None
-        tokenizer, model = load_model_and_tokenizer(language)
-        inputs = tokenizer(text, return_tensors="pt").to(device)
-        with torch.no_grad():
-            output = model(**inputs)
-        # Save to temporary WAV file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            sf.write(f.name, output.waveform.cpu().numpy(), samplerate=16000)
-            return "Here is your audio output", f.name
-    except Exception as e:
-        return f"Error: {str(e)}", None
 iface = gr.Interface(
     fn=tts,
     inputs=[
-        gr.Dropdown(label="Select Language", choices=list(LANG_MODEL_MAP.keys()), value="English"),
         gr.Textbox(label="Enter Text")
     ],
-    outputs=[
-        gr.Textbox(label="Status"),
-        gr.Audio(label="Synthesized Speech", type="filepath")
-    ],
-    title="Multilingual TTS with Meta MMS",
     description="Generate speech from text using Meta's MMS models for English, Hindi, Tamil, Malayalam, and Kannada."
 )

 import gradio as gr
 import torch
+import numpy as np
 from transformers import VitsModel, AutoTokenizer
 LANG_MODEL_MAP = {
     "English": "facebook/mms-tts-eng",
     return cache[model_name]
 def tts(language, text):
+    if not text.strip():
+        return 16000, np.zeros(1)  # empty waveform if no text
+    tokenizer, model = load_model_and_tokenizer(language)
+    inputs = tokenizer(text, return_tensors="pt").to(device)
+    with torch.no_grad():
+        output = model(**inputs)
+    waveform = output.waveform.squeeze().cpu().numpy()
+    return 16000, waveform
 iface = gr.Interface(
     fn=tts,
     inputs=[
+        gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"),
         gr.Textbox(label="Enter Text")
     ],
+    outputs=gr.Audio(label="Synthesized Speech", type="numpy"),
+    title="Multilingual Text-to-Speech (MMS)",
     description="Generate speech from text using Meta's MMS models for English, Hindi, Tamil, Malayalam, and Kannada."
 )