import gradio as gr
import torch
from transformers import VitsModel, AutoTokenizer

# 1. Load the model (Nastaliq-based) and tokenizer
#    This checkpoint is intended for Urdu text in its traditional (Nastaliq) script.
model_name = "facebook/mms-tts-urd"
model = VitsModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. Define the inference function
def generate_urdu_speech(urdu_text):
    # Tokenize the input text
    inputs = tokenizer(urdu_text, return_tensors="pt")

    # Perform inference with the model
    with torch.no_grad():
        output = model(**inputs).waveform

    # Convert PyTorch tensor to NumPy array
    waveform = output.squeeze().cpu().numpy()
    sample_rate = model.config.sampling_rate

    # Gradio’s Audio component expects (sample_rate, audio_data)
    return (sample_rate, waveform)

# 3. Build the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("""
    # Urdu TTS Demo (Nastaliq Script)

    Enter text in Urdu (Nastaliq) script, and this demo will synthesize speech using the Facebook MMS TTS model for Urdu.
    """)

    # Text input for Urdu (Nastaliq)
    text_input = gr.Textbox(
        label="Enter Urdu text",
        placeholder="مثال کے طور پر...",
        lines=3
    )

    # Audio output
    audio_output = gr.Audio(label="Generated Urdu Speech", type="numpy")

    # Generate button
    generate_button = gr.Button("Generate Speech")

    # Wire up the button to the function
    generate_button.click(
        fn=generate_urdu_speech,
        inputs=text_input,
        outputs=audio_output
    )

# 4. Launch the Gradio app
if __name__ == "__main__":
    demo.launch()