from transformers import VitsModel, AutoTokenizer
import torch
import scipy.io.wavfile
import util

# Model ID and setup
model_id = "facebook/mms-tts-uig-script_arabic"
tts_tokenizer = AutoTokenizer.from_pretrained(model_id)
tts_model = VitsModel.from_pretrained(model_id)

# Automatically allocate the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tts_model = tts_model.to(device) 

def generate_audio(input_text, script):
    """
    Generate audio for the given input text and script
    """
    # Convert text to Uyghur Arabic if needed
    if script != "Uyghur Arabic":
        input_text = util.ug_latn_to_arab(input_text)

    # Tokenize and move inputs to the same device as the model
    tts_inputs = tts_tokenizer(input_text, return_tensors="pt").to(device)
    
    # Perform inference
    with torch.no_grad():
        tts_output = tts_model(**tts_inputs).waveform.cpu()  # Move output back to CPU for saving

    # Save to a temporary file
    output_path = "tts_output.wav"
    sample_rate = 16000
    scipy.io.wavfile.write(output_path, rate=sample_rate, data=tts_output.numpy()[0])

    # Return the audio file path
    return output_path