from transformers import VitsModel, AutoTokenizer import torch import scipy.io.wavfile import util # Model ID and setup model_id = "facebook/mms-tts-uig-script_arabic" tts_tokenizer = AutoTokenizer.from_pretrained(model_id) tts_model = VitsModel.from_pretrained(model_id) # Automatically allocate the device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tts_model = tts_model.to(device) def generate_audio(input_text, script): """ Generate audio for the given input text and script """ # Convert text to Uyghur Arabic if needed if script != "Uyghur Arabic": input_text = util.ug_latn_to_arab(input_text) # Tokenize and move inputs to the same device as the model tts_inputs = tts_tokenizer(input_text, return_tensors="pt").to(device) # Perform inference with torch.no_grad(): tts_output = tts_model(**tts_inputs).waveform.cpu() # Move output back to CPU for saving # Save to a temporary file output_path = "tts_output.wav" sample_rate = 16000 scipy.io.wavfile.write(output_path, rate=sample_rate, data=tts_output.numpy()[0]) # Return the audio file path return output_path