Abdurahman
app
f0e249a
from transformers import VitsModel, AutoTokenizer
import torch
import scipy.io.wavfile
import util
# Model ID and setup
model_id = "facebook/mms-tts-uig-script_arabic"
tts_tokenizer = AutoTokenizer.from_pretrained(model_id)
tts_model = VitsModel.from_pretrained(model_id)
# Automatically allocate the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tts_model = tts_model.to(device)
def generate_audio(input_text, script):
"""
Generate audio for the given input text and script
"""
# Convert text to Uyghur Arabic if needed
if script != "Uyghur Arabic":
input_text = util.ug_latn_to_arab(input_text)
# Tokenize and move inputs to the same device as the model
tts_inputs = tts_tokenizer(input_text, return_tensors="pt").to(device)
# Perform inference
with torch.no_grad():
tts_output = tts_model(**tts_inputs).waveform.cpu() # Move output back to CPU for saving
# Save to a temporary file
output_path = "tts_output.wav"
sample_rate = 16000
scipy.io.wavfile.write(output_path, rate=sample_rate, data=tts_output.numpy()[0])
# Return the audio file path
return output_path