import gradio as gr from transformers import VitsModel, AutoTokenizer import torch import scipy.io.wavfile import numpy as np import librosa import soundfile as sf import tempfile # Load the model and tokenizer model = VitsModel.from_pretrained("facebook/mms-tts-eng") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") def pitch_shift_np(audio_np, sampling_rate, pitch_shift): # Correcting the function call return librosa.effects.pitch_shift(audio_np, sr=sampling_rate, n_steps=pitch_shift) def synthesize_speech(text, pitch_shift): # Tokenize the input text inputs = tokenizer(text, return_tensors="pt") # Generate waveform with torch.no_grad(): output = model(**inputs).waveform.squeeze().numpy() # Pitch shift shifted_audio = pitch_shift_np(output, model.config.sampling_rate, pitch_shift) # Save to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp: sf.write(fp.name, shifted_audio, model.config.sampling_rate) temp_file_path = fp.name return temp_file_path # Create the Gradio interface interface = gr.Interface( fn=synthesize_speech, inputs=[ gr.components.Textbox(lines=2, placeholder="Type your text here..."), gr.components.Slider(minimum=-2, maximum=2, step=0.1, label="Pitch Shift (Semitones)") ], outputs=gr.components.Audio(type="filepath", label="Generated Speech"), title="Text to Speech Synthesis", description="Type text and convert it to speech using a TTS model. Use the slider to adjust the pitch." ) # Launch the application interface.launch()