File size: 1,637 Bytes
c4ff521 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import gradio as gr
from transformers import VitsModel, AutoTokenizer
import torch
import scipy.io.wavfile
import numpy as np
import librosa
import soundfile as sf
import tempfile
# Load the model and tokenizer
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
def pitch_shift_np(audio_np, sampling_rate, pitch_shift):
# Correcting the function call
return librosa.effects.pitch_shift(audio_np, sr=sampling_rate, n_steps=pitch_shift)
def synthesize_speech(text, pitch_shift):
# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt")
# Generate waveform
with torch.no_grad():
output = model(**inputs).waveform.squeeze().numpy()
# Pitch shift
shifted_audio = pitch_shift_np(output, model.config.sampling_rate, pitch_shift)
# Save to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
sf.write(fp.name, shifted_audio, model.config.sampling_rate)
temp_file_path = fp.name
return temp_file_path
# Create the Gradio interface
interface = gr.Interface(
fn=synthesize_speech,
inputs=[
gr.components.Textbox(lines=2, placeholder="Type your text here..."),
gr.components.Slider(minimum=-2, maximum=2, step=0.1, label="Pitch Shift (Semitones)")
],
outputs=gr.components.Audio(type="filepath", label="Generated Speech"),
title="Text to Speech Synthesis",
description="Type text and convert it to speech using a TTS model. Use the slider to adjust the pitch."
)
# Launch the application
interface.launch()
|