Babyloncoder's picture
Create app.py
c4ff521 verified
raw
history blame
1.64 kB
import gradio as gr
from transformers import VitsModel, AutoTokenizer
import torch
import scipy.io.wavfile
import numpy as np
import librosa
import soundfile as sf
import tempfile
# Load the model and tokenizer
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
def pitch_shift_np(audio_np, sampling_rate, pitch_shift):
# Correcting the function call
return librosa.effects.pitch_shift(audio_np, sr=sampling_rate, n_steps=pitch_shift)
def synthesize_speech(text, pitch_shift):
# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt")
# Generate waveform
with torch.no_grad():
output = model(**inputs).waveform.squeeze().numpy()
# Pitch shift
shifted_audio = pitch_shift_np(output, model.config.sampling_rate, pitch_shift)
# Save to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
sf.write(fp.name, shifted_audio, model.config.sampling_rate)
temp_file_path = fp.name
return temp_file_path
# Create the Gradio interface
interface = gr.Interface(
fn=synthesize_speech,
inputs=[
gr.components.Textbox(lines=2, placeholder="Type your text here..."),
gr.components.Slider(minimum=-2, maximum=2, step=0.1, label="Pitch Shift (Semitones)")
],
outputs=gr.components.Audio(type="filepath", label="Generated Speech"),
title="Text to Speech Synthesis",
description="Type text and convert it to speech using a TTS model. Use the slider to adjust the pitch."
)
# Launch the application
interface.launch()