|
import gradio as gr |
|
from transformers import VitsModel, AutoTokenizer |
|
import torch |
|
import scipy.io.wavfile |
|
import numpy as np |
|
import librosa |
|
import soundfile as sf |
|
import tempfile |
|
|
|
|
|
model = VitsModel.from_pretrained("facebook/mms-tts-eng") |
|
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") |
|
|
|
|
|
def pitch_shift_np(audio_np, sampling_rate, pitch_shift): |
|
|
|
return librosa.effects.pitch_shift(audio_np, sr=sampling_rate, n_steps=pitch_shift) |
|
|
|
|
|
def synthesize_speech(text, pitch_shift): |
|
|
|
inputs = tokenizer(text, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
output = model(**inputs).waveform.squeeze().numpy() |
|
|
|
|
|
shifted_audio = pitch_shift_np(output, model.config.sampling_rate, pitch_shift) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp: |
|
sf.write(fp.name, shifted_audio, model.config.sampling_rate) |
|
temp_file_path = fp.name |
|
|
|
return temp_file_path |
|
|
|
|
|
|
|
interface = gr.Interface( |
|
fn=synthesize_speech, |
|
inputs=[ |
|
gr.components.Textbox(lines=2, placeholder="Type your text here..."), |
|
gr.components.Slider(minimum=-2, maximum=2, step=0.1, label="Pitch Shift (Semitones)") |
|
], |
|
outputs=gr.components.Audio(type="filepath", label="Generated Speech"), |
|
title="Text to Speech Synthesis", |
|
description="Type text and convert it to speech using a TTS model. Use the slider to adjust the pitch." |
|
) |
|
|
|
|
|
interface.launch() |
|
|