youngshen's picture
Update app.py
c335e76 verified
raw
history blame
3.19 kB
import gradio as gr
import librosa
import numpy as np
import torch
import requests
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
speaker_embeddings = {
"BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
"CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
"KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
"RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
"SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
}
def getNews(search_key):
return requests.get ("https://newsapi.org/v2/everything?q=" +search_key+ "&pagesize=3&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8")
def getHeadlines():
return requests.get ("https://newsapi.org/v2/top-headlines?country=us&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8")
def predict(text, preset):
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
# text = getNews ()
# inputs = processor(text=text, return_tensors="pt")
inputs = processor(text=getNews(text), return_tensors="pt")
# limit input length
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :model.config.max_text_positions]
speaker_embedding = np.load('spkemb/cmu_us_awb_arctic-wav-arctic_a0002.npy', allow_pickle=True)
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
title = "SpeechT5: Speech Synthesis"
description = """
The <b>SpeechT5</b> model is pre-trained on text as well as speech inputs, with targets that are also a mix of text and speech.
By pre-training on text and speech at the same time, it learns unified representations for both, resulting in improved modeling capabilities.
"""
article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
<a href="https://github.com/microsoft/SpeechT5/">original GitHub</a> |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
<p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a> using <a href="https://huggingface.co/mechanicalsea/speecht5-vc/blob/main/manifest/utils/prep_cmu_arctic_spkemb.py">this script</a>.</p>
</div>
"""
examples = [
["example 1", "US"],
["example 2", "US"],
]
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Input Text"),
gr.Radio(label="Preset", choices=[
"US",
"International",
"Technology",
"KPop",
"Surprise Me!"
], value="KPop"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
title=title,
description=description,
article=article,
examples=examples,
).launch(share=True)