File size: 3,025 Bytes
003b0ed
ae71d4b
 
 
 
27a58ec
ae71d4b
 
 
6e9b076
ae71d4b
 
6e9b076
ae71d4b
 
 
 
9483da5
 
ae71d4b
 
 
b68b40e
2a963d1
9d22775
b68b40e
 
 
ae71d4b
32fb746
ae71d4b
 
 
9483da5
 
2a963d1
27fd4f5
bbb7e65
 
 
9483da5
183bb88
2a963d1
ae71d4b
bbb7e65
ae71d4b
 
 
 
 
183bb88
ae71d4b
 
183bb88
a1ec3a4
ae71d4b
 
 
 
 
 
 
 
 
 
 
 
 
 
c335e76
b2dc9fc
ae71d4b
 
 
 
 
9483da5
 
 
 
27fd4f5
 
caaf71e
867b7ff
ae71d4b
 
 
 
 
 
 
 
b2dc9fc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import textwrap
import gradio as gr
import librosa
import numpy as np
import torch
import requests

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

speaker_embeddings = {
    "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
    "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
    "KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
    "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
    "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
}

def getNews(search_key):
    return requests.get ("https://newsapi.org/v2/everything?pagesize=3&apiKey=3bca07c913ec4703a23f6ba03e15b30b&q="+search_key).content.decode("utf-8")

def getHeadlines():
    return requests.get ("https://newsapi.org/v2/top-headlines?country=us&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8")


def predict(text, preset):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0).astype(np.int16))

    # text = getNews ()
    # inputs = processor(text=text, return_tensors="pt")
    inputs = processor(text=textwrap.shorten(getNews(text), width=250), return_tensors="pt")

    # limit input length
    input_ids = inputs["input_ids"]
    input_ids = input_ids[..., :model.config.max_text_positions]

    # cmu_us_awb_arctic-wav-arctic_a0002.npy
    speaker_embedding = np.load('spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy')
    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
    speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)

    speech = (speech.numpy() * 32767).astype(np.int16)
    return (16000, speech)


title = "Create 423: News to Speech"

description = """
Create 423: News to Speech
"""

article = """
<div style='margin:20px auto;'>

<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
<a href="https://github.com/microsoft/SpeechT5/">original GitHub</a> |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>

<p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a> using <a href="https://huggingface.co/mechanicalsea/speecht5-vc/blob/main/manifest/utils/prep_cmu_arctic_spkemb.py">this script</a>.</p>

</div>
"""

examples = [
    ["example 1", "US"],
    ["example 2", "International"],  
]

gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="Input Text"),
        gr.Radio(label="Preset", choices=[
            "US",
            "International",
            "Technology",
            "KPop",
            "Surprise Me!"
        ], value="KPop"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
).launch(share=False)