File size: 4,077 Bytes
90c5b6d
 
 
 
831b161
90c5b6d
ec39417
 
90c5b6d
831b161
cd96dae
ec39417
 
831b161
 
 
 
ec39417
325312c
831b161
 
325312c
90c5b6d
 
325312c
 
 
 
 
 
90c5b6d
 
 
1d1e03e
 
 
 
ec39417
90c5b6d
 
 
 
 
54811b2
 
 
 
 
90c5b6d
 
 
831b161
 
90c5b6d
ec39417
1d1e03e
ec39417
1d1e03e
 
 
 
 
90c5b6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec39417
 
 
 
 
325312c
ec39417
 
 
 
 
90c5b6d
 
be8cb80
 
 
 
 
90c5b6d
 
 
54811b2
 
ec39417
90c5b6d
 
 
 
 
 
 
 
 
 
 
7cbdcbc
90c5b6d
7cbdcbc
 
90c5b6d
 
 
 
ec39417
90c5b6d
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import gradio as gr
import numpy as np
import torch

from transformers import pipeline

from resemble_enhance.enhancer.inference import denoise, enhance

checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"

revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    "text-to-speech",
    model=checkpoint_finetuned,
    use_fast=True,
    device=device,
    revision=revision,
)

embeddings_dir = "embeddings/nst-da-metricgan-plus/"

speaker_embeddings = {
    "F23": embeddings_dir + "female_23_vestjylland.npy",
    "F24": embeddings_dir + "female_24_storkoebenhavn.npy",
    "F49": embeddings_dir + "female_49_nordjylland.npy",
    "M51": embeddings_dir + "male_51_vest_sydsjaelland.npy",
    "M18": embeddings_dir + "male_18_vest_sydsjaelland.npy",
    "M31": embeddings_dir + "male_31_fyn.npy",
}


target_dtype = np.int16
max_range = np.iinfo(target_dtype).max


def predict(text, speaker, post_process):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0))

    text = replace_danish_letters(text)

    speaker_id = speaker[:3]

    speaker_embedding_path = speaker_embeddings[speaker_id]

    speaker_embedding = np.load(speaker_embedding_path)

    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)

    forward_params = {"speaker_embeddings": speaker_embedding}
    speech = pipe(text, forward_params=forward_params)

    if post_process:
        sr, audio = enhance_audio(speech["audio"], speech["sampling_rate"], device)
    else:
        sr, audio = speech["sampling_rate"], speech["audio"]

    audio = (audio * max_range).astype(np.int16)

    return sr, audio


def replace_danish_letters(text):
    for src, dst in replacements:
        text = text.replace(src, dst)
    return text


replacements = [
    ("&", "og"),
    ("\r", " "),
    ("´", ""),
    ("\\", ""),
    ("¨", " "),
    ("Å", "AA"),
    ("Æ", "AE"),
    ("É", "E"),
    ("Ö", "OE"),
    ("Ø", "OE"),
    ("á", "a"),
    ("ä", "ae"),
    ("å", "aa"),
    ("è", "e"),
    ("î", "i"),
    ("ô", "oe"),
    ("ö", "oe"),
    ("ø", "oe"),
    ("ü", "y"),
]


def enhance_audio(waveform, sr, device="cuda"):
    tensor = torch.tensor(waveform).float()
    denoised, new_sr = denoise(tensor, sr, device)
    enhanced, new_sr = enhance(
        denoised, new_sr, device, nfe=64, solver="midpoint", lambd=0.1, tau=0.5
    )
    enhanced_cpu = enhanced.cpu().numpy()
    return new_sr, enhanced_cpu


title = "Danish Speech Synthesis"

description = (
    "Synthesize long-form danish speech from text with the click of a button! Demo uses the"
    f" checkpoint [{checkpoint_finetuned}](https://huggingface.co/{checkpoint_finetuned}) and 🤗 Transformers to synthesize speech"
    "."
)

examples = [
    [
        "I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene.",
        "F23 (Female, 23, Vestjylland)",
        True,
    ],
]

demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(label="Input Text"),
        gr.Radio(
            label="Speaker",
            choices=[
                "F23 (Female, 23, Vestjylland)",
                "F24 (Female, 24, Storkøbenhavn)",
                "F49 (Female, 49 Nordjylland)",
                "M51 (Male. 51, Vest-sydsjælland)",
                "M18 (Male, 18, Vest-sydjælland)",
                "M31 (Male, 31, Fyn)",
            ],
            value="F23 (Female, 23, Vestjylland)",
        ),
        gr.Checkbox(label="Enhance audio (takes substantially longer)"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
    ],
    title=title,
    description=description,
    examples=examples,
    cache_examples=True,
    allow_flagging="never",
)

demo.launch()