File size: 4,170 Bytes
d1d6cc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398361f
 
 
 
 
 
 
 
 
 
 
d1d6cc0
 
398361f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1d6cc0
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
from transformers import VitsModel, VitsTokenizer
import torch
from shannlp import util, word_tokenize


def preprocess_string(input_string: str):
    input_string = input_string.replace("(", "").replace(")", "")
    string_token = word_tokenize(input_string)
    num_to_shanword = util.num_to_shanword

    result = []
    for token in string_token:
        if token.strip().isdigit():
            result.append(num_to_shanword(int(token)))
        else:
            result.append(token)

    full_token = "".join(result)
    return full_token


def synthesize(model: str, input_string: str, speed: float = 1.0):
    auth_token = os.environ.get("TOKEN_READ_SECRET") or True

    model_id = {
        "original": "facebook/mms-tts-shn",
        "nova": "NorHsangPha/mms-tts-nova-train",
        "homhom": "NorHsangPha/mms-tts-shn-train",
    }[model]

    model = VitsModel.from_pretrained(model_id, token=auth_token)
    tokenizer = VitsTokenizer.from_pretrained(model_id, token=auth_token)

    device = ""
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")

    model.to(device)

    processed_string = preprocess_string(input_string)
    inputs = tokenizer(processed_string, return_tensors="pt").to(device)

    torch.manual_seed(42)

    model.speaking_rate = speed
    model.noise_scale = 0.2
    model.noise_scale_w = 0.2
    model.length_scale = 1.0 / speed

    with torch.no_grad():
        output = model(**inputs).waveform

    output = output.squeeze().cpu().numpy()

    return ((16_000, output), processed_string)


TTS_EXAMPLES = [
    [
        "homhom",
        "မိူင်းတႆးၵေႃႈပႆႇပႃႈလႆႈၶိုၼ်း ပူၼ်ႉမႃးသိပ်းပီယဝ်ႉလူး",
        1.0,
    ],
     [
        "homhom",
        "လိၵ်ႈတႆးတႆးဢမ်ႇႁဵၼ်းၽႂ်တေႁဵၼ်း?",
        1.0,
    ],
    [
        "homhom",
        "ၼုင်ႈၶူဝ်းတႆးလႄႈဢမ်ႇသင် ၵေႃႈႁဝ်းပဵၼ်တႆး",
        1.0,
    ],
    [
        "homhom",
        "သိူဝ်တူၵ်းထၢင် ပၢင်တၢႆးၵျွၵ်းဢွၵ်ႇ",
        1.0,
    ],
    [
        "homhom",
        "ဝူဝ်းၽၢတ်ႇၽုင်ပဵၼ်ၼိူဝ်ႉ ၵူၼ်းၽၢတ်ႇၸိူဝ်ႉပဵၼ်ပိူၼ်ႈ",
        1.0,
    ],
    [
        "homhom",
        "ၵူၼ်းၵဝ်ႈၵေႃႉ ႁဵတ်းၵဝ်ႈၵေႇ ၽိုၼ်းသေႇၵူၼ်ႈမေႃႈဢမ်ႇမီး",
        1.0,
    ],
    [
        "homhom",
        "မိူဝ်ႈပိူၼ်ႈမႃးတွႆႇႁူဝ်ၸဝ်ႈၵဝ်ႇသေ တိုၵ်ႉတေဝႃႈ တႆးတိုၵ်ႉတွင်းယူႇႁိုဝ်",
        1.0,
    ],
    ["nova", "မႂ်ႇသုင်ၶႃႈ ယူႇလီၵိၼ်ဝၢၼ်ၵတ်းယဵၼ် လီယူႇၶႃႈၼေႃႈ။", 1.0],
    ["original", "ပဵၼ်ယၢမ်းဢၼ် ၸႂ်တိုၼ်ႇတဵၼ်ႈ ၽူင်ႉပိဝ် တႃႇၼုမ်ႇယိင်းၼုမ်ႇၸၢႆးၶဝ် ၸိူဝ်းဢၼ် တေလႆႈၶိုၼ်ႈႁဵၼ်းၼၼ်ႉယူႇ", 1.0],
    [
        "nova",
        "ပဵၼ်ၵၢၼ်ၾုၵ်ႇၾင်ၸႂ်ၵၼ်ၼႅၼ်ႈ ၼၵ်းပၵ်းၸႂ် ယွင်ႈၵုၼ်းယွင်ႈမုၼ်ဢူငဝ်း ၸိူဝ်းၽူႈလဵပ်ႈႁဵၼ်းႁူႉပိုၼ်း ၸဵမ်လဵၵ်ႉယႂ်ႇၼုမ်ႇထဝ်ႈ ၼႂ်းၸိူဝ်း ၽူႈႁၵ်ႉ ၸိူဝ်ႉၸၢတ်ႈလၢႆပၢၼ်လၢႆသႅၼ်းမႃး 66 ပီ ၼပ်ႉတင်ႈတႄႇ 1958 ဝၼ်းတီႈ 21 လိူၼ်မေႊ။",
        1.0,
    ],
]