File size: 6,395 Bytes
e072488
 
 
 
 
 
 
 
 
 
 
 
 
c6c1f65
e072488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import spaces
import gradio as gr
import torch
import soundfile as sf
from transformers import AutoTokenizer, AutoModelForCausalLM
from xcodec2.modeling_xcodec2 import XCodec2Model
import tempfile

device = "cuda" if torch.cuda.is_available() else "cpu"

####################
#  全局加载模型
####################
llasa_3b = "HKUSTAudio/Llasa-1B-multi-speakers-genshin-zh-en-ja-ko"
print("Loading tokenizer & model ...")
tokenizer = AutoTokenizer.from_pretrained(llasa_3b)
model = AutoModelForCausalLM.from_pretrained(llasa_3b)
model.eval().to(device)

print("Loading XCodec2Model ...")
codec_model_path = "HKUSTAudio/xcodec2"
Codec_model = XCodec2Model.from_pretrained(codec_model_path)
Codec_model.eval().to(device)

print("Models loaded.")

####################
#  推理用函数
####################
def extract_speech_ids(speech_tokens_str):
    """
    将类似 <|s_23456|> 还原为 int 23456
    """
    speech_ids = []
    for token_str in speech_tokens_str:
        if token_str.startswith("<|s_") and token_str.endswith("|>"):
            num_str = token_str[4:-2]
            num = int(num_str)
            speech_ids.append(num)
        else:
            print(f"Unexpected token: {token_str}")
    return speech_ids
@spaces.GPU
def text2speech(input_text, speaker_choice):
    """
    将文本转为语音波形,并返回音频文件路径
    """
    with torch.no_grad():
        # 在输入文本前后拼接提示token
        formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
        chat = [
            {"role": "user", "content": "Convert the text to speech:" + formatted_text},
            {"role": "assistant", "content": f"Speaker {speaker_choice} <|SPEECH_GENERATION_START|>"}
        ]

        # tokenizer.apply_chat_template 是 Llasa 风格的对话模式
        input_ids = tokenizer.apply_chat_template(
            chat,
            tokenize=True,
            return_tensors='pt',
            continue_final_message=True
        ).to(device)

        # 结束符
        speech_end_id = tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")

        # 文本生成
        outputs = model.generate(
            input_ids,
            max_length=2048,  # We trained our model with a max length of 2048
            eos_token_id= speech_end_id ,
            do_sample=True,    
            top_p=0.95,           #  Adjusts the diversity of generated content
            temperature=0.9,   #  Controls randomness in output
            repetition_penalty= 1.2,
        )

        # 把新生成的 token(不包括输入部分)取出来
        generated_ids = outputs[0][input_ids.shape[1]:-1]
        speech_tokens_str = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # 将 <|s_23456|> 提取成 [23456 ...]
        speech_tokens_int = extract_speech_ids(speech_tokens_str)
        speech_tokens_int = torch.tensor(speech_tokens_int).to(device).unsqueeze(0).unsqueeze(0)

        # 调用 XCodec2Model 解码波形
        gen_wav = Codec_model.decode_code(speech_tokens_int)  # [batch, channels, samples]

    # 获取音频数据和采样率
    audio = gen_wav[0, 0, :].cpu().numpy()
    sample_rate = 16000

    # 将音频保存到临时文件
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
        sf.write(tmpfile.name, audio, sample_rate)
        audio_path = tmpfile.name

    return audio_path

####################
#  Gradio 界面
####################
speaker_choices =   [   
        'Paimon', 'Traveler', 'Nahida', 'Navia', 'Furina', 'Lyney', 'Layla', 'Neuvillette',
        'Kaveh', 'Tighnari', 'Alhaitham', 'Kaeya', 'Dehya', 'Zhongli', 'Cyno', 'Yoimiya',
        'Ningguang', 'Nilou', 'Faruzan', 'Wriothesley', 'Collei', 'Thoma', 'Noelle',
        'Venti', 'Lynette', 'Charlotte', 'Diona', 'Yelan', 'Clorinde', 'Sigewinne', 'Beidou',
        'Gorou', 'Lisa', 'Yanfei', 'Sucrose', 'Sayu', 'Ganyu', 'Chiori', 'Chongyun', 'Freminet',
        'Barbara', 'Baizhu', 'Kirara', 'Dainsleif', 'Klee', 'Albedo', 'Dori', 'Eula', 'Xiao',
        'Mona', 'Bennett', 'Amber', 'Xingqiu', 'Shenhe', 'Childe', 'Xiangling', 'Jean', 'Diluc',
        'Katheryne', 'Mika', 'Keqing', 'Candace'
    ]
#["puck", "kore"]

demo = gr.Interface(
    fn=text2speech,
    inputs=[gr.Textbox(label="Enter text", lines=5),
            gr.Dropdown(choices=speaker_choices, label="Select Speaker", value="Paimon")],
    outputs=gr.Audio(label="Generated Audio", type="filepath"),
    title="Llasa-1B TTS finetuned using simon3000/genshin-voice",
    description = (
    "Input a piece of text (Chinese, English, Japanese, Korean), select a speaker, "
    "and click to generate speech. If fail, try a few more times\n"
    "Speakers (角色选择):\n"
    "'Paimon(派蒙)', 'Traveler(旅行者)', 'Nahida(纳西妲)', 'Navia(纳维亚)', 'Furina(芙宁娜)', "
    "'Lyney(莱依拉)', 'Layla(莱依拉)', 'Neuvillette(诺维利特)', 'Kaveh(卡维赫)', 'Tighnari(提纳里)', "
    "'Alhaitham(艾尔海森)', 'Kaeya(凯亚)', 'Dehya(迪希雅)', 'Zhongli(钟离)', 'Cyno(赛诺)', 'Yoimiya(宵宫)', "
    "'Ningguang(凝光)', 'Nilou(妮露)', 'Faruzan(法露珊)', 'Wriothesley(维欧塞利)', 'Collei(可莉)', 'Thoma(托马)', "
    "'Noelle(诺艾尔)', 'Venti(温迪)', 'Lynette(莉妮特)', 'Charlotte(夏洛特)', 'Diona(迪奥娜)', 'Yelan(夜兰)', "
    "'Clorinde(克洛琳德)', 'Sigewinne(希格温)', 'Beidou(北斗)', 'Gorou(五郎)', 'Lisa(丽莎)', 'Yanfei(烟绯)', "
    "'Sucrose(砂糖)', 'Sayu(早柚)', 'Ganyu(甘雨)', 'Chiori(千里)', 'Chongyun(重云)', 'Freminet(弗雷明内)', "
    "'Barbara(芭芭拉)', 'Baizhu(白术)', 'Kirara(切尔拉)', 'Dainsleif(戴因斯雷布)', 'Klee(可莉)', 'Albedo(阿贝多)', "
    "'Dori(多莉)', 'Eula(优菈)', 'Xiao(魈)', 'Mona(莫娜)', 'Bennett(班尼特)', 'Amber(安柏)', 'Xingqiu(行秋)', "
    "'Shenhe(申鹤)', 'Childe(公子)', 'Xiangling(香菱)', 'Jean(琴)', 'Diluc(迪卢克)', 'Katheryne(凯瑟琳)', "
    "'Mika(米卡)', 'Keqing(刻晴)', 'Candace(坎蒂丝)'"
))

if __name__ == "__main__":
    demo.launch(
    share=True )