zhiweiliu commited on
Commit
b4dfa1e
·
verified ·
1 Parent(s): 35c3f00

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -0
app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import argparse
3
+ import torchaudio
4
+ from tts import StepAudioTTS
5
+ from tokenizer import StepAudioTokenizer
6
+ from datetime import datetime
7
+ import os
8
+
9
+
10
+
11
+ # 普通语音合成
12
+ def tts_common(text, speaker, emotion, language, speed):
13
+ text = (
14
+ (f"({emotion})" if emotion else "")
15
+ + (f"({language})" if language else "")
16
+ + (f"({speed})" if speed else "")
17
+ + text
18
+ )
19
+ output_audio, sr = tts_engine(text, speaker)
20
+ return output_audio
21
+
22
+
23
+ # RAP / 哼唱模式
24
+ def tts_music(text_input_rap, speaker, mode_input):
25
+ text_input_rap = f"({mode_input})" + text_input_rap
26
+ output_audio, sr = tts_engine(text_input_rap, speaker)
27
+ return output_audio
28
+
29
+
30
+ # 语音克隆
31
+ def tts_clone(text, wav_file, speaker_prompt, emotion, language, speed):
32
+ clone_speaker = {
33
+ "wav_path": wav_file,
34
+ "speaker": "custom_voice",
35
+ "prompt_text": speaker_prompt,
36
+ }
37
+ clone_text = (
38
+ (f"({emotion})" if emotion else "")
39
+ + (f"({language})" if language else "")
40
+ + (f"({speed})" if speed else "")
41
+ + text
42
+ )
43
+ output_audio, sr = tts_engine(clone_text, "", clone_speaker)
44
+ return output_audio
45
+
46
+
47
+ def launch_demo(args):
48
+ # 选项列表
49
+ emotion_options = ["高兴1", "高兴2", "生气1", "生气2", "悲伤1", "撒娇1"]
50
+ language_options = ["中文", "英文", "韩语", "日语", "四川话", "粤语", "广东话"]
51
+ speed_options = ["慢速1", "慢速2", "快速1", "快速2"]
52
+ speaker_options = ["Tingting", "nezha"]
53
+ # Gradio 界面
54
+ with gr.Blocks() as demo:
55
+ gr.Markdown("## 🎙️ Step-Audio-TTS-3B Demo")
56
+
57
+ # 普通语音合成
58
+ with gr.Tab("Common TTS (普通语音合成)"):
59
+ text_input = gr.Textbox(
60
+ label="Input Text (输入文本)",
61
+ )
62
+ speaker_input = gr.Dropdown(
63
+ speaker_options,
64
+ label="Speaker Selection (音色选择)",
65
+ )
66
+ emotion_input = gr.Dropdown(
67
+ emotion_options,
68
+ label="Emotion Style (情感风格)",
69
+ allow_custom_value=True,
70
+ interactive=True,
71
+ )
72
+ language_input = gr.Dropdown(
73
+ language_options,
74
+ label="Language/Dialect (语言/方言)",
75
+ allow_custom_value=True,
76
+ interactive=True,
77
+ )
78
+ speed_input = gr.Dropdown(
79
+ speed_options,
80
+ label="Speech Rate (语速调节)",
81
+ allow_custom_value=True,
82
+ interactive=True,
83
+ )
84
+ submit_btn = gr.Button("🔊 Generate Speech (生成语音)")
85
+ output_audio = gr.Audio(
86
+ label="Output Audio (合成语音)",
87
+ interactive=False,
88
+ )
89
+
90
+ submit_btn.click(
91
+ tts_common,
92
+ inputs=[
93
+ text_input,
94
+ speaker_input,
95
+ emotion_input,
96
+ language_input,
97
+ speed_input,
98
+ ],
99
+ outputs=output_audio,
100
+ )
101
+
102
+ # RAP / 哼唱模式
103
+ with gr.Tab("RAP/Humming Mode (RAP/哼唱模式)"):
104
+ text_input_rap = gr.Textbox(
105
+ label="Lyrics Input (歌词输入)",
106
+ )
107
+ speaker_input = gr.Dropdown(
108
+ speaker_options,
109
+ label="Speaker Selection (音色选择)",
110
+ )
111
+ mode_input = gr.Radio(
112
+ ["RAP", "Humming (哼唱)"],
113
+ value="RAP",
114
+ label="Generation Mode (生成模式)",
115
+ )
116
+ submit_btn_rap = gr.Button("🎤 Generate Performance (生成演绎)")
117
+ output_audio_rap = gr.Audio(
118
+ label="Performance Audio (演绎音频)", interactive=False
119
+ )
120
+ submit_btn_rap.click(
121
+ tts_music,
122
+ inputs=[text_input_rap, speaker_input, mode_input],
123
+ outputs=output_audio_rap,
124
+ )
125
+
126
+ with gr.Tab("Voice Clone (语音克隆)"):
127
+ text_input_clone = gr.Textbox(
128
+ label="Target Text (目标文本)",
129
+ placeholder="Text to be synthesized with cloned voice (待克隆语音合成的文本)",
130
+ )
131
+ audio_input = gr.File(
132
+ label="Reference Audio Upload (参考音频上传)",
133
+ )
134
+ speaker_prompt = gr.Textbox(
135
+ label="Exact text from reference audio (输入参考音频的准确文本)",
136
+ )
137
+ emotion_input = gr.Dropdown(
138
+ emotion_options,
139
+ label="Emotion Style (情感风格)",
140
+ allow_custom_value=True,
141
+ interactive=True,
142
+ )
143
+ language_input = gr.Dropdown(
144
+ language_options,
145
+ label="Language/Dialect (语言/方���)",
146
+ allow_custom_value=True,
147
+ interactive=True,
148
+ )
149
+ speed_input = gr.Dropdown(
150
+ speed_options,
151
+ label="Speech Rate (语速调节)",
152
+ allow_custom_value=True,
153
+ interactive=True,
154
+ )
155
+ submit_btn_clone = gr.Button("🗣️ Synthesize Cloned Speech (合成克隆语音)")
156
+ output_audio_clone = gr.Audio(
157
+ label="Cloned Speech Output (克隆语音输出)",
158
+ interactive=False,
159
+ )
160
+ submit_btn_clone.click(
161
+ tts_clone,
162
+ inputs=[
163
+ text_input_clone,
164
+ audio_input,
165
+ speaker_prompt,
166
+ emotion_input,
167
+ language_input,
168
+ speed_input,
169
+ ],
170
+ outputs=output_audio_clone,
171
+ )
172
+
173
+ # 启动 Gradio demo
174
+ demo.queue().launch(server_name=args.server_name, server_port=args.server_port)
175
+
176
+
177
+ if __name__ == "__main__":
178
+ model_id = "stepfun-ai/Step-Audio-TTS-3B"
179
+ tokeniers = "stepfun-ai/Step-Audio-Tokenizer"
180
+ encoder = StepAudioTokenizer(tokeniers)
181
+ tts_engine = StepAudioTTS(model_id, encoder)
182
+ launch_demo()