ayousanz commited on
Commit
7037048
·
verified ·
1 Parent(s): dbef717

Update from GitHub Actions - 2025-08-04 05:28:15\n\nDeployment mode: full

Browse files
Files changed (1) hide show
  1. app.py +223 -53
app.py CHANGED
@@ -1,63 +1,212 @@
1
  #!/usr/bin/env python3
2
  """
3
- Simplified Piper TTS Demo - Lightweight version without models
 
4
  """
5
 
 
 
 
6
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
8
 
9
- def synthesize_speech_demo(
10
- text, model_name, speaker_id, length_scale, noise_scale, noise_w
11
- ):
12
- """Demo function that returns a placeholder message"""
13
- return f"""
14
- 🎙️ Piper TTS Demo (Simplified Version)
15
 
16
- Model: {model_name}
17
- Text: "{text}"
 
 
 
 
 
 
 
 
18
 
19
- Parameters:
20
- - Speaker ID: {speaker_id}
21
- - Length Scale: {length_scale}
22
- - Noise Scale: {noise_scale}
23
- - Noise W: {noise_w}
24
 
25
- Note: This is a lightweight demo without actual TTS models.
26
- For the full version with working TTS, please use the 'full' deployment mode.
27
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
30
  def create_interface():
31
- """Create simplified Gradio interface"""
32
 
33
- with gr.Blocks(title="Piper TTS Demo - Simplified") as interface:
34
  gr.Markdown("""
35
- # 🎙️ Piper TTS Demo (Simplified Version)
36
 
37
- This is a lightweight demo interface without model files.
38
 
39
- Features in the full version:
40
- - ✅ Japanese text-to-speech with OpenJTalk phonemization
41
- - ✅ English text-to-speech synthesis
42
- - ✅ Real-time audio generation
43
- - ✅ Adjustable voice parameters
44
-
45
- Repository: [piper-plus](https://github.com/ayutaz/piper-plus)
46
  """)
47
 
48
  with gr.Row():
49
- with gr.Column():
50
  model_dropdown = gr.Dropdown(
51
- choices=["Japanese (Medium)", "English (Test)"],
52
  label="Select Model",
53
- value="Japanese (Medium)",
54
  )
55
 
56
  text_input = gr.Textbox(
57
  label="Text to synthesize",
58
  placeholder="Enter text here...",
59
  lines=3,
60
- value="こんにちは、世界!",
61
  )
62
 
63
  with gr.Accordion("Advanced Settings", open=False):
@@ -65,42 +214,73 @@ def create_interface():
65
  label="Speaker ID",
66
  value=0,
67
  precision=0,
 
 
 
68
  )
69
 
70
  length_scale = gr.Slider(
71
- label="Length Scale (speaking rate)",
72
  minimum=0.5,
73
  maximum=2.0,
74
  value=1.0,
75
  step=0.1,
 
76
  )
77
 
78
  noise_scale = gr.Slider(
79
- label="Noise Scale (expressiveness)",
80
  minimum=0.0,
81
- maximum=2.0,
82
  value=0.667,
83
  step=0.01,
84
  )
85
 
86
  noise_w = gr.Slider(
87
- label="Noise W (phoneme duration variation)",
88
  minimum=0.0,
89
- maximum=2.0,
90
  value=0.8,
91
  step=0.01,
92
  )
93
 
94
  synthesize_btn = gr.Button("Generate Speech", variant="primary")
95
 
96
- with gr.Column():
97
- output_text = gr.Textbox(
98
- label="Demo Output",
99
- lines=15,
 
100
  )
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  synthesize_btn.click(
103
- fn=synthesize_speech_demo,
104
  inputs=[
105
  text_input,
106
  model_dropdown,
@@ -109,17 +289,7 @@ def create_interface():
109
  noise_scale,
110
  noise_w,
111
  ],
112
- outputs=output_text,
113
- )
114
-
115
- gr.Examples(
116
- examples=[
117
- ["こんにちは、世界!", "Japanese (Medium)"],
118
- ["音声合成のデモンストレーションです。", "Japanese (Medium)"],
119
- ["Hello, world!", "English (Test)"],
120
- ["This is a text-to-speech demonstration.", "English (Test)"],
121
- ],
122
- inputs=[text_input, model_dropdown],
123
  )
124
 
125
  return interface
 
1
  #!/usr/bin/env python3
2
  """
3
+ Piper TTS Gradio Demo for Hugging Face Spaces
4
+ Supports Japanese and English text-to-speech using ONNX models
5
  """
6
 
7
+ import json
8
+ import logging
9
+
10
  import gradio as gr
11
+ import numpy as np
12
+ import onnxruntime
13
+ from app_imports import ESPEAK_AVAILABLE, PYOPENJTALK_AVAILABLE
14
+
15
+ # Download models if not present
16
+ from download_models import download_models
17
+
18
+
19
+ # Ensure models are downloaded
20
+ download_models()
21
+
22
+
23
+ # Import optional dependencies
24
+ if PYOPENJTALK_AVAILABLE:
25
+ import pyopenjtalk
26
+ if ESPEAK_AVAILABLE:
27
+ from espeak_phonemizer import Phonemizer
28
+
29
+
30
+ # Configure logging
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Model configurations
35
+ MODELS = {
36
+ "Japanese (Medium)": {
37
+ "path": "models/ja_JP-test-medium.onnx",
38
+ "config": "models/ja_JP-test-medium.onnx.json",
39
+ "language": "ja",
40
+ },
41
+ "English (Test)": {
42
+ "path": "models/test_voice.onnx",
43
+ "config": "models/test_voice.onnx.json",
44
+ "language": "en",
45
+ },
46
+ }
47
+
48
+
49
+ def load_model_config(config_path: str) -> dict:
50
+ """Load model configuration from JSON file"""
51
+ with open(config_path, encoding="utf-8") as f:
52
+ return json.load(f)
53
+
54
+
55
+ def text_to_phonemes(text: str, language: str) -> list[str]:
56
+ """Convert text to phoneme strings based on language"""
57
+
58
+ if language == "ja":
59
+ if PYOPENJTALK_AVAILABLE:
60
+ # Get phonemes from OpenJTalk
61
+ labels = pyopenjtalk.extract_fullcontext(text)
62
+ phonemes = []
63
 
64
+ for label in labels:
65
+ # Extract phoneme from label
66
+ if "-" in label and "+" in label:
67
+ phoneme = label.split("-")[1].split("+")[0]
68
+ if phoneme not in ["sil", "pau"]:
69
+ phonemes.append(phoneme)
70
 
71
+ # Add sentence markers
72
+ phonemes = ["^"] + phonemes + ["$"]
73
+ else:
74
+ logger.warning("pyopenjtalk not available, using fallback")
75
+ # Simple fallback - just use dummy phonemes
76
+ phonemes = ["^"] + list("aiueo") * 5 + ["$"]
77
 
78
+ elif ESPEAK_AVAILABLE: # English
79
+ phonemizer = Phonemizer("en-us")
80
+ phoneme_str = phonemizer.phonemize(text)
81
+ # Convert phoneme string to list
82
+ phonemes = ["^"] + list(phoneme_str.replace(" ", "")) + ["$"]
83
+ else:
84
+ logger.warning("espeak_phonemizer not available, using character fallback")
85
+ # Character-based fallback - filter non-alphabetic characters
86
+ cleaned_text = "".join(c.lower() for c in text if c.isalpha() or c.isspace())
87
+ phonemes = ["^"] + list(cleaned_text) + ["$"]
88
 
89
+ return phonemes
 
 
 
 
90
 
91
+
92
+ def phonemes_to_ids(phonemes: list[str], config: dict) -> list[int]:
93
+ """Convert phonemes to model input IDs"""
94
+ phoneme_id_map = config.get("phoneme_id_map", {})
95
+
96
+ ids = []
97
+ for phoneme in phonemes:
98
+ if phoneme in phoneme_id_map:
99
+ ids.extend(phoneme_id_map[phoneme])
100
+ else:
101
+ # Use pad token for unknown phonemes
102
+ ids.append(0)
103
+
104
+ return ids
105
+
106
+
107
+ def synthesize_speech(
108
+ text: str,
109
+ model_name: str,
110
+ speaker_id: int = 0,
111
+ length_scale: float = 1.0,
112
+ noise_scale: float = 0.667,
113
+ noise_w: float = 0.8,
114
+ ) -> tuple[int, np.ndarray]:
115
+ """Generate speech from text using selected model"""
116
+
117
+ if not text.strip():
118
+ raise gr.Error("Please enter some text")
119
+
120
+ if model_name not in MODELS:
121
+ raise gr.Error("Invalid model selected")
122
+
123
+ model_info = MODELS[model_name]
124
+ config = load_model_config(model_info["config"])
125
+
126
+ # Convert text to phoneme IDs
127
+ phonemes = text_to_phonemes(text, model_info["language"])
128
+ phoneme_ids = phonemes_to_ids(phonemes, config)
129
+
130
+ if not phoneme_ids:
131
+ raise gr.Error("Failed to convert text to phonemes")
132
+
133
+ # Load ONNX model
134
+ sess_options = onnxruntime.SessionOptions()
135
+ sess_options.inter_op_num_threads = 1
136
+ sess_options.intra_op_num_threads = 1
137
+
138
+ try:
139
+ model = onnxruntime.InferenceSession(
140
+ model_info["path"],
141
+ sess_options=sess_options,
142
+ providers=["CPUExecutionProvider"],
143
+ )
144
+ except Exception as e:
145
+ logger.error(f"Failed to load model: {e}")
146
+ raise gr.Error(f"Failed to load model: {str(e)}") from e
147
+
148
+ # Prepare inputs
149
+ text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
150
+ text_lengths = np.array([text_array.shape[1]], dtype=np.int64)
151
+ scales = np.array([noise_scale, length_scale, noise_w], dtype=np.float32)
152
+
153
+ # Handle speaker ID for multi-speaker models
154
+ sid = None
155
+ if config.get("num_speakers", 1) > 1:
156
+ sid = np.array([speaker_id], dtype=np.int64)
157
+
158
+ # Run inference
159
+ try:
160
+ inputs = {
161
+ "input": text_array,
162
+ "input_lengths": text_lengths,
163
+ "scales": scales,
164
+ }
165
+
166
+ if sid is not None:
167
+ inputs["sid"] = sid
168
+
169
+ audio = model.run(None, inputs)[0]
170
+
171
+ # Remove batch and channel dimensions
172
+ audio = audio.squeeze()
173
+
174
+ # Convert to int16
175
+ audio = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
176
+
177
+ sample_rate = config.get("audio", {}).get("sample_rate", 22050)
178
+
179
+ return sample_rate, audio
180
+
181
+ except Exception as e:
182
+ logger.error(f"Inference failed: {e}")
183
+ raise gr.Error(f"Failed to generate speech: {str(e)}") from e
184
 
185
 
186
  def create_interface():
187
+ """Create Gradio interface"""
188
 
189
+ with gr.Blocks(title="Piper TTS Demo") as interface:
190
  gr.Markdown("""
191
+ # 🎙️ Piper TTS Demo
192
 
193
+ High-quality text-to-speech synthesis supporting Japanese and English.
194
 
195
+ This demo uses ONNX models for fast CPU inference.
 
 
 
 
 
 
196
  """)
197
 
198
  with gr.Row():
199
+ with gr.Column(scale=2):
200
  model_dropdown = gr.Dropdown(
201
+ choices=list(MODELS.keys()),
202
  label="Select Model",
203
+ value=list(MODELS.keys())[0],
204
  )
205
 
206
  text_input = gr.Textbox(
207
  label="Text to synthesize",
208
  placeholder="Enter text here...",
209
  lines=3,
 
210
  )
211
 
212
  with gr.Accordion("Advanced Settings", open=False):
 
214
  label="Speaker ID",
215
  value=0,
216
  precision=0,
217
+ minimum=0,
218
+ maximum=10,
219
+ info="For multi-speaker models only",
220
  )
221
 
222
  length_scale = gr.Slider(
223
+ label="Speed",
224
  minimum=0.5,
225
  maximum=2.0,
226
  value=1.0,
227
  step=0.1,
228
+ info="Lower = faster speech",
229
  )
230
 
231
  noise_scale = gr.Slider(
232
+ label="Expressiveness",
233
  minimum=0.0,
234
+ maximum=1.0,
235
  value=0.667,
236
  step=0.01,
237
  )
238
 
239
  noise_w = gr.Slider(
240
+ label="Phoneme Duration Variance",
241
  minimum=0.0,
242
+ maximum=1.0,
243
  value=0.8,
244
  step=0.01,
245
  )
246
 
247
  synthesize_btn = gr.Button("Generate Speech", variant="primary")
248
 
249
+ with gr.Column(scale=1):
250
+ audio_output = gr.Audio(
251
+ label="Generated Speech",
252
+ type="numpy",
253
+ autoplay=True,
254
  )
255
 
256
+ gr.Markdown("""
257
+ ### Tips:
258
+ - Japanese model expects hiragana/kanji text
259
+ - English model works with standard text
260
+ - Adjust speed for faster/slower speech
261
+ - Higher expressiveness = more natural variation
262
+ """)
263
+
264
+ # Examples
265
+ gr.Examples(
266
+ examples=[
267
+ ["こんにちは、世界!今日はいい天気で��ね。", "Japanese (Medium)"],
268
+ [
269
+ "おはようございます。本日の会議は午後3時から始まります。",
270
+ "Japanese (Medium)",
271
+ ],
272
+ ["Hello world! This is a text to speech demo.", "English (Test)"],
273
+ [
274
+ "Welcome to Piper TTS. Enjoy high quality speech synthesis.",
275
+ "English (Test)",
276
+ ],
277
+ ],
278
+ inputs=[text_input, model_dropdown],
279
+ )
280
+
281
+ # Event handlers
282
  synthesize_btn.click(
283
+ fn=synthesize_speech,
284
  inputs=[
285
  text_input,
286
  model_dropdown,
 
289
  noise_scale,
290
  noise_w,
291
  ],
292
+ outputs=audio_output,
 
 
 
 
 
 
 
 
 
 
293
  )
294
 
295
  return interface