ayousanz commited on
Commit
dbef717
·
verified ·
1 Parent(s): 93872e8

Update from GitHub Actions - 2025-08-01 17:05:49\n\nDeployment mode: simple

Browse files
Files changed (1) hide show
  1. app.py +53 -223
app.py CHANGED
@@ -1,212 +1,63 @@
1
  #!/usr/bin/env python3
2
  """
3
- Piper TTS Gradio Demo for Hugging Face Spaces
4
- Supports Japanese and English text-to-speech using ONNX models
5
  """
6
 
7
- import json
8
- import logging
9
-
10
  import gradio as gr
11
- import numpy as np
12
- import onnxruntime
13
- from app_imports import ESPEAK_AVAILABLE, PYOPENJTALK_AVAILABLE
14
-
15
- # Download models if not present
16
- from download_models import download_models
17
-
18
-
19
- # Ensure models are downloaded
20
- download_models()
21
-
22
-
23
- # Import optional dependencies
24
- if PYOPENJTALK_AVAILABLE:
25
- import pyopenjtalk
26
- if ESPEAK_AVAILABLE:
27
- from espeak_phonemizer import Phonemizer
28
-
29
-
30
- # Configure logging
31
- logging.basicConfig(level=logging.INFO)
32
- logger = logging.getLogger(__name__)
33
-
34
- # Model configurations
35
- MODELS = {
36
- "Japanese (Medium)": {
37
- "path": "models/ja_JP-test-medium.onnx",
38
- "config": "models/ja_JP-test-medium.onnx.json",
39
- "language": "ja",
40
- },
41
- "English (Test)": {
42
- "path": "models/test_voice.onnx",
43
- "config": "models/test_voice.onnx.json",
44
- "language": "en",
45
- },
46
- }
47
-
48
-
49
- def load_model_config(config_path: str) -> dict:
50
- """Load model configuration from JSON file"""
51
- with open(config_path, encoding="utf-8") as f:
52
- return json.load(f)
53
-
54
-
55
- def text_to_phonemes(text: str, language: str) -> list[str]:
56
- """Convert text to phoneme strings based on language"""
57
-
58
- if language == "ja":
59
- if PYOPENJTALK_AVAILABLE:
60
- # Get phonemes from OpenJTalk
61
- labels = pyopenjtalk.extract_fullcontext(text)
62
- phonemes = []
63
 
64
- for label in labels:
65
- # Extract phoneme from label
66
- if "-" in label and "+" in label:
67
- phoneme = label.split("-")[1].split("+")[0]
68
- if phoneme not in ["sil", "pau"]:
69
- phonemes.append(phoneme)
70
 
71
- # Add sentence markers
72
- phonemes = ["^"] + phonemes + ["$"]
73
- else:
74
- logger.warning("pyopenjtalk not available, using fallback")
75
- # Simple fallback - just use dummy phonemes
76
- phonemes = ["^"] + list("aiueo") * 5 + ["$"]
77
 
78
- elif ESPEAK_AVAILABLE: # English
79
- phonemizer = Phonemizer("en-us")
80
- phoneme_str = phonemizer.phonemize(text)
81
- # Convert phoneme string to list
82
- phonemes = ["^"] + list(phoneme_str.replace(" ", "")) + ["$"]
83
- else:
84
- logger.warning("espeak_phonemizer not available, using character fallback")
85
- # Character-based fallback - filter non-alphabetic characters
86
- cleaned_text = "".join(c.lower() for c in text if c.isalpha() or c.isspace())
87
- phonemes = ["^"] + list(cleaned_text) + ["$"]
88
 
89
- return phonemes
 
 
 
 
90
 
91
-
92
- def phonemes_to_ids(phonemes: list[str], config: dict) -> list[int]:
93
- """Convert phonemes to model input IDs"""
94
- phoneme_id_map = config.get("phoneme_id_map", {})
95
-
96
- ids = []
97
- for phoneme in phonemes:
98
- if phoneme in phoneme_id_map:
99
- ids.extend(phoneme_id_map[phoneme])
100
- else:
101
- # Use pad token for unknown phonemes
102
- ids.append(0)
103
-
104
- return ids
105
-
106
-
107
- def synthesize_speech(
108
- text: str,
109
- model_name: str,
110
- speaker_id: int = 0,
111
- length_scale: float = 1.0,
112
- noise_scale: float = 0.667,
113
- noise_w: float = 0.8,
114
- ) -> tuple[int, np.ndarray]:
115
- """Generate speech from text using selected model"""
116
-
117
- if not text.strip():
118
- raise gr.Error("Please enter some text")
119
-
120
- if model_name not in MODELS:
121
- raise gr.Error("Invalid model selected")
122
-
123
- model_info = MODELS[model_name]
124
- config = load_model_config(model_info["config"])
125
-
126
- # Convert text to phoneme IDs
127
- phonemes = text_to_phonemes(text, model_info["language"])
128
- phoneme_ids = phonemes_to_ids(phonemes, config)
129
-
130
- if not phoneme_ids:
131
- raise gr.Error("Failed to convert text to phonemes")
132
-
133
- # Load ONNX model
134
- sess_options = onnxruntime.SessionOptions()
135
- sess_options.inter_op_num_threads = 1
136
- sess_options.intra_op_num_threads = 1
137
-
138
- try:
139
- model = onnxruntime.InferenceSession(
140
- model_info["path"],
141
- sess_options=sess_options,
142
- providers=["CPUExecutionProvider"],
143
- )
144
- except Exception as e:
145
- logger.error(f"Failed to load model: {e}")
146
- raise gr.Error(f"Failed to load model: {str(e)}") from e
147
-
148
- # Prepare inputs
149
- text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
150
- text_lengths = np.array([text_array.shape[1]], dtype=np.int64)
151
- scales = np.array([noise_scale, length_scale, noise_w], dtype=np.float32)
152
-
153
- # Handle speaker ID for multi-speaker models
154
- sid = None
155
- if config.get("num_speakers", 1) > 1:
156
- sid = np.array([speaker_id], dtype=np.int64)
157
-
158
- # Run inference
159
- try:
160
- inputs = {
161
- "input": text_array,
162
- "input_lengths": text_lengths,
163
- "scales": scales,
164
- }
165
-
166
- if sid is not None:
167
- inputs["sid"] = sid
168
-
169
- audio = model.run(None, inputs)[0]
170
-
171
- # Remove batch and channel dimensions
172
- audio = audio.squeeze()
173
-
174
- # Convert to int16
175
- audio = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
176
-
177
- sample_rate = config.get("audio", {}).get("sample_rate", 22050)
178
-
179
- return sample_rate, audio
180
-
181
- except Exception as e:
182
- logger.error(f"Inference failed: {e}")
183
- raise gr.Error(f"Failed to generate speech: {str(e)}") from e
184
 
185
 
186
  def create_interface():
187
- """Create Gradio interface"""
188
 
189
- with gr.Blocks(title="Piper TTS Demo") as interface:
190
  gr.Markdown("""
191
- # 🎙️ Piper TTS Demo
192
 
193
- High-quality text-to-speech synthesis supporting Japanese and English.
194
 
195
- This demo uses ONNX models for fast CPU inference.
 
 
 
 
 
 
196
  """)
197
 
198
  with gr.Row():
199
- with gr.Column(scale=2):
200
  model_dropdown = gr.Dropdown(
201
- choices=list(MODELS.keys()),
202
  label="Select Model",
203
- value=list(MODELS.keys())[0],
204
  )
205
 
206
  text_input = gr.Textbox(
207
  label="Text to synthesize",
208
  placeholder="Enter text here...",
209
  lines=3,
 
210
  )
211
 
212
  with gr.Accordion("Advanced Settings", open=False):
@@ -214,73 +65,42 @@ def create_interface():
214
  label="Speaker ID",
215
  value=0,
216
  precision=0,
217
- minimum=0,
218
- maximum=10,
219
- info="For multi-speaker models only",
220
  )
221
 
222
  length_scale = gr.Slider(
223
- label="Speed",
224
  minimum=0.5,
225
  maximum=2.0,
226
  value=1.0,
227
  step=0.1,
228
- info="Lower = faster speech",
229
  )
230
 
231
  noise_scale = gr.Slider(
232
- label="Expressiveness",
233
  minimum=0.0,
234
- maximum=1.0,
235
  value=0.667,
236
  step=0.01,
237
  )
238
 
239
  noise_w = gr.Slider(
240
- label="Phoneme Duration Variance",
241
  minimum=0.0,
242
- maximum=1.0,
243
  value=0.8,
244
  step=0.01,
245
  )
246
 
247
  synthesize_btn = gr.Button("Generate Speech", variant="primary")
248
 
249
- with gr.Column(scale=1):
250
- audio_output = gr.Audio(
251
- label="Generated Speech",
252
- type="numpy",
253
- autoplay=True,
254
  )
255
 
256
- gr.Markdown("""
257
- ### Tips:
258
- - Japanese model expects hiragana/kanji text
259
- - English model works with standard text
260
- - Adjust speed for faster/slower speech
261
- - Higher expressiveness = more natural variation
262
- """)
263
-
264
- # Examples
265
- gr.Examples(
266
- examples=[
267
- ["こんにちは、世界!今日はいい天気で��ね。", "Japanese (Medium)"],
268
- [
269
- "おはようございます。本日の会議は午後3時から始まります。",
270
- "Japanese (Medium)",
271
- ],
272
- ["Hello world! This is a text to speech demo.", "English (Test)"],
273
- [
274
- "Welcome to Piper TTS. Enjoy high quality speech synthesis.",
275
- "English (Test)",
276
- ],
277
- ],
278
- inputs=[text_input, model_dropdown],
279
- )
280
-
281
- # Event handlers
282
  synthesize_btn.click(
283
- fn=synthesize_speech,
284
  inputs=[
285
  text_input,
286
  model_dropdown,
@@ -289,7 +109,17 @@ def create_interface():
289
  noise_scale,
290
  noise_w,
291
  ],
292
- outputs=audio_output,
 
 
 
 
 
 
 
 
 
 
293
  )
294
 
295
  return interface
 
1
  #!/usr/bin/env python3
2
  """
3
+ Simplified Piper TTS Demo - Lightweight version without models
 
4
  """
5
 
 
 
 
6
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
8
 
9
+ def synthesize_speech_demo(
10
+ text, model_name, speaker_id, length_scale, noise_scale, noise_w
11
+ ):
12
+ """Demo function that returns a placeholder message"""
13
+ return f"""
14
+ 🎙️ Piper TTS Demo (Simplified Version)
15
 
16
+ Model: {model_name}
17
+ Text: "{text}"
 
 
 
 
 
 
 
 
18
 
19
+ Parameters:
20
+ - Speaker ID: {speaker_id}
21
+ - Length Scale: {length_scale}
22
+ - Noise Scale: {noise_scale}
23
+ - Noise W: {noise_w}
24
 
25
+ Note: This is a lightweight demo without actual TTS models.
26
+ For the full version with working TTS, please use the 'full' deployment mode.
27
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
30
  def create_interface():
31
+ """Create simplified Gradio interface"""
32
 
33
+ with gr.Blocks(title="Piper TTS Demo - Simplified") as interface:
34
  gr.Markdown("""
35
+ # 🎙️ Piper TTS Demo (Simplified Version)
36
 
37
+ This is a lightweight demo interface without model files.
38
 
39
+ Features in the full version:
40
+ - ✅ Japanese text-to-speech with OpenJTalk phonemization
41
+ - ✅ English text-to-speech synthesis
42
+ - ✅ Real-time audio generation
43
+ - ✅ Adjustable voice parameters
44
+
45
+ Repository: [piper-plus](https://github.com/ayutaz/piper-plus)
46
  """)
47
 
48
  with gr.Row():
49
+ with gr.Column():
50
  model_dropdown = gr.Dropdown(
51
+ choices=["Japanese (Medium)", "English (Test)"],
52
  label="Select Model",
53
+ value="Japanese (Medium)",
54
  )
55
 
56
  text_input = gr.Textbox(
57
  label="Text to synthesize",
58
  placeholder="Enter text here...",
59
  lines=3,
60
+ value="こんにちは、世界!",
61
  )
62
 
63
  with gr.Accordion("Advanced Settings", open=False):
 
65
  label="Speaker ID",
66
  value=0,
67
  precision=0,
 
 
 
68
  )
69
 
70
  length_scale = gr.Slider(
71
+ label="Length Scale (speaking rate)",
72
  minimum=0.5,
73
  maximum=2.0,
74
  value=1.0,
75
  step=0.1,
 
76
  )
77
 
78
  noise_scale = gr.Slider(
79
+ label="Noise Scale (expressiveness)",
80
  minimum=0.0,
81
+ maximum=2.0,
82
  value=0.667,
83
  step=0.01,
84
  )
85
 
86
  noise_w = gr.Slider(
87
+ label="Noise W (phoneme duration variation)",
88
  minimum=0.0,
89
+ maximum=2.0,
90
  value=0.8,
91
  step=0.01,
92
  )
93
 
94
  synthesize_btn = gr.Button("Generate Speech", variant="primary")
95
 
96
+ with gr.Column():
97
+ output_text = gr.Textbox(
98
+ label="Demo Output",
99
+ lines=15,
 
100
  )
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  synthesize_btn.click(
103
+ fn=synthesize_speech_demo,
104
  inputs=[
105
  text_input,
106
  model_dropdown,
 
109
  noise_scale,
110
  noise_w,
111
  ],
112
+ outputs=output_text,
113
+ )
114
+
115
+ gr.Examples(
116
+ examples=[
117
+ ["こんにちは、世界!", "Japanese (Medium)"],
118
+ ["音声合成のデモンストレーションです。", "Japanese (Medium)"],
119
+ ["Hello, world!", "English (Test)"],
120
+ ["This is a text-to-speech demonstration.", "English (Test)"],
121
+ ],
122
+ inputs=[text_input, model_dropdown],
123
  )
124
 
125
  return interface