ayousanz commited on
Commit
4821af3
·
verified ·
1 Parent(s): 62e3411

Update from GitHub Actions - 2025-07-31 15:25:59\n\nDeployment mode: simple

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +68 -240
  3. requirements.txt +2 -1
README.md CHANGED
@@ -4,8 +4,8 @@ emoji: 🎙️
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.44.0
8
- app_file: use_simple.py
9
  pinned: false
10
  license: mit
11
  ---
 
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
+ app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
app.py CHANGED
@@ -1,286 +1,104 @@
1
  #!/usr/bin/env python3
2
  """
3
- Piper TTS Gradio Demo for Hugging Face Spaces
4
- Supports Japanese and English text-to-speech using ONNX models
5
  """
6
 
7
- import json
8
- import logging
9
- from pathlib import Path
10
-
11
  import gradio as gr
12
- import numpy as np
13
- import onnxruntime
14
- from app_imports import ESPEAK_AVAILABLE, PYOPENJTALK_AVAILABLE
15
-
16
- # Download models if not present
17
- from download_models import download_models
18
-
19
- # Ensure models are downloaded
20
- download_models()
21
-
22
-
23
- # Import optional dependencies
24
- if PYOPENJTALK_AVAILABLE:
25
- import pyopenjtalk
26
- if ESPEAK_AVAILABLE:
27
- from espeak_phonemizer import Phonemizer
28
-
29
-
30
- # Configure logging
31
- logging.basicConfig(level=logging.INFO)
32
- logger = logging.getLogger(__name__)
33
-
34
- # Model configurations
35
- MODELS = {
36
- "Japanese (Medium)": {
37
- "path": "models/ja_JP-test-medium.onnx",
38
- "config": "models/ja_JP-test-medium.onnx.json",
39
- "language": "ja",
40
- },
41
- "English (Test)": {
42
- "path": "models/test_voice.onnx",
43
- "config": "models/test_voice.onnx.json",
44
- "language": "en",
45
- },
46
- }
47
-
48
-
49
- def load_model_config(config_path: str) -> dict:
50
- """Load model configuration from JSON file"""
51
- with open(config_path, encoding="utf-8") as f:
52
- return json.load(f)
53
-
54
-
55
- def text_to_phonemes(text: str, language: str) -> list[str]:
56
- """Convert text to phoneme strings based on language"""
57
-
58
- if language == "ja":
59
- if PYOPENJTALK_AVAILABLE:
60
- # Get phonemes from OpenJTalk
61
- labels = pyopenjtalk.extract_fullcontext(text)
62
- phonemes = []
63
-
64
- for label in labels:
65
- # Extract phoneme from label
66
- if "-" in label and "+" in label:
67
- phoneme = label.split("-")[1].split("+")[0]
68
- if phoneme not in ["sil", "pau"]:
69
- phonemes.append(phoneme)
70
-
71
- # Add sentence markers
72
- phonemes = ["^"] + phonemes + ["$"]
73
- else:
74
- logger.warning("pyopenjtalk not available, using fallback")
75
- # Simple fallback - just use dummy phonemes
76
- phonemes = ["^"] + list("aiueo") * 5 + ["$"]
77
-
78
- elif ESPEAK_AVAILABLE: # English
79
- phonemizer = Phonemizer("en-us")
80
- phoneme_str = phonemizer.phonemize(text)
81
- # Convert phoneme string to list
82
- phonemes = ["^"] + list(phoneme_str.replace(" ", "")) + ["$"]
83
- else:
84
- logger.warning("espeak_phonemizer not available, using character fallback")
85
- # Character-based fallback - filter non-alphabetic characters
86
- cleaned_text = "".join(c.lower() for c in text if c.isalpha() or c.isspace())
87
- phonemes = ["^"] + list(cleaned_text) + ["$"]
88
-
89
- return phonemes
90
-
91
-
92
- def phonemes_to_ids(phonemes: list[str], config: dict) -> list[int]:
93
- """Convert phonemes to model input IDs"""
94
- phoneme_id_map = config.get("phoneme_id_map", {})
95
-
96
- ids = []
97
- for phoneme in phonemes:
98
- if phoneme in phoneme_id_map:
99
- ids.extend(phoneme_id_map[phoneme])
100
- else:
101
- # Use pad token for unknown phonemes
102
- ids.append(0)
103
-
104
- return ids
105
-
106
-
107
- def synthesize_speech(
108
- text: str,
109
- model_name: str,
110
- speaker_id: int = 0,
111
- length_scale: float = 1.0,
112
- noise_scale: float = 0.667,
113
- noise_w: float = 0.8,
114
- ) -> tuple[int, np.ndarray]:
115
- """Generate speech from text using selected model"""
116
-
117
- if not text.strip():
118
- raise gr.Error("Please enter some text")
119
-
120
- if model_name not in MODELS:
121
- raise gr.Error("Invalid model selected")
122
-
123
- model_info = MODELS[model_name]
124
- config = load_model_config(model_info["config"])
125
-
126
- # Convert text to phoneme IDs
127
- phonemes = text_to_phonemes(text, model_info["language"])
128
- phoneme_ids = phonemes_to_ids(phonemes, config)
129
-
130
- if not phoneme_ids:
131
- raise gr.Error("Failed to convert text to phonemes")
132
-
133
- # Load ONNX model
134
- sess_options = onnxruntime.SessionOptions()
135
- sess_options.inter_op_num_threads = 1
136
- sess_options.intra_op_num_threads = 1
137
-
138
- try:
139
- model = onnxruntime.InferenceSession(
140
- model_info["path"],
141
- sess_options=sess_options,
142
- providers=["CPUExecutionProvider"],
143
- )
144
- except Exception as e:
145
- logger.error(f"Failed to load model: {e}")
146
- raise gr.Error(f"Failed to load model: {str(e)}") from e
147
-
148
- # Prepare inputs
149
- text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
150
- text_lengths = np.array([text_array.shape[1]], dtype=np.int64)
151
- scales = np.array([noise_scale, length_scale, noise_w], dtype=np.float32)
152
-
153
- # Handle speaker ID for multi-speaker models
154
- sid = None
155
- if config.get("num_speakers", 1) > 1:
156
- sid = np.array([speaker_id], dtype=np.int64)
157
 
158
- # Run inference
159
- try:
160
- inputs = {
161
- "input": text_array,
162
- "input_lengths": text_lengths,
163
- "scales": scales,
164
- }
165
 
166
- if sid is not None:
167
- inputs["sid"] = sid
168
-
169
- audio = model.run(None, inputs)[0]
170
-
171
- # Remove batch and channel dimensions
172
- audio = audio.squeeze()
173
-
174
- # Convert to int16
175
- audio = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
176
-
177
- sample_rate = config.get("audio", {}).get("sample_rate", 22050)
178
-
179
- return sample_rate, audio
180
-
181
- except Exception as e:
182
- logger.error(f"Inference failed: {e}")
183
- raise gr.Error(f"Failed to generate speech: {str(e)}") from e
184
 
185
 
186
  def create_interface():
187
- """Create Gradio interface"""
188
-
189
- with gr.Blocks(title="Piper TTS Demo") as interface:
190
  gr.Markdown("""
191
- # 🎙️ Piper TTS Demo
192
-
193
- High-quality text-to-speech synthesis supporting Japanese and English.
194
-
195
- This demo uses ONNX models for fast CPU inference.
 
 
 
 
 
 
196
  """)
197
-
198
  with gr.Row():
199
- with gr.Column(scale=2):
200
  model_dropdown = gr.Dropdown(
201
- choices=list(MODELS.keys()),
202
  label="Select Model",
203
- value=list(MODELS.keys())[0],
204
  )
205
-
206
  text_input = gr.Textbox(
207
  label="Text to synthesize",
208
  placeholder="Enter text here...",
209
  lines=3,
 
210
  )
211
-
212
  with gr.Accordion("Advanced Settings", open=False):
213
  speaker_id = gr.Number(
214
  label="Speaker ID",
215
  value=0,
216
  precision=0,
217
- minimum=0,
218
- maximum=10,
219
- info="For multi-speaker models only",
220
  )
221
-
222
  length_scale = gr.Slider(
223
- label="Speed",
224
  minimum=0.5,
225
  maximum=2.0,
226
  value=1.0,
227
  step=0.1,
228
- info="Lower = faster speech",
229
  )
230
-
231
  noise_scale = gr.Slider(
232
- label="Expressiveness",
233
  minimum=0.0,
234
- maximum=1.0,
235
  value=0.667,
236
  step=0.01,
237
  )
238
-
239
  noise_w = gr.Slider(
240
- label="Phoneme Duration Variance",
241
  minimum=0.0,
242
- maximum=1.0,
243
  value=0.8,
244
  step=0.01,
245
  )
246
-
247
  synthesize_btn = gr.Button("Generate Speech", variant="primary")
248
-
249
- with gr.Column(scale=1):
250
- audio_output = gr.Audio(
251
- label="Generated Speech",
252
- type="numpy",
253
- autoplay=True,
254
  )
255
-
256
- gr.Markdown("""
257
- ### Tips:
258
- - Japanese model expects hiragana/kanji text
259
- - English model works with standard text
260
- - Adjust speed for faster/slower speech
261
- - Higher expressiveness = more natural variation
262
- """)
263
-
264
- # Examples
265
- gr.Examples(
266
- examples=[
267
- ["こんにちは、世界!今日はいい天気ですね。", "Japanese (Medium)"],
268
- [
269
- "おはようございます。本日の会議は午後3時から始まります。",
270
- "Japanese (Medium)",
271
- ],
272
- ["Hello world! This is a text to speech demo.", "English (Test)"],
273
- [
274
- "Welcome to Piper TTS. Enjoy high quality speech synthesis.",
275
- "English (Test)",
276
- ],
277
- ],
278
- inputs=[text_input, model_dropdown],
279
- )
280
-
281
- # Event handlers
282
  synthesize_btn.click(
283
- fn=synthesize_speech,
284
  inputs=[
285
  text_input,
286
  model_dropdown,
@@ -289,9 +107,19 @@ def create_interface():
289
  noise_scale,
290
  noise_w,
291
  ],
292
- outputs=audio_output,
293
  )
294
-
 
 
 
 
 
 
 
 
 
 
295
  return interface
296
 
297
 
@@ -299,4 +127,4 @@ def create_interface():
299
  interface = create_interface()
300
 
301
  if __name__ == "__main__":
302
- interface.launch()
 
1
  #!/usr/bin/env python3
2
  """
3
+ Simplified Piper TTS Demo - Lightweight version without models
 
4
  """
5
 
 
 
 
 
6
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
 
8
 
9
+ def synthesize_speech_demo(text, model_name, speaker_id, length_scale, noise_scale, noise_w):
10
+ """Demo function that returns a placeholder message"""
11
+ return f"""
12
+ 🎙️ Piper TTS Demo (Simplified Version)
13
+
14
+ Model: {model_name}
15
+ Text: "{text}"
16
+
17
+ Parameters:
18
+ - Speaker ID: {speaker_id}
19
+ - Length Scale: {length_scale}
20
+ - Noise Scale: {noise_scale}
21
+ - Noise W: {noise_w}
22
+
23
+ Note: This is a lightweight demo without actual TTS models.
24
+ For the full version with working TTS, please use the 'full' deployment mode.
25
+ """
 
26
 
27
 
28
  def create_interface():
29
+ """Create simplified Gradio interface"""
30
+
31
+ with gr.Blocks(title="Piper TTS Demo - Simplified") as interface:
32
  gr.Markdown("""
33
+ # 🎙️ Piper TTS Demo (Simplified Version)
34
+
35
+ This is a lightweight demo interface without model files.
36
+
37
+ Features in the full version:
38
+ - ✅ Japanese text-to-speech with OpenJTalk phonemization
39
+ - ✅ English text-to-speech synthesis
40
+ - ✅ Real-time audio generation
41
+ - ✅ Adjustable voice parameters
42
+
43
+ Repository: [piper-plus](https://github.com/ayutaz/piper-plus)
44
  """)
45
+
46
  with gr.Row():
47
+ with gr.Column():
48
  model_dropdown = gr.Dropdown(
49
+ choices=["Japanese (Medium)", "English (Test)"],
50
  label="Select Model",
51
+ value="Japanese (Medium)",
52
  )
53
+
54
  text_input = gr.Textbox(
55
  label="Text to synthesize",
56
  placeholder="Enter text here...",
57
  lines=3,
58
+ value="こんにちは、世界!",
59
  )
60
+
61
  with gr.Accordion("Advanced Settings", open=False):
62
  speaker_id = gr.Number(
63
  label="Speaker ID",
64
  value=0,
65
  precision=0,
 
 
 
66
  )
67
+
68
  length_scale = gr.Slider(
69
+ label="Length Scale (speaking rate)",
70
  minimum=0.5,
71
  maximum=2.0,
72
  value=1.0,
73
  step=0.1,
 
74
  )
75
+
76
  noise_scale = gr.Slider(
77
+ label="Noise Scale (expressiveness)",
78
  minimum=0.0,
79
+ maximum=2.0,
80
  value=0.667,
81
  step=0.01,
82
  )
83
+
84
  noise_w = gr.Slider(
85
+ label="Noise W (phoneme duration variation)",
86
  minimum=0.0,
87
+ maximum=2.0,
88
  value=0.8,
89
  step=0.01,
90
  )
91
+
92
  synthesize_btn = gr.Button("Generate Speech", variant="primary")
93
+
94
+ with gr.Column():
95
+ output_text = gr.Textbox(
96
+ label="Demo Output",
97
+ lines=15,
 
98
  )
99
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  synthesize_btn.click(
101
+ fn=synthesize_speech_demo,
102
  inputs=[
103
  text_input,
104
  model_dropdown,
 
107
  noise_scale,
108
  noise_w,
109
  ],
110
+ outputs=output_text,
111
  )
112
+
113
+ gr.Examples(
114
+ examples=[
115
+ ["こんにちは、世界!", "Japanese (Medium)"],
116
+ ["音声合成のデモンストレーションです。", "Japanese (Medium)"],
117
+ ["Hello, world!", "English (Test)"],
118
+ ["This is a text-to-speech demonstration.", "English (Test)"],
119
+ ],
120
+ inputs=[text_input, model_dropdown],
121
+ )
122
+
123
  return interface
124
 
125
 
 
127
  interface = create_interface()
128
 
129
  if __name__ == "__main__":
130
+ interface.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
  # Piper TTS Demo Requirements
2
- gradio>=4.0.0
3
  numpy>=1.19.0
4
  onnxruntime>=1.16.0
5
  pyopenjtalk>=0.3.0
 
6
  # Note: espeak-phonemizer requires system espeak-ng library
7
  # For simplified deployment, using character-based fallback for English
 
1
  # Piper TTS Demo Requirements
2
+ gradio==4.44.1
3
  numpy>=1.19.0
4
  onnxruntime>=1.16.0
5
  pyopenjtalk>=0.3.0
6
+ onnx>=1.14.0
7
  # Note: espeak-phonemizer requires system espeak-ng library
8
  # For simplified deployment, using character-based fallback for English