Spaces:
Build error
Build error
Update from GitHub Actions - 2025-07-31 15:25:59\n\nDeployment mode: simple
Browse files- README.md +2 -2
- app.py +68 -240
- requirements.txt +2 -1
README.md
CHANGED
@@ -4,8 +4,8 @@ emoji: 🎙️
|
|
4 |
colorFrom: blue
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.44.
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.44.1
|
8 |
+
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
app.py
CHANGED
@@ -1,286 +1,104 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
Piper TTS
|
4 |
-
Supports Japanese and English text-to-speech using ONNX models
|
5 |
"""
|
6 |
|
7 |
-
import json
|
8 |
-
import logging
|
9 |
-
from pathlib import Path
|
10 |
-
|
11 |
import gradio as gr
|
12 |
-
import numpy as np
|
13 |
-
import onnxruntime
|
14 |
-
from app_imports import ESPEAK_AVAILABLE, PYOPENJTALK_AVAILABLE
|
15 |
-
|
16 |
-
# Download models if not present
|
17 |
-
from download_models import download_models
|
18 |
-
|
19 |
-
# Ensure models are downloaded
|
20 |
-
download_models()
|
21 |
-
|
22 |
-
|
23 |
-
# Import optional dependencies
|
24 |
-
if PYOPENJTALK_AVAILABLE:
|
25 |
-
import pyopenjtalk
|
26 |
-
if ESPEAK_AVAILABLE:
|
27 |
-
from espeak_phonemizer import Phonemizer
|
28 |
-
|
29 |
-
|
30 |
-
# Configure logging
|
31 |
-
logging.basicConfig(level=logging.INFO)
|
32 |
-
logger = logging.getLogger(__name__)
|
33 |
-
|
34 |
-
# Model configurations
|
35 |
-
MODELS = {
|
36 |
-
"Japanese (Medium)": {
|
37 |
-
"path": "models/ja_JP-test-medium.onnx",
|
38 |
-
"config": "models/ja_JP-test-medium.onnx.json",
|
39 |
-
"language": "ja",
|
40 |
-
},
|
41 |
-
"English (Test)": {
|
42 |
-
"path": "models/test_voice.onnx",
|
43 |
-
"config": "models/test_voice.onnx.json",
|
44 |
-
"language": "en",
|
45 |
-
},
|
46 |
-
}
|
47 |
-
|
48 |
-
|
49 |
-
def load_model_config(config_path: str) -> dict:
|
50 |
-
"""Load model configuration from JSON file"""
|
51 |
-
with open(config_path, encoding="utf-8") as f:
|
52 |
-
return json.load(f)
|
53 |
-
|
54 |
-
|
55 |
-
def text_to_phonemes(text: str, language: str) -> list[str]:
|
56 |
-
"""Convert text to phoneme strings based on language"""
|
57 |
-
|
58 |
-
if language == "ja":
|
59 |
-
if PYOPENJTALK_AVAILABLE:
|
60 |
-
# Get phonemes from OpenJTalk
|
61 |
-
labels = pyopenjtalk.extract_fullcontext(text)
|
62 |
-
phonemes = []
|
63 |
-
|
64 |
-
for label in labels:
|
65 |
-
# Extract phoneme from label
|
66 |
-
if "-" in label and "+" in label:
|
67 |
-
phoneme = label.split("-")[1].split("+")[0]
|
68 |
-
if phoneme not in ["sil", "pau"]:
|
69 |
-
phonemes.append(phoneme)
|
70 |
-
|
71 |
-
# Add sentence markers
|
72 |
-
phonemes = ["^"] + phonemes + ["$"]
|
73 |
-
else:
|
74 |
-
logger.warning("pyopenjtalk not available, using fallback")
|
75 |
-
# Simple fallback - just use dummy phonemes
|
76 |
-
phonemes = ["^"] + list("aiueo") * 5 + ["$"]
|
77 |
-
|
78 |
-
elif ESPEAK_AVAILABLE: # English
|
79 |
-
phonemizer = Phonemizer("en-us")
|
80 |
-
phoneme_str = phonemizer.phonemize(text)
|
81 |
-
# Convert phoneme string to list
|
82 |
-
phonemes = ["^"] + list(phoneme_str.replace(" ", "")) + ["$"]
|
83 |
-
else:
|
84 |
-
logger.warning("espeak_phonemizer not available, using character fallback")
|
85 |
-
# Character-based fallback - filter non-alphabetic characters
|
86 |
-
cleaned_text = "".join(c.lower() for c in text if c.isalpha() or c.isspace())
|
87 |
-
phonemes = ["^"] + list(cleaned_text) + ["$"]
|
88 |
-
|
89 |
-
return phonemes
|
90 |
-
|
91 |
-
|
92 |
-
def phonemes_to_ids(phonemes: list[str], config: dict) -> list[int]:
|
93 |
-
"""Convert phonemes to model input IDs"""
|
94 |
-
phoneme_id_map = config.get("phoneme_id_map", {})
|
95 |
-
|
96 |
-
ids = []
|
97 |
-
for phoneme in phonemes:
|
98 |
-
if phoneme in phoneme_id_map:
|
99 |
-
ids.extend(phoneme_id_map[phoneme])
|
100 |
-
else:
|
101 |
-
# Use pad token for unknown phonemes
|
102 |
-
ids.append(0)
|
103 |
-
|
104 |
-
return ids
|
105 |
-
|
106 |
-
|
107 |
-
def synthesize_speech(
|
108 |
-
text: str,
|
109 |
-
model_name: str,
|
110 |
-
speaker_id: int = 0,
|
111 |
-
length_scale: float = 1.0,
|
112 |
-
noise_scale: float = 0.667,
|
113 |
-
noise_w: float = 0.8,
|
114 |
-
) -> tuple[int, np.ndarray]:
|
115 |
-
"""Generate speech from text using selected model"""
|
116 |
-
|
117 |
-
if not text.strip():
|
118 |
-
raise gr.Error("Please enter some text")
|
119 |
-
|
120 |
-
if model_name not in MODELS:
|
121 |
-
raise gr.Error("Invalid model selected")
|
122 |
-
|
123 |
-
model_info = MODELS[model_name]
|
124 |
-
config = load_model_config(model_info["config"])
|
125 |
-
|
126 |
-
# Convert text to phoneme IDs
|
127 |
-
phonemes = text_to_phonemes(text, model_info["language"])
|
128 |
-
phoneme_ids = phonemes_to_ids(phonemes, config)
|
129 |
-
|
130 |
-
if not phoneme_ids:
|
131 |
-
raise gr.Error("Failed to convert text to phonemes")
|
132 |
-
|
133 |
-
# Load ONNX model
|
134 |
-
sess_options = onnxruntime.SessionOptions()
|
135 |
-
sess_options.inter_op_num_threads = 1
|
136 |
-
sess_options.intra_op_num_threads = 1
|
137 |
-
|
138 |
-
try:
|
139 |
-
model = onnxruntime.InferenceSession(
|
140 |
-
model_info["path"],
|
141 |
-
sess_options=sess_options,
|
142 |
-
providers=["CPUExecutionProvider"],
|
143 |
-
)
|
144 |
-
except Exception as e:
|
145 |
-
logger.error(f"Failed to load model: {e}")
|
146 |
-
raise gr.Error(f"Failed to load model: {str(e)}") from e
|
147 |
-
|
148 |
-
# Prepare inputs
|
149 |
-
text_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
150 |
-
text_lengths = np.array([text_array.shape[1]], dtype=np.int64)
|
151 |
-
scales = np.array([noise_scale, length_scale, noise_w], dtype=np.float32)
|
152 |
-
|
153 |
-
# Handle speaker ID for multi-speaker models
|
154 |
-
sid = None
|
155 |
-
if config.get("num_speakers", 1) > 1:
|
156 |
-
sid = np.array([speaker_id], dtype=np.int64)
|
157 |
|
158 |
-
# Run inference
|
159 |
-
try:
|
160 |
-
inputs = {
|
161 |
-
"input": text_array,
|
162 |
-
"input_lengths": text_lengths,
|
163 |
-
"scales": scales,
|
164 |
-
}
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
raise gr.Error(f"Failed to generate speech: {str(e)}") from e
|
184 |
|
185 |
|
186 |
def create_interface():
|
187 |
-
"""Create Gradio interface"""
|
188 |
-
|
189 |
-
with gr.Blocks(title="Piper TTS Demo") as interface:
|
190 |
gr.Markdown("""
|
191 |
-
# 🎙️ Piper TTS Demo
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
""")
|
197 |
-
|
198 |
with gr.Row():
|
199 |
-
with gr.Column(
|
200 |
model_dropdown = gr.Dropdown(
|
201 |
-
choices=
|
202 |
label="Select Model",
|
203 |
-
value=
|
204 |
)
|
205 |
-
|
206 |
text_input = gr.Textbox(
|
207 |
label="Text to synthesize",
|
208 |
placeholder="Enter text here...",
|
209 |
lines=3,
|
|
|
210 |
)
|
211 |
-
|
212 |
with gr.Accordion("Advanced Settings", open=False):
|
213 |
speaker_id = gr.Number(
|
214 |
label="Speaker ID",
|
215 |
value=0,
|
216 |
precision=0,
|
217 |
-
minimum=0,
|
218 |
-
maximum=10,
|
219 |
-
info="For multi-speaker models only",
|
220 |
)
|
221 |
-
|
222 |
length_scale = gr.Slider(
|
223 |
-
label="
|
224 |
minimum=0.5,
|
225 |
maximum=2.0,
|
226 |
value=1.0,
|
227 |
step=0.1,
|
228 |
-
info="Lower = faster speech",
|
229 |
)
|
230 |
-
|
231 |
noise_scale = gr.Slider(
|
232 |
-
label="
|
233 |
minimum=0.0,
|
234 |
-
maximum=
|
235 |
value=0.667,
|
236 |
step=0.01,
|
237 |
)
|
238 |
-
|
239 |
noise_w = gr.Slider(
|
240 |
-
label="
|
241 |
minimum=0.0,
|
242 |
-
maximum=
|
243 |
value=0.8,
|
244 |
step=0.01,
|
245 |
)
|
246 |
-
|
247 |
synthesize_btn = gr.Button("Generate Speech", variant="primary")
|
248 |
-
|
249 |
-
with gr.Column(
|
250 |
-
|
251 |
-
label="
|
252 |
-
|
253 |
-
autoplay=True,
|
254 |
)
|
255 |
-
|
256 |
-
gr.Markdown("""
|
257 |
-
### Tips:
|
258 |
-
- Japanese model expects hiragana/kanji text
|
259 |
-
- English model works with standard text
|
260 |
-
- Adjust speed for faster/slower speech
|
261 |
-
- Higher expressiveness = more natural variation
|
262 |
-
""")
|
263 |
-
|
264 |
-
# Examples
|
265 |
-
gr.Examples(
|
266 |
-
examples=[
|
267 |
-
["こんにちは、世界!今日はいい天気ですね。", "Japanese (Medium)"],
|
268 |
-
[
|
269 |
-
"おはようございます。本日の会議は午後3時から始まります。",
|
270 |
-
"Japanese (Medium)",
|
271 |
-
],
|
272 |
-
["Hello world! This is a text to speech demo.", "English (Test)"],
|
273 |
-
[
|
274 |
-
"Welcome to Piper TTS. Enjoy high quality speech synthesis.",
|
275 |
-
"English (Test)",
|
276 |
-
],
|
277 |
-
],
|
278 |
-
inputs=[text_input, model_dropdown],
|
279 |
-
)
|
280 |
-
|
281 |
-
# Event handlers
|
282 |
synthesize_btn.click(
|
283 |
-
fn=
|
284 |
inputs=[
|
285 |
text_input,
|
286 |
model_dropdown,
|
@@ -289,9 +107,19 @@ def create_interface():
|
|
289 |
noise_scale,
|
290 |
noise_w,
|
291 |
],
|
292 |
-
outputs=
|
293 |
)
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
return interface
|
296 |
|
297 |
|
@@ -299,4 +127,4 @@ def create_interface():
|
|
299 |
interface = create_interface()
|
300 |
|
301 |
if __name__ == "__main__":
|
302 |
-
interface.launch()
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
Simplified Piper TTS Demo - Lightweight version without models
|
|
|
4 |
"""
|
5 |
|
|
|
|
|
|
|
|
|
6 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
def synthesize_speech_demo(text, model_name, speaker_id, length_scale, noise_scale, noise_w):
|
10 |
+
"""Demo function that returns a placeholder message"""
|
11 |
+
return f"""
|
12 |
+
🎙️ Piper TTS Demo (Simplified Version)
|
13 |
+
|
14 |
+
Model: {model_name}
|
15 |
+
Text: "{text}"
|
16 |
+
|
17 |
+
Parameters:
|
18 |
+
- Speaker ID: {speaker_id}
|
19 |
+
- Length Scale: {length_scale}
|
20 |
+
- Noise Scale: {noise_scale}
|
21 |
+
- Noise W: {noise_w}
|
22 |
+
|
23 |
+
Note: This is a lightweight demo without actual TTS models.
|
24 |
+
For the full version with working TTS, please use the 'full' deployment mode.
|
25 |
+
"""
|
|
|
26 |
|
27 |
|
28 |
def create_interface():
|
29 |
+
"""Create simplified Gradio interface"""
|
30 |
+
|
31 |
+
with gr.Blocks(title="Piper TTS Demo - Simplified") as interface:
|
32 |
gr.Markdown("""
|
33 |
+
# 🎙️ Piper TTS Demo (Simplified Version)
|
34 |
+
|
35 |
+
This is a lightweight demo interface without model files.
|
36 |
+
|
37 |
+
Features in the full version:
|
38 |
+
- ✅ Japanese text-to-speech with OpenJTalk phonemization
|
39 |
+
- ✅ English text-to-speech synthesis
|
40 |
+
- ✅ Real-time audio generation
|
41 |
+
- ✅ Adjustable voice parameters
|
42 |
+
|
43 |
+
Repository: [piper-plus](https://github.com/ayutaz/piper-plus)
|
44 |
""")
|
45 |
+
|
46 |
with gr.Row():
|
47 |
+
with gr.Column():
|
48 |
model_dropdown = gr.Dropdown(
|
49 |
+
choices=["Japanese (Medium)", "English (Test)"],
|
50 |
label="Select Model",
|
51 |
+
value="Japanese (Medium)",
|
52 |
)
|
53 |
+
|
54 |
text_input = gr.Textbox(
|
55 |
label="Text to synthesize",
|
56 |
placeholder="Enter text here...",
|
57 |
lines=3,
|
58 |
+
value="こんにちは、世界!",
|
59 |
)
|
60 |
+
|
61 |
with gr.Accordion("Advanced Settings", open=False):
|
62 |
speaker_id = gr.Number(
|
63 |
label="Speaker ID",
|
64 |
value=0,
|
65 |
precision=0,
|
|
|
|
|
|
|
66 |
)
|
67 |
+
|
68 |
length_scale = gr.Slider(
|
69 |
+
label="Length Scale (speaking rate)",
|
70 |
minimum=0.5,
|
71 |
maximum=2.0,
|
72 |
value=1.0,
|
73 |
step=0.1,
|
|
|
74 |
)
|
75 |
+
|
76 |
noise_scale = gr.Slider(
|
77 |
+
label="Noise Scale (expressiveness)",
|
78 |
minimum=0.0,
|
79 |
+
maximum=2.0,
|
80 |
value=0.667,
|
81 |
step=0.01,
|
82 |
)
|
83 |
+
|
84 |
noise_w = gr.Slider(
|
85 |
+
label="Noise W (phoneme duration variation)",
|
86 |
minimum=0.0,
|
87 |
+
maximum=2.0,
|
88 |
value=0.8,
|
89 |
step=0.01,
|
90 |
)
|
91 |
+
|
92 |
synthesize_btn = gr.Button("Generate Speech", variant="primary")
|
93 |
+
|
94 |
+
with gr.Column():
|
95 |
+
output_text = gr.Textbox(
|
96 |
+
label="Demo Output",
|
97 |
+
lines=15,
|
|
|
98 |
)
|
99 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
synthesize_btn.click(
|
101 |
+
fn=synthesize_speech_demo,
|
102 |
inputs=[
|
103 |
text_input,
|
104 |
model_dropdown,
|
|
|
107 |
noise_scale,
|
108 |
noise_w,
|
109 |
],
|
110 |
+
outputs=output_text,
|
111 |
)
|
112 |
+
|
113 |
+
gr.Examples(
|
114 |
+
examples=[
|
115 |
+
["こんにちは、世界!", "Japanese (Medium)"],
|
116 |
+
["音声合成のデモンストレーションです。", "Japanese (Medium)"],
|
117 |
+
["Hello, world!", "English (Test)"],
|
118 |
+
["This is a text-to-speech demonstration.", "English (Test)"],
|
119 |
+
],
|
120 |
+
inputs=[text_input, model_dropdown],
|
121 |
+
)
|
122 |
+
|
123 |
return interface
|
124 |
|
125 |
|
|
|
127 |
interface = create_interface()
|
128 |
|
129 |
if __name__ == "__main__":
|
130 |
+
interface.launch(server_name="0.0.0.0", server_port=7860)
|
requirements.txt
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
# Piper TTS Demo Requirements
|
2 |
-
gradio
|
3 |
numpy>=1.19.0
|
4 |
onnxruntime>=1.16.0
|
5 |
pyopenjtalk>=0.3.0
|
|
|
6 |
# Note: espeak-phonemizer requires system espeak-ng library
|
7 |
# For simplified deployment, using character-based fallback for English
|
|
|
1 |
# Piper TTS Demo Requirements
|
2 |
+
gradio==4.44.1
|
3 |
numpy>=1.19.0
|
4 |
onnxruntime>=1.16.0
|
5 |
pyopenjtalk>=0.3.0
|
6 |
+
onnx>=1.14.0
|
7 |
# Note: espeak-phonemizer requires system espeak-ng library
|
8 |
# For simplified deployment, using character-based fallback for English
|