eaglelandsonce commited on
Commit
e583c33
·
verified ·
1 Parent(s): b6ab8f2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -0
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import uuid
4
+ import re
5
+ import tempfile
6
+ from typing import Optional, List
7
+
8
+ import gradio as gr
9
+
10
+ # --- File reading ---
11
+ def read_text_from_file(file_obj) -> str:
12
+ if file_obj is None:
13
+ return ""
14
+ name = getattr(file_obj, "name", "")
15
+ if not name:
16
+ return ""
17
+
18
+ ext = os.path.splitext(name)[1].lower()
19
+ if ext == ".txt":
20
+ return file_obj.read().decode("utf-8", errors="ignore")
21
+ elif ext == ".docx":
22
+ # lazy import to keep startup snappy
23
+ import docx
24
+ d = docx.Document(file_obj)
25
+ return "\n".join([p.text for p in d.paragraphs]).strip()
26
+ else:
27
+ raise gr.Error("Unsupported file type. Please upload .txt or .docx")
28
+
29
+ # --- Chunking utility (keeps sentences intact, ~350-500 chars each) ---
30
+ _SENT_SPLIT = re.compile(r"(?<=[\.\!\?\:\;\n])\s+")
31
+
32
+ def chunk_text(text: str, max_len: int = 450) -> List[str]:
33
+ # Fast path
34
+ if len(text) <= max_len:
35
+ return [text.strip()]
36
+
37
+ sentences = [s.strip() for s in _SENT_SPLIT.split(text) if s.strip()]
38
+ chunks, cur = [], ""
39
+ for s in sentences:
40
+ if len(cur) + 1 + len(s) <= max_len:
41
+ cur = f"{cur} {s}".strip() if cur else s
42
+ else:
43
+ if cur:
44
+ chunks.append(cur)
45
+ # very long single sentence fallback
46
+ if len(s) > max_len:
47
+ for i in range(0, len(s), max_len):
48
+ chunks.append(s[i:i+max_len])
49
+ cur = ""
50
+ else:
51
+ cur = s
52
+ if cur:
53
+ chunks.append(cur)
54
+ return chunks
55
+
56
+ # --- Lazy TTS loader (Coqui XTTS v2) ---
57
+ _TTS = None
58
+ _SR = 22050 # default; will be overwritten after first load if available
59
+
60
+ def get_tts():
61
+ global _TTS, _SR
62
+ if _TTS is None:
63
+ from TTS.api import TTS
64
+ # Multilingual, high-quality, supports voice cloning via reference audio
65
+ _TTS = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
66
+ try:
67
+ _SR = getattr(_TTS, "output_sample_rate", 24000) or 24000
68
+ except Exception:
69
+ _SR = 24000
70
+ return _TTS
71
+
72
+ # --- Synthesis core ---
73
+ def synthesize(
74
+ text_input: str,
75
+ file_input,
76
+ language: str,
77
+ voice_ref, # optional reference audio for cloning
78
+ ) -> str:
79
+ # Collect text from inputs
80
+ user_text = (text_input or "").strip()
81
+ file_text = read_text_from_file(file_input) if file_input else ""
82
+ final_text = (user_text + "\n" + file_text).strip()
83
+
84
+ if not final_text:
85
+ raise gr.Error("Please paste/type text or upload a .txt/.docx file.")
86
+
87
+ # Clean + limit length to something reasonable for demo
88
+ final_text = re.sub(r"\s+", " ", final_text).strip()
89
+ if len(final_text) > 20000:
90
+ final_text = final_text[:20000] + " ..."
91
+
92
+ # Prepare chunks
93
+ chunks = chunk_text(final_text, max_len=480)
94
+
95
+ # TTS model
96
+ tts = get_tts()
97
+
98
+ # Target WAV path
99
+ out_path = os.path.join(tempfile.gettempdir(), f"tts_{uuid.uuid4().hex}.wav")
100
+
101
+ # Synthesize and append to a single WAV
102
+ import soundfile as sf
103
+ import numpy as np
104
+
105
+ # Create/overwrite file
106
+ with sf.SoundFile(out_path, mode="w", samplerate=_SR, channels=1, subtype="PCM_16") as f:
107
+ for i, chunk in enumerate(chunks, start=1):
108
+ # If a reference voice is provided, use it
109
+ speaker_wav = None
110
+ if voice_ref is not None:
111
+ try:
112
+ speaker_wav = voice_ref.name # temp file path provided by Gradio
113
+ except Exception:
114
+ speaker_wav = None
115
+
116
+ # Generate audio as numpy array
117
+ audio = tts.tts(
118
+ text=chunk,
119
+ language=language,
120
+ speaker_wav=speaker_wav, # None => default voice
121
+ )
122
+ # Ensure mono float32/float64 -> int16
123
+ audio = np.asarray(audio).flatten()
124
+ # Normalize if needed
125
+ if audio.dtype != np.float32 and audio.dtype != np.float64:
126
+ audio = audio.astype("float32")
127
+ # write chunk
128
+ f.write(audio)
129
+
130
+ return out_path
131
+
132
+ # --- Gradio UI ---
133
+ LANG_OPTIONS = [
134
+ ("English", "en"),
135
+ ("Spanish", "es"),
136
+ ("French", "fr"),
137
+ ("German", "de"),
138
+ ("Italian", "it"),
139
+ ("Portuguese", "pt"),
140
+ ("Polish", "pl"),
141
+ ("Turkish", "tr"),
142
+ ("Russian", "ru"),
143
+ ("Dutch", "nl"),
144
+ ("Chinese", "zh-cn"),
145
+ ("Japanese", "ja"),
146
+ ("Korean", "ko"),
147
+ ("Arabic", "ar"),
148
+ ]
149
+
150
+ with gr.Blocks(title="High-Quality TTS (XTTS v2)") as demo:
151
+ gr.Markdown(
152
+ """
153
+ # 🔊 High-Quality Text-to-Speech
154
+ - **Upload** a `.docx` or `.txt`, **or** paste/type text.
155
+ - Optionally **clone a voice** by uploading a short (10–30s) reference `.wav`.
156
+ - Choose a **language**, then click **Generate Audio**.
157
+ """
158
+ )
159
+ with gr.Row():
160
+ text_in = gr.Textbox(
161
+ label="Type or paste text",
162
+ lines=8,
163
+ placeholder="Paste text here… (you can also upload a .docx/.txt below)",
164
+ )
165
+ with gr.Row():
166
+ file_in = gr.File(
167
+ label="Drag & drop .docx or .txt (optional)",
168
+ file_types=[".docx", ".txt"],
169
+ )
170
+ with gr.Row():
171
+ voice_ref = gr.File(
172
+ label="Optional: Voice reference (.wav, 10–30s) for cloning",
173
+ file_types=[".wav"],
174
+ visible=True,
175
+ )
176
+ lang = gr.Dropdown(
177
+ choices=[v for _, v in LANG_OPTIONS],
178
+ value="en",
179
+ label="Language",
180
+ info="XTTS v2 is multilingual; pick what fits your input.",
181
+ )
182
+
183
+ btn = gr.Button("🎙️ Generate Audio", variant="primary")
184
+ audio_out = gr.Audio(label="Result", type="filepath", autoplay=True)
185
+ download = gr.File(label="Download WAV")
186
+
187
+ def run(text_input, file_input, language, voice_ref_file):
188
+ path = synthesize(text_input, file_input, language, voice_ref_file)
189
+ return path, path
190
+
191
+ btn.click(
192
+ run,
193
+ inputs=[text_in, file_in, lang, voice_ref],
194
+ outputs=[audio_out, download],
195
+ )
196
+
197
+ if __name__ == "__main__":
198
+ demo.launch()