eaglelandsonce commited on
Commit
287050f
·
verified ·
1 Parent(s): e00db2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -141
app.py CHANGED
@@ -1,197 +1,176 @@
1
- import os
2
- import io
3
- import uuid
4
- import re
5
- import tempfile
6
- from typing import Optional, List
7
 
8
- import gradio as gr
 
 
9
 
10
- # --- File reading ---
11
- def read_text_from_file(file_obj) -> str:
12
- if file_obj is None:
13
- return ""
14
- name = getattr(file_obj, "name", "")
15
- if not name:
16
- return ""
17
 
18
- ext = os.path.splitext(name)[1].lower()
19
- if ext == ".txt":
20
- return file_obj.read().decode("utf-8", errors="ignore")
21
- elif ext == ".docx":
22
- # lazy import to keep startup snappy
23
- import docx
24
- d = docx.Document(file_obj)
25
- return "\n".join([p.text for p in d.paragraphs]).strip()
26
- else:
27
- raise gr.Error("Unsupported file type. Please upload .txt or .docx")
28
 
29
- # --- Chunking utility (keeps sentences intact, ~350-500 chars each) ---
30
  _SENT_SPLIT = re.compile(r"(?<=[\.\!\?\:\;\n])\s+")
31
 
32
- def chunk_text(text: str, max_len: int = 450) -> List[str]:
33
- # Fast path
 
 
34
  if len(text) <= max_len:
35
- return [text.strip()]
36
-
37
- sentences = [s.strip() for s in _SENT_SPLIT.split(text) if s.strip()]
38
- chunks, cur = [], ""
39
- for s in sentences:
40
- if len(cur) + 1 + len(s) <= max_len:
41
- cur = f"{cur} {s}".strip() if cur else s
42
  else:
43
- if cur:
44
- chunks.append(cur)
45
- # very long single sentence fallback
46
- if len(s) > max_len:
47
  for i in range(0, len(s), max_len):
48
  chunks.append(s[i:i+max_len])
49
- cur = ""
50
  else:
51
- cur = s
52
- if cur:
53
- chunks.append(cur)
54
  return chunks
55
 
56
- # --- Lazy TTS loader (Coqui XTTS v2) ---
57
- _TTS = None
58
- _SR = 22050 # default; will be overwritten after first load if available
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  def get_tts():
61
  global _TTS, _SR
62
  if _TTS is None:
63
- from TTS.api import TTS
64
- # Multilingual, high-quality, supports voice cloning via reference audio
65
- _TTS = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
66
  try:
67
- _SR = getattr(_TTS, "output_sample_rate", 24000) or 24000
68
- except Exception:
69
- _SR = 24000
 
 
 
 
 
 
 
70
  return _TTS
71
 
72
- # --- Synthesis core ---
73
- def synthesize(
74
- text_input: str,
75
- file_input,
76
- language: str,
77
- voice_ref, # optional reference audio for cloning
78
- ) -> str:
79
- # Collect text from inputs
80
- user_text = (text_input or "").strip()
81
- file_text = read_text_from_file(file_input) if file_input else ""
82
- final_text = (user_text + "\n" + file_text).strip()
 
 
 
 
 
 
 
83
 
84
  if not final_text:
85
  raise gr.Error("Please paste/type text or upload a .txt/.docx file.")
86
 
87
- # Clean + limit length to something reasonable for demo
88
- final_text = re.sub(r"\s+", " ", final_text).strip()
89
  if len(final_text) > 20000:
90
  final_text = final_text[:20000] + " ..."
91
 
92
- # Prepare chunks
93
  chunks = chunk_text(final_text, max_len=480)
 
 
94
 
95
- # TTS model
96
  tts = get_tts()
97
 
98
- # Target WAV path
99
- out_path = os.path.join(tempfile.gettempdir(), f"tts_{uuid.uuid4().hex}.wav")
100
-
101
- # Synthesize and append to a single WAV
102
- import soundfile as sf
103
- import numpy as np
104
-
105
- # Create/overwrite file
106
- with sf.SoundFile(out_path, mode="w", samplerate=_SR, channels=1, subtype="PCM_16") as f:
107
- for i, chunk in enumerate(chunks, start=1):
108
- # If a reference voice is provided, use it
109
  speaker_wav = None
110
- if voice_ref is not None:
111
- try:
112
- speaker_wav = voice_ref.name # temp file path provided by Gradio
113
- except Exception:
114
- speaker_wav = None
115
-
116
- # Generate audio as numpy array
117
- audio = tts.tts(
118
- text=chunk,
119
- language=language,
120
- speaker_wav=speaker_wav, # None => default voice
121
- )
122
- # Ensure mono float32/float64 -> int16
123
- audio = np.asarray(audio).flatten()
124
- # Normalize if needed
125
- if audio.dtype != np.float32 and audio.dtype != np.float64:
126
- audio = audio.astype("float32")
127
- # write chunk
128
- f.write(audio)
129
 
130
- return out_path
 
 
 
 
 
 
 
 
131
 
132
- # --- Gradio UI ---
133
  LANG_OPTIONS = [
134
- ("English", "en"),
135
- ("Spanish", "es"),
136
- ("French", "fr"),
137
- ("German", "de"),
138
- ("Italian", "it"),
139
- ("Portuguese", "pt"),
140
- ("Polish", "pl"),
141
- ("Turkish", "tr"),
142
- ("Russian", "ru"),
143
- ("Dutch", "nl"),
144
- ("Chinese", "zh-cn"),
145
- ("Japanese", "ja"),
146
- ("Korean", "ko"),
147
- ("Arabic", "ar"),
148
  ]
149
 
150
  with gr.Blocks(title="High-Quality TTS (XTTS v2)") as demo:
151
  gr.Markdown(
152
  """
153
- # 🔊 High-Quality Text-to-Speech
154
- - **Upload** a `.docx` or `.txt`, **or** paste/type text.
155
- - Optionally **clone a voice** by uploading a short (10–30s) reference `.wav`.
156
- - Choose a **language**, then click **Generate Audio**.
157
  """
158
  )
 
 
159
  with gr.Row():
160
- text_in = gr.Textbox(
161
- label="Type or paste text",
162
- lines=8,
163
- placeholder="Paste text here… (you can also upload a .docx/.txt below)",
164
- )
165
- with gr.Row():
166
- file_in = gr.File(
167
- label="Drag & drop .docx or .txt (optional)",
168
- file_types=[".docx", ".txt"],
169
- )
170
- with gr.Row():
171
- voice_ref = gr.File(
172
- label="Optional: Voice reference (.wav, 10–30s) for cloning",
173
- file_types=[".wav"],
174
- visible=True,
175
- )
176
  lang = gr.Dropdown(
177
- choices=[v for _, v in LANG_OPTIONS],
178
  value="en",
179
  label="Language",
180
- info="XTTS v2 is multilingual; pick what fits your input.",
181
  )
182
-
183
- btn = gr.Button("🎙️ Generate Audio", variant="primary")
184
  audio_out = gr.Audio(label="Result", type="filepath", autoplay=True)
185
  download = gr.File(label="Download WAV")
 
186
 
187
  def run(text_input, file_input, language, voice_ref_file):
188
- path = synthesize(text_input, file_input, language, voice_ref_file)
189
- return path, path
190
-
191
- btn.click(
 
 
 
 
 
 
192
  run,
193
  inputs=[text_in, file_in, lang, voice_ref],
194
- outputs=[audio_out, download],
195
  )
196
 
197
  if __name__ == "__main__":
 
1
+ import os, io, uuid, re, tempfile, traceback
2
+ from typing import List
 
 
 
 
3
 
4
+ # ---- Make Spaces happy: force CPU & avoid MPS/CUDA surprises ----
5
+ os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
6
+ os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
7
 
8
+ import numpy as np
9
+ import gradio as gr
 
 
 
 
 
10
 
11
+ # Lazy flags
12
+ _TTS = None
13
+ _SR = 24000 # XTTS v2 typical output rate
 
 
 
 
 
 
 
14
 
15
+ # ---------- Utilities ----------
16
  _SENT_SPLIT = re.compile(r"(?<=[\.\!\?\:\;\n])\s+")
17
 
18
+ def chunk_text(text: str, max_len: int = 480) -> List[str]:
19
+ text = re.sub(r"\s+", " ", text).strip()
20
+ if not text:
21
+ return []
22
  if len(text) <= max_len:
23
+ return [text]
24
+ sents = [s.strip() for s in _SENT_SPLIT.split(text) if s.strip()]
25
+ chunks, buf = [], ""
26
+ for s in sents:
27
+ if len(buf) + 1 + len(s) <= max_len:
28
+ buf = f"{buf} {s}".strip() if buf else s
 
29
  else:
30
+ if buf:
31
+ chunks.append(buf)
32
+ if len(s) > max_len: # very long single sentence
 
33
  for i in range(0, len(s), max_len):
34
  chunks.append(s[i:i+max_len])
35
+ buf = ""
36
  else:
37
+ buf = s
38
+ if buf:
39
+ chunks.append(buf)
40
  return chunks
41
 
42
+ def read_text_from_file(file_obj) -> str:
43
+ if not file_obj:
44
+ return ""
45
+ # gr.File in v4 gives a TempFile with .name path string
46
+ path = getattr(file_obj, "name", None)
47
+ if not path or not os.path.exists(path):
48
+ return ""
49
+ ext = os.path.splitext(path)[1].lower()
50
+ if ext == ".txt":
51
+ with open(path, "rb") as f:
52
+ return f.read().decode("utf-8", errors="ignore")
53
+ elif ext == ".docx":
54
+ try:
55
+ import docx
56
+ except Exception:
57
+ raise gr.Error("python-docx not installed. Check requirements.txt")
58
+ d = docx.Document(path)
59
+ return "\n".join(p.text for p in d.paragraphs).strip()
60
+ else:
61
+ raise gr.Error("Unsupported file type. Please upload .txt or .docx")
62
 
63
  def get_tts():
64
  global _TTS, _SR
65
  if _TTS is None:
 
 
 
66
  try:
67
+ from TTS.api import TTS
68
+ except Exception as e:
69
+ raise gr.Error(
70
+ "Coqui TTS is not installed or failed to import. "
71
+ "Make sure your Space installed requirements.txt.\n\n" + str(e)
72
+ )
73
+ # CPU-safe init
74
+ _TTS = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False)
75
+ # sample rate if exposed
76
+ _SR = int(getattr(_TTS, "output_sample_rate", 24000) or 24000)
77
  return _TTS
78
 
79
+ def safe_concat_wav(chunks_audio: List[np.ndarray], sr: int, out_path: str) -> str:
80
+ import soundfile as sf
81
+ with sf.SoundFile(out_path, mode="w", samplerate=sr, channels=1, subtype="PCM_16") as f:
82
+ for a in chunks_audio:
83
+ a = np.asarray(a).flatten().astype("float32")
84
+ # guard against NaNs/Infs
85
+ a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
86
+ # clamp to [-1, 1]
87
+ a = np.clip(a, -1.0, 1.0)
88
+ f.write(a)
89
+ return out_path
90
+
91
+ # ---------- Core pipeline ----------
92
+ def synthesize_pipeline(text_input, file_input, language, voice_ref):
93
+ # Gather text
94
+ user = (text_input or "").strip()
95
+ from_file = read_text_from_file(file_input) if file_input else ""
96
+ final_text = (user + ("\n" if user and from_file else "") + from_file).strip()
97
 
98
  if not final_text:
99
  raise gr.Error("Please paste/type text or upload a .txt/.docx file.")
100
 
101
+ # Limit very long inputs so Spaces don't OOM
 
102
  if len(final_text) > 20000:
103
  final_text = final_text[:20000] + " ..."
104
 
 
105
  chunks = chunk_text(final_text, max_len=480)
106
+ if not chunks:
107
+ raise gr.Error("No readable text found.")
108
 
 
109
  tts = get_tts()
110
 
111
+ # Optional voice clone
112
+ speaker_wav = None
113
+ if voice_ref is not None:
114
+ try:
115
+ speaker_wav = getattr(voice_ref, "name", None)
116
+ except Exception:
 
 
 
 
 
117
  speaker_wav = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ # Synthesize
120
+ audios = []
121
+ for i, ch in enumerate(chunks, 1):
122
+ audio = tts.tts(text=ch, language=language, speaker_wav=speaker_wav)
123
+ audios.append(audio)
124
+
125
+ # Write single WAV
126
+ out_path = os.path.join(tempfile.gettempdir(), f"tts_{uuid.uuid4().hex}.wav")
127
+ return safe_concat_wav(audios, _SR, out_path)
128
 
129
+ # ---------- Gradio UI ----------
130
  LANG_OPTIONS = [
131
+ ("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"),
132
+ ("Italian", "it"), ("Portuguese", "pt"), ("Polish", "pl"), ("Turkish", "tr"),
133
+ ("Russian", "ru"), ("Dutch", "nl"), ("Chinese (Simplified)", "zh-cn"),
134
+ ("Japanese", "ja"), ("Korean", "ko"), ("Arabic", "ar"),
 
 
 
 
 
 
 
 
 
 
135
  ]
136
 
137
  with gr.Blocks(title="High-Quality TTS (XTTS v2)") as demo:
138
  gr.Markdown(
139
  """
140
+ # 🔊 High-Quality Text-to-Speech (Coqui XTTS v2)
141
+ - **Type/paste** text or **upload** `.docx` / `.txt`
142
+ - Optional: upload a short **.wav** (10–30s) to clone voice
143
+ - Click **Generate Audio**
144
  """
145
  )
146
+ text_in = gr.Textbox(label="Type or paste text", lines=8, placeholder="Paste text here…")
147
+ file_in = gr.File(label="Drag & drop .docx / .txt (optional)", file_types=[".docx", ".txt"])
148
  with gr.Row():
149
+ voice_ref = gr.File(label="Optional voice reference (.wav, 10–30s)", file_types=[".wav"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  lang = gr.Dropdown(
151
+ choices=[code for (_, code) in LANG_OPTIONS],
152
  value="en",
153
  label="Language",
 
154
  )
155
+ run_btn = gr.Button("🎙️ Generate Audio", variant="primary")
 
156
  audio_out = gr.Audio(label="Result", type="filepath", autoplay=True)
157
  download = gr.File(label="Download WAV")
158
+ err_box = gr.Markdown("", elem_id="error_box")
159
 
160
  def run(text_input, file_input, language, voice_ref_file):
161
+ try:
162
+ path = synthesize_pipeline(text_input, file_input, language, voice_ref_file)
163
+ return path, path, "" # clear errors
164
+ except Exception as e:
165
+ tb = traceback.format_exc()
166
+ # Show a compact, readable error in the UI
167
+ msg = f"**Error:** {e}\n\n```\n{tb[-1500:]}\n```"
168
+ return None, None, msg
169
+
170
+ run_btn.click(
171
  run,
172
  inputs=[text_in, file_in, lang, voice_ref],
173
+ outputs=[audio_out, download, err_box],
174
  )
175
 
176
  if __name__ == "__main__":