Spaces:

eaglelandsonce
/

Text_Audio_Reader

Sleeping

App Files Files Community

eaglelandsonce commited on 9 days ago

Commit

287050f

verified ·

1 Parent(s): e00db2c

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -141

app.py CHANGED Viewed

@@ -1,197 +1,176 @@
-import os
-import io
-import uuid
-import re
-import tempfile
-from typing import Optional, List
-import gradio as gr
-# --- File reading ---
-def read_text_from_file(file_obj) -> str:
-    if file_obj is None:
-        return ""
-    name = getattr(file_obj, "name", "")
-    if not name:
-        return ""
-    ext = os.path.splitext(name)[1].lower()
-    if ext == ".txt":
-        return file_obj.read().decode("utf-8", errors="ignore")
-    elif ext == ".docx":
-        # lazy import to keep startup snappy
-        import docx
-        d = docx.Document(file_obj)
-        return "\n".join([p.text for p in d.paragraphs]).strip()
-    else:
-        raise gr.Error("Unsupported file type. Please upload .txt or .docx")
-# --- Chunking utility (keeps sentences intact, ~350-500 chars each) ---
 _SENT_SPLIT = re.compile(r"(?<=[\.\!\?\:\;\n])\s+")
-def chunk_text(text: str, max_len: int = 450) -> List[str]:
-    # Fast path
     if len(text) <= max_len:
-        return [text.strip()]
-    sentences = [s.strip() for s in _SENT_SPLIT.split(text) if s.strip()]
-    chunks, cur = [], ""
-    for s in sentences:
-        if len(cur) + 1 + len(s) <= max_len:
-            cur = f"{cur} {s}".strip() if cur else s
         else:
-            if cur:
-                chunks.append(cur)
-            # very long single sentence fallback
-            if len(s) > max_len:
                 for i in range(0, len(s), max_len):
                     chunks.append(s[i:i+max_len])
-                cur = ""
             else:
-                cur = s
-    if cur:
-        chunks.append(cur)
     return chunks
-# --- Lazy TTS loader (Coqui XTTS v2) ---
-_TTS = None
-_SR = 22050  # default; will be overwritten after first load if available
 def get_tts():
     global _TTS, _SR
     if _TTS is None:
-        from TTS.api import TTS
-        # Multilingual, high-quality, supports voice cloning via reference audio
-        _TTS = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
         try:
-            _SR = getattr(_TTS, "output_sample_rate", 24000) or 24000
-        except Exception:
-            _SR = 24000
     return _TTS
-# --- Synthesis core ---
-def synthesize(
-    text_input: str,
-    file_input,
-    language: str,
-    voice_ref,  # optional reference audio for cloning
-) -> str:
-    # Collect text from inputs
-    user_text = (text_input or "").strip()
-    file_text = read_text_from_file(file_input) if file_input else ""
-    final_text = (user_text + "\n" + file_text).strip()
     if not final_text:
         raise gr.Error("Please paste/type text or upload a .txt/.docx file.")
-    # Clean + limit length to something reasonable for demo
-    final_text = re.sub(r"\s+", " ", final_text).strip()
     if len(final_text) > 20000:
         final_text = final_text[:20000] + " ..."
-    # Prepare chunks
     chunks = chunk_text(final_text, max_len=480)
-    # TTS model
     tts = get_tts()
-    # Target WAV path
-    out_path = os.path.join(tempfile.gettempdir(), f"tts_{uuid.uuid4().hex}.wav")
-    # Synthesize and append to a single WAV
-    import soundfile as sf
-    import numpy as np
-    # Create/overwrite file
-    with sf.SoundFile(out_path, mode="w", samplerate=_SR, channels=1, subtype="PCM_16") as f:
-        for i, chunk in enumerate(chunks, start=1):
-            # If a reference voice is provided, use it
             speaker_wav = None
-            if voice_ref is not None:
-                try:
-                    speaker_wav = voice_ref.name  # temp file path provided by Gradio
-                except Exception:
-                    speaker_wav = None
-            # Generate audio as numpy array
-            audio = tts.tts(
-                text=chunk,
-                language=language,
-                speaker_wav=speaker_wav,  # None => default voice
-            )
-            # Ensure mono float32/float64 -> int16
-            audio = np.asarray(audio).flatten()
-            # Normalize if needed
-            if audio.dtype != np.float32 and audio.dtype != np.float64:
-                audio = audio.astype("float32")
-            # write chunk
-            f.write(audio)
-    return out_path
-# --- Gradio UI ---
 LANG_OPTIONS = [
-    ("English", "en"),
-    ("Spanish", "es"),
-    ("French", "fr"),
-    ("German", "de"),
-    ("Italian", "it"),
-    ("Portuguese", "pt"),
-    ("Polish", "pl"),
-    ("Turkish", "tr"),
-    ("Russian", "ru"),
-    ("Dutch", "nl"),
-    ("Chinese", "zh-cn"),
-    ("Japanese", "ja"),
-    ("Korean", "ko"),
-    ("Arabic", "ar"),
 ]
 with gr.Blocks(title="High-Quality TTS (XTTS v2)") as demo:
     gr.Markdown(
         """
-        # 🔊 High-Quality Text-to-Speech
-        - **Upload** a `.docx` or `.txt`, **or** paste/type text.
-        - Optionally **clone a voice** by uploading a short (10–30s) reference `.wav`.
-        - Choose a **language**, then click **Generate Audio**.
         """
     )
     with gr.Row():
-        text_in = gr.Textbox(
-            label="Type or paste text",
-            lines=8,
-            placeholder="Paste text here… (you can also upload a .docx/.txt below)",
-        )
-    with gr.Row():
-        file_in = gr.File(
-            label="Drag & drop .docx or .txt (optional)",
-            file_types=[".docx", ".txt"],
-        )
-    with gr.Row():
-        voice_ref = gr.File(
-            label="Optional: Voice reference (.wav, 10–30s) for cloning",
-            file_types=[".wav"],
-            visible=True,
-        )
         lang = gr.Dropdown(
-            choices=[v for _, v in LANG_OPTIONS],
             value="en",
             label="Language",
-            info="XTTS v2 is multilingual; pick what fits your input.",
         )
-    btn = gr.Button("🎙️ Generate Audio", variant="primary")
     audio_out = gr.Audio(label="Result", type="filepath", autoplay=True)
     download = gr.File(label="Download WAV")
     def run(text_input, file_input, language, voice_ref_file):
-        path = synthesize(text_input, file_input, language, voice_ref_file)
-        return path, path
-    btn.click(
         run,
         inputs=[text_in, file_in, lang, voice_ref],
-        outputs=[audio_out, download],
     )
 if __name__ == "__main__":

+import os, io, uuid, re, tempfile, traceback
+from typing import List
+# ---- Make Spaces happy: force CPU & avoid MPS/CUDA surprises ----
+os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
+os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
+import numpy as np
+import gradio as gr
+# Lazy flags
+_TTS = None
+_SR = 24000  # XTTS v2 typical output rate
+# ---------- Utilities ----------
 _SENT_SPLIT = re.compile(r"(?<=[\.\!\?\:\;\n])\s+")
+def chunk_text(text: str, max_len: int = 480) -> List[str]:
+    text = re.sub(r"\s+", " ", text).strip()
+    if not text:
+        return []
     if len(text) <= max_len:
+        return [text]
+    sents = [s.strip() for s in _SENT_SPLIT.split(text) if s.strip()]
+    chunks, buf = [], ""
+    for s in sents:
+        if len(buf) + 1 + len(s) <= max_len:
+            buf = f"{buf} {s}".strip() if buf else s
         else:
+            if buf:
+                chunks.append(buf)
+            if len(s) > max_len:  # very long single sentence
                 for i in range(0, len(s), max_len):
                     chunks.append(s[i:i+max_len])
+                buf = ""
             else:
+                buf = s
+    if buf:
+        chunks.append(buf)
     return chunks
+def read_text_from_file(file_obj) -> str:
+    if not file_obj:
+        return ""
+    # gr.File in v4 gives a TempFile with .name path string
+    path = getattr(file_obj, "name", None)
+    if not path or not os.path.exists(path):
+        return ""
+    ext = os.path.splitext(path)[1].lower()
+    if ext == ".txt":
+        with open(path, "rb") as f:
+            return f.read().decode("utf-8", errors="ignore")
+    elif ext == ".docx":
+        try:
+            import docx
+        except Exception:
+            raise gr.Error("python-docx not installed. Check requirements.txt")
+        d = docx.Document(path)
+        return "\n".join(p.text for p in d.paragraphs).strip()
+    else:
+        raise gr.Error("Unsupported file type. Please upload .txt or .docx")
 def get_tts():
     global _TTS, _SR
     if _TTS is None:
         try:
+            from TTS.api import TTS
+        except Exception as e:
+            raise gr.Error(
+                "Coqui TTS is not installed or failed to import. "
+                "Make sure your Space installed requirements.txt.\n\n" + str(e)
+            )
+        # CPU-safe init
+        _TTS = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False)
+        # sample rate if exposed
+        _SR = int(getattr(_TTS, "output_sample_rate", 24000) or 24000)
     return _TTS
+def safe_concat_wav(chunks_audio: List[np.ndarray], sr: int, out_path: str) -> str:
+    import soundfile as sf
+    with sf.SoundFile(out_path, mode="w", samplerate=sr, channels=1, subtype="PCM_16") as f:
+        for a in chunks_audio:
+            a = np.asarray(a).flatten().astype("float32")
+            # guard against NaNs/Infs
+            a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
+            # clamp to [-1, 1]
+            a = np.clip(a, -1.0, 1.0)
+            f.write(a)
+    return out_path
+# ---------- Core pipeline ----------
+def synthesize_pipeline(text_input, file_input, language, voice_ref):
+    # Gather text
+    user = (text_input or "").strip()
+    from_file = read_text_from_file(file_input) if file_input else ""
+    final_text = (user + ("\n" if user and from_file else "") + from_file).strip()
     if not final_text:
         raise gr.Error("Please paste/type text or upload a .txt/.docx file.")
+    # Limit very long inputs so Spaces don't OOM
     if len(final_text) > 20000:
         final_text = final_text[:20000] + " ..."
     chunks = chunk_text(final_text, max_len=480)
+    if not chunks:
+        raise gr.Error("No readable text found.")
     tts = get_tts()
+    # Optional voice clone
+    speaker_wav = None
+    if voice_ref is not None:
+        try:
+            speaker_wav = getattr(voice_ref, "name", None)
+        except Exception:
             speaker_wav = None
+    # Synthesize
+    audios = []
+    for i, ch in enumerate(chunks, 1):
+        audio = tts.tts(text=ch, language=language, speaker_wav=speaker_wav)
+        audios.append(audio)
+    # Write single WAV
+    out_path = os.path.join(tempfile.gettempdir(), f"tts_{uuid.uuid4().hex}.wav")
+    return safe_concat_wav(audios, _SR, out_path)
+# ---------- Gradio UI ----------
 LANG_OPTIONS = [
+    ("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"),
+    ("Italian", "it"), ("Portuguese", "pt"), ("Polish", "pl"), ("Turkish", "tr"),
+    ("Russian", "ru"), ("Dutch", "nl"), ("Chinese (Simplified)", "zh-cn"),
+    ("Japanese", "ja"), ("Korean", "ko"), ("Arabic", "ar"),
 ]
 with gr.Blocks(title="High-Quality TTS (XTTS v2)") as demo:
     gr.Markdown(
         """
+        # 🔊 High-Quality Text-to-Speech (Coqui XTTS v2)
+        - **Type/paste** text or **upload** `.docx` / `.txt`
+        - Optional: upload a short **.wav** (10–30s) to clone voice
+        - Click **Generate Audio**
         """
     )
+    text_in = gr.Textbox(label="Type or paste text", lines=8, placeholder="Paste text here…")
+    file_in = gr.File(label="Drag & drop .docx / .txt (optional)", file_types=[".docx", ".txt"])
     with gr.Row():
+        voice_ref = gr.File(label="Optional voice reference (.wav, 10–30s)", file_types=[".wav"])
         lang = gr.Dropdown(
+            choices=[code for (_, code) in LANG_OPTIONS],
             value="en",
             label="Language",
         )
+    run_btn = gr.Button("🎙️ Generate Audio", variant="primary")
     audio_out = gr.Audio(label="Result", type="filepath", autoplay=True)
     download = gr.File(label="Download WAV")
+    err_box = gr.Markdown("", elem_id="error_box")
     def run(text_input, file_input, language, voice_ref_file):
+        try:
+            path = synthesize_pipeline(text_input, file_input, language, voice_ref_file)
+            return path, path, ""  # clear errors
+        except Exception as e:
+            tb = traceback.format_exc()
+            # Show a compact, readable error in the UI
+            msg = f"**Error:** {e}\n\n```\n{tb[-1500:]}\n```"
+            return None, None, msg
+    run_btn.click(
         run,
         inputs=[text_in, file_in, lang, voice_ref],
+        outputs=[audio_out, download, err_box],
     )
 if __name__ == "__main__":