Spaces:

tacab
/

TTS

Sleeping

App Files Files Community

nurfarah57 commited on May 26

Commit

7291a4c

verified ·

1 Parent(s): 1ad4fd7

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -33

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
-# Set cache dirs before imports to fix permission errors
 os.environ["HF_HOME"] = "/tmp"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["TORCH_HOME"] = "/tmp"
@@ -19,7 +19,7 @@ from transformers import VitsModel, AutoTokenizer
 app = FastAPI()
-# Load model/tokenizer once at startup
 model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
 tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
@@ -82,16 +82,31 @@ def normalize_text(text: str) -> str:
     text = text.replace("ZamZam", "SamSam")
     return text
 class TextIn(BaseModel):
     inputs: str
 @app.post("/synthesize")
-async def synthesize(data: TextIn, test: bool = Query(False, description="Set true to generate test tone instead of TTS")):
     if test:
-        # Generate 2-second 440Hz sine wave for testing playback
         duration_s = 2.0
         sample_rate = 22050
-        t = np.linspace(0, duration_s, int(sample_rate*duration_s), endpoint=False)
         freq = 440
         waveform = 0.5 * np.sin(2 * math.pi * freq * t).astype(np.float32)
         pcm_waveform = (waveform * 32767).astype(np.int16)
@@ -101,45 +116,30 @@ async def synthesize(data: TextIn, test: bool = Query(False, description="Set tr
         buf.seek(0)
         print(f"[TEST MODE] Generated test tone: {pcm_waveform.shape[0]} samples, Sample rate: {sample_rate}")
         return StreamingResponse(buf, media_type="audio/wav")
-    # Normalize input text
     text = normalize_text(data.inputs)
-    # Tokenize and move to device
     inputs = tokenizer(text, return_tensors="pt").to(device)
-    # Generate waveform
     with torch.no_grad():
         output = model(**inputs)
-    print("Raw waveform shape:", output.waveform.shape)
-    waveform = output.waveform.cpu().numpy()
-    # Process waveform dimensions
-    if waveform.ndim == 3:
-        waveform = waveform[0]  # batch dimension
-    if waveform.ndim == 2:
-        waveform = waveform.mean(axis=0)  # average channels to mono
-    print("Processed waveform shape:", waveform.shape)
-    print("Waveform min/max before clip:", waveform.min(), waveform.max())
-    waveform = waveform.astype(np.float32)
-    waveform = np.clip(waveform, -1.0, 1.0)
-    pcm_waveform = (waveform * 32767).astype(np.int16)
-    print("PCM waveform shape:", pcm_waveform.shape)
-    print("PCM waveform min/max:", pcm_waveform.min(), pcm_waveform.max())
-    buf = io.BytesIO()
     sample_rate = getattr(model.config, "sampling_rate", 22050)
     print("Sample rate:", sample_rate)
-    scipy.io.wavfile.write(buf, rate=sample_rate, data=pcm_waveform)
-    buf.seek(0)
-    return StreamingResponse(buf, media_type="audio/wav")

 import os
+# Set cache dirs BEFORE imports for permission fix
 os.environ["HF_HOME"] = "/tmp"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["TORCH_HOME"] = "/tmp"
 app = FastAPI()
+# Load model and tokenizer ONCE at startup
 model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
 tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
     text = text.replace("ZamZam", "SamSam")
     return text
+def waveform_to_wav_bytes(waveform: torch.Tensor, sample_rate: int = 22050) -> bytes:
+    np_waveform = waveform.cpu().numpy()
+    if np_waveform.ndim == 3:
+        np_waveform = np_waveform[0]
+    if np_waveform.ndim == 2:
+        np_waveform = np_waveform.mean(axis=0)
+    np_waveform = np.clip(np_waveform, -1.0, 1.0).astype(np.float32)
+    pcm_waveform = (np_waveform * 32767).astype(np.int16)
+    buf = io.BytesIO()
+    scipy.io.wavfile.write(buf, rate=sample_rate, data=pcm_waveform)
+    buf.seek(0)
+    return buf.read()
 class TextIn(BaseModel):
     inputs: str
 @app.post("/synthesize")
+async def synthesize(data: TextIn, test: bool = Query(False, description="Set true to return a test tone")):
     if test:
         duration_s = 2.0
         sample_rate = 22050
+        t = np.linspace(0, duration_s, int(sample_rate * duration_s), endpoint=False)
         freq = 440
         waveform = 0.5 * np.sin(2 * math.pi * freq * t).astype(np.float32)
         pcm_waveform = (waveform * 32767).astype(np.int16)
         buf.seek(0)
         print(f"[TEST MODE] Generated test tone: {pcm_waveform.shape[0]} samples, Sample rate: {sample_rate}")
         return StreamingResponse(buf, media_type="audio/wav")
     text = normalize_text(data.inputs)
     inputs = tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
         output = model(**inputs)
+    print("Model output type:", type(output))
+    # Try to extract waveform safely:
+    if hasattr(output, "waveform"):
+        waveform = output.waveform
+    elif isinstance(output, dict) and "waveform" in output:
+        waveform = output["waveform"]
+    elif isinstance(output, (tuple, list)):
+        waveform = output[0]
+    else:
+        return {"error": "Waveform not found in model output"}
+    print("Extracted waveform shape:", waveform.shape)
     sample_rate = getattr(model.config, "sampling_rate", 22050)
     print("Sample rate:", sample_rate)
+    wav_bytes = waveform_to_wav_bytes(waveform, sample_rate=sample_rate)
+    return StreamingResponse(io.BytesIO(wav_bytes), media_type="audio/wav")