from fastapi import FastAPI, Query, HTTPException from fastapi.responses import StreamingResponse from fastapi.responses import FileResponse from TTS.api import TTS import os from io import BytesIO from typing import Generator import torch from torch.serialization import add_safe_globals from TTS.tts.configs.xtts_config import XttsConfig # Allow this class to be unpickled (safe if you're sure the model is trusted) add_safe_globals([XttsConfig]) app = FastAPI() import os # By using XTTS you agree to CPML license https://coqui.ai/cpml os.environ["COQUI_TOS_AGREED"] = "1" # Initialize the TTS model tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False) # Set gpu=True if you have GPU support # Predefined path to the sample voice clone FIXED_SPEAKER_WAV = "Bible Verses About Community.wav" # Function to split text into chunks def split_text(text: str, words_per_chunk: int = 20): words = text.split() return [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)] @app.post("/generate-audio/") async def generate_audio( text: str = Query(..., description="The input text to convert to speech."), language: str = Query("en", description="Language code for TTS (e.g., 'en' for English).")): if not os.path.exists(FIXED_SPEAKER_WAV): raise HTTPException(status_code=400, detail="Fixed speaker WAV file not found.") if tts.is_multi_lingual and not language: raise ValueError("Language must be specified for multi-lingual models.") # Generate audio for each chunk and yield as bytes output_file = f"out.wav" try: tts.tts_to_file( text=text, file_path=output_file, speaker_wav=FIXED_SPEAKER_WAV, language=language ) print(output_file) # Return the generated audio file as a response return FileResponse(output_file, media_type="audio/wav") except Exception as e: raise HTTPException(status_code=500, detail=f"Error generating audio: {str(e)}") # finally: # # Clean up the generated file after the response is sent # if os.path.exists(output_file): # os.remove(output_file)