voicecloneapi / main.py
Nasma's picture
Update main.py
7859e04 verified
raw
history blame
2.09 kB
from fastapi import FastAPI, Query, HTTPException
from fastapi.responses import StreamingResponse
from TTS.api import TTS
import os
from io import BytesIO
from typing import Generator
app = FastAPI()
# Initialize the TTS model
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False) # Set gpu=True if you have GPU support
# Predefined path to the sample voice clone
FIXED_SPEAKER_WAV = "Bible Verses About Community.wav"
# Function to split text into chunks
def split_text(text: str, words_per_chunk: int = 20):
words = text.split()
return [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
# Function to generate audio chunks
def generate_audio_chunks(text: str, language: str, chunk_size: int = 20) -> Generator[bytes, None, None]:
if tts.is_multi_lingual and not language:
raise ValueError("Language must be specified for multi-lingual models.")
text_chunks = split_text(text, chunk_size)
for idx, chunk in enumerate(text_chunks):
# Generate audio for each chunk and yield as bytes
audio_buffer = BytesIO()
tts.tts_to_file(
text=chunk,
file_path=audio_buffer,
speaker_wav=FIXED_SPEAKER_WAV,
language=language
)
audio_buffer.seek(0)
yield audio_buffer.read()
@app.post("/generate-audio/")
async def generate_audio(
text: str = Query(..., description="The input text to convert to speech."),
language: str = Query("en", description="Language code for TTS (e.g., 'en' for English).")
):
if not os.path.exists(FIXED_SPEAKER_WAV):
raise HTTPException(status_code=400, detail="Fixed speaker WAV file not found.")
# StreamingResponse to stream audio chunks
def audio_stream():
try:
for audio_chunk in generate_audio_chunks(text=text, language=language):
yield audio_chunk
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
return StreamingResponse(audio_stream(), media_type="audio/wav")