import logging from fastapi import Depends, HTTPException, Query from fastapi.responses import FileResponse, StreamingResponse from pydantic import BaseModel from modules.api import utils as api_utils from modules.api.Api import APIManager from modules.api.impl.handler.TTSHandler import TTSHandler from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig from modules.api.impl.model.enhancer_model import EnhancerConfig from modules.speaker import Speaker logger = logging.getLogger(__name__) class TTSParams(BaseModel): text: str = Query(..., description="Text to synthesize") spk: str = Query( "female2", description="Specific speaker by speaker name or speaker seed" ) style: str = Query("chat", description="Specific style by style name") temperature: float = Query( 0.3, description="Temperature for sampling (may be overridden by style or spk)" ) top_p: float = Query( 0.5, description="Top P for sampling (may be overridden by style or spk)" ) top_k: int = Query( 20, description="Top K for sampling (may be overridden by style or spk)" ) seed: int = Query( 42, description="Seed for generate (may be overridden by style or spk)" ) format: str = Query("mp3", description="Response audio format: [mp3,wav]") prompt1: str = Query("", description="Text prompt for inference") prompt2: str = Query("", description="Text prompt for inference") prefix: str = Query("", description="Text prefix for inference") bs: str = Query("8", description="Batch size for inference") thr: str = Query("100", description="Threshold for sentence spliter") eos: str = Query("[uv_break]", description="End of sentence str") enhance: bool = Query(False, description="Enable enhancer") denoise: bool = Query(False, description="Enable denoiser") speed: float = Query(1.0, description="Speed of the audio") pitch: float = Query(0, description="Pitch of the audio") volume_gain: float = Query(0, description="Volume gain of the audio") stream: bool = Query(False, description="Stream the audio") async def synthesize_tts(params: TTSParams = Depends()): try: # Validate text if not params.text.strip(): raise HTTPException( status_code=422, detail="Text parameter cannot be empty" ) # Validate temperature if not (0 <= params.temperature <= 1): raise HTTPException( status_code=422, detail="Temperature must be between 0 and 1" ) # Validate top_p if not (0 <= params.top_p <= 1): raise HTTPException(status_code=422, detail="top_p must be between 0 and 1") # Validate top_k if params.top_k <= 0: raise HTTPException( status_code=422, detail="top_k must be a positive integer" ) if params.top_k > 100: raise HTTPException( status_code=422, detail="top_k must be less than or equal to 100" ) # Validate format if params.format not in ["mp3", "wav"]: raise HTTPException( status_code=422, detail="Invalid format. Supported formats are mp3 and wav", ) calc_params = api_utils.calc_spk_style(spk=params.spk, style=params.style) spk = calc_params.get("spk", params.spk) if not isinstance(spk, Speaker): raise HTTPException(status_code=422, detail="Invalid speaker") style = calc_params.get("style", params.style) seed = params.seed or calc_params.get("seed", params.seed) temperature = params.temperature or calc_params.get( "temperature", params.temperature ) prefix = params.prefix or calc_params.get("prefix", params.prefix) prompt1 = params.prompt1 or calc_params.get("prompt1", params.prompt1) prompt2 = params.prompt2 or calc_params.get("prompt2", params.prompt2) eos = params.eos or "" batch_size = int(params.bs) threshold = int(params.thr) tts_config = ChatTTSConfig( style=style, temperature=temperature, top_k=params.top_k, top_p=params.top_p, prefix=prefix, prompt1=prompt1, prompt2=prompt2, ) infer_config = InferConfig( batch_size=batch_size, spliter_threshold=threshold, eos=eos, seed=seed, ) adjust_config = AdjustConfig( pitch=params.pitch, speed_rate=params.speed, volume_gain_db=params.volume_gain, ) enhancer_config = EnhancerConfig( enabled=params.enhance or params.denoise or False, lambd=0.9 if params.denoise else 0.1, ) handler = TTSHandler( text_content=params.text, spk=spk, tts_config=tts_config, infer_config=infer_config, adjust_config=adjust_config, enhancer_config=enhancer_config, ) media_type = f"audio/{params.format}" if params.format == "mp3": media_type = "audio/mpeg" if params.stream: if infer_config.batch_size != 1: # 流式生成下仅支持 batch size 为 1,当前请求参数将被忽略 logger.warning( f"Batch size {infer_config.batch_size} is not supported in streaming mode, will set to 1" ) buffer_gen = handler.enqueue_to_stream(format=AudioFormat(params.format)) return StreamingResponse(buffer_gen, media_type=media_type) else: buffer = handler.enqueue_to_buffer(format=AudioFormat(params.format)) return StreamingResponse(buffer, media_type=media_type) except Exception as e: import logging logging.exception(e) if isinstance(e, HTTPException): raise e else: raise HTTPException(status_code=500, detail=str(e)) def setup(api_manager: APIManager): api_manager.get("/v1/tts", response_class=FileResponse)(synthesize_tts)