ChatTTS-Forge

Running on Zero

File size: 5,602 Bytes

from typing import Union

from fastapi import HTTPException
from pydantic import BaseModel

from modules.api import utils as api_utils
from modules.api.Api import APIManager
from modules.api.impl.handler.SSMLHandler import SSMLHandler
from modules.api.impl.handler.TTSHandler import TTSHandler
from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat
from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig
from modules.api.impl.model.enhancer_model import EnhancerConfig
from modules.speaker import Speaker, speaker_mgr


class SynthesisInput(BaseModel):
    text: Union[str, None] = None
    ssml: Union[str, None] = None


class VoiceSelectionParams(BaseModel):
    languageCode: str = "ZH-CN"

    name: str = "female2"
    style: str = ""
    temperature: float = 0.3
    topP: float = 0.7
    topK: int = 20
    seed: int = 42

    # end_of_sentence
    eos: str = "[uv_break]"


class AudioConfig(BaseModel):
    audioEncoding: AudioFormat = AudioFormat.mp3
    speakingRate: float = 1
    pitch: float = 0
    volumeGainDb: float = 0
    sampleRateHertz: int = 24000
    batchSize: int = 4
    spliterThreshold: int = 100


class GoogleTextSynthesizeRequest(BaseModel):
    input: SynthesisInput
    voice: VoiceSelectionParams
    audioConfig: AudioConfig
    enhancerConfig: EnhancerConfig = None


class GoogleTextSynthesizeResponse(BaseModel):
    audioContent: str


async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
    input = request.input
    voice = request.voice
    audioConfig = request.audioConfig
    enhancerConfig = request.enhancerConfig

    # 提取参数

    # TODO 这个也许应该传给 normalizer
    language_code = voice.languageCode
    voice_name = voice.name
    infer_seed = voice.seed or 42
    eos = voice.eos or "[uv_break]"
    audio_format = audioConfig.audioEncoding

    if not isinstance(audio_format, AudioFormat) and isinstance(audio_format, str):
        audio_format = AudioFormat(audio_format)

    speaking_rate = audioConfig.speakingRate or 1
    pitch = audioConfig.pitch or 0
    volume_gain_db = audioConfig.volumeGainDb or 0

    batch_size = audioConfig.batchSize or 1

    spliter_threshold = audioConfig.spliterThreshold or 100

    # TODO
    sample_rate = audioConfig.sampleRateHertz or 24000

    params = api_utils.calc_spk_style(spk=voice.name, style=voice.style)

    # 虽然 calc_spk_style 可以解析 seed 形式，但是这个接口只准备支持 speakers list 中存在的 speaker
    if speaker_mgr.get_speaker(voice_name) is None:
        raise HTTPException(
            status_code=422, detail="The specified voice name is not supported."
        )

    if not isinstance(params.get("spk"), Speaker):
        raise HTTPException(
            status_code=422, detail="The specified voice name is not supported."
        )

    speaker = params.get("spk")
    tts_config = ChatTTSConfig(
        style=params.get("style", ""),
        temperature=voice.temperature,
        top_k=voice.topK,
        top_p=voice.topP,
    )
    infer_config = InferConfig(
        batch_size=batch_size,
        spliter_threshold=spliter_threshold,
        eos=eos,
        seed=infer_seed,
    )
    adjust_config = AdjustConfig(
        speaking_rate=speaking_rate,
        pitch=pitch,
        volume_gain_db=volume_gain_db,
    )
    enhancer_config = enhancerConfig

    mime_type = f"audio/{audio_format.value}"
    if audio_format == AudioFormat.mp3:
        mime_type = "audio/mpeg"
    try:
        if input.text:
            text_content = input.text

            handler = TTSHandler(
                text_content=text_content,
                spk=speaker,
                tts_config=tts_config,
                infer_config=infer_config,
                adjust_config=adjust_config,
                enhancer_config=enhancer_config,
            )

            base64_string = handler.enqueue_to_base64(format=audio_format)
            return {"audioContent": f"data:{mime_type};base64,{base64_string}"}

        elif input.ssml:
            ssml_content = input.ssml

            handler = SSMLHandler(
                ssml_content=ssml_content,
                infer_config=infer_config,
                adjust_config=adjust_config,
                enhancer_config=enhancer_config,
            )

            base64_string = handler.enqueue_to_base64(format=audio_format)

            return {"audioContent": f"data:{mime_type};base64,{base64_string}"}

        else:
            raise HTTPException(
                status_code=422, detail="Invalid input text or ssml specified."
            )

    except Exception as e:
        import logging

        logging.exception(e)

        if isinstance(e, HTTPException):
            raise e
        else:
            raise HTTPException(status_code=500, detail=str(e))


def setup(app: APIManager):
    app.post(
        "/v1/text:synthesize",
        response_model=GoogleTextSynthesizeResponse,
        description="""
google api document: <br/>
[https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize](https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize)

- 多个属性在本系统中无用仅仅是为了兼容google api
- voice 中的 topP, topK, temperature 为本系统中的参数
- voice.name 即 speaker name （或者speaker seed）
- voice.seed 为 infer seed （可在webui中测试具体作用）

- 编码格式影响的是 audioContent 的二进制格式，所以所有format都是返回带有base64数据的json
        """,
    )(google_text_synthesize)