Amadeus_Project / api.py
johntito
```
2ef7c17
raw
history blame
2.14 kB
import torch
from fastapi import FastAPI
from pydantic import BaseModel
import numpy as np
import base64
import io
from scipy.io.wavfile import write
import sounddevice as sd
# 自定义模块
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
# 检查 PyTorch 版本
print(torch.__version__)
# 检查 CUDA 是否可用
print(torch.cuda.is_available())
# 检查当前 CUDA 版本
print(torch.version.cuda)
# FastAPI 应用
app = FastAPI()
# 请求体模型
class TextRequest(BaseModel):
text: str
# 加载配置和模型
config_path = "configs/steins_gate_base.json"
checkpoint_path = "G_265000.pth"
hps = utils.get_hparams_from_file(config_path)
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model,
).eval()
utils.load_checkpoint(checkpoint_path, net_g, None)
# 文本到语音合成
def text_to_speech(content):
stn_tst = text_to_sequence(content, hps.data.text_cleaners)
if hps.data.add_blank:
stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=0.667, noise_scale_w=0.8, length_scale=1)[0][0, 0].data.float().numpy()
return hps.data.sampling_rate, audio
# API 路由:文本转语音
@app.post("/synthesize")
def synthesize(request: TextRequest):
# 假设 text_to_speech 是生成音频的函数
sampling_rate, audio = text_to_speech(request.text)
# 将音频数据保存到 BytesIO 对象
wav_bytes = io.BytesIO()
write(wav_bytes, sampling_rate, (audio * 32767).astype(np.int16))
wav_bytes.seek(0) # 将指针移动到文件开头
# 将 WAV 文件编码为 Base64
audio_base64 = base64.b64encode(wav_bytes.read()).decode("utf-8")
return {"audio": audio_base64}
# 主函数
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=8000)