megatrump commited on
Commit
b310def
·
1 Parent(s): 792314e
Files changed (5) hide show
  1. Dockerfile +44 -0
  2. api.py +256 -0
  3. build.sh +30 -0
  4. requirements.txt +15 -0
  5. start.sh +7 -0
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 构建阶段
2
+ FROM python:3.12-slim as builder
3
+
4
+ # 设置工作目录
5
+ WORKDIR /app
6
+
7
+ # 安装系统依赖
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ build-essential \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # 复制依赖文件
13
+ COPY requirements.txt .
14
+
15
+ # 安装Python依赖
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # 运行阶段
19
+ FROM python:3.12-slim
20
+
21
+ # 创建非特权用户
22
+ RUN useradd -m -s /bin/bash app
23
+
24
+ # 设置工作目录
25
+ WORKDIR /app
26
+
27
+ # 复制应用代码和依赖
28
+ COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages
29
+ COPY . .
30
+
31
+ # 设置权限
32
+ RUN chown -R app:app /app
33
+
34
+ # 切换到非特权用户
35
+ USER app
36
+
37
+ # 设置环境变量
38
+ ENV PYTHONUNBUFFERED=1
39
+
40
+ # 暴露端口
41
+ EXPOSE 8000
42
+
43
+ # 启动命令
44
+ CMD ["python", "-m", "uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8000"]
api.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+
3
+ from io import BytesIO
4
+ from typing import Optional
5
+
6
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Depends
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
9
+ import numpy as np
10
+ import torch
11
+ import torchaudio
12
+ from funasr import AutoModel
13
+ from dotenv import load_dotenv
14
+ import os
15
+
16
+ # 加载环境变量
17
+ load_dotenv()
18
+
19
+ # 获取API Token
20
+ API_TOKEN = os.getenv("API_TOKEN")
21
+ if not API_TOKEN:
22
+ raise RuntimeError("API_TOKEN environment variable is not set")
23
+
24
+ # 设置认证
25
+ security = HTTPBearer()
26
+
27
+ app = FastAPI(
28
+ title="SenseVoice API",
29
+ description="语音识别 API 服务",
30
+ version="1.0.0"
31
+ )
32
+
33
+ # 允许跨域请求
34
+ app.add_middleware(
35
+ CORSMiddleware,
36
+ allow_origins=["*"],
37
+ allow_credentials=True,
38
+ allow_methods=["*"],
39
+ allow_headers=["*"],
40
+ )
41
+
42
+ # 初始化模型
43
+ model = AutoModel(
44
+ model="FunAudioLLM/SenseVoiceSmall",
45
+ vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
46
+ vad_kwargs={"max_single_segment_time": 30000},
47
+ hub="hf",
48
+ device="cuda"
49
+ )
50
+
51
+ # 复用原有的格式化函数
52
+ emo_dict = {
53
+ "<|HAPPY|>": "😊",
54
+ "<|SAD|>": "😔",
55
+ "<|ANGRY|>": "😡",
56
+ "<|NEUTRAL|>": "",
57
+ "<|FEARFUL|>": "😰",
58
+ "<|DISGUSTED|>": "🤢",
59
+ "<|SURPRISED|>": "😮",
60
+ }
61
+
62
+ event_dict = {
63
+ "<|BGM|>": "🎼",
64
+ "<|Speech|>": "",
65
+ "<|Applause|>": "👏",
66
+ "<|Laughter|>": "😀",
67
+ "<|Cry|>": "😭",
68
+ "<|Sneeze|>": "🤧",
69
+ "<|Breath|>": "",
70
+ "<|Cough|>": "🤧",
71
+ }
72
+
73
+ emoji_dict = {
74
+ "<|nospeech|><|Event_UNK|>": "❓",
75
+ "<|zh|>": "",
76
+ "<|en|>": "",
77
+ "<|yue|>": "",
78
+ "<|ja|>": "",
79
+ "<|ko|>": "",
80
+ "<|nospeech|>": "",
81
+ "<|HAPPY|>": "😊",
82
+ "<|SAD|>": "😔",
83
+ "<|ANGRY|>": "😡",
84
+ "<|NEUTRAL|>": "",
85
+ "<|BGM|>": "🎼",
86
+ "<|Speech|>": "",
87
+ "<|Applause|>": "👏",
88
+ "<|Laughter|>": "😀",
89
+ "<|FEARFUL|>": "😰",
90
+ "<|DISGUSTED|>": "🤢",
91
+ "<|SURPRISED|>": "😮",
92
+ "<|Cry|>": "😭",
93
+ "<|EMO_UNKNOWN|>": "",
94
+ "<|Sneeze|>": "🤧",
95
+ "<|Breath|>": "",
96
+ "<|Cough|>": "😷",
97
+ "<|Sing|>": "",
98
+ "<|Speech_Noise|>": "",
99
+ "<|withitn|>": "",
100
+ "<|woitn|>": "",
101
+ "<|GBG|>": "",
102
+ "<|Event_UNK|>": "",
103
+ }
104
+
105
+ lang_dict = {
106
+ "<|zh|>": "<|lang|>",
107
+ "<|en|>": "<|lang|>",
108
+ "<|yue|>": "<|lang|>",
109
+ "<|ja|>": "<|lang|>",
110
+ "<|ko|>": "<|lang|>",
111
+ "<|nospeech|>": "<|lang|>",
112
+ }
113
+
114
+ emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
115
+ event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷"}
116
+
117
+
118
+ def format_str(s):
119
+ for sptk in emoji_dict:
120
+ s = s.replace(sptk, emoji_dict[sptk])
121
+ return s
122
+
123
+
124
+ def format_str_v2(s):
125
+ sptk_dict = {}
126
+ for sptk in emoji_dict:
127
+ sptk_dict[sptk] = s.count(sptk)
128
+ s = s.replace(sptk, "")
129
+ emo = "<|NEUTRAL|>"
130
+ for e in emo_dict:
131
+ if sptk_dict[e] > sptk_dict[emo]:
132
+ emo = e
133
+ for e in event_dict:
134
+ if sptk_dict[e] > 0:
135
+ s = event_dict[e] + s
136
+ s = s + emo_dict[emo]
137
+
138
+ for emoji in emo_set.union(event_set):
139
+ s = s.replace(" " + emoji, emoji)
140
+ s = s.replace(emoji + " ", emoji)
141
+ return s.strip()
142
+
143
+
144
+ def format_str_v3(s):
145
+ def get_emo(s):
146
+ return s[-1] if s[-1] in emo_set else None
147
+
148
+ def get_event(s):
149
+ return s[0] if s[0] in event_set else None
150
+
151
+ s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
152
+ for lang in lang_dict:
153
+ s = s.replace(lang, "<|lang|>")
154
+ s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
155
+ new_s = " " + s_list[0]
156
+ cur_ent_event = get_event(new_s)
157
+ for i in range(1, len(s_list)):
158
+ if len(s_list[i]) == 0:
159
+ continue
160
+ if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
161
+ s_list[i] = s_list[i][1:]
162
+ cur_ent_event = get_event(s_list[i])
163
+ if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
164
+ new_s = new_s[:-1]
165
+ new_s += s_list[i].strip().lstrip()
166
+ new_s = new_s.replace("The.", " ")
167
+ return new_s.strip()
168
+
169
+
170
+ async def process_audio(audio_data: bytes, language: str = "auto") -> str:
171
+ """处理音频数据并返回识别结果"""
172
+ try:
173
+ # 将字节数据转换为 numpy 数组
174
+ audio_buffer = BytesIO(audio_data)
175
+ waveform, sample_rate = torchaudio.load(audio_buffer)
176
+
177
+ # 转换为单声道
178
+ if waveform.shape[0] > 1:
179
+ waveform = waveform.mean(dim=0)
180
+
181
+ # 转换为 numpy array 并归一化
182
+ input_wav = waveform.numpy().astype(np.float32)
183
+
184
+ # 重采样到 16kHz
185
+ if sample_rate != 16000:
186
+ resampler = torchaudio.transforms.Resample(sample_rate, 16000)
187
+ input_wav = resampler(torch.from_numpy(input_wav)[None, :])[0, :].numpy()
188
+
189
+ # 模型推理
190
+ text = model.generate(
191
+ input=input_wav,
192
+ cache={},
193
+ language=language,
194
+ use_itn=True,
195
+ batch_size_s=500,
196
+ merge_vad=True
197
+ )
198
+
199
+ # 格式化结果
200
+ result = text[0]["text"]
201
+ result = format_str_v3(result)
202
+
203
+ return result
204
+
205
+ except Exception as e:
206
+ raise HTTPException(status_code=500, detail=f"音频处理失败:{str(e)}")
207
+
208
+
209
+ async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
210
+ """验证Bearer Token"""
211
+ if credentials.credentials != API_TOKEN:
212
+ raise HTTPException(
213
+ status_code=401,
214
+ detail="Invalid authentication token",
215
+ headers={"WWW-Authenticate": "Bearer"}
216
+ )
217
+ return credentials
218
+
219
+ @app.post("/v1/audio/transcriptions")
220
+ async def transcribe_audio(
221
+ file: UploadFile = File(...),
222
+ model: Optional[str] = "FunAudioLLM/SenseVoiceSmall",
223
+ language: Optional[str] = "auto",
224
+ token: HTTPAuthorizationCredentials = Depends(verify_token)
225
+ ):
226
+ """音频转写接口
227
+
228
+ Args:
229
+ file: 音频文件(支持常见音频格式)
230
+ model: 模型名称,目前仅支持 FunAudioLLM/SenseVoiceSmall
231
+ language: 语言代码,支持 auto/zh/en/yue/ja/ko/nospeech
232
+
233
+ Returns:
234
+ {"text": "识别结果"}
235
+ """
236
+ if not file.filename.lower().endswith((".mp3", ".wav", ".flac", ".ogg", ".m4a")):
237
+ raise HTTPException(status_code=400, detail="不支持的音频格式")
238
+
239
+ if model != "FunAudioLLM/SenseVoiceSmall":
240
+ raise HTTPException(status_code=400, detail="不支持的模型")
241
+
242
+ if language not in ["auto", "zh", "en", "yue", "ja", "ko", "nospeech"]:
243
+ raise HTTPException(status_code=400, detail="不支持的语言")
244
+
245
+ try:
246
+ content = await file.read()
247
+ text = await process_audio(content, language)
248
+ return {"text": text}
249
+
250
+ except Exception as e:
251
+ raise HTTPException(status_code=500, detail=str(e))
252
+
253
+
254
+ if __name__ == "__main__":
255
+ import uvicorn
256
+ uvicorn.run(app, host="0.0.0.0", port=8000)
build.sh ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -ex
4
+
5
+ # 设置变量
6
+ IMAGE_NAME="sensevoice-api"
7
+ CONTAINER_NAME="sensevoice-api-container"
8
+ PORT=8000
9
+
10
+ # 停止并删除已存在的同名容器
11
+ if [ "$(docker ps -aq -f name=$CONTAINER_NAME)" ]; then
12
+ echo "停止并删除已存在的容器..."
13
+ docker stop $CONTAINER_NAME
14
+ docker rm $CONTAINER_NAME
15
+ fi
16
+
17
+ # 构建Docker镜像
18
+ echo "构建Docker镜像..."
19
+ docker build -t $IMAGE_NAME .
20
+
21
+ # 运行容器
22
+ echo "启动容器..."
23
+ docker run -d \
24
+ --name $CONTAINER_NAME \
25
+ -p $PORT:8000 \
26
+ -e API_TOKEN="your-secret-token-here" \
27
+ -e PYTHONUNBUFFERED=1 \
28
+ $IMAGE_NAME
29
+
30
+ echo "容器启动成功!服务运行在 http://localhost:$PORT"
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web 框架和服务器
2
+ fastapi==0.104.1 # 现代、快速的 Web API 框架
3
+ uvicorn==0.24.0 # 轻量级 ASGI 服务器
4
+ python-multipart==0.0.6 # 处理文件上传的multipart表单数据
5
+
6
+ # 数据处理和科学计算
7
+ numpy==1.26.2 # 科学计算基础库,提供多维数组支持
8
+
9
+ # 深度学习和音频处理
10
+ torch==2.1.1 # PyTorch深度学习框架
11
+ torchaudio==2.1.1 # PyTorch音频处理库
12
+ funasr==0.8.1 # 语音识别模型库
13
+
14
+ # 工具库
15
+ python-dotenv==1.0.0 # 从.env文件加载环境变量
start.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # 设置API认证Token
4
+ export API_TOKEN="your-secret-token-here"
5
+
6
+ # 启动FastAPI服务
7
+ python -m uvicorn api:app --host 0.0.0.0 --port 8000