Merge branch 'vad'
Browse files* vad:
[fix]: update parameter.
[fix]: requirements.
[fix]: update parameter.
fix np array copy error
add vad update_silence_ms adapter
[fix]: parameter.
update
filter [] words
Disable FunASR pbar.
[fix]: remove unused file.
fix bug of loss segemnts
update text threhold
[fix]: test dynamic vad.
update to vad streaming
[fix]: update web.
update pipelines launch wait
ignore write to wav in assets
update config of save data to save flag
fix words missing
Integrate FunASR.
- config.py +4 -5
- pyproject.toml +1 -0
- requirements.txt +4 -0
- transcribe/helpers/funasr.py +37 -0
- transcribe/helpers/vadprocessor.py +301 -5
- transcribe/helpers/whisper.py +1 -1
- transcribe/pipelines/__init__.py +3 -2
- transcribe/pipelines/base.py +1 -0
- transcribe/pipelines/pipe_funasr.py +73 -0
- transcribe/pipelines/pipe_vad.py +96 -18
- transcribe/pipelines/pipe_whisper.py +2 -5
- transcribe/translatepipes.py +29 -34
- transcribe/utils.py +45 -0
- transcribe/whisper_llm_serve.py +123 -65
- uv.lock +0 -0
config.py
CHANGED
@@ -3,17 +3,16 @@ import re
|
|
3 |
import logging
|
4 |
|
5 |
DEBUG = True
|
6 |
-
TEST = False
|
7 |
-
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
8 |
-
|
9 |
|
|
|
10 |
logging.basicConfig(
|
11 |
level=logging.DEBUG if DEBUG else logging.INFO,
|
12 |
format="%(asctime)s - %(levelname)s - %(message)s",
|
13 |
filename='translator.log',
|
14 |
datefmt="%H:%M:%S"
|
15 |
)
|
16 |
-
|
|
|
17 |
# Add terminal log
|
18 |
console_handler = logging.StreamHandler()
|
19 |
console_handler.setLevel(logging.DEBUG if DEBUG else logging.INFO)
|
@@ -22,7 +21,7 @@ console_handler.setFormatter(console_formatter)
|
|
22 |
logging.getLogger().addHandler(console_handler)
|
23 |
|
24 |
# 文字输出长度阈值
|
25 |
-
TEXT_THREHOLD =
|
26 |
|
27 |
BASE_DIR = pathlib.Path(__file__).parent
|
28 |
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
|
|
3 |
import logging
|
4 |
|
5 |
DEBUG = True
|
|
|
|
|
|
|
6 |
|
7 |
+
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
8 |
logging.basicConfig(
|
9 |
level=logging.DEBUG if DEBUG else logging.INFO,
|
10 |
format="%(asctime)s - %(levelname)s - %(message)s",
|
11 |
filename='translator.log',
|
12 |
datefmt="%H:%M:%S"
|
13 |
)
|
14 |
+
# save pipelines data to disk
|
15 |
+
SAVE_DATA_SAVE = False
|
16 |
# Add terminal log
|
17 |
console_handler = logging.StreamHandler()
|
18 |
console_handler.setLevel(logging.DEBUG if DEBUG else logging.INFO)
|
|
|
21 |
logging.getLogger().addHandler(console_handler)
|
22 |
|
23 |
# 文字输出长度阈值
|
24 |
+
TEXT_THREHOLD = 6
|
25 |
|
26 |
BASE_DIR = pathlib.Path(__file__).parent
|
27 |
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
pyproject.toml
CHANGED
@@ -7,6 +7,7 @@ requires-python = ">=3.11"
|
|
7 |
dependencies = [
|
8 |
"av>=14.2.0",
|
9 |
"fastapi>=0.115.12",
|
|
|
10 |
"librosa>=0.11.0",
|
11 |
"numpy>=2.1.3",
|
12 |
"onnxruntime>=1.21.0",
|
|
|
7 |
dependencies = [
|
8 |
"av>=14.2.0",
|
9 |
"fastapi>=0.115.12",
|
10 |
+
"funasr>=1.2.6",
|
11 |
"librosa>=0.11.0",
|
12 |
"numpy>=2.1.3",
|
13 |
"onnxruntime>=1.21.0",
|
requirements.txt
CHANGED
@@ -154,6 +154,9 @@ torch==2.6.0
|
|
154 |
# silero-vad
|
155 |
# torchaudio
|
156 |
torchaudio==2.6.0
|
|
|
|
|
|
|
157 |
# via silero-vad
|
158 |
tqdm==4.67.1
|
159 |
# via
|
@@ -184,3 +187,4 @@ websockets==15.0.1
|
|
184 |
# via trans (pyproject.toml)
|
185 |
wordninja==2.0.0
|
186 |
# via trans (pyproject.toml)
|
|
|
|
154 |
# silero-vad
|
155 |
# torchaudio
|
156 |
torchaudio==2.6.0
|
157 |
+
ane_transformers
|
158 |
+
openai-whisper
|
159 |
+
coremltools
|
160 |
# via silero-vad
|
161 |
tqdm==4.67.1
|
162 |
# via
|
|
|
187 |
# via trans (pyproject.toml)
|
188 |
wordninja==2.0.0
|
189 |
# via trans (pyproject.toml)
|
190 |
+
funasr==1.2.6
|
transcribe/helpers/funasr.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import uuid
|
3 |
+
from logging import getLogger
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
from funasr import AutoModel
|
7 |
+
import soundfile as sf
|
8 |
+
|
9 |
+
import config
|
10 |
+
|
11 |
+
logger = getLogger(__name__)
|
12 |
+
|
13 |
+
|
14 |
+
class FunASR:
|
15 |
+
def __init__(self, source_lange: str = 'en', warmup=True) -> None:
|
16 |
+
self.source_lange = source_lange
|
17 |
+
|
18 |
+
self.model = AutoModel(
|
19 |
+
model="paraformer-zh", vad_model="fsmn-vad", punc_model="ct-punc"
|
20 |
+
)
|
21 |
+
if warmup:
|
22 |
+
self.warmup()
|
23 |
+
|
24 |
+
def warmup(self, warmup_steps=1):
|
25 |
+
warmup_soundfile = f"{config.ASSERT_DIR}/jfk.flac"
|
26 |
+
for _ in range(warmup_steps):
|
27 |
+
self.model.generate(input=warmup_soundfile)
|
28 |
+
|
29 |
+
def transcribe(self, audio_buffer: bytes, language):
|
30 |
+
audio_frames = np.frombuffer(audio_buffer, dtype=np.float32)
|
31 |
+
# sf.write(f'{config.ASSERT_DIR}/{time.time()}.wav', audio_frames, samplerate=16000)
|
32 |
+
try:
|
33 |
+
output = self.model.generate(input=audio_frames, disable_pbar=True)
|
34 |
+
return output
|
35 |
+
except Exception as e:
|
36 |
+
logger.error(e)
|
37 |
+
return []
|
transcribe/helpers/vadprocessor.py
CHANGED
@@ -2,10 +2,47 @@ from copy import deepcopy
|
|
2 |
from queue import Queue, Empty
|
3 |
from time import time
|
4 |
from config import VAD_MODEL_PATH
|
5 |
-
|
6 |
import numpy as np
|
7 |
import onnxruntime
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
class OnnxWrapper():
|
10 |
|
11 |
def __init__(self, path, force_onnx_cpu=False):
|
@@ -108,6 +145,7 @@ class VADIteratorOnnx:
|
|
108 |
sampling_rate: int = 16000,
|
109 |
min_silence_duration_ms: int = 100,
|
110 |
max_speech_duration_s: float = float('inf'),
|
|
|
111 |
):
|
112 |
self.model = OnnxWrapper(VAD_MODEL_PATH, True)
|
113 |
self.threshold = threshold
|
@@ -118,7 +156,7 @@ class VADIteratorOnnx:
|
|
118 |
|
119 |
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
120 |
self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
|
121 |
-
|
122 |
self.reset_states()
|
123 |
|
124 |
def reset_states(self):
|
@@ -153,7 +191,8 @@ class VADIteratorOnnx:
|
|
153 |
|
154 |
if (speech_prob >= self.threshold) and not self.triggered:
|
155 |
self.triggered = True
|
156 |
-
speech_start = max(0, self.current_sample - window_size_samples)
|
|
|
157 |
self.start = speech_start
|
158 |
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
|
159 |
|
@@ -169,7 +208,8 @@ class VADIteratorOnnx:
|
|
169 |
if self.current_sample - self.temp_end < self.min_silence_samples:
|
170 |
return None
|
171 |
else:
|
172 |
-
speech_end = self.temp_end - window_size_samples
|
|
|
173 |
self.temp_end = 0
|
174 |
self.triggered = False
|
175 |
return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
|
@@ -178,6 +218,33 @@ class VADIteratorOnnx:
|
|
178 |
|
179 |
|
180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
class VadV2:
|
182 |
def __init__(self,
|
183 |
threshold: float = 0.5,
|
@@ -269,6 +336,235 @@ class VadV2:
|
|
269 |
return None
|
270 |
|
271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
|
273 |
class VadProcessor:
|
274 |
def __init__(
|
|
|
2 |
from queue import Queue, Empty
|
3 |
from time import time
|
4 |
from config import VAD_MODEL_PATH
|
5 |
+
from silero_vad import load_silero_vad
|
6 |
import numpy as np
|
7 |
import onnxruntime
|
8 |
+
import logging
|
9 |
+
from datetime import timedelta
|
10 |
+
import gc
|
11 |
+
from pydub import AudioSegment
|
12 |
+
from collections import deque
|
13 |
+
|
14 |
+
class AdaptiveSilenceController:
|
15 |
+
def __init__(self, base_silence_ms=120, min_ms=50, max_ms=600):
|
16 |
+
self.base = base_silence_ms
|
17 |
+
self.min = min_ms
|
18 |
+
self.max = max_ms
|
19 |
+
self.recent_silences = deque(maxlen=20)
|
20 |
+
self.recent_speeches = deque(maxlen=20)
|
21 |
+
|
22 |
+
def update_silence(self, duration_ms):
|
23 |
+
self.recent_silences.append(duration_ms)
|
24 |
+
|
25 |
+
def update_speech(self, duration_ms):
|
26 |
+
self.recent_speeches.append(duration_ms)
|
27 |
+
|
28 |
+
def get_adaptive_silence_ms(self):
|
29 |
+
# 1. 快速说话特征:平均语音段长度短(如 < 250ms)
|
30 |
+
avg_speech = np.mean(self.recent_speeches) if self.recent_speeches else self.base
|
31 |
+
avg_silence = np.mean(self.recent_silences) if self.recent_silences else self.base
|
32 |
+
|
33 |
+
# 2. 快速语音则缩短 silence 阈值
|
34 |
+
speed_factor = 1.0
|
35 |
+
if avg_speech < 300:
|
36 |
+
speed_factor = 0.5
|
37 |
+
elif avg_speech < 600:
|
38 |
+
speed_factor = 0.8
|
39 |
+
|
40 |
+
# 3. silence 的变化趋势也考虑进去
|
41 |
+
adaptive = self.base * speed_factor + 0.3 * avg_silence
|
42 |
+
|
43 |
+
return int(max(self.min, min(self.max, adaptive)))
|
44 |
+
|
45 |
+
|
46 |
class OnnxWrapper():
|
47 |
|
48 |
def __init__(self, path, force_onnx_cpu=False):
|
|
|
145 |
sampling_rate: int = 16000,
|
146 |
min_silence_duration_ms: int = 100,
|
147 |
max_speech_duration_s: float = float('inf'),
|
148 |
+
speech_pad_ms: int = 30
|
149 |
):
|
150 |
self.model = OnnxWrapper(VAD_MODEL_PATH, True)
|
151 |
self.threshold = threshold
|
|
|
156 |
|
157 |
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
158 |
self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
|
159 |
+
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
160 |
self.reset_states()
|
161 |
|
162 |
def reset_states(self):
|
|
|
191 |
|
192 |
if (speech_prob >= self.threshold) and not self.triggered:
|
193 |
self.triggered = True
|
194 |
+
# speech_start = max(0, self.current_sample - window_size_samples)
|
195 |
+
speech_start = max(0, self.current_sample - self.speech_pad_samples - window_size_samples)
|
196 |
self.start = speech_start
|
197 |
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
|
198 |
|
|
|
208 |
if self.current_sample - self.temp_end < self.min_silence_samples:
|
209 |
return None
|
210 |
else:
|
211 |
+
# speech_end = self.temp_end - window_size_samples
|
212 |
+
speech_end = self.temp_end + self.speech_pad_samples - window_size_samples
|
213 |
self.temp_end = 0
|
214 |
self.triggered = False
|
215 |
return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
|
|
|
218 |
|
219 |
|
220 |
|
221 |
+
|
222 |
+
class FixedVADIterator(VADIteratorOnnx):
|
223 |
+
'''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
|
224 |
+
If audio to be processed at once is long and multiple voiced segments detected,
|
225 |
+
then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
|
226 |
+
'''
|
227 |
+
|
228 |
+
def reset_states(self):
|
229 |
+
super().reset_states()
|
230 |
+
self.buffer = np.array([],dtype=np.float32)
|
231 |
+
|
232 |
+
def __call__(self, x, return_seconds=False):
|
233 |
+
self.buffer = np.append(self.buffer, x)
|
234 |
+
ret = None
|
235 |
+
while len(self.buffer) >= 512:
|
236 |
+
r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
|
237 |
+
self.buffer = self.buffer[512:]
|
238 |
+
if ret is None:
|
239 |
+
ret = r
|
240 |
+
elif r is not None:
|
241 |
+
if 'end' in r:
|
242 |
+
ret['end'] = r['end'] # the latter end
|
243 |
+
if 'start' in r and 'end' in ret: # there is an earlier start.
|
244 |
+
# Remove end, merging this segment with the previous one.
|
245 |
+
del ret['end']
|
246 |
+
return ret if ret != {} else None
|
247 |
+
|
248 |
class VadV2:
|
249 |
def __init__(self,
|
250 |
threshold: float = 0.5,
|
|
|
336 |
return None
|
337 |
|
338 |
|
339 |
+
class SileroVADProcessor:
|
340 |
+
"""
|
341 |
+
A class for processing audio files using Silero VAD to detect voice activity
|
342 |
+
and extract voice segments from audio files.
|
343 |
+
"""
|
344 |
+
|
345 |
+
def __init__(self,
|
346 |
+
activate_threshold=0.5,
|
347 |
+
fusion_threshold=0.3,
|
348 |
+
min_speech_duration=0.25,
|
349 |
+
max_speech_duration=20,
|
350 |
+
min_silence_duration=250,
|
351 |
+
sample_rate=16000,
|
352 |
+
ort_providers=None):
|
353 |
+
"""
|
354 |
+
Initialize the SileroVADProcessor.
|
355 |
+
Args:
|
356 |
+
activate_threshold (float): Threshold for voice activity detection
|
357 |
+
fusion_threshold (float): Threshold for merging close speech segments (seconds)
|
358 |
+
min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
|
359 |
+
max_speech_duration (float): Maximum duration of speech (seconds)
|
360 |
+
min_silence_duration (int): Minimum silence duration (ms)
|
361 |
+
sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
|
362 |
+
ort_providers (list): ONNX Runtime providers for acceleration
|
363 |
+
"""
|
364 |
+
# VAD parameters
|
365 |
+
self.activate_threshold = activate_threshold
|
366 |
+
self.fusion_threshold = fusion_threshold
|
367 |
+
self.min_speech_duration = min_speech_duration
|
368 |
+
self.max_speech_duration = max_speech_duration
|
369 |
+
self.min_silence_duration = min_silence_duration
|
370 |
+
self.sample_rate = sample_rate
|
371 |
+
self.ort_providers = ort_providers if ort_providers else []
|
372 |
+
|
373 |
+
# Initialize logger
|
374 |
+
self.logger = logging.getLogger(__name__)
|
375 |
+
|
376 |
+
# Load Silero VAD model
|
377 |
+
self._init_onnx_session()
|
378 |
+
self.silero_vad = load_silero_vad(onnx=True)
|
379 |
+
|
380 |
+
def _init_onnx_session(self):
|
381 |
+
"""Initialize ONNX Runtime session with appropriate settings."""
|
382 |
+
session_opts = onnxruntime.SessionOptions()
|
383 |
+
session_opts.log_severity_level = 3
|
384 |
+
session_opts.inter_op_num_threads = 0
|
385 |
+
session_opts.intra_op_num_threads = 0
|
386 |
+
session_opts.enable_cpu_mem_arena = True
|
387 |
+
session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
388 |
+
session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
389 |
+
|
390 |
+
session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
|
391 |
+
session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
|
392 |
+
session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
|
393 |
+
|
394 |
+
# Set the session_opts to be used by silero_vad
|
395 |
+
# onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
|
396 |
+
|
397 |
+
def load_audio(self, audio_path):
|
398 |
+
"""
|
399 |
+
Load audio file and prepare it for VAD processing.
|
400 |
+
Args:
|
401 |
+
audio_path (str): Path to the audio file
|
402 |
+
Returns:
|
403 |
+
numpy.ndarray: Audio data as numpy array
|
404 |
+
"""
|
405 |
+
self.logger.info(f"Loading audio from {audio_path}")
|
406 |
+
audio_segment = AudioSegment.from_file(audio_path)
|
407 |
+
audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
|
408 |
+
|
409 |
+
# Convert to numpy array and normalize
|
410 |
+
dtype = np.float16 if self.use_gpu_fp16 else np.float32
|
411 |
+
audio_array = np.array(audio_segment.get_array_of_samples(), dtype=dtype) * 0.000030517578 # 1/32768
|
412 |
+
|
413 |
+
self.audio_segment = audio_segment # Store for later use
|
414 |
+
return audio_array
|
415 |
+
|
416 |
+
@property
|
417 |
+
def model(self):
|
418 |
+
return self.silero_vad
|
419 |
+
|
420 |
+
def process_timestamps(self, timestamps):
|
421 |
+
"""
|
422 |
+
Process VAD timestamps: filter short segments and merge close segments.
|
423 |
+
Args:
|
424 |
+
timestamps (list): List of (start, end) tuples
|
425 |
+
Returns:
|
426 |
+
list: Processed list of (start, end) tuples
|
427 |
+
"""
|
428 |
+
# Filter out short durations
|
429 |
+
filtered_timestamps = [(start, end) for start, end in timestamps
|
430 |
+
if (end - start) >= self.min_speech_duration]
|
431 |
+
|
432 |
+
# Fuse timestamps in two passes for better merging
|
433 |
+
fused_timestamps_1st = []
|
434 |
+
for start, end in filtered_timestamps:
|
435 |
+
if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
|
436 |
+
fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
|
437 |
+
else:
|
438 |
+
fused_timestamps_1st.append((start, end))
|
439 |
+
|
440 |
+
fused_timestamps_2nd = []
|
441 |
+
for start, end in fused_timestamps_1st:
|
442 |
+
if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
|
443 |
+
fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
|
444 |
+
else:
|
445 |
+
fused_timestamps_2nd.append((start, end))
|
446 |
+
|
447 |
+
return fused_timestamps_2nd
|
448 |
+
|
449 |
+
def format_time(self, seconds):
|
450 |
+
"""
|
451 |
+
Convert seconds to VTT time format 'hh:mm:ss.mmm'.
|
452 |
+
Args:
|
453 |
+
seconds (float): Time in seconds
|
454 |
+
Returns:
|
455 |
+
str: Formatted time string
|
456 |
+
"""
|
457 |
+
td = timedelta(seconds=seconds)
|
458 |
+
td_sec = td.total_seconds()
|
459 |
+
total_seconds = int(td_sec)
|
460 |
+
milliseconds = int((td_sec - total_seconds) * 1000)
|
461 |
+
hours = total_seconds // 3600
|
462 |
+
minutes = (total_seconds % 3600) // 60
|
463 |
+
seconds = total_seconds % 60
|
464 |
+
return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
|
465 |
+
|
466 |
+
def detect_speech(self, audio:np.array):
|
467 |
+
"""
|
468 |
+
Run VAD on the audio file to detect speech segments.
|
469 |
+
Args:
|
470 |
+
audio_path (str): Path to the audio file
|
471 |
+
Returns:
|
472 |
+
list: List of processed timestamps as (start, end) tuples
|
473 |
+
"""
|
474 |
+
self.logger.info("Starting VAD process")
|
475 |
+
start_time = time.time()
|
476 |
+
# Get speech timestamps
|
477 |
+
raw_timestamps = get_speech_timestamps(
|
478 |
+
audio,
|
479 |
+
model=self.silero_vad,
|
480 |
+
threshold=self.activate_threshold,
|
481 |
+
max_speech_duration_s=self.max_speech_duration,
|
482 |
+
min_speech_duration_ms=int(self.min_speech_duration * 1000),
|
483 |
+
min_silence_duration_ms=self.min_silence_duration,
|
484 |
+
return_seconds=True
|
485 |
+
)
|
486 |
+
|
487 |
+
# Convert to simple format and process
|
488 |
+
timestamps = [(item['start'], item['end']) for item in raw_timestamps]
|
489 |
+
processed_timestamps = self.process_timestamps(timestamps)
|
490 |
+
|
491 |
+
# Clean up
|
492 |
+
del audio
|
493 |
+
gc.collect()
|
494 |
+
|
495 |
+
self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
|
496 |
+
return processed_timestamps
|
497 |
+
|
498 |
+
"""
|
499 |
+
Save timestamps in both second and sample indices formats.
|
500 |
+
Args:
|
501 |
+
timestamps (list): List of (start, end) tuples
|
502 |
+
output_prefix (str): Prefix for output files
|
503 |
+
"""
|
504 |
+
# Save timestamps in seconds (VTT format)
|
505 |
+
seconds_path = f"{output_prefix}_timestamps_second.txt"
|
506 |
+
with open(seconds_path, "w", encoding='UTF-8') as file:
|
507 |
+
self.logger.info("Saving timestamps in seconds format")
|
508 |
+
for start, end in timestamps:
|
509 |
+
s_time = self.format_time(start)
|
510 |
+
e_time = self.format_time(end)
|
511 |
+
line = f"{s_time} --> {e_time}\n"
|
512 |
+
file.write(line)
|
513 |
+
|
514 |
+
# Save timestamps in sample indices
|
515 |
+
indices_path = f"{output_prefix}_timestamps_indices.txt"
|
516 |
+
with open(indices_path, "w", encoding='UTF-8') as file:
|
517 |
+
self.logger.info("Saving timestamps in indices format")
|
518 |
+
for start, end in timestamps:
|
519 |
+
line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
|
520 |
+
file.write(line)
|
521 |
+
|
522 |
+
self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
|
523 |
+
|
524 |
+
def extract_speech_segments(self, audio_segment, timestamps):
|
525 |
+
"""
|
526 |
+
Extract speech segments from the audio and combine them into a single audio file.
|
527 |
+
Args:
|
528 |
+
timestamps (list): List of (start, end) tuples indicating speech segments
|
529 |
+
Returns:
|
530 |
+
AudioSegment: The combined speech segments
|
531 |
+
"""
|
532 |
+
audio_segment = audio_segment.numpy()
|
533 |
+
combined_speech = np.array([], dtype=np.float32)
|
534 |
+
|
535 |
+
# Extract and combine each speech segment
|
536 |
+
for i, (start, end) in enumerate(timestamps):
|
537 |
+
# Convert seconds to milliseconds for pydub
|
538 |
+
start_ms = int(start * 1000)
|
539 |
+
end_ms = int(end * 1000)
|
540 |
+
|
541 |
+
# Ensure the end time does not exceed the length of the audio segment
|
542 |
+
if end_ms > len(audio_segment):
|
543 |
+
end_ms = len(audio_segment)
|
544 |
+
|
545 |
+
# Extract the segment
|
546 |
+
segment = audio_segment[start_ms:end_ms]
|
547 |
+
|
548 |
+
# Add to combined audio
|
549 |
+
combined_speech = np.append(combined_speech, segment)
|
550 |
+
|
551 |
+
return combined_speech
|
552 |
+
|
553 |
+
def process_audio(self, audio_array:np.array):
|
554 |
+
"""
|
555 |
+
Complete processing pipeline: detect speech, save timestamps, and optionally extract speech.
|
556 |
+
Returns:
|
557 |
+
tuple: (timestamps, output_speech_path if extract_speech else None)
|
558 |
+
"""
|
559 |
+
|
560 |
+
# Run VAD to detect speech
|
561 |
+
timestamps = self.detect_speech(audio_array)
|
562 |
+
|
563 |
+
combined_speech = self.extract_speech_segments(audio_array, timestamps)
|
564 |
+
|
565 |
+
return timestamps, combined_speech
|
566 |
+
|
567 |
+
|
568 |
|
569 |
class VadProcessor:
|
570 |
def __init__(
|
transcribe/helpers/whisper.py
CHANGED
@@ -52,7 +52,7 @@ class WhisperCPP:
|
|
52 |
initial_prompt=prompt,
|
53 |
language=language,
|
54 |
# token_timestamps=True,
|
55 |
-
|
56 |
# max_len=max_len
|
57 |
)
|
58 |
return output
|
|
|
52 |
initial_prompt=prompt,
|
53 |
language=language,
|
54 |
# token_timestamps=True,
|
55 |
+
split_on_word=True,
|
56 |
# max_len=max_len
|
57 |
)
|
58 |
return output
|
transcribe/pipelines/__init__.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
|
|
|
2 |
from .pipe_translate import TranslatePipe, Translate7BPipe
|
3 |
-
from .pipe_whisper import WhisperPipe, WhisperChinese
|
4 |
from .pipe_vad import VadPipe
|
5 |
-
from .
|
|
|
|
1 |
|
2 |
+
from .base import MetaItem
|
3 |
from .pipe_translate import TranslatePipe, Translate7BPipe
|
|
|
4 |
from .pipe_vad import VadPipe
|
5 |
+
from .pipe_whisper import WhisperPipe, WhisperChinese
|
6 |
+
from .pipe_funasr import FunASRPipe
|
transcribe/pipelines/base.py
CHANGED
@@ -22,6 +22,7 @@ class MetaItem:
|
|
22 |
translate_content: str = ''
|
23 |
source_language: str = 'zh'
|
24 |
destination_language: str = 'en'
|
|
|
25 |
|
26 |
|
27 |
class BasePipe(Process):
|
|
|
22 |
translate_content: str = ''
|
23 |
source_language: str = 'zh'
|
24 |
destination_language: str = 'en'
|
25 |
+
speech_status: str = 'END' # "END", "START"
|
26 |
|
27 |
|
28 |
class BasePipe(Process):
|
transcribe/pipelines/pipe_funasr.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unicodedata
|
2 |
+
|
3 |
+
from .base import MetaItem, BasePipe, Segment
|
4 |
+
from ..helpers.funasr import FunASR
|
5 |
+
|
6 |
+
|
7 |
+
class FunASRPipe(BasePipe):
|
8 |
+
funasr = None
|
9 |
+
|
10 |
+
@classmethod
|
11 |
+
def init(cls):
|
12 |
+
if cls.funasr is None:
|
13 |
+
cls.funasr = FunASR()
|
14 |
+
|
15 |
+
def process(self, in_data: MetaItem) -> MetaItem:
|
16 |
+
audio_data = in_data.audio
|
17 |
+
source_language = in_data.source_language
|
18 |
+
result = self.funasr.transcribe(audio_data, source_language)
|
19 |
+
|
20 |
+
# 处理 FunASR 的输出结果
|
21 |
+
if result and isinstance(result, list) and 'text' in result[0]:
|
22 |
+
# FunASR 输出格式为包含文本和时间戳的字典列表
|
23 |
+
segments = []
|
24 |
+
texts = []
|
25 |
+
|
26 |
+
for item in result:
|
27 |
+
text = item.get('text', '')
|
28 |
+
start = item.get('start', 0)
|
29 |
+
end = item.get('end', 0)
|
30 |
+
segments.append(Segment(t0=start, t1=end, text=self.filter_chinese_printable(text)))
|
31 |
+
texts.append(text)
|
32 |
+
|
33 |
+
in_data.segments = segments
|
34 |
+
in_data.transcribe_content = "".join(texts)
|
35 |
+
else:
|
36 |
+
# 如果 FunASR 返回的是单个文本字符串或其他格式
|
37 |
+
if isinstance(result, str):
|
38 |
+
in_data.transcribe_content = result
|
39 |
+
in_data.segments = [Segment(t0=0, t1=0, text=self.filter_chinese_printable(result))]
|
40 |
+
elif result and hasattr(result[0], 'text'):
|
41 |
+
# 如果是对象列表
|
42 |
+
segments = []
|
43 |
+
texts = []
|
44 |
+
for item in result:
|
45 |
+
text = item.text
|
46 |
+
start = getattr(item, 'start', 0) or getattr(item, 't0', 0)
|
47 |
+
end = getattr(item, 'end', 0) or getattr(item, 't1', 0)
|
48 |
+
segments.append(Segment(t0=start, t1=end, text=self.filter_chinese_printable(text)))
|
49 |
+
texts.append(text)
|
50 |
+
|
51 |
+
in_data.segments = segments
|
52 |
+
in_data.transcribe_content = "".join(texts)
|
53 |
+
else:
|
54 |
+
in_data.transcribe_content = ""
|
55 |
+
in_data.segments = []
|
56 |
+
|
57 |
+
in_data.audio = b""
|
58 |
+
return in_data
|
59 |
+
|
60 |
+
def filter_chinese_printable(self, s):
|
61 |
+
printable = []
|
62 |
+
bytearray_chars = s.encode('utf-8')
|
63 |
+
for char in bytearray_chars.decode('utf-8', errors='replace'):
|
64 |
+
if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
|
65 |
+
printable.append(char)
|
66 |
+
return ''.join(printable).strip()
|
67 |
+
|
68 |
+
|
69 |
+
class FunASRChinese(FunASRPipe):
|
70 |
+
@classmethod
|
71 |
+
def init(cls):
|
72 |
+
if cls.funasr is None:
|
73 |
+
cls.funasr = FunASR(source_lange='zh')
|
transcribe/pipelines/pipe_vad.py
CHANGED
@@ -1,41 +1,119 @@
|
|
1 |
|
2 |
from .base import MetaItem, BasePipe
|
3 |
-
from ..helpers.vadprocessor import
|
|
|
4 |
import numpy as np
|
5 |
from silero_vad import get_speech_timestamps
|
6 |
from typing import List
|
7 |
import logging
|
8 |
-
|
9 |
# import noisereduce as nr
|
10 |
|
11 |
|
12 |
class VadPipe(BasePipe):
|
13 |
vac = None
|
14 |
sample_rate = 16000
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
@classmethod
|
24 |
def init(cls):
|
25 |
if cls.vac is None:
|
26 |
-
cls.vac =
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
return in_data
|
36 |
|
37 |
|
38 |
# def reduce_noise(self, data):
|
39 |
# return nr.reduce_noise(y=data, sr=self.sample_rate)
|
40 |
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
from .base import MetaItem, BasePipe
|
3 |
+
from ..helpers.vadprocessor import FixedVADIterator, AdaptiveSilenceController
|
4 |
+
|
5 |
import numpy as np
|
6 |
from silero_vad import get_speech_timestamps
|
7 |
from typing import List
|
8 |
import logging
|
9 |
+
import time
|
10 |
# import noisereduce as nr
|
11 |
|
12 |
|
13 |
class VadPipe(BasePipe):
|
14 |
vac = None
|
15 |
sample_rate = 16000
|
16 |
+
|
17 |
+
def __init__(self, in_queue=None, out_queue=None) -> None:
|
18 |
+
super().__init__(in_queue, out_queue)
|
19 |
+
self._offset = 0 # 处理的frame size offset
|
20 |
+
self._status = 'END'
|
21 |
+
self.last_state_change_offset = 0
|
22 |
+
self.adaptive_ctrl = AdaptiveSilenceController()
|
23 |
|
24 |
|
25 |
+
def reset(self):
|
26 |
+
self._offset = 0
|
27 |
+
self._status = 'END'
|
28 |
+
self.last_state_change_offset = 0
|
29 |
+
self.adaptive_ctrl = AdaptiveSilenceController()
|
30 |
+
self.vac.reset_states()
|
31 |
|
32 |
@classmethod
|
33 |
def init(cls):
|
34 |
if cls.vac is None:
|
35 |
+
cls.vac = FixedVADIterator(
|
36 |
+
threshold=0.5,
|
37 |
+
sampling_rate=cls.sample_rate,
|
38 |
+
# speech_pad_ms=10
|
39 |
+
min_silence_duration_ms = 100,
|
40 |
+
# speech_pad_ms = 30,
|
41 |
+
max_speech_duration_s=20.0,
|
42 |
+
)
|
43 |
+
cls.vac.reset_states()
|
|
|
44 |
|
45 |
|
46 |
# def reduce_noise(self, data):
|
47 |
# return nr.reduce_noise(y=data, sr=self.sample_rate)
|
48 |
|
49 |
+
def _process_speech_chunk(self, source_audio:np.ndarray):
|
50 |
+
speech_dict = self.vac(source_audio, return_seconds=False)
|
51 |
+
if speech_dict:
|
52 |
+
relative_start_frame = None
|
53 |
+
relative_end_frame = None
|
54 |
+
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
|
55 |
+
if start_frame:
|
56 |
+
relative_start_frame =start_frame - self._offset
|
57 |
+
if end_frame:
|
58 |
+
relative_end_frame = max(0, end_frame - self._offset)
|
59 |
+
return relative_start_frame, relative_end_frame
|
60 |
+
|
61 |
+
def update_silence_ms(self):
|
62 |
+
min_silence = self.adaptive_ctrl.get_adaptive_silence_ms()
|
63 |
+
min_silence_samples = self.sample_rate * min_silence / 1000
|
64 |
+
self.vac.min_silence_samples = min_silence_samples
|
65 |
+
logging.warning(f"🫠 update_silence_ms :{min_silence} => current: {self.vac.min_silence_samples} ")
|
66 |
+
|
67 |
+
def process(self, in_data: MetaItem) -> MetaItem:
|
68 |
+
if self._offset == 0:
|
69 |
+
self.vac.reset_states()
|
70 |
+
|
71 |
+
# silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
|
72 |
+
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
|
73 |
+
speech_data = self._process_speech_chunk(source_audio)
|
74 |
+
|
75 |
+
if speech_data: # 表示有音频的变化点出现
|
76 |
+
# self.update_silence_ms()
|
77 |
+
rel_start_frame, rel_end_frame = speech_data
|
78 |
+
if rel_start_frame is not None and rel_end_frame is None:
|
79 |
+
self._status = "START" # 语音开始
|
80 |
+
target_audio = source_audio[rel_start_frame:]
|
81 |
+
|
82 |
+
# 计算上一段静音长度
|
83 |
+
silence_len = (self._offset + rel_start_frame - self.last_state_change_offset) / self.sample_rate * 1000
|
84 |
+
self.adaptive_ctrl.update_silence(silence_len)
|
85 |
+
self.last_state_change_offset = self._offset + rel_start_frame
|
86 |
+
|
87 |
+
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
|
88 |
+
elif rel_start_frame is None and rel_end_frame is not None:
|
89 |
+
self._status = "END" # 音频结束
|
90 |
+
target_audio = source_audio[:rel_end_frame]
|
91 |
+
|
92 |
+
speech_len = (rel_end_frame) / self.sample_rate * 1000
|
93 |
+
self.adaptive_ctrl.update_speech(speech_len)
|
94 |
+
self.last_state_change_offset = self._offset + rel_end_frame
|
95 |
+
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
96 |
+
else:
|
97 |
+
self._status = 'END'
|
98 |
+
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
99 |
+
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
100 |
+
|
101 |
+
seg_len = (rel_end_frame - rel_start_frame) / self.sample_rate * 1000
|
102 |
+
self.adaptive_ctrl.update_speech(seg_len)
|
103 |
+
self.last_state_change_offset = self._offset + rel_end_frame
|
104 |
+
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
105 |
+
else:
|
106 |
+
if self._status == 'START':
|
107 |
+
target_audio = source_audio
|
108 |
+
# logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
|
109 |
+
else: # end
|
110 |
+
target_audio = np.array([],dtype=np.float32)
|
111 |
+
# self._status = 'END'
|
112 |
+
# logging.debug("❌ No speech detected, setting status to END")
|
113 |
+
|
114 |
+
self._offset += len(source_audio)
|
115 |
+
|
116 |
+
in_data.audio = target_audio.tobytes()
|
117 |
+
in_data.source_audio = b''
|
118 |
+
in_data.speech_status = self._status
|
119 |
+
return in_data
|
transcribe/pipelines/pipe_whisper.py
CHANGED
@@ -1,19 +1,17 @@
|
|
1 |
-
|
2 |
import unicodedata
|
|
|
3 |
from .base import MetaItem, BasePipe, Segment
|
4 |
from ..helpers.whisper import WhisperCPP
|
5 |
|
|
|
6 |
class WhisperPipe(BasePipe):
|
7 |
whisper = None
|
8 |
|
9 |
-
|
10 |
-
|
11 |
@classmethod
|
12 |
def init(cls):
|
13 |
if cls.whisper is None:
|
14 |
# cls.zh_whisper = WhisperCPP(source_lange='zh')
|
15 |
cls.whisper = WhisperCPP()
|
16 |
-
|
17 |
|
18 |
def process(self, in_data: MetaItem) -> MetaItem:
|
19 |
audio_data = in_data.audio
|
@@ -32,7 +30,6 @@ class WhisperPipe(BasePipe):
|
|
32 |
if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
|
33 |
printable.append(char)
|
34 |
return ''.join(printable).strip()
|
35 |
-
|
36 |
|
37 |
|
38 |
class WhisperChinese(WhisperPipe):
|
|
|
|
|
1 |
import unicodedata
|
2 |
+
|
3 |
from .base import MetaItem, BasePipe, Segment
|
4 |
from ..helpers.whisper import WhisperCPP
|
5 |
|
6 |
+
|
7 |
class WhisperPipe(BasePipe):
|
8 |
whisper = None
|
9 |
|
|
|
|
|
10 |
@classmethod
|
11 |
def init(cls):
|
12 |
if cls.whisper is None:
|
13 |
# cls.zh_whisper = WhisperCPP(source_lange='zh')
|
14 |
cls.whisper = WhisperCPP()
|
|
|
15 |
|
16 |
def process(self, in_data: MetaItem) -> MetaItem:
|
17 |
audio_data = in_data.audio
|
|
|
30 |
if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
|
31 |
printable.append(char)
|
32 |
return ''.join(printable).strip()
|
|
|
33 |
|
34 |
|
35 |
class WhisperChinese(WhisperPipe):
|
transcribe/translatepipes.py
CHANGED
@@ -1,86 +1,81 @@
|
|
1 |
-
from transcribe.pipelines import WhisperPipe,
|
2 |
-
import multiprocessing as mp
|
3 |
-
import config
|
4 |
|
5 |
|
6 |
class TranslatePipes:
|
7 |
def __init__(self) -> None:
|
8 |
-
|
9 |
-
# self.whisper_input_q = mp.Queue()
|
10 |
# self.translate_input_q = mp.Queue()
|
11 |
# self.result_queue = mp.Queue()
|
12 |
-
|
13 |
# whisper 转录
|
14 |
self._whisper_pipe_en = self._launch_process(WhisperPipe())
|
15 |
-
self._whisper_pipe_zh = self._launch_process(WhisperChinese())
|
16 |
-
|
|
|
17 |
# llm 翻译
|
18 |
# self._translate_pipe = self._launch_process(TranslatePipe())
|
19 |
|
20 |
self._translate_7b_pipe = self._launch_process(Translate7BPipe())
|
21 |
# vad
|
22 |
-
|
23 |
-
|
24 |
# def reset(self):
|
25 |
# self._vad_pipe.reset()
|
26 |
-
|
27 |
def _launch_process(self, process_obj):
|
28 |
process_obj.daemon = True
|
29 |
process_obj.start()
|
|
|
30 |
return process_obj
|
31 |
|
32 |
def wait_ready(self):
|
33 |
-
self.
|
34 |
-
|
35 |
-
|
36 |
-
# self._vad_pipe.wait()
|
37 |
-
self._translate_7b_pipe.wait()
|
38 |
-
|
39 |
def translate(self, text, src_lang, dst_lang) -> MetaItem:
|
40 |
item = MetaItem(
|
41 |
transcribe_content=text,
|
42 |
-
source_language=src_lang,
|
43 |
destination_language=dst_lang)
|
44 |
self._translate_pipe.input_queue.put(item)
|
45 |
return self._translate_pipe.output_queue.get()
|
46 |
-
|
47 |
|
48 |
def translate_large(self, text, src_lang, dst_lang) -> MetaItem:
|
49 |
item = MetaItem(
|
50 |
transcribe_content=text,
|
51 |
-
source_language=src_lang,
|
52 |
destination_language=dst_lang)
|
53 |
self._translate_7b_pipe.input_queue.put(item)
|
54 |
return self._translate_7b_pipe.output_queue.get()
|
55 |
-
|
56 |
-
def get_whisper_model(self, lang:str='en'):
|
57 |
if lang == 'zh':
|
58 |
return self._whisper_pipe_zh
|
59 |
return self._whisper_pipe_en
|
60 |
-
|
61 |
|
62 |
-
def
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
64 |
item = MetaItem(audio=audio_buffer, source_language=src_lang)
|
65 |
-
|
66 |
-
return
|
67 |
-
|
68 |
-
def voice_detect(self, audio_buffer:bytes) -> MetaItem:
|
69 |
item = MetaItem(source_audio=audio_buffer)
|
70 |
self._vad_pipe.input_queue.put(item)
|
71 |
return self._vad_pipe.output_queue.get()
|
72 |
|
73 |
-
|
74 |
|
75 |
if __name__ == "__main__":
|
76 |
import soundfile
|
|
|
77 |
tp = TranslatePipes()
|
78 |
# result = tp.translate("你好,今天天气怎么样?", src_lang="zh", dst_lang="en")
|
79 |
mel, _, = soundfile.read("assets/jfk.flac")
|
80 |
# result = tp.transcrible(mel, 'en')
|
81 |
result = tp.voice_detect(mel)
|
82 |
print(result)
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
1 |
+
from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe
|
|
|
|
|
2 |
|
3 |
|
4 |
class TranslatePipes:
|
5 |
def __init__(self) -> None:
|
6 |
+
# self.whisper_input_q = mp.Queue()
|
|
|
7 |
# self.translate_input_q = mp.Queue()
|
8 |
# self.result_queue = mp.Queue()
|
9 |
+
self._process = []
|
10 |
# whisper 转录
|
11 |
self._whisper_pipe_en = self._launch_process(WhisperPipe())
|
12 |
+
# self._whisper_pipe_zh = self._launch_process(WhisperChinese())
|
13 |
+
self._funasr_pipe = self._launch_process(FunASRPipe())
|
14 |
+
|
15 |
# llm 翻译
|
16 |
# self._translate_pipe = self._launch_process(TranslatePipe())
|
17 |
|
18 |
self._translate_7b_pipe = self._launch_process(Translate7BPipe())
|
19 |
# vad
|
20 |
+
self._vad_pipe = self._launch_process(VadPipe())
|
21 |
+
|
22 |
# def reset(self):
|
23 |
# self._vad_pipe.reset()
|
24 |
+
|
25 |
def _launch_process(self, process_obj):
|
26 |
process_obj.daemon = True
|
27 |
process_obj.start()
|
28 |
+
self._process.append(process_obj)
|
29 |
return process_obj
|
30 |
|
31 |
def wait_ready(self):
|
32 |
+
for p in self._process:
|
33 |
+
p.wait()
|
34 |
+
|
|
|
|
|
|
|
35 |
def translate(self, text, src_lang, dst_lang) -> MetaItem:
|
36 |
item = MetaItem(
|
37 |
transcribe_content=text,
|
38 |
+
source_language=src_lang,
|
39 |
destination_language=dst_lang)
|
40 |
self._translate_pipe.input_queue.put(item)
|
41 |
return self._translate_pipe.output_queue.get()
|
|
|
42 |
|
43 |
def translate_large(self, text, src_lang, dst_lang) -> MetaItem:
|
44 |
item = MetaItem(
|
45 |
transcribe_content=text,
|
46 |
+
source_language=src_lang,
|
47 |
destination_language=dst_lang)
|
48 |
self._translate_7b_pipe.input_queue.put(item)
|
49 |
return self._translate_7b_pipe.output_queue.get()
|
50 |
+
|
51 |
+
def get_whisper_model(self, lang: str = 'en'):
|
52 |
if lang == 'zh':
|
53 |
return self._whisper_pipe_zh
|
54 |
return self._whisper_pipe_en
|
|
|
55 |
|
56 |
+
def get_transcription_model(self, lang: str = 'en'):
|
57 |
+
if lang == 'zh':
|
58 |
+
return self._funasr_pipe
|
59 |
+
return self._whisper_pipe_en
|
60 |
+
|
61 |
+
def transcrible(self, audio_buffer: bytes, src_lang: str) -> MetaItem:
|
62 |
+
transcription_model = self.get_transcription_model(src_lang)
|
63 |
item = MetaItem(audio=audio_buffer, source_language=src_lang)
|
64 |
+
transcription_model.input_queue.put(item)
|
65 |
+
return transcription_model.output_queue.get()
|
66 |
+
|
67 |
+
def voice_detect(self, audio_buffer: bytes) -> MetaItem:
|
68 |
item = MetaItem(source_audio=audio_buffer)
|
69 |
self._vad_pipe.input_queue.put(item)
|
70 |
return self._vad_pipe.output_queue.get()
|
71 |
|
|
|
72 |
|
73 |
if __name__ == "__main__":
|
74 |
import soundfile
|
75 |
+
|
76 |
tp = TranslatePipes()
|
77 |
# result = tp.translate("你好,今天天气怎么样?", src_lang="zh", dst_lang="en")
|
78 |
mel, _, = soundfile.read("assets/jfk.flac")
|
79 |
# result = tp.transcrible(mel, 'en')
|
80 |
result = tp.voice_detect(mel)
|
81 |
print(result)
|
|
|
|
|
|
|
|
transcribe/utils.py
CHANGED
@@ -7,6 +7,51 @@ from scipy.io.wavfile import write
|
|
7 |
import config
|
8 |
import csv
|
9 |
import av
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def log_block(key: str, value, unit=''):
|
11 |
if config.DEBUG:
|
12 |
return
|
|
|
7 |
import config
|
8 |
import csv
|
9 |
import av
|
10 |
+
import re
|
11 |
+
|
12 |
+
# Compile regex patterns once outside the loop for better performance
|
13 |
+
p_pattern = re.compile(r"(\s*\[.*?\])")
|
14 |
+
p_start_pattern = re.compile(r"(\s*\[.*)")
|
15 |
+
p_end_pattern = re.compile(r"(\s*.*\])")
|
16 |
+
|
17 |
+
|
18 |
+
def filter_words(res_word):
|
19 |
+
"""
|
20 |
+
Filter words according to specific bracket patterns.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
res_word: Iterable of word objects with a 'text' attribute
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
List of filtered word objects
|
27 |
+
"""
|
28 |
+
asr_results = []
|
29 |
+
skip_word = False
|
30 |
+
|
31 |
+
for word in res_word:
|
32 |
+
# Skip words that completely match the pattern
|
33 |
+
if p_pattern.match(word.text):
|
34 |
+
continue
|
35 |
+
|
36 |
+
# Mark the start of a section to skip
|
37 |
+
if p_start_pattern.match(word.text):
|
38 |
+
skip_word = True
|
39 |
+
continue
|
40 |
+
|
41 |
+
# Mark the end of a section to skip
|
42 |
+
if p_end_pattern.match(word.text) and skip_word:
|
43 |
+
skip_word = False
|
44 |
+
continue
|
45 |
+
|
46 |
+
# Skip words if we're in a skip section
|
47 |
+
if skip_word:
|
48 |
+
continue
|
49 |
+
|
50 |
+
# Add the word to results if it passed all filters
|
51 |
+
asr_results.append(word)
|
52 |
+
|
53 |
+
return asr_results
|
54 |
+
|
55 |
def log_block(key: str, value, unit=''):
|
56 |
if config.DEBUG:
|
57 |
return
|
transcribe/whisper_llm_serve.py
CHANGED
@@ -8,14 +8,16 @@ from typing import List, Optional, Iterator, Tuple, Any
|
|
8 |
import asyncio
|
9 |
import numpy as np
|
10 |
import config
|
11 |
-
|
12 |
from api_model import TransResult, Message, DebugResult
|
13 |
|
14 |
-
from .utils import log_block, save_to_wave, TestDataWriter
|
15 |
from .translatepipes import TranslatePipes
|
16 |
from .strategy import (
|
17 |
TranscriptStabilityAnalyzer, TranscriptToken)
|
18 |
from transcribe.helpers.vadprocessor import VadProcessor
|
|
|
|
|
19 |
from transcribe.pipelines import MetaItem
|
20 |
|
21 |
logger = getLogger("TranscriptionService")
|
@@ -43,13 +45,19 @@ class WhisperTranscriptionService:
|
|
43 |
self.sample_rate = 16000
|
44 |
|
45 |
self.lock = threading.Lock()
|
46 |
-
|
47 |
-
self._vad_frame_queue = queue.Queue()
|
48 |
|
49 |
# 文本分隔符,根据语言设置
|
50 |
self.text_separator = self._get_text_separator(language)
|
51 |
self.loop = asyncio.get_event_loop()
|
52 |
# 发送就绪状态
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
self._transcrible_analysis = None
|
55 |
# 启动处理线程
|
@@ -58,25 +66,26 @@ class WhisperTranscriptionService:
|
|
58 |
|
59 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
60 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
61 |
-
if language == "zh":
|
62 |
-
|
63 |
-
else:
|
64 |
-
|
65 |
self.row_number = 0
|
66 |
# for test
|
67 |
self._transcrible_time_cost = 0.
|
68 |
self._translate_time_cost = 0.
|
69 |
-
|
70 |
-
|
71 |
-
self.
|
72 |
-
self.
|
|
|
73 |
|
74 |
# self._c = 0
|
75 |
|
76 |
-
def
|
77 |
writer = TestDataWriter()
|
78 |
-
while not self.
|
79 |
-
test_data = self.
|
80 |
writer.write(test_data) # Save test_data to CSV
|
81 |
|
82 |
|
@@ -110,23 +119,108 @@ class WhisperTranscriptionService:
|
|
110 |
"""添加音频帧到处理队列"""
|
111 |
self._frame_queue.put(frame_np)
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
def _frame_processing_loop(self) -> None:
|
114 |
"""从队列获取音频帧并合并到缓冲区"""
|
115 |
while not self._frame_processing_thread_stop.is_set():
|
116 |
try:
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
self.
|
|
|
|
|
127 |
except queue.Empty:
|
128 |
pass
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
132 |
"""转录音频并返回转录片段"""
|
@@ -175,43 +269,7 @@ class WhisperTranscriptionService:
|
|
175 |
self._translate_time_cost = round(time_diff, 3)
|
176 |
return translated_text
|
177 |
|
178 |
-
def _transcription_processing_loop(self) -> None:
|
179 |
-
"""主转录处理循环"""
|
180 |
|
181 |
-
while not self._translate_thread_stop.is_set():
|
182 |
-
audio_buffer = self._vad_frame_queue.get()
|
183 |
-
if audio_buffer is None or len(audio_buffer) < int(self.sample_rate):
|
184 |
-
time.sleep(0.2)
|
185 |
-
continue
|
186 |
-
|
187 |
-
logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
|
188 |
-
# try:
|
189 |
-
meta_item = self._transcribe_audio(audio_buffer)
|
190 |
-
segments = meta_item.segments
|
191 |
-
logger.debug(f"Segments: {segments}")
|
192 |
-
if len(segments):
|
193 |
-
result = self._process_transcription_results_2(segments)
|
194 |
-
self._send_result_to_client(result)
|
195 |
-
time.sleep(0.1)
|
196 |
-
# 处理转录结果并发送到客户端
|
197 |
-
# for result in self._process_transcription_results(segments, audio_buffer):
|
198 |
-
# self._send_result_to_client(result)
|
199 |
-
|
200 |
-
# except Exception as e:
|
201 |
-
# logger.error(f"Error processing audio: {e}")
|
202 |
-
|
203 |
-
def _process_transcription_results_2(self, segments: List[TranscriptToken],):
|
204 |
-
seg_text = self.text_separator.join(seg.text for seg in segments)
|
205 |
-
item = TransResult(
|
206 |
-
seg_id=self.row_number,
|
207 |
-
context=seg_text,
|
208 |
-
from_=self.source_language,
|
209 |
-
to=self.target_language,
|
210 |
-
tran_content=self._translate_text_large(seg_text),
|
211 |
-
partial=False
|
212 |
-
)
|
213 |
-
self.row_number += 1
|
214 |
-
return item
|
215 |
|
216 |
def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
|
217 |
"""
|
@@ -243,8 +301,8 @@ class WhisperTranscriptionService:
|
|
243 |
)
|
244 |
current_time = time.perf_counter()
|
245 |
time_diff = current_time - start_time
|
246 |
-
if config.
|
247 |
-
self.
|
248 |
seg_id=ana_result.seg_id,
|
249 |
transcrible_time=self._transcrible_time_cost,
|
250 |
translate_time=self._translate_time_cost,
|
@@ -273,6 +331,6 @@ class WhisperTranscriptionService:
|
|
273 |
"""停止所有处理线程并清理资源"""
|
274 |
self._translate_thread_stop.set()
|
275 |
self._frame_processing_thread_stop.set()
|
276 |
-
if config.
|
277 |
-
self.
|
278 |
logger.info(f"Stopping transcription service for client: {self.client_uid}")
|
|
|
8 |
import asyncio
|
9 |
import numpy as np
|
10 |
import config
|
11 |
+
import collections
|
12 |
from api_model import TransResult, Message, DebugResult
|
13 |
|
14 |
+
from .utils import log_block, save_to_wave, TestDataWriter, filter_words
|
15 |
from .translatepipes import TranslatePipes
|
16 |
from .strategy import (
|
17 |
TranscriptStabilityAnalyzer, TranscriptToken)
|
18 |
from transcribe.helpers.vadprocessor import VadProcessor
|
19 |
+
# from transcribe.helpers.vad_dynamic import VadProcessor
|
20 |
+
# from transcribe.helpers.vadprocessor import VadProcessor
|
21 |
from transcribe.pipelines import MetaItem
|
22 |
|
23 |
logger = getLogger("TranscriptionService")
|
|
|
45 |
self.sample_rate = 16000
|
46 |
|
47 |
self.lock = threading.Lock()
|
48 |
+
|
|
|
49 |
|
50 |
# 文本分隔符,根据语言设置
|
51 |
self.text_separator = self._get_text_separator(language)
|
52 |
self.loop = asyncio.get_event_loop()
|
53 |
# 发送就绪状态
|
54 |
+
# 原始音频队列
|
55 |
+
self._frame_queue = queue.Queue()
|
56 |
+
# 音频队列缓冲区
|
57 |
+
self.frames_np = None
|
58 |
+
# 完整音频队列
|
59 |
+
self.segments_queue = collections.deque()
|
60 |
+
self._temp_string = ""
|
61 |
|
62 |
self._transcrible_analysis = None
|
63 |
# 启动处理线程
|
|
|
66 |
|
67 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
68 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
69 |
+
# if language == "zh":
|
70 |
+
# self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
|
71 |
+
# else:
|
72 |
+
# self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
|
73 |
self.row_number = 0
|
74 |
# for test
|
75 |
self._transcrible_time_cost = 0.
|
76 |
self._translate_time_cost = 0.
|
77 |
+
|
78 |
+
if config.SAVE_DATA_SAVE:
|
79 |
+
self._save_task_stop = threading.Event()
|
80 |
+
self._save_queue = queue.Queue()
|
81 |
+
self._save_thread = self._start_thread(self.save_data_loop)
|
82 |
|
83 |
# self._c = 0
|
84 |
|
85 |
+
def save_data_loop(self):
|
86 |
writer = TestDataWriter()
|
87 |
+
while not self._save_task_stop.is_set():
|
88 |
+
test_data = self._save_queue.get()
|
89 |
writer.write(test_data) # Save test_data to CSV
|
90 |
|
91 |
|
|
|
119 |
"""添加音频帧到处理队列"""
|
120 |
self._frame_queue.put(frame_np)
|
121 |
|
122 |
+
def _apply_voice_activity_detection(self, frame_np:np.array):
|
123 |
+
"""应用语音活动检测来优化音频缓冲区"""
|
124 |
+
processed_audio = self._translate_pipe.voice_detect(frame_np.tobytes())
|
125 |
+
speech_audio = np.frombuffer(processed_audio.audio, dtype=np.float32)
|
126 |
+
speech_status = processed_audio.speech_status
|
127 |
+
return speech_audio, speech_status
|
128 |
+
|
129 |
def _frame_processing_loop(self) -> None:
|
130 |
"""从队列获取音频帧并合并到缓冲区"""
|
131 |
while not self._frame_processing_thread_stop.is_set():
|
132 |
try:
|
133 |
+
frame_np = self._frame_queue.get(timeout=0.1)
|
134 |
+
frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
|
135 |
+
if frame_np is None or len(frame_np) == 0:
|
136 |
+
continue
|
137 |
+
with self.lock:
|
138 |
+
if self.frames_np is None:
|
139 |
+
self.frames_np = frame_np.copy()
|
140 |
+
else:
|
141 |
+
self.frames_np = np.append(self.frames_np, frame_np)
|
142 |
+
if speech_status == "END" and len(self.frames_np) > 0:
|
143 |
+
self.segments_queue.appendleft(self.frames_np.copy())
|
144 |
+
self.frames_np = np.array([], dtype=np.float32)
|
145 |
except queue.Empty:
|
146 |
pass
|
147 |
|
148 |
+
def _process_transcription_results_2(self, seg_text:str,partial):
|
149 |
+
|
150 |
+
item = TransResult(
|
151 |
+
seg_id=self.row_number,
|
152 |
+
context=seg_text,
|
153 |
+
from_=self.source_language,
|
154 |
+
to=self.target_language,
|
155 |
+
tran_content=self._translate_text_large(seg_text),
|
156 |
+
partial=partial
|
157 |
+
)
|
158 |
+
if partial == False:
|
159 |
+
self.row_number += 1
|
160 |
+
return item
|
161 |
+
|
162 |
+
def _transcription_processing_loop(self) -> None:
|
163 |
+
"""主转录处理循环"""
|
164 |
+
frame_epoch = 1
|
165 |
+
while not self._translate_thread_stop.is_set():
|
166 |
+
|
167 |
+
if self.frames_np is None:
|
168 |
+
time.sleep(0.01)
|
169 |
+
continue
|
170 |
+
|
171 |
+
|
172 |
+
if len(self.segments_queue) >0:
|
173 |
+
audio_buffer = self.segments_queue.pop()
|
174 |
+
partial = False
|
175 |
+
else:
|
176 |
+
with self.lock:
|
177 |
+
audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
|
178 |
+
partial = True
|
179 |
+
|
180 |
+
if len(audio_buffer) ==0:
|
181 |
+
time.sleep(0.01)
|
182 |
+
continue
|
183 |
+
|
184 |
+
if len(audio_buffer) < int(self.sample_rate):
|
185 |
+
silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
|
186 |
+
silence_audio[-len(audio_buffer):] = audio_buffer
|
187 |
+
audio_buffer = silence_audio
|
188 |
+
|
189 |
+
|
190 |
+
logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
|
191 |
+
# try:
|
192 |
+
meta_item = self._transcribe_audio(audio_buffer)
|
193 |
+
segments = meta_item.segments
|
194 |
+
logger.debug(f"Segments: {segments}")
|
195 |
+
segments = filter_words(segments)
|
196 |
+
if len(segments):
|
197 |
+
seg_text = self.text_separator.join(seg.text for seg in segments)
|
198 |
+
if self._temp_string:
|
199 |
+
seg_text = self._temp_string + seg_text
|
200 |
+
|
201 |
+
if partial == False:
|
202 |
+
if len(seg_text) < config.TEXT_THREHOLD:
|
203 |
+
partial = True
|
204 |
+
self._temp_string = seg_text
|
205 |
+
else:
|
206 |
+
self._temp_string = ""
|
207 |
+
|
208 |
+
|
209 |
+
result = self._process_transcription_results_2(seg_text, partial)
|
210 |
+
self._send_result_to_client(result)
|
211 |
+
time.sleep(0.1)
|
212 |
+
|
213 |
+
if partial == False:
|
214 |
+
frame_epoch = 1
|
215 |
+
else:
|
216 |
+
frame_epoch += 1
|
217 |
+
# 处理转录结果并发送到客户端
|
218 |
+
# for result in self._process_transcription_results(segments, audio_buffer):
|
219 |
+
# self._send_result_to_client(result)
|
220 |
+
|
221 |
+
# except Exception as e:
|
222 |
+
# logger.error(f"Error processing audio: {e}")
|
223 |
+
|
224 |
|
225 |
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
226 |
"""转录音频并返回转录片段"""
|
|
|
269 |
self._translate_time_cost = round(time_diff, 3)
|
270 |
return translated_text
|
271 |
|
|
|
|
|
272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
|
275 |
"""
|
|
|
301 |
)
|
302 |
current_time = time.perf_counter()
|
303 |
time_diff = current_time - start_time
|
304 |
+
if config.SAVE_DATA_SAVE:
|
305 |
+
self._save_queue.put(DebugResult(
|
306 |
seg_id=ana_result.seg_id,
|
307 |
transcrible_time=self._transcrible_time_cost,
|
308 |
translate_time=self._translate_time_cost,
|
|
|
331 |
"""停止所有处理线程并清理资源"""
|
332 |
self._translate_thread_stop.set()
|
333 |
self._frame_processing_thread_stop.set()
|
334 |
+
if config.SAVE_DATA_SAVE:
|
335 |
+
self._save_task_stop.set()
|
336 |
logger.info(f"Stopping transcription service for client: {self.client_uid}")
|
uv.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|