Merge branch 'vad'

* vad:
[fix]: update parameter.
[fix]: requirements.
[fix]: update parameter.
fix np array copy error
add vad update_silence_ms adapter
[fix]: parameter.
update
filter [] words
Disable FunASR pbar.
[fix]: remove unused file.
fix bug of loss segemnts
update text threhold
[fix]: test dynamic vad.
update to vad streaming
[fix]: update web.
update pipelines launch wait
ignore write to wav in assets
update config of save data to save flag
fix words missing
Integrate FunASR.

Files changed (15) hide show

config.py +4 -5
pyproject.toml +1 -0
requirements.txt +4 -0
transcribe/helpers/funasr.py +37 -0
transcribe/helpers/vadprocessor.py +301 -5
transcribe/helpers/whisper.py +1 -1
transcribe/pipelines/__init__.py +3 -2
transcribe/pipelines/base.py +1 -0
transcribe/pipelines/pipe_funasr.py +73 -0
transcribe/pipelines/pipe_vad.py +96 -18
transcribe/pipelines/pipe_whisper.py +2 -5
transcribe/translatepipes.py +29 -34
transcribe/utils.py +45 -0
transcribe/whisper_llm_serve.py +123 -65
uv.lock +0 -0

config.py CHANGED Viewed

@@ -3,17 +3,16 @@ import re
 import logging
 DEBUG = True
-TEST = False
-logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
 logging.basicConfig(
     level=logging.DEBUG if DEBUG else logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
     filename='translator.log',
     datefmt="%H:%M:%S"
 )
 # Add terminal log
 console_handler = logging.StreamHandler()
 console_handler.setLevel(logging.DEBUG if DEBUG else logging.INFO)
@@ -22,7 +21,7 @@ console_handler.setFormatter(console_formatter)
 logging.getLogger().addHandler(console_handler)
 # 文字输出长度阈值
-TEXT_THREHOLD = 16
 BASE_DIR = pathlib.Path(__file__).parent
 MODEL_DIR = BASE_DIR / "moyoyo_asr_models"

 import logging
 DEBUG = True
+logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
 logging.basicConfig(
     level=logging.DEBUG if DEBUG else logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
     filename='translator.log',
     datefmt="%H:%M:%S"
 )
+# save pipelines data to disk
+SAVE_DATA_SAVE = False
 # Add terminal log
 console_handler = logging.StreamHandler()
 console_handler.setLevel(logging.DEBUG if DEBUG else logging.INFO)
 logging.getLogger().addHandler(console_handler)
 # 文字输出长度阈值
+TEXT_THREHOLD = 6
 BASE_DIR = pathlib.Path(__file__).parent
 MODEL_DIR = BASE_DIR / "moyoyo_asr_models"

pyproject.toml CHANGED Viewed

@@ -7,6 +7,7 @@ requires-python = ">=3.11"
 dependencies = [
     "av>=14.2.0",
     "fastapi>=0.115.12",
     "librosa>=0.11.0",
     "numpy>=2.1.3",
     "onnxruntime>=1.21.0",

 dependencies = [
     "av>=14.2.0",
     "fastapi>=0.115.12",
+    "funasr>=1.2.6",
     "librosa>=0.11.0",
     "numpy>=2.1.3",
     "onnxruntime>=1.21.0",

requirements.txt CHANGED Viewed

@@ -154,6 +154,9 @@ torch==2.6.0
     #   silero-vad
     #   torchaudio
 torchaudio==2.6.0
     # via silero-vad
 tqdm==4.67.1
     # via
@@ -184,3 +187,4 @@ websockets==15.0.1
     # via trans (pyproject.toml)
 wordninja==2.0.0
     # via trans (pyproject.toml)

     #   silero-vad
     #   torchaudio
 torchaudio==2.6.0
+ane_transformers
+openai-whisper
+coremltools
     # via silero-vad
 tqdm==4.67.1
     # via
     # via trans (pyproject.toml)
 wordninja==2.0.0
     # via trans (pyproject.toml)
+funasr==1.2.6

transcribe/helpers/funasr.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import time
+import uuid
+from logging import getLogger
+import numpy as np
+from funasr import AutoModel
+import soundfile as sf
+import config
+logger = getLogger(__name__)
+class FunASR:
+    def __init__(self, source_lange: str = 'en', warmup=True) -> None:
+        self.source_lange = source_lange
+        self.model = AutoModel(
+            model="paraformer-zh", vad_model="fsmn-vad", punc_model="ct-punc"
+        )
+        if warmup:
+            self.warmup()
+    def warmup(self, warmup_steps=1):
+        warmup_soundfile = f"{config.ASSERT_DIR}/jfk.flac"
+        for _ in range(warmup_steps):
+            self.model.generate(input=warmup_soundfile)
+    def transcribe(self, audio_buffer: bytes, language):
+        audio_frames = np.frombuffer(audio_buffer, dtype=np.float32)
+        # sf.write(f'{config.ASSERT_DIR}/{time.time()}.wav', audio_frames, samplerate=16000)
+        try:
+            output = self.model.generate(input=audio_frames, disable_pbar=True)
+            return output
+        except Exception as e:
+            logger.error(e)
+            return []

transcribe/helpers/vadprocessor.py CHANGED Viewed

@@ -2,10 +2,47 @@ from copy import deepcopy
 from queue import Queue, Empty
 from time import time
 from config import VAD_MODEL_PATH
-# from silero_vad import load_silero_vad
 import numpy as np
 import onnxruntime
 class OnnxWrapper():
     def __init__(self, path, force_onnx_cpu=False):
@@ -108,6 +145,7 @@ class VADIteratorOnnx:
                  sampling_rate: int = 16000,
                  min_silence_duration_ms: int = 100,
                  max_speech_duration_s: float = float('inf'),
                  ):
         self.model = OnnxWrapper(VAD_MODEL_PATH, True)
         self.threshold = threshold
@@ -118,7 +156,7 @@ class VADIteratorOnnx:
         self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
         self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
-        # self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
         self.reset_states()
     def reset_states(self):
@@ -153,7 +191,8 @@ class VADIteratorOnnx:
         if (speech_prob >= self.threshold) and not self.triggered:
             self.triggered = True
-            speech_start = max(0, self.current_sample - window_size_samples)
             self.start = speech_start
             return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
@@ -169,7 +208,8 @@ class VADIteratorOnnx:
             if self.current_sample - self.temp_end < self.min_silence_samples:
                 return None
             else:
-                speech_end = self.temp_end - window_size_samples
                 self.temp_end = 0
                 self.triggered = False
                 return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
@@ -178,6 +218,33 @@ class VADIteratorOnnx:
 class VadV2:
     def __init__(self,
                  threshold: float = 0.5,
@@ -269,6 +336,235 @@ class VadV2:
         return None
 class VadProcessor:
     def __init__(

 from queue import Queue, Empty
 from time import time
 from config import VAD_MODEL_PATH
+from silero_vad import load_silero_vad
 import numpy as np
 import onnxruntime
+import logging
+from datetime import timedelta
+import gc
+from pydub import AudioSegment
+from collections import deque
+class AdaptiveSilenceController:
+    def __init__(self, base_silence_ms=120, min_ms=50, max_ms=600):
+        self.base = base_silence_ms
+        self.min = min_ms
+        self.max = max_ms
+        self.recent_silences = deque(maxlen=20)
+        self.recent_speeches = deque(maxlen=20)
+    def update_silence(self, duration_ms):
+        self.recent_silences.append(duration_ms)
+    def update_speech(self, duration_ms):
+        self.recent_speeches.append(duration_ms)
+    def get_adaptive_silence_ms(self):
+        # 1. 快速说话特征：平均语音段长度短（如 < 250ms）
+        avg_speech = np.mean(self.recent_speeches) if self.recent_speeches else self.base
+        avg_silence = np.mean(self.recent_silences) if self.recent_silences else self.base
+        # 2. 快速语音则缩短 silence 阈值
+        speed_factor = 1.0
+        if avg_speech < 300:
+            speed_factor = 0.5
+        elif avg_speech < 600:
+            speed_factor = 0.8
+        # 3. silence 的变化趋势也考虑进去
+        adaptive = self.base * speed_factor + 0.3 * avg_silence
+        return int(max(self.min, min(self.max, adaptive)))
 class OnnxWrapper():
     def __init__(self, path, force_onnx_cpu=False):
                  sampling_rate: int = 16000,
                  min_silence_duration_ms: int = 100,
                  max_speech_duration_s: float = float('inf'),
+                 speech_pad_ms: int = 30
                  ):
         self.model = OnnxWrapper(VAD_MODEL_PATH, True)
         self.threshold = threshold
         self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
         self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
+        self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
         self.reset_states()
     def reset_states(self):
         if (speech_prob >= self.threshold) and not self.triggered:
             self.triggered = True
+            # speech_start = max(0, self.current_sample - window_size_samples)
+            speech_start = max(0, self.current_sample - self.speech_pad_samples - window_size_samples)
             self.start = speech_start
             return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
             if self.current_sample - self.temp_end < self.min_silence_samples:
                 return None
             else:
+                # speech_end = self.temp_end - window_size_samples
+                speech_end = self.temp_end + self.speech_pad_samples - window_size_samples
                 self.temp_end = 0
                 self.triggered = False
                 return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
+class FixedVADIterator(VADIteratorOnnx):
+    '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
+    If audio to be processed at once is long and multiple voiced segments detected,
+    then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
+    '''
+    def reset_states(self):
+        super().reset_states()
+        self.buffer = np.array([],dtype=np.float32)
+    def __call__(self, x, return_seconds=False):
+        self.buffer = np.append(self.buffer, x)
+        ret = None
+        while len(self.buffer) >= 512:
+            r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
+            self.buffer = self.buffer[512:]
+            if ret is None:
+                ret = r
+            elif r is not None:
+                if 'end' in r:
+                    ret['end'] = r['end']  # the latter end
+                if 'start' in r and 'end' in ret:  # there is an earlier start.
+                    # Remove end, merging this segment with the previous one.
+                    del ret['end']
+        return ret if ret != {} else None
 class VadV2:
     def __init__(self,
                  threshold: float = 0.5,
         return None
+class SileroVADProcessor:
+    """
+    A class for processing audio files using Silero VAD to detect voice activity
+    and extract voice segments from audio files.
+    """
+    def __init__(self,
+                 activate_threshold=0.5,
+                 fusion_threshold=0.3,
+                 min_speech_duration=0.25,
+                 max_speech_duration=20,
+                 min_silence_duration=250,
+                 sample_rate=16000,
+                 ort_providers=None):
+        """
+        Initialize the SileroVADProcessor.
+        Args:
+            activate_threshold (float): Threshold for voice activity detection
+            fusion_threshold (float): Threshold for merging close speech segments (seconds)
+            min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
+            max_speech_duration (float): Maximum duration of speech (seconds)
+            min_silence_duration (int): Minimum silence duration (ms)
+            sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
+            ort_providers (list): ONNX Runtime providers for acceleration
+        """
+        # VAD parameters
+        self.activate_threshold = activate_threshold
+        self.fusion_threshold = fusion_threshold
+        self.min_speech_duration = min_speech_duration
+        self.max_speech_duration = max_speech_duration
+        self.min_silence_duration = min_silence_duration
+        self.sample_rate = sample_rate
+        self.ort_providers = ort_providers if ort_providers else []
+        # Initialize logger
+        self.logger = logging.getLogger(__name__)
+        # Load Silero VAD model
+        self._init_onnx_session()
+        self.silero_vad = load_silero_vad(onnx=True)
+    def _init_onnx_session(self):
+        """Initialize ONNX Runtime session with appropriate settings."""
+        session_opts = onnxruntime.SessionOptions()
+        session_opts.log_severity_level = 3
+        session_opts.inter_op_num_threads = 0
+        session_opts.intra_op_num_threads = 0
+        session_opts.enable_cpu_mem_arena = True
+        session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
+        session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
+        session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
+        session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
+        # Set the session_opts to be used by silero_vad
+        # onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
+    def load_audio(self, audio_path):
+        """
+        Load audio file and prepare it for VAD processing.
+        Args:
+            audio_path (str): Path to the audio file
+        Returns:
+            numpy.ndarray: Audio data as numpy array
+        """
+        self.logger.info(f"Loading audio from {audio_path}")
+        audio_segment = AudioSegment.from_file(audio_path)
+        audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
+        # Convert to numpy array and normalize
+        dtype = np.float16 if self.use_gpu_fp16 else np.float32
+        audio_array = np.array(audio_segment.get_array_of_samples(), dtype=dtype) * 0.000030517578  # 1/32768
+        self.audio_segment = audio_segment  # Store for later use
+        return audio_array
+    @property
+    def model(self):
+        return self.silero_vad
+    def process_timestamps(self, timestamps):
+        """
+        Process VAD timestamps: filter short segments and merge close segments.
+        Args:
+            timestamps (list): List of (start, end) tuples
+        Returns:
+            list: Processed list of (start, end) tuples
+        """
+        # Filter out short durations
+        filtered_timestamps = [(start, end) for start, end in timestamps
+                               if (end - start) >= self.min_speech_duration]
+        # Fuse timestamps in two passes for better merging
+        fused_timestamps_1st = []
+        for start, end in filtered_timestamps:
+            if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
+                fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
+            else:
+                fused_timestamps_1st.append((start, end))
+        fused_timestamps_2nd = []
+        for start, end in fused_timestamps_1st:
+            if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
+                fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
+            else:
+                fused_timestamps_2nd.append((start, end))
+        return fused_timestamps_2nd
+    def format_time(self, seconds):
+        """
+        Convert seconds to VTT time format 'hh:mm:ss.mmm'.
+        Args:
+            seconds (float): Time in seconds
+        Returns:
+            str: Formatted time string
+        """
+        td = timedelta(seconds=seconds)
+        td_sec = td.total_seconds()
+        total_seconds = int(td_sec)
+        milliseconds = int((td_sec - total_seconds) * 1000)
+        hours = total_seconds // 3600
+        minutes = (total_seconds % 3600) // 60
+        seconds = total_seconds % 60
+        return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
+    def detect_speech(self, audio:np.array):
+        """
+        Run VAD on the audio file to detect speech segments.
+        Args:
+            audio_path (str): Path to the audio file
+        Returns:
+            list: List of processed timestamps as (start, end) tuples
+        """
+        self.logger.info("Starting VAD process")
+        start_time = time.time()
+        # Get speech timestamps
+        raw_timestamps = get_speech_timestamps(
+            audio,
+            model=self.silero_vad,
+            threshold=self.activate_threshold,
+            max_speech_duration_s=self.max_speech_duration,
+            min_speech_duration_ms=int(self.min_speech_duration * 1000),
+            min_silence_duration_ms=self.min_silence_duration,
+            return_seconds=True
+        )
+        # Convert to simple format and process
+        timestamps = [(item['start'], item['end']) for item in raw_timestamps]
+        processed_timestamps = self.process_timestamps(timestamps)
+        # Clean up
+        del audio
+        gc.collect()
+        self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
+        return processed_timestamps
+        """
+        Save timestamps in both second and sample indices formats.
+        Args:
+            timestamps (list): List of (start, end) tuples
+            output_prefix (str): Prefix for output files
+        """
+        # Save timestamps in seconds (VTT format)
+        seconds_path = f"{output_prefix}_timestamps_second.txt"
+        with open(seconds_path, "w", encoding='UTF-8') as file:
+            self.logger.info("Saving timestamps in seconds format")
+            for start, end in timestamps:
+                s_time = self.format_time(start)
+                e_time = self.format_time(end)
+                line = f"{s_time} --> {e_time}\n"
+                file.write(line)
+        # Save timestamps in sample indices
+        indices_path = f"{output_prefix}_timestamps_indices.txt"
+        with open(indices_path, "w", encoding='UTF-8') as file:
+            self.logger.info("Saving timestamps in indices format")
+            for start, end in timestamps:
+                line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
+                file.write(line)
+        self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
+    def extract_speech_segments(self, audio_segment, timestamps):
+        """
+        Extract speech segments from the audio and combine them into a single audio file.
+        Args:
+            timestamps (list): List of (start, end) tuples indicating speech segments
+        Returns:
+            AudioSegment: The combined speech segments
+        """
+        audio_segment = audio_segment.numpy()
+        combined_speech = np.array([], dtype=np.float32)
+        # Extract and combine each speech segment
+        for i, (start, end) in enumerate(timestamps):
+            # Convert seconds to milliseconds for pydub
+            start_ms = int(start * 1000)
+            end_ms = int(end * 1000)
+            # Ensure the end time does not exceed the length of the audio segment
+            if end_ms > len(audio_segment):
+                end_ms = len(audio_segment)
+            # Extract the segment
+            segment = audio_segment[start_ms:end_ms]
+            # Add to combined audio
+            combined_speech = np.append(combined_speech, segment)
+        return combined_speech
+    def process_audio(self, audio_array:np.array):
+        """
+        Complete processing pipeline: detect speech, save timestamps, and optionally extract speech.
+        Returns:
+            tuple: (timestamps, output_speech_path if extract_speech else None)
+        """
+        # Run VAD to detect speech
+        timestamps = self.detect_speech(audio_array)
+        combined_speech = self.extract_speech_segments(audio_array, timestamps)
+        return timestamps, combined_speech
 class VadProcessor:
     def __init__(

transcribe/helpers/whisper.py CHANGED Viewed

@@ -52,7 +52,7 @@ class WhisperCPP:
                 initial_prompt=prompt,
                 language=language,
                 # token_timestamps=True,
-                # split_on_word=True,
                 # max_len=max_len
             )
             return output

                 initial_prompt=prompt,
                 language=language,
                 # token_timestamps=True,
+                split_on_word=True,
                 # max_len=max_len
             )
             return output

transcribe/pipelines/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from .pipe_translate import TranslatePipe, Translate7BPipe
-from .pipe_whisper import WhisperPipe, WhisperChinese
 from .pipe_vad import VadPipe
-from .base import MetaItem

+from .base import MetaItem
 from .pipe_translate import TranslatePipe, Translate7BPipe
 from .pipe_vad import VadPipe
+from .pipe_whisper import WhisperPipe, WhisperChinese
+from .pipe_funasr import FunASRPipe

transcribe/pipelines/base.py CHANGED Viewed

@@ -22,6 +22,7 @@ class MetaItem:
     translate_content: str = ''
     source_language: str = 'zh'
     destination_language: str = 'en'
 class BasePipe(Process):

     translate_content: str = ''
     source_language: str = 'zh'
     destination_language: str = 'en'
+    speech_status: str = 'END' # "END", "START"
 class BasePipe(Process):

transcribe/pipelines/pipe_funasr.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import unicodedata
+from .base import MetaItem, BasePipe, Segment
+from ..helpers.funasr import FunASR
+class FunASRPipe(BasePipe):
+    funasr = None
+    @classmethod
+    def init(cls):
+        if cls.funasr is None:
+            cls.funasr = FunASR()
+    def process(self, in_data: MetaItem) -> MetaItem:
+        audio_data = in_data.audio
+        source_language = in_data.source_language
+        result = self.funasr.transcribe(audio_data, source_language)
+        # 处理 FunASR 的输出结果
+        if result and isinstance(result, list) and 'text' in result[0]:
+            # FunASR 输出格式为包含文本和时间戳的字典列表
+            segments = []
+            texts = []
+            for item in result:
+                text = item.get('text', '')
+                start = item.get('start', 0)
+                end = item.get('end', 0)
+                segments.append(Segment(t0=start, t1=end, text=self.filter_chinese_printable(text)))
+                texts.append(text)
+            in_data.segments = segments
+            in_data.transcribe_content = "".join(texts)
+        else:
+            # 如果 FunASR 返回的是单个文本字符串或其他格式
+            if isinstance(result, str):
+                in_data.transcribe_content = result
+                in_data.segments = [Segment(t0=0, t1=0, text=self.filter_chinese_printable(result))]
+            elif result and hasattr(result[0], 'text'):
+                # 如果是对象列表
+                segments = []
+                texts = []
+                for item in result:
+                    text = item.text
+                    start = getattr(item, 'start', 0) or getattr(item, 't0', 0)
+                    end = getattr(item, 'end', 0) or getattr(item, 't1', 0)
+                    segments.append(Segment(t0=start, t1=end, text=self.filter_chinese_printable(text)))
+                    texts.append(text)
+                in_data.segments = segments
+                in_data.transcribe_content = "".join(texts)
+            else:
+                in_data.transcribe_content = ""
+                in_data.segments = []
+        in_data.audio = b""
+        return in_data
+    def filter_chinese_printable(self, s):
+        printable = []
+        bytearray_chars = s.encode('utf-8')
+        for char in bytearray_chars.decode('utf-8', errors='replace'):
+            if unicodedata.category(char) != 'Cc':  # 不可打印字符的分类为 'Cc'
+                printable.append(char)
+        return ''.join(printable).strip()
+class FunASRChinese(FunASRPipe):
+    @classmethod
+    def init(cls):
+        if cls.funasr is None:
+            cls.funasr = FunASR(source_lange='zh')

transcribe/pipelines/pipe_vad.py CHANGED Viewed

@@ -1,41 +1,119 @@
 from .base import MetaItem, BasePipe
-from ..helpers.vadprocessor import VadV2
 import numpy as np
 from silero_vad import get_speech_timestamps
 from typing import List
 import logging
 # import noisereduce as nr
 class VadPipe(BasePipe):
     vac = None
     sample_rate = 16000
-    window_size_samples = 512
-    chunk_size = 512
-    prob_threshold=0.5,
-    silence_s=0.5,
-    cache_s=0.25,
     @classmethod
     def init(cls):
         if cls.vac is None:
-            cls.vac = VadV2(cls.prob_threshold, cls.sample_rate, cls.silence_s * 1000, cls.cache_s * 1000, max_speech_duration_s=15)
-    def process(self, in_data: MetaItem) -> MetaItem:
-        audio_buffer = np.frombuffer(in_data.source_audio)
-        vad_audio = self.vac(audio_buffer)
-        if vad_audio:
-            in_data.audio = vad_audio['audio']
-        else:
-            in_data.audio = b""
-        return in_data
     # def reduce_noise(self, data):
     #     return nr.reduce_noise(y=data, sr=self.sample_rate)

 from .base import MetaItem, BasePipe
+from ..helpers.vadprocessor import FixedVADIterator, AdaptiveSilenceController
 import numpy as np
 from silero_vad import get_speech_timestamps
 from typing import List
 import logging
+import time
 # import noisereduce as nr
 class VadPipe(BasePipe):
     vac = None
     sample_rate = 16000
+    def __init__(self, in_queue=None, out_queue=None) -> None:
+        super().__init__(in_queue, out_queue)
+        self._offset = 0 # 处理的frame size offset
+        self._status = 'END'
+        self.last_state_change_offset = 0
+        self.adaptive_ctrl = AdaptiveSilenceController()
+    def reset(self):
+        self._offset = 0
+        self._status = 'END'
+        self.last_state_change_offset = 0
+        self.adaptive_ctrl = AdaptiveSilenceController()
+        self.vac.reset_states()
     @classmethod
     def init(cls):
         if cls.vac is None:
+            cls.vac = FixedVADIterator(
+                threshold=0.5,
+                sampling_rate=cls.sample_rate,
+                # speech_pad_ms=10
+                min_silence_duration_ms = 100,
+                # speech_pad_ms = 30,
+                max_speech_duration_s=20.0,
+                )
+            cls.vac.reset_states()
     # def reduce_noise(self, data):
     #     return nr.reduce_noise(y=data, sr=self.sample_rate)
+    def _process_speech_chunk(self, source_audio:np.ndarray):
+        speech_dict = self.vac(source_audio, return_seconds=False)
+        if speech_dict:
+            relative_start_frame = None
+            relative_end_frame = None
+            start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
+            if start_frame:
+                relative_start_frame =start_frame - self._offset
+            if end_frame:
+                relative_end_frame = max(0, end_frame - self._offset)
+            return relative_start_frame, relative_end_frame
+    def update_silence_ms(self):
+        min_silence = self.adaptive_ctrl.get_adaptive_silence_ms()
+        min_silence_samples = self.sample_rate * min_silence / 1000
+        self.vac.min_silence_samples = min_silence_samples
+        logging.warning(f"🫠 update_silence_ms :{min_silence} => current: {self.vac.min_silence_samples} ")
+    def process(self, in_data: MetaItem) -> MetaItem:
+        if self._offset == 0:
+            self.vac.reset_states()
+        # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
+        source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
+        speech_data  = self._process_speech_chunk(source_audio)
+        if speech_data: # 表示有音频的变化点出现
+            # self.update_silence_ms()
+            rel_start_frame, rel_end_frame = speech_data
+            if rel_start_frame is not None and rel_end_frame is None:
+                self._status = "START" # 语音开始
+                target_audio = source_audio[rel_start_frame:]
+                 # 计算上一段静音长度
+                silence_len = (self._offset + rel_start_frame - self.last_state_change_offset) / self.sample_rate * 1000
+                self.adaptive_ctrl.update_silence(silence_len)
+                self.last_state_change_offset = self._offset + rel_start_frame
+                logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
+            elif rel_start_frame is None and rel_end_frame is not None:
+                self._status = "END" # 音频结束
+                target_audio = source_audio[:rel_end_frame]
+                speech_len = (rel_end_frame) / self.sample_rate * 1000
+                self.adaptive_ctrl.update_speech(speech_len)
+                self.last_state_change_offset = self._offset + rel_end_frame
+                logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
+            else:
+                self._status = 'END'
+                target_audio = source_audio[rel_start_frame:rel_end_frame]
+                logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
+                seg_len = (rel_end_frame - rel_start_frame) / self.sample_rate * 1000
+                self.adaptive_ctrl.update_speech(seg_len)
+                self.last_state_change_offset = self._offset + rel_end_frame
+                # logging.debug("❌ No valid speech segment detected, setting status to END")
+        else:
+            if self._status == 'START':
+                target_audio = source_audio
+                # logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
+            else: # end
+                target_audio = np.array([],dtype=np.float32)
+                # self._status = 'END'
+                # logging.debug("❌ No speech detected, setting status to END")
+        self._offset += len(source_audio)
+        in_data.audio = target_audio.tobytes()
+        in_data.source_audio = b''
+        in_data.speech_status = self._status
+        return in_data

transcribe/pipelines/pipe_whisper.py CHANGED Viewed

@@ -1,19 +1,17 @@
 import unicodedata
 from .base import MetaItem, BasePipe, Segment
 from ..helpers.whisper import WhisperCPP
 class WhisperPipe(BasePipe):
     whisper = None
     @classmethod
     def init(cls):
         if cls.whisper is None:
             # cls.zh_whisper = WhisperCPP(source_lange='zh')
             cls.whisper = WhisperCPP()
     def process(self, in_data: MetaItem) -> MetaItem:
         audio_data = in_data.audio
@@ -32,7 +30,6 @@ class WhisperPipe(BasePipe):
             if unicodedata.category(char) != 'Cc':  # 不可打印字符的分类为 'Cc'
                 printable.append(char)
         return ''.join(printable).strip()
 class WhisperChinese(WhisperPipe):

 import unicodedata
 from .base import MetaItem, BasePipe, Segment
 from ..helpers.whisper import WhisperCPP
 class WhisperPipe(BasePipe):
     whisper = None
     @classmethod
     def init(cls):
         if cls.whisper is None:
             # cls.zh_whisper = WhisperCPP(source_lange='zh')
             cls.whisper = WhisperCPP()
     def process(self, in_data: MetaItem) -> MetaItem:
         audio_data = in_data.audio
             if unicodedata.category(char) != 'Cc':  # 不可打印字符的分类为 'Cc'
                 printable.append(char)
         return ''.join(printable).strip()
 class WhisperChinese(WhisperPipe):

transcribe/translatepipes.py CHANGED Viewed

@@ -1,86 +1,81 @@
-from transcribe.pipelines import WhisperPipe, TranslatePipe, MetaItem, WhisperChinese, Translate7BPipe
-import multiprocessing  as mp
-import config
 class TranslatePipes:
     def __init__(self) -> None:
-        # self.whisper_input_q = mp.Queue()
         # self.translate_input_q = mp.Queue()
         # self.result_queue = mp.Queue()
         # whisper 转录
         self._whisper_pipe_en = self._launch_process(WhisperPipe())
-        self._whisper_pipe_zh = self._launch_process(WhisperChinese())
         # llm 翻译
         # self._translate_pipe = self._launch_process(TranslatePipe())
         self._translate_7b_pipe = self._launch_process(Translate7BPipe())
         # vad
-        # self._vad_pipe = self._launch_process(VadPipe())
     # def reset(self):
     #     self._vad_pipe.reset()
     def _launch_process(self, process_obj):
         process_obj.daemon = True
         process_obj.start()
         return process_obj
     def wait_ready(self):
-        self._whisper_pipe_zh.wait()
-        self._whisper_pipe_en.wait()
-        # self._translate_pipe.wait()
-        # self._vad_pipe.wait()
-        self._translate_7b_pipe.wait()
     def translate(self, text, src_lang, dst_lang) -> MetaItem:
         item = MetaItem(
             transcribe_content=text,
-            source_language=src_lang,
             destination_language=dst_lang)
         self._translate_pipe.input_queue.put(item)
         return self._translate_pipe.output_queue.get()
     def translate_large(self, text, src_lang, dst_lang) -> MetaItem:
         item = MetaItem(
             transcribe_content=text,
-            source_language=src_lang,
             destination_language=dst_lang)
         self._translate_7b_pipe.input_queue.put(item)
         return self._translate_7b_pipe.output_queue.get()
-    def get_whisper_model(self, lang:str='en'):
         if lang == 'zh':
             return self._whisper_pipe_zh
         return self._whisper_pipe_en
-    def transcrible(self, audio_buffer:bytes, src_lang: str) -> MetaItem:
-        whisper_model = self.get_whisper_model(src_lang)
         item = MetaItem(audio=audio_buffer, source_language=src_lang)
-        whisper_model.input_queue.put(item)
-        return whisper_model.output_queue.get()
-    def voice_detect(self, audio_buffer:bytes) -> MetaItem:
         item = MetaItem(source_audio=audio_buffer)
         self._vad_pipe.input_queue.put(item)
         return self._vad_pipe.output_queue.get()
 if __name__ == "__main__":
     import soundfile
     tp = TranslatePipes()
     # result = tp.translate("你好，今天天气怎么样?", src_lang="zh", dst_lang="en")
     mel, _, = soundfile.read("assets/jfk.flac")
     # result = tp.transcrible(mel, 'en')
     result = tp.voice_detect(mel)
     print(result)

+from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe
 class TranslatePipes:
     def __init__(self) -> None:
+        # self.whisper_input_q = mp.Queue()
         # self.translate_input_q = mp.Queue()
         # self.result_queue = mp.Queue()
+        self._process = []
         # whisper 转录
         self._whisper_pipe_en = self._launch_process(WhisperPipe())
+        # self._whisper_pipe_zh = self._launch_process(WhisperChinese())
+        self._funasr_pipe = self._launch_process(FunASRPipe())
         # llm 翻译
         # self._translate_pipe = self._launch_process(TranslatePipe())
         self._translate_7b_pipe = self._launch_process(Translate7BPipe())
         # vad
+        self._vad_pipe = self._launch_process(VadPipe())
     # def reset(self):
     #     self._vad_pipe.reset()
     def _launch_process(self, process_obj):
         process_obj.daemon = True
         process_obj.start()
+        self._process.append(process_obj)
         return process_obj
     def wait_ready(self):
+        for p in self._process:
+            p.wait()
     def translate(self, text, src_lang, dst_lang) -> MetaItem:
         item = MetaItem(
             transcribe_content=text,
+            source_language=src_lang,
             destination_language=dst_lang)
         self._translate_pipe.input_queue.put(item)
         return self._translate_pipe.output_queue.get()
     def translate_large(self, text, src_lang, dst_lang) -> MetaItem:
         item = MetaItem(
             transcribe_content=text,
+            source_language=src_lang,
             destination_language=dst_lang)
         self._translate_7b_pipe.input_queue.put(item)
         return self._translate_7b_pipe.output_queue.get()
+    def get_whisper_model(self, lang: str = 'en'):
         if lang == 'zh':
             return self._whisper_pipe_zh
         return self._whisper_pipe_en
+    def get_transcription_model(self, lang: str = 'en'):
+        if lang == 'zh':
+            return self._funasr_pipe
+        return self._whisper_pipe_en
+    def transcrible(self, audio_buffer: bytes, src_lang: str) -> MetaItem:
+        transcription_model = self.get_transcription_model(src_lang)
         item = MetaItem(audio=audio_buffer, source_language=src_lang)
+        transcription_model.input_queue.put(item)
+        return transcription_model.output_queue.get()
+    def voice_detect(self, audio_buffer: bytes) -> MetaItem:
         item = MetaItem(source_audio=audio_buffer)
         self._vad_pipe.input_queue.put(item)
         return self._vad_pipe.output_queue.get()
 if __name__ == "__main__":
     import soundfile
     tp = TranslatePipes()
     # result = tp.translate("你好，今天天气怎么样?", src_lang="zh", dst_lang="en")
     mel, _, = soundfile.read("assets/jfk.flac")
     # result = tp.transcrible(mel, 'en')
     result = tp.voice_detect(mel)
     print(result)

transcribe/utils.py CHANGED Viewed

@@ -7,6 +7,51 @@ from scipy.io.wavfile import write
 import config
 import csv
 import av
 def log_block(key: str, value, unit=''):
     if config.DEBUG:
         return

 import config
 import csv
 import av
+import re
+# Compile regex patterns once outside the loop for better performance
+p_pattern = re.compile(r"(\s*\[.*?\])")
+p_start_pattern = re.compile(r"(\s*\[.*)")
+p_end_pattern = re.compile(r"(\s*.*\])")
+def filter_words(res_word):
+    """
+    Filter words according to specific bracket patterns.
+    Args:
+        res_word: Iterable of word objects with a 'text' attribute
+    Returns:
+        List of filtered word objects
+    """
+    asr_results = []
+    skip_word = False
+    for word in res_word:
+        # Skip words that completely match the pattern
+        if p_pattern.match(word.text):
+            continue
+        # Mark the start of a section to skip
+        if p_start_pattern.match(word.text):
+            skip_word = True
+            continue
+        # Mark the end of a section to skip
+        if p_end_pattern.match(word.text) and skip_word:
+            skip_word = False
+            continue
+        # Skip words if we're in a skip section
+        if skip_word:
+            continue
+        # Add the word to results if it passed all filters
+        asr_results.append(word)
+    return asr_results
 def log_block(key: str, value, unit=''):
     if config.DEBUG:
         return

transcribe/whisper_llm_serve.py CHANGED Viewed

@@ -8,14 +8,16 @@ from typing import List, Optional, Iterator, Tuple, Any
 import asyncio
 import numpy as np
 import config
 from api_model import TransResult, Message, DebugResult
-from .utils import log_block, save_to_wave, TestDataWriter
 from .translatepipes import TranslatePipes
 from .strategy import (
     TranscriptStabilityAnalyzer, TranscriptToken)
 from transcribe.helpers.vadprocessor import VadProcessor
 from transcribe.pipelines import MetaItem
 logger = getLogger("TranscriptionService")
@@ -43,13 +45,19 @@ class WhisperTranscriptionService:
         self.sample_rate = 16000
         self.lock = threading.Lock()
-        self._frame_queue = queue.Queue()
-        self._vad_frame_queue = queue.Queue()
         # 文本分隔符，根据语言设置
         self.text_separator = self._get_text_separator(language)
         self.loop = asyncio.get_event_loop()
         # 发送就绪状态
         self._transcrible_analysis = None
         # 启动处理线程
@@ -58,25 +66,26 @@ class WhisperTranscriptionService:
         self.translate_thread = self._start_thread(self._transcription_processing_loop)
         self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
-        if language == "zh":
-            self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
-        else:
-            self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
         self.row_number = 0
         # for test
         self._transcrible_time_cost = 0.
         self._translate_time_cost = 0.
-        if config.TEST:
-            self._test_task_stop = threading.Event()
-            self._test_queue = queue.Queue()
-            self._test_thread = self._start_thread(self.test_data_loop)
         # self._c = 0
-    def test_data_loop(self):
         writer = TestDataWriter()
-        while not self._test_task_stop.is_set():
-            test_data = self._test_queue.get()
             writer.write(test_data)  # Save test_data to CSV
@@ -110,23 +119,108 @@ class WhisperTranscriptionService:
         """添加音频帧到处理队列"""
         self._frame_queue.put(frame_np)
     def _frame_processing_loop(self) -> None:
         """从队列获取音频帧并合并到缓冲区"""
         while not self._frame_processing_thread_stop.is_set():
             try:
-                audio = self._frame_queue.get(timeout=0.1)
-                # save_to_wave(f"{self._c}_before_vad.wav", audio)
-                processed_audio = self._vad.process_audio(audio)
-                if processed_audio.shape[0] > 0:
-                    # vad_processed_audio = processed_audio
-                    # save_to_wave(f"{self._c}_after_vad.wav", processed_audio)
-                    # vad_frame_obj = np.frombuffer(processed_audio.audio, dtype=np.float32)
-                    logger.debug(f"Vad frame: {processed_audio.shape[0]/self.sample_rate:.2f}")
-                # apply vad speech check:
-                    self._vad_frame_queue.put(processed_audio)
             except queue.Empty:
                 pass
     def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
         """转录音频并返回转录片段"""
@@ -175,43 +269,7 @@ class WhisperTranscriptionService:
         self._translate_time_cost = round(time_diff, 3)
         return translated_text
-    def _transcription_processing_loop(self) -> None:
-        """主转录处理循环"""
-        while not self._translate_thread_stop.is_set():
-            audio_buffer = self._vad_frame_queue.get()
-            if audio_buffer is None or len(audio_buffer) < int(self.sample_rate):
-                time.sleep(0.2)
-                continue
-            logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
-            # try:
-            meta_item = self._transcribe_audio(audio_buffer)
-            segments = meta_item.segments
-            logger.debug(f"Segments: {segments}")
-            if len(segments):
-                result = self._process_transcription_results_2(segments)
-                self._send_result_to_client(result)
-                time.sleep(0.1)
-            # 处理转录结果并发送到客户端
-            # for result in self._process_transcription_results(segments, audio_buffer):
-            #     self._send_result_to_client(result)
-            # except Exception as e:
-            #     logger.error(f"Error processing audio: {e}")
-    def _process_transcription_results_2(self, segments: List[TranscriptToken],):
-        seg_text = self.text_separator.join(seg.text for seg in segments)
-        item =  TransResult(
-                seg_id=self.row_number,
-                context=seg_text,
-                from_=self.source_language,
-                to=self.target_language,
-                tran_content=self._translate_text_large(seg_text),
-                partial=False
-            )
-        self.row_number += 1
-        return item
     def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
         """
@@ -243,8 +301,8 @@ class WhisperTranscriptionService:
             )
             current_time = time.perf_counter()
             time_diff = current_time - start_time
-            if config.TEST:
-                self._test_queue.put(DebugResult(
                     seg_id=ana_result.seg_id,
                     transcrible_time=self._transcrible_time_cost,
                     translate_time=self._translate_time_cost,
@@ -273,6 +331,6 @@ class WhisperTranscriptionService:
         """停止所有处理线程并清理资源"""
         self._translate_thread_stop.set()
         self._frame_processing_thread_stop.set()
-        if config.TEST:
-            self._test_task_stop.set()
         logger.info(f"Stopping transcription service for client: {self.client_uid}")

 import asyncio
 import numpy as np
 import config
+import collections
 from api_model import TransResult, Message, DebugResult
+from .utils import log_block, save_to_wave, TestDataWriter, filter_words
 from .translatepipes import TranslatePipes
 from .strategy import (
     TranscriptStabilityAnalyzer, TranscriptToken)
 from transcribe.helpers.vadprocessor import VadProcessor
+# from transcribe.helpers.vad_dynamic import VadProcessor
+# from transcribe.helpers.vadprocessor import VadProcessor
 from transcribe.pipelines import MetaItem
 logger = getLogger("TranscriptionService")
         self.sample_rate = 16000
         self.lock = threading.Lock()
         # 文本分隔符，根据语言设置
         self.text_separator = self._get_text_separator(language)
         self.loop = asyncio.get_event_loop()
         # 发送就绪状态
+        #  原始音频队列
+        self._frame_queue = queue.Queue()
+        #  音频队列缓冲区
+        self.frames_np = None
+        #  完整音频队列
+        self.segments_queue = collections.deque()
+        self._temp_string = ""
         self._transcrible_analysis = None
         # 启动处理线程
         self.translate_thread = self._start_thread(self._transcription_processing_loop)
         self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
+        # if language == "zh":
+        #     self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
+        # else:
+        #     self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
         self.row_number = 0
         # for test
         self._transcrible_time_cost = 0.
         self._translate_time_cost = 0.
+        if config.SAVE_DATA_SAVE:
+            self._save_task_stop = threading.Event()
+            self._save_queue = queue.Queue()
+            self._save_thread = self._start_thread(self.save_data_loop)
         # self._c = 0
+    def save_data_loop(self):
         writer = TestDataWriter()
+        while not self._save_task_stop.is_set():
+            test_data = self._save_queue.get()
             writer.write(test_data)  # Save test_data to CSV
         """添加音频帧到处理队列"""
         self._frame_queue.put(frame_np)
+    def _apply_voice_activity_detection(self, frame_np:np.array):
+        """应用语音活动检测来优化音频缓冲区"""
+        processed_audio = self._translate_pipe.voice_detect(frame_np.tobytes())
+        speech_audio =  np.frombuffer(processed_audio.audio, dtype=np.float32)
+        speech_status = processed_audio.speech_status
+        return speech_audio, speech_status
     def _frame_processing_loop(self) -> None:
         """从队列获取音频帧并合并到缓冲区"""
         while not self._frame_processing_thread_stop.is_set():
             try:
+                frame_np = self._frame_queue.get(timeout=0.1)
+                frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
+                if frame_np is None or len(frame_np) == 0:
+                    continue
+                with self.lock:
+                    if self.frames_np is None:
+                        self.frames_np = frame_np.copy()
+                    else:
+                        self.frames_np = np.append(self.frames_np, frame_np)
+                    if speech_status == "END" and len(self.frames_np) > 0:
+                        self.segments_queue.appendleft(self.frames_np.copy())
+                        self.frames_np = np.array([], dtype=np.float32)
             except queue.Empty:
                 pass
+    def _process_transcription_results_2(self, seg_text:str,partial):
+        item =  TransResult(
+                seg_id=self.row_number,
+                context=seg_text,
+                from_=self.source_language,
+                to=self.target_language,
+                tran_content=self._translate_text_large(seg_text),
+                partial=partial
+            )
+        if partial == False:
+            self.row_number += 1
+        return item
+    def _transcription_processing_loop(self) -> None:
+        """主转录处理循环"""
+        frame_epoch = 1
+        while not self._translate_thread_stop.is_set():
+            if self.frames_np is None:
+                time.sleep(0.01)
+                continue
+            if len(self.segments_queue) >0:
+                audio_buffer = self.segments_queue.pop()
+                partial = False
+            else:
+                with self.lock:
+                    audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
+                partial = True
+            if len(audio_buffer) ==0:
+                time.sleep(0.01)
+                continue
+            if len(audio_buffer) < int(self.sample_rate):
+                silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
+                silence_audio[-len(audio_buffer):] = audio_buffer
+                audio_buffer = silence_audio
+            logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
+            # try:
+            meta_item = self._transcribe_audio(audio_buffer)
+            segments = meta_item.segments
+            logger.debug(f"Segments: {segments}")
+            segments = filter_words(segments)
+            if len(segments):
+                seg_text = self.text_separator.join(seg.text for seg in segments)
+                if self._temp_string:
+                    seg_text = self._temp_string + seg_text
+                if partial == False:
+                    if len(seg_text) < config.TEXT_THREHOLD:
+                        partial = True
+                        self._temp_string = seg_text
+                    else:
+                        self._temp_string = ""
+                result = self._process_transcription_results_2(seg_text, partial)
+                self._send_result_to_client(result)
+                time.sleep(0.1)
+                if partial == False:
+                    frame_epoch = 1
+                else:
+                    frame_epoch += 1
+            # 处理转录结果并发送到客户端
+            # for result in self._process_transcription_results(segments, audio_buffer):
+            #     self._send_result_to_client(result)
+            # except Exception as e:
+            #     logger.error(f"Error processing audio: {e}")
     def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
         """转录音频并返回转录片段"""
         self._translate_time_cost = round(time_diff, 3)
         return translated_text
     def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
         """
             )
             current_time = time.perf_counter()
             time_diff = current_time - start_time
+            if config.SAVE_DATA_SAVE:
+                self._save_queue.put(DebugResult(
                     seg_id=ana_result.seg_id,
                     transcrible_time=self._transcrible_time_cost,
                     translate_time=self._translate_time_cost,
         """停止所有处理线程并清理资源"""
         self._translate_thread_stop.set()
         self._frame_processing_thread_stop.set()
+        if config.SAVE_DATA_SAVE:
+            self._save_task_stop.set()
         logger.info(f"Stopping transcription service for client: {self.client_uid}")

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff