Xin Zhang commited on
Commit
38a440e
·
2 Parent(s): 5a5007c e6f9f7a

Merge branch 'vad'

Browse files

* vad:
[fix]: update parameter.
[fix]: requirements.
[fix]: update parameter.
fix np array copy error
add vad update_silence_ms adapter
[fix]: parameter.
update
filter [] words
Disable FunASR pbar.
[fix]: remove unused file.
fix bug of loss segemnts
update text threhold
[fix]: test dynamic vad.
update to vad streaming
[fix]: update web.
update pipelines launch wait
ignore write to wav in assets
update config of save data to save flag
fix words missing
Integrate FunASR.

config.py CHANGED
@@ -3,17 +3,16 @@ import re
3
  import logging
4
 
5
  DEBUG = True
6
- TEST = False
7
- logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
8
-
9
 
 
10
  logging.basicConfig(
11
  level=logging.DEBUG if DEBUG else logging.INFO,
12
  format="%(asctime)s - %(levelname)s - %(message)s",
13
  filename='translator.log',
14
  datefmt="%H:%M:%S"
15
  )
16
-
 
17
  # Add terminal log
18
  console_handler = logging.StreamHandler()
19
  console_handler.setLevel(logging.DEBUG if DEBUG else logging.INFO)
@@ -22,7 +21,7 @@ console_handler.setFormatter(console_formatter)
22
  logging.getLogger().addHandler(console_handler)
23
 
24
  # 文字输出长度阈值
25
- TEXT_THREHOLD = 16
26
 
27
  BASE_DIR = pathlib.Path(__file__).parent
28
  MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
 
3
  import logging
4
 
5
  DEBUG = True
 
 
 
6
 
7
+ logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
8
  logging.basicConfig(
9
  level=logging.DEBUG if DEBUG else logging.INFO,
10
  format="%(asctime)s - %(levelname)s - %(message)s",
11
  filename='translator.log',
12
  datefmt="%H:%M:%S"
13
  )
14
+ # save pipelines data to disk
15
+ SAVE_DATA_SAVE = False
16
  # Add terminal log
17
  console_handler = logging.StreamHandler()
18
  console_handler.setLevel(logging.DEBUG if DEBUG else logging.INFO)
 
21
  logging.getLogger().addHandler(console_handler)
22
 
23
  # 文字输出长度阈值
24
+ TEXT_THREHOLD = 6
25
 
26
  BASE_DIR = pathlib.Path(__file__).parent
27
  MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
pyproject.toml CHANGED
@@ -7,6 +7,7 @@ requires-python = ">=3.11"
7
  dependencies = [
8
  "av>=14.2.0",
9
  "fastapi>=0.115.12",
 
10
  "librosa>=0.11.0",
11
  "numpy>=2.1.3",
12
  "onnxruntime>=1.21.0",
 
7
  dependencies = [
8
  "av>=14.2.0",
9
  "fastapi>=0.115.12",
10
+ "funasr>=1.2.6",
11
  "librosa>=0.11.0",
12
  "numpy>=2.1.3",
13
  "onnxruntime>=1.21.0",
requirements.txt CHANGED
@@ -154,6 +154,9 @@ torch==2.6.0
154
  # silero-vad
155
  # torchaudio
156
  torchaudio==2.6.0
 
 
 
157
  # via silero-vad
158
  tqdm==4.67.1
159
  # via
@@ -184,3 +187,4 @@ websockets==15.0.1
184
  # via trans (pyproject.toml)
185
  wordninja==2.0.0
186
  # via trans (pyproject.toml)
 
 
154
  # silero-vad
155
  # torchaudio
156
  torchaudio==2.6.0
157
+ ane_transformers
158
+ openai-whisper
159
+ coremltools
160
  # via silero-vad
161
  tqdm==4.67.1
162
  # via
 
187
  # via trans (pyproject.toml)
188
  wordninja==2.0.0
189
  # via trans (pyproject.toml)
190
+ funasr==1.2.6
transcribe/helpers/funasr.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import uuid
3
+ from logging import getLogger
4
+
5
+ import numpy as np
6
+ from funasr import AutoModel
7
+ import soundfile as sf
8
+
9
+ import config
10
+
11
+ logger = getLogger(__name__)
12
+
13
+
14
+ class FunASR:
15
+ def __init__(self, source_lange: str = 'en', warmup=True) -> None:
16
+ self.source_lange = source_lange
17
+
18
+ self.model = AutoModel(
19
+ model="paraformer-zh", vad_model="fsmn-vad", punc_model="ct-punc"
20
+ )
21
+ if warmup:
22
+ self.warmup()
23
+
24
+ def warmup(self, warmup_steps=1):
25
+ warmup_soundfile = f"{config.ASSERT_DIR}/jfk.flac"
26
+ for _ in range(warmup_steps):
27
+ self.model.generate(input=warmup_soundfile)
28
+
29
+ def transcribe(self, audio_buffer: bytes, language):
30
+ audio_frames = np.frombuffer(audio_buffer, dtype=np.float32)
31
+ # sf.write(f'{config.ASSERT_DIR}/{time.time()}.wav', audio_frames, samplerate=16000)
32
+ try:
33
+ output = self.model.generate(input=audio_frames, disable_pbar=True)
34
+ return output
35
+ except Exception as e:
36
+ logger.error(e)
37
+ return []
transcribe/helpers/vadprocessor.py CHANGED
@@ -2,10 +2,47 @@ from copy import deepcopy
2
  from queue import Queue, Empty
3
  from time import time
4
  from config import VAD_MODEL_PATH
5
- # from silero_vad import load_silero_vad
6
  import numpy as np
7
  import onnxruntime
8
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  class OnnxWrapper():
10
 
11
  def __init__(self, path, force_onnx_cpu=False):
@@ -108,6 +145,7 @@ class VADIteratorOnnx:
108
  sampling_rate: int = 16000,
109
  min_silence_duration_ms: int = 100,
110
  max_speech_duration_s: float = float('inf'),
 
111
  ):
112
  self.model = OnnxWrapper(VAD_MODEL_PATH, True)
113
  self.threshold = threshold
@@ -118,7 +156,7 @@ class VADIteratorOnnx:
118
 
119
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
120
  self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
121
- # self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
122
  self.reset_states()
123
 
124
  def reset_states(self):
@@ -153,7 +191,8 @@ class VADIteratorOnnx:
153
 
154
  if (speech_prob >= self.threshold) and not self.triggered:
155
  self.triggered = True
156
- speech_start = max(0, self.current_sample - window_size_samples)
 
157
  self.start = speech_start
158
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
159
 
@@ -169,7 +208,8 @@ class VADIteratorOnnx:
169
  if self.current_sample - self.temp_end < self.min_silence_samples:
170
  return None
171
  else:
172
- speech_end = self.temp_end - window_size_samples
 
173
  self.temp_end = 0
174
  self.triggered = False
175
  return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
@@ -178,6 +218,33 @@ class VADIteratorOnnx:
178
 
179
 
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  class VadV2:
182
  def __init__(self,
183
  threshold: float = 0.5,
@@ -269,6 +336,235 @@ class VadV2:
269
  return None
270
 
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  class VadProcessor:
274
  def __init__(
 
2
  from queue import Queue, Empty
3
  from time import time
4
  from config import VAD_MODEL_PATH
5
+ from silero_vad import load_silero_vad
6
  import numpy as np
7
  import onnxruntime
8
+ import logging
9
+ from datetime import timedelta
10
+ import gc
11
+ from pydub import AudioSegment
12
+ from collections import deque
13
+
14
+ class AdaptiveSilenceController:
15
+ def __init__(self, base_silence_ms=120, min_ms=50, max_ms=600):
16
+ self.base = base_silence_ms
17
+ self.min = min_ms
18
+ self.max = max_ms
19
+ self.recent_silences = deque(maxlen=20)
20
+ self.recent_speeches = deque(maxlen=20)
21
+
22
+ def update_silence(self, duration_ms):
23
+ self.recent_silences.append(duration_ms)
24
+
25
+ def update_speech(self, duration_ms):
26
+ self.recent_speeches.append(duration_ms)
27
+
28
+ def get_adaptive_silence_ms(self):
29
+ # 1. 快速说话特征:平均语音段长度短(如 < 250ms)
30
+ avg_speech = np.mean(self.recent_speeches) if self.recent_speeches else self.base
31
+ avg_silence = np.mean(self.recent_silences) if self.recent_silences else self.base
32
+
33
+ # 2. 快速语音则缩短 silence 阈值
34
+ speed_factor = 1.0
35
+ if avg_speech < 300:
36
+ speed_factor = 0.5
37
+ elif avg_speech < 600:
38
+ speed_factor = 0.8
39
+
40
+ # 3. silence 的变化趋势也考虑进去
41
+ adaptive = self.base * speed_factor + 0.3 * avg_silence
42
+
43
+ return int(max(self.min, min(self.max, adaptive)))
44
+
45
+
46
  class OnnxWrapper():
47
 
48
  def __init__(self, path, force_onnx_cpu=False):
 
145
  sampling_rate: int = 16000,
146
  min_silence_duration_ms: int = 100,
147
  max_speech_duration_s: float = float('inf'),
148
+ speech_pad_ms: int = 30
149
  ):
150
  self.model = OnnxWrapper(VAD_MODEL_PATH, True)
151
  self.threshold = threshold
 
156
 
157
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
158
  self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
159
+ self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
160
  self.reset_states()
161
 
162
  def reset_states(self):
 
191
 
192
  if (speech_prob >= self.threshold) and not self.triggered:
193
  self.triggered = True
194
+ # speech_start = max(0, self.current_sample - window_size_samples)
195
+ speech_start = max(0, self.current_sample - self.speech_pad_samples - window_size_samples)
196
  self.start = speech_start
197
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
198
 
 
208
  if self.current_sample - self.temp_end < self.min_silence_samples:
209
  return None
210
  else:
211
+ # speech_end = self.temp_end - window_size_samples
212
+ speech_end = self.temp_end + self.speech_pad_samples - window_size_samples
213
  self.temp_end = 0
214
  self.triggered = False
215
  return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
 
218
 
219
 
220
 
221
+
222
+ class FixedVADIterator(VADIteratorOnnx):
223
+ '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
224
+ If audio to be processed at once is long and multiple voiced segments detected,
225
+ then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
226
+ '''
227
+
228
+ def reset_states(self):
229
+ super().reset_states()
230
+ self.buffer = np.array([],dtype=np.float32)
231
+
232
+ def __call__(self, x, return_seconds=False):
233
+ self.buffer = np.append(self.buffer, x)
234
+ ret = None
235
+ while len(self.buffer) >= 512:
236
+ r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
237
+ self.buffer = self.buffer[512:]
238
+ if ret is None:
239
+ ret = r
240
+ elif r is not None:
241
+ if 'end' in r:
242
+ ret['end'] = r['end'] # the latter end
243
+ if 'start' in r and 'end' in ret: # there is an earlier start.
244
+ # Remove end, merging this segment with the previous one.
245
+ del ret['end']
246
+ return ret if ret != {} else None
247
+
248
  class VadV2:
249
  def __init__(self,
250
  threshold: float = 0.5,
 
336
  return None
337
 
338
 
339
+ class SileroVADProcessor:
340
+ """
341
+ A class for processing audio files using Silero VAD to detect voice activity
342
+ and extract voice segments from audio files.
343
+ """
344
+
345
+ def __init__(self,
346
+ activate_threshold=0.5,
347
+ fusion_threshold=0.3,
348
+ min_speech_duration=0.25,
349
+ max_speech_duration=20,
350
+ min_silence_duration=250,
351
+ sample_rate=16000,
352
+ ort_providers=None):
353
+ """
354
+ Initialize the SileroVADProcessor.
355
+ Args:
356
+ activate_threshold (float): Threshold for voice activity detection
357
+ fusion_threshold (float): Threshold for merging close speech segments (seconds)
358
+ min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
359
+ max_speech_duration (float): Maximum duration of speech (seconds)
360
+ min_silence_duration (int): Minimum silence duration (ms)
361
+ sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
362
+ ort_providers (list): ONNX Runtime providers for acceleration
363
+ """
364
+ # VAD parameters
365
+ self.activate_threshold = activate_threshold
366
+ self.fusion_threshold = fusion_threshold
367
+ self.min_speech_duration = min_speech_duration
368
+ self.max_speech_duration = max_speech_duration
369
+ self.min_silence_duration = min_silence_duration
370
+ self.sample_rate = sample_rate
371
+ self.ort_providers = ort_providers if ort_providers else []
372
+
373
+ # Initialize logger
374
+ self.logger = logging.getLogger(__name__)
375
+
376
+ # Load Silero VAD model
377
+ self._init_onnx_session()
378
+ self.silero_vad = load_silero_vad(onnx=True)
379
+
380
+ def _init_onnx_session(self):
381
+ """Initialize ONNX Runtime session with appropriate settings."""
382
+ session_opts = onnxruntime.SessionOptions()
383
+ session_opts.log_severity_level = 3
384
+ session_opts.inter_op_num_threads = 0
385
+ session_opts.intra_op_num_threads = 0
386
+ session_opts.enable_cpu_mem_arena = True
387
+ session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
388
+ session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
389
+
390
+ session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
391
+ session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
392
+ session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
393
+
394
+ # Set the session_opts to be used by silero_vad
395
+ # onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
396
+
397
+ def load_audio(self, audio_path):
398
+ """
399
+ Load audio file and prepare it for VAD processing.
400
+ Args:
401
+ audio_path (str): Path to the audio file
402
+ Returns:
403
+ numpy.ndarray: Audio data as numpy array
404
+ """
405
+ self.logger.info(f"Loading audio from {audio_path}")
406
+ audio_segment = AudioSegment.from_file(audio_path)
407
+ audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
408
+
409
+ # Convert to numpy array and normalize
410
+ dtype = np.float16 if self.use_gpu_fp16 else np.float32
411
+ audio_array = np.array(audio_segment.get_array_of_samples(), dtype=dtype) * 0.000030517578 # 1/32768
412
+
413
+ self.audio_segment = audio_segment # Store for later use
414
+ return audio_array
415
+
416
+ @property
417
+ def model(self):
418
+ return self.silero_vad
419
+
420
+ def process_timestamps(self, timestamps):
421
+ """
422
+ Process VAD timestamps: filter short segments and merge close segments.
423
+ Args:
424
+ timestamps (list): List of (start, end) tuples
425
+ Returns:
426
+ list: Processed list of (start, end) tuples
427
+ """
428
+ # Filter out short durations
429
+ filtered_timestamps = [(start, end) for start, end in timestamps
430
+ if (end - start) >= self.min_speech_duration]
431
+
432
+ # Fuse timestamps in two passes for better merging
433
+ fused_timestamps_1st = []
434
+ for start, end in filtered_timestamps:
435
+ if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
436
+ fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
437
+ else:
438
+ fused_timestamps_1st.append((start, end))
439
+
440
+ fused_timestamps_2nd = []
441
+ for start, end in fused_timestamps_1st:
442
+ if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
443
+ fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
444
+ else:
445
+ fused_timestamps_2nd.append((start, end))
446
+
447
+ return fused_timestamps_2nd
448
+
449
+ def format_time(self, seconds):
450
+ """
451
+ Convert seconds to VTT time format 'hh:mm:ss.mmm'.
452
+ Args:
453
+ seconds (float): Time in seconds
454
+ Returns:
455
+ str: Formatted time string
456
+ """
457
+ td = timedelta(seconds=seconds)
458
+ td_sec = td.total_seconds()
459
+ total_seconds = int(td_sec)
460
+ milliseconds = int((td_sec - total_seconds) * 1000)
461
+ hours = total_seconds // 3600
462
+ minutes = (total_seconds % 3600) // 60
463
+ seconds = total_seconds % 60
464
+ return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
465
+
466
+ def detect_speech(self, audio:np.array):
467
+ """
468
+ Run VAD on the audio file to detect speech segments.
469
+ Args:
470
+ audio_path (str): Path to the audio file
471
+ Returns:
472
+ list: List of processed timestamps as (start, end) tuples
473
+ """
474
+ self.logger.info("Starting VAD process")
475
+ start_time = time.time()
476
+ # Get speech timestamps
477
+ raw_timestamps = get_speech_timestamps(
478
+ audio,
479
+ model=self.silero_vad,
480
+ threshold=self.activate_threshold,
481
+ max_speech_duration_s=self.max_speech_duration,
482
+ min_speech_duration_ms=int(self.min_speech_duration * 1000),
483
+ min_silence_duration_ms=self.min_silence_duration,
484
+ return_seconds=True
485
+ )
486
+
487
+ # Convert to simple format and process
488
+ timestamps = [(item['start'], item['end']) for item in raw_timestamps]
489
+ processed_timestamps = self.process_timestamps(timestamps)
490
+
491
+ # Clean up
492
+ del audio
493
+ gc.collect()
494
+
495
+ self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
496
+ return processed_timestamps
497
+
498
+ """
499
+ Save timestamps in both second and sample indices formats.
500
+ Args:
501
+ timestamps (list): List of (start, end) tuples
502
+ output_prefix (str): Prefix for output files
503
+ """
504
+ # Save timestamps in seconds (VTT format)
505
+ seconds_path = f"{output_prefix}_timestamps_second.txt"
506
+ with open(seconds_path, "w", encoding='UTF-8') as file:
507
+ self.logger.info("Saving timestamps in seconds format")
508
+ for start, end in timestamps:
509
+ s_time = self.format_time(start)
510
+ e_time = self.format_time(end)
511
+ line = f"{s_time} --> {e_time}\n"
512
+ file.write(line)
513
+
514
+ # Save timestamps in sample indices
515
+ indices_path = f"{output_prefix}_timestamps_indices.txt"
516
+ with open(indices_path, "w", encoding='UTF-8') as file:
517
+ self.logger.info("Saving timestamps in indices format")
518
+ for start, end in timestamps:
519
+ line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
520
+ file.write(line)
521
+
522
+ self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
523
+
524
+ def extract_speech_segments(self, audio_segment, timestamps):
525
+ """
526
+ Extract speech segments from the audio and combine them into a single audio file.
527
+ Args:
528
+ timestamps (list): List of (start, end) tuples indicating speech segments
529
+ Returns:
530
+ AudioSegment: The combined speech segments
531
+ """
532
+ audio_segment = audio_segment.numpy()
533
+ combined_speech = np.array([], dtype=np.float32)
534
+
535
+ # Extract and combine each speech segment
536
+ for i, (start, end) in enumerate(timestamps):
537
+ # Convert seconds to milliseconds for pydub
538
+ start_ms = int(start * 1000)
539
+ end_ms = int(end * 1000)
540
+
541
+ # Ensure the end time does not exceed the length of the audio segment
542
+ if end_ms > len(audio_segment):
543
+ end_ms = len(audio_segment)
544
+
545
+ # Extract the segment
546
+ segment = audio_segment[start_ms:end_ms]
547
+
548
+ # Add to combined audio
549
+ combined_speech = np.append(combined_speech, segment)
550
+
551
+ return combined_speech
552
+
553
+ def process_audio(self, audio_array:np.array):
554
+ """
555
+ Complete processing pipeline: detect speech, save timestamps, and optionally extract speech.
556
+ Returns:
557
+ tuple: (timestamps, output_speech_path if extract_speech else None)
558
+ """
559
+
560
+ # Run VAD to detect speech
561
+ timestamps = self.detect_speech(audio_array)
562
+
563
+ combined_speech = self.extract_speech_segments(audio_array, timestamps)
564
+
565
+ return timestamps, combined_speech
566
+
567
+
568
 
569
  class VadProcessor:
570
  def __init__(
transcribe/helpers/whisper.py CHANGED
@@ -52,7 +52,7 @@ class WhisperCPP:
52
  initial_prompt=prompt,
53
  language=language,
54
  # token_timestamps=True,
55
- # split_on_word=True,
56
  # max_len=max_len
57
  )
58
  return output
 
52
  initial_prompt=prompt,
53
  language=language,
54
  # token_timestamps=True,
55
+ split_on_word=True,
56
  # max_len=max_len
57
  )
58
  return output
transcribe/pipelines/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
 
 
2
  from .pipe_translate import TranslatePipe, Translate7BPipe
3
- from .pipe_whisper import WhisperPipe, WhisperChinese
4
  from .pipe_vad import VadPipe
5
- from .base import MetaItem
 
 
1
 
2
+ from .base import MetaItem
3
  from .pipe_translate import TranslatePipe, Translate7BPipe
 
4
  from .pipe_vad import VadPipe
5
+ from .pipe_whisper import WhisperPipe, WhisperChinese
6
+ from .pipe_funasr import FunASRPipe
transcribe/pipelines/base.py CHANGED
@@ -22,6 +22,7 @@ class MetaItem:
22
  translate_content: str = ''
23
  source_language: str = 'zh'
24
  destination_language: str = 'en'
 
25
 
26
 
27
  class BasePipe(Process):
 
22
  translate_content: str = ''
23
  source_language: str = 'zh'
24
  destination_language: str = 'en'
25
+ speech_status: str = 'END' # "END", "START"
26
 
27
 
28
  class BasePipe(Process):
transcribe/pipelines/pipe_funasr.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unicodedata
2
+
3
+ from .base import MetaItem, BasePipe, Segment
4
+ from ..helpers.funasr import FunASR
5
+
6
+
7
+ class FunASRPipe(BasePipe):
8
+ funasr = None
9
+
10
+ @classmethod
11
+ def init(cls):
12
+ if cls.funasr is None:
13
+ cls.funasr = FunASR()
14
+
15
+ def process(self, in_data: MetaItem) -> MetaItem:
16
+ audio_data = in_data.audio
17
+ source_language = in_data.source_language
18
+ result = self.funasr.transcribe(audio_data, source_language)
19
+
20
+ # 处理 FunASR 的输出结果
21
+ if result and isinstance(result, list) and 'text' in result[0]:
22
+ # FunASR 输出格式为包含文本和时间戳的字典列表
23
+ segments = []
24
+ texts = []
25
+
26
+ for item in result:
27
+ text = item.get('text', '')
28
+ start = item.get('start', 0)
29
+ end = item.get('end', 0)
30
+ segments.append(Segment(t0=start, t1=end, text=self.filter_chinese_printable(text)))
31
+ texts.append(text)
32
+
33
+ in_data.segments = segments
34
+ in_data.transcribe_content = "".join(texts)
35
+ else:
36
+ # 如果 FunASR 返回的是单个文本字符串或其他格式
37
+ if isinstance(result, str):
38
+ in_data.transcribe_content = result
39
+ in_data.segments = [Segment(t0=0, t1=0, text=self.filter_chinese_printable(result))]
40
+ elif result and hasattr(result[0], 'text'):
41
+ # 如果是对象列表
42
+ segments = []
43
+ texts = []
44
+ for item in result:
45
+ text = item.text
46
+ start = getattr(item, 'start', 0) or getattr(item, 't0', 0)
47
+ end = getattr(item, 'end', 0) or getattr(item, 't1', 0)
48
+ segments.append(Segment(t0=start, t1=end, text=self.filter_chinese_printable(text)))
49
+ texts.append(text)
50
+
51
+ in_data.segments = segments
52
+ in_data.transcribe_content = "".join(texts)
53
+ else:
54
+ in_data.transcribe_content = ""
55
+ in_data.segments = []
56
+
57
+ in_data.audio = b""
58
+ return in_data
59
+
60
+ def filter_chinese_printable(self, s):
61
+ printable = []
62
+ bytearray_chars = s.encode('utf-8')
63
+ for char in bytearray_chars.decode('utf-8', errors='replace'):
64
+ if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
65
+ printable.append(char)
66
+ return ''.join(printable).strip()
67
+
68
+
69
+ class FunASRChinese(FunASRPipe):
70
+ @classmethod
71
+ def init(cls):
72
+ if cls.funasr is None:
73
+ cls.funasr = FunASR(source_lange='zh')
transcribe/pipelines/pipe_vad.py CHANGED
@@ -1,41 +1,119 @@
1
 
2
  from .base import MetaItem, BasePipe
3
- from ..helpers.vadprocessor import VadV2
 
4
  import numpy as np
5
  from silero_vad import get_speech_timestamps
6
  from typing import List
7
  import logging
8
-
9
  # import noisereduce as nr
10
 
11
 
12
  class VadPipe(BasePipe):
13
  vac = None
14
  sample_rate = 16000
15
- window_size_samples = 512
16
- chunk_size = 512
17
- prob_threshold=0.5,
18
- silence_s=0.5,
19
- cache_s=0.25,
 
 
20
 
21
 
 
 
 
 
 
 
22
 
23
  @classmethod
24
  def init(cls):
25
  if cls.vac is None:
26
- cls.vac = VadV2(cls.prob_threshold, cls.sample_rate, cls.silence_s * 1000, cls.cache_s * 1000, max_speech_duration_s=15)
27
-
28
- def process(self, in_data: MetaItem) -> MetaItem:
29
- audio_buffer = np.frombuffer(in_data.source_audio)
30
- vad_audio = self.vac(audio_buffer)
31
- if vad_audio:
32
- in_data.audio = vad_audio['audio']
33
- else:
34
- in_data.audio = b""
35
- return in_data
36
 
37
 
38
  # def reduce_noise(self, data):
39
  # return nr.reduce_noise(y=data, sr=self.sample_rate)
40
 
41
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  from .base import MetaItem, BasePipe
3
+ from ..helpers.vadprocessor import FixedVADIterator, AdaptiveSilenceController
4
+
5
  import numpy as np
6
  from silero_vad import get_speech_timestamps
7
  from typing import List
8
  import logging
9
+ import time
10
  # import noisereduce as nr
11
 
12
 
13
  class VadPipe(BasePipe):
14
  vac = None
15
  sample_rate = 16000
16
+
17
+ def __init__(self, in_queue=None, out_queue=None) -> None:
18
+ super().__init__(in_queue, out_queue)
19
+ self._offset = 0 # 处理的frame size offset
20
+ self._status = 'END'
21
+ self.last_state_change_offset = 0
22
+ self.adaptive_ctrl = AdaptiveSilenceController()
23
 
24
 
25
+ def reset(self):
26
+ self._offset = 0
27
+ self._status = 'END'
28
+ self.last_state_change_offset = 0
29
+ self.adaptive_ctrl = AdaptiveSilenceController()
30
+ self.vac.reset_states()
31
 
32
  @classmethod
33
  def init(cls):
34
  if cls.vac is None:
35
+ cls.vac = FixedVADIterator(
36
+ threshold=0.5,
37
+ sampling_rate=cls.sample_rate,
38
+ # speech_pad_ms=10
39
+ min_silence_duration_ms = 100,
40
+ # speech_pad_ms = 30,
41
+ max_speech_duration_s=20.0,
42
+ )
43
+ cls.vac.reset_states()
 
44
 
45
 
46
  # def reduce_noise(self, data):
47
  # return nr.reduce_noise(y=data, sr=self.sample_rate)
48
 
49
+ def _process_speech_chunk(self, source_audio:np.ndarray):
50
+ speech_dict = self.vac(source_audio, return_seconds=False)
51
+ if speech_dict:
52
+ relative_start_frame = None
53
+ relative_end_frame = None
54
+ start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
55
+ if start_frame:
56
+ relative_start_frame =start_frame - self._offset
57
+ if end_frame:
58
+ relative_end_frame = max(0, end_frame - self._offset)
59
+ return relative_start_frame, relative_end_frame
60
+
61
+ def update_silence_ms(self):
62
+ min_silence = self.adaptive_ctrl.get_adaptive_silence_ms()
63
+ min_silence_samples = self.sample_rate * min_silence / 1000
64
+ self.vac.min_silence_samples = min_silence_samples
65
+ logging.warning(f"🫠 update_silence_ms :{min_silence} => current: {self.vac.min_silence_samples} ")
66
+
67
+ def process(self, in_data: MetaItem) -> MetaItem:
68
+ if self._offset == 0:
69
+ self.vac.reset_states()
70
+
71
+ # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
72
+ source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
73
+ speech_data = self._process_speech_chunk(source_audio)
74
+
75
+ if speech_data: # 表示有音频的变化点出现
76
+ # self.update_silence_ms()
77
+ rel_start_frame, rel_end_frame = speech_data
78
+ if rel_start_frame is not None and rel_end_frame is None:
79
+ self._status = "START" # 语音开始
80
+ target_audio = source_audio[rel_start_frame:]
81
+
82
+ # 计算上一段静音长度
83
+ silence_len = (self._offset + rel_start_frame - self.last_state_change_offset) / self.sample_rate * 1000
84
+ self.adaptive_ctrl.update_silence(silence_len)
85
+ self.last_state_change_offset = self._offset + rel_start_frame
86
+
87
+ logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
88
+ elif rel_start_frame is None and rel_end_frame is not None:
89
+ self._status = "END" # 音频结束
90
+ target_audio = source_audio[:rel_end_frame]
91
+
92
+ speech_len = (rel_end_frame) / self.sample_rate * 1000
93
+ self.adaptive_ctrl.update_speech(speech_len)
94
+ self.last_state_change_offset = self._offset + rel_end_frame
95
+ logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
96
+ else:
97
+ self._status = 'END'
98
+ target_audio = source_audio[rel_start_frame:rel_end_frame]
99
+ logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
100
+
101
+ seg_len = (rel_end_frame - rel_start_frame) / self.sample_rate * 1000
102
+ self.adaptive_ctrl.update_speech(seg_len)
103
+ self.last_state_change_offset = self._offset + rel_end_frame
104
+ # logging.debug("❌ No valid speech segment detected, setting status to END")
105
+ else:
106
+ if self._status == 'START':
107
+ target_audio = source_audio
108
+ # logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
109
+ else: # end
110
+ target_audio = np.array([],dtype=np.float32)
111
+ # self._status = 'END'
112
+ # logging.debug("❌ No speech detected, setting status to END")
113
+
114
+ self._offset += len(source_audio)
115
+
116
+ in_data.audio = target_audio.tobytes()
117
+ in_data.source_audio = b''
118
+ in_data.speech_status = self._status
119
+ return in_data
transcribe/pipelines/pipe_whisper.py CHANGED
@@ -1,19 +1,17 @@
1
-
2
  import unicodedata
 
3
  from .base import MetaItem, BasePipe, Segment
4
  from ..helpers.whisper import WhisperCPP
5
 
 
6
  class WhisperPipe(BasePipe):
7
  whisper = None
8
 
9
-
10
-
11
  @classmethod
12
  def init(cls):
13
  if cls.whisper is None:
14
  # cls.zh_whisper = WhisperCPP(source_lange='zh')
15
  cls.whisper = WhisperCPP()
16
-
17
 
18
  def process(self, in_data: MetaItem) -> MetaItem:
19
  audio_data = in_data.audio
@@ -32,7 +30,6 @@ class WhisperPipe(BasePipe):
32
  if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
33
  printable.append(char)
34
  return ''.join(printable).strip()
35
-
36
 
37
 
38
  class WhisperChinese(WhisperPipe):
 
 
1
  import unicodedata
2
+
3
  from .base import MetaItem, BasePipe, Segment
4
  from ..helpers.whisper import WhisperCPP
5
 
6
+
7
  class WhisperPipe(BasePipe):
8
  whisper = None
9
 
 
 
10
  @classmethod
11
  def init(cls):
12
  if cls.whisper is None:
13
  # cls.zh_whisper = WhisperCPP(source_lange='zh')
14
  cls.whisper = WhisperCPP()
 
15
 
16
  def process(self, in_data: MetaItem) -> MetaItem:
17
  audio_data = in_data.audio
 
30
  if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
31
  printable.append(char)
32
  return ''.join(printable).strip()
 
33
 
34
 
35
  class WhisperChinese(WhisperPipe):
transcribe/translatepipes.py CHANGED
@@ -1,86 +1,81 @@
1
- from transcribe.pipelines import WhisperPipe, TranslatePipe, MetaItem, WhisperChinese, Translate7BPipe
2
- import multiprocessing as mp
3
- import config
4
 
5
 
6
  class TranslatePipes:
7
  def __init__(self) -> None:
8
-
9
- # self.whisper_input_q = mp.Queue()
10
  # self.translate_input_q = mp.Queue()
11
  # self.result_queue = mp.Queue()
12
-
13
  # whisper 转录
14
  self._whisper_pipe_en = self._launch_process(WhisperPipe())
15
- self._whisper_pipe_zh = self._launch_process(WhisperChinese())
16
-
 
17
  # llm 翻译
18
  # self._translate_pipe = self._launch_process(TranslatePipe())
19
 
20
  self._translate_7b_pipe = self._launch_process(Translate7BPipe())
21
  # vad
22
- # self._vad_pipe = self._launch_process(VadPipe())
23
-
24
  # def reset(self):
25
  # self._vad_pipe.reset()
26
-
27
  def _launch_process(self, process_obj):
28
  process_obj.daemon = True
29
  process_obj.start()
 
30
  return process_obj
31
 
32
  def wait_ready(self):
33
- self._whisper_pipe_zh.wait()
34
- self._whisper_pipe_en.wait()
35
- # self._translate_pipe.wait()
36
- # self._vad_pipe.wait()
37
- self._translate_7b_pipe.wait()
38
-
39
  def translate(self, text, src_lang, dst_lang) -> MetaItem:
40
  item = MetaItem(
41
  transcribe_content=text,
42
- source_language=src_lang,
43
  destination_language=dst_lang)
44
  self._translate_pipe.input_queue.put(item)
45
  return self._translate_pipe.output_queue.get()
46
-
47
 
48
  def translate_large(self, text, src_lang, dst_lang) -> MetaItem:
49
  item = MetaItem(
50
  transcribe_content=text,
51
- source_language=src_lang,
52
  destination_language=dst_lang)
53
  self._translate_7b_pipe.input_queue.put(item)
54
  return self._translate_7b_pipe.output_queue.get()
55
-
56
- def get_whisper_model(self, lang:str='en'):
57
  if lang == 'zh':
58
  return self._whisper_pipe_zh
59
  return self._whisper_pipe_en
60
-
61
 
62
- def transcrible(self, audio_buffer:bytes, src_lang: str) -> MetaItem:
63
- whisper_model = self.get_whisper_model(src_lang)
 
 
 
 
 
64
  item = MetaItem(audio=audio_buffer, source_language=src_lang)
65
- whisper_model.input_queue.put(item)
66
- return whisper_model.output_queue.get()
67
-
68
- def voice_detect(self, audio_buffer:bytes) -> MetaItem:
69
  item = MetaItem(source_audio=audio_buffer)
70
  self._vad_pipe.input_queue.put(item)
71
  return self._vad_pipe.output_queue.get()
72
 
73
-
74
 
75
  if __name__ == "__main__":
76
  import soundfile
 
77
  tp = TranslatePipes()
78
  # result = tp.translate("你好,今天天气怎么样?", src_lang="zh", dst_lang="en")
79
  mel, _, = soundfile.read("assets/jfk.flac")
80
  # result = tp.transcrible(mel, 'en')
81
  result = tp.voice_detect(mel)
82
  print(result)
83
-
84
-
85
-
86
-
 
1
+ from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe
 
 
2
 
3
 
4
  class TranslatePipes:
5
  def __init__(self) -> None:
6
+ # self.whisper_input_q = mp.Queue()
 
7
  # self.translate_input_q = mp.Queue()
8
  # self.result_queue = mp.Queue()
9
+ self._process = []
10
  # whisper 转录
11
  self._whisper_pipe_en = self._launch_process(WhisperPipe())
12
+ # self._whisper_pipe_zh = self._launch_process(WhisperChinese())
13
+ self._funasr_pipe = self._launch_process(FunASRPipe())
14
+
15
  # llm 翻译
16
  # self._translate_pipe = self._launch_process(TranslatePipe())
17
 
18
  self._translate_7b_pipe = self._launch_process(Translate7BPipe())
19
  # vad
20
+ self._vad_pipe = self._launch_process(VadPipe())
21
+
22
  # def reset(self):
23
  # self._vad_pipe.reset()
24
+
25
  def _launch_process(self, process_obj):
26
  process_obj.daemon = True
27
  process_obj.start()
28
+ self._process.append(process_obj)
29
  return process_obj
30
 
31
  def wait_ready(self):
32
+ for p in self._process:
33
+ p.wait()
34
+
 
 
 
35
  def translate(self, text, src_lang, dst_lang) -> MetaItem:
36
  item = MetaItem(
37
  transcribe_content=text,
38
+ source_language=src_lang,
39
  destination_language=dst_lang)
40
  self._translate_pipe.input_queue.put(item)
41
  return self._translate_pipe.output_queue.get()
 
42
 
43
  def translate_large(self, text, src_lang, dst_lang) -> MetaItem:
44
  item = MetaItem(
45
  transcribe_content=text,
46
+ source_language=src_lang,
47
  destination_language=dst_lang)
48
  self._translate_7b_pipe.input_queue.put(item)
49
  return self._translate_7b_pipe.output_queue.get()
50
+
51
+ def get_whisper_model(self, lang: str = 'en'):
52
  if lang == 'zh':
53
  return self._whisper_pipe_zh
54
  return self._whisper_pipe_en
 
55
 
56
+ def get_transcription_model(self, lang: str = 'en'):
57
+ if lang == 'zh':
58
+ return self._funasr_pipe
59
+ return self._whisper_pipe_en
60
+
61
+ def transcrible(self, audio_buffer: bytes, src_lang: str) -> MetaItem:
62
+ transcription_model = self.get_transcription_model(src_lang)
63
  item = MetaItem(audio=audio_buffer, source_language=src_lang)
64
+ transcription_model.input_queue.put(item)
65
+ return transcription_model.output_queue.get()
66
+
67
+ def voice_detect(self, audio_buffer: bytes) -> MetaItem:
68
  item = MetaItem(source_audio=audio_buffer)
69
  self._vad_pipe.input_queue.put(item)
70
  return self._vad_pipe.output_queue.get()
71
 
 
72
 
73
  if __name__ == "__main__":
74
  import soundfile
75
+
76
  tp = TranslatePipes()
77
  # result = tp.translate("你好,今天天气怎么样?", src_lang="zh", dst_lang="en")
78
  mel, _, = soundfile.read("assets/jfk.flac")
79
  # result = tp.transcrible(mel, 'en')
80
  result = tp.voice_detect(mel)
81
  print(result)
 
 
 
 
transcribe/utils.py CHANGED
@@ -7,6 +7,51 @@ from scipy.io.wavfile import write
7
  import config
8
  import csv
9
  import av
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def log_block(key: str, value, unit=''):
11
  if config.DEBUG:
12
  return
 
7
  import config
8
  import csv
9
  import av
10
+ import re
11
+
12
+ # Compile regex patterns once outside the loop for better performance
13
+ p_pattern = re.compile(r"(\s*\[.*?\])")
14
+ p_start_pattern = re.compile(r"(\s*\[.*)")
15
+ p_end_pattern = re.compile(r"(\s*.*\])")
16
+
17
+
18
+ def filter_words(res_word):
19
+ """
20
+ Filter words according to specific bracket patterns.
21
+
22
+ Args:
23
+ res_word: Iterable of word objects with a 'text' attribute
24
+
25
+ Returns:
26
+ List of filtered word objects
27
+ """
28
+ asr_results = []
29
+ skip_word = False
30
+
31
+ for word in res_word:
32
+ # Skip words that completely match the pattern
33
+ if p_pattern.match(word.text):
34
+ continue
35
+
36
+ # Mark the start of a section to skip
37
+ if p_start_pattern.match(word.text):
38
+ skip_word = True
39
+ continue
40
+
41
+ # Mark the end of a section to skip
42
+ if p_end_pattern.match(word.text) and skip_word:
43
+ skip_word = False
44
+ continue
45
+
46
+ # Skip words if we're in a skip section
47
+ if skip_word:
48
+ continue
49
+
50
+ # Add the word to results if it passed all filters
51
+ asr_results.append(word)
52
+
53
+ return asr_results
54
+
55
  def log_block(key: str, value, unit=''):
56
  if config.DEBUG:
57
  return
transcribe/whisper_llm_serve.py CHANGED
@@ -8,14 +8,16 @@ from typing import List, Optional, Iterator, Tuple, Any
8
  import asyncio
9
  import numpy as np
10
  import config
11
-
12
  from api_model import TransResult, Message, DebugResult
13
 
14
- from .utils import log_block, save_to_wave, TestDataWriter
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
17
  TranscriptStabilityAnalyzer, TranscriptToken)
18
  from transcribe.helpers.vadprocessor import VadProcessor
 
 
19
  from transcribe.pipelines import MetaItem
20
 
21
  logger = getLogger("TranscriptionService")
@@ -43,13 +45,19 @@ class WhisperTranscriptionService:
43
  self.sample_rate = 16000
44
 
45
  self.lock = threading.Lock()
46
- self._frame_queue = queue.Queue()
47
- self._vad_frame_queue = queue.Queue()
48
 
49
  # 文本分隔符,根据语言设置
50
  self.text_separator = self._get_text_separator(language)
51
  self.loop = asyncio.get_event_loop()
52
  # 发送就绪状态
 
 
 
 
 
 
 
53
 
54
  self._transcrible_analysis = None
55
  # 启动处理线程
@@ -58,25 +66,26 @@ class WhisperTranscriptionService:
58
 
59
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
60
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
61
- if language == "zh":
62
- self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
63
- else:
64
- self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
65
  self.row_number = 0
66
  # for test
67
  self._transcrible_time_cost = 0.
68
  self._translate_time_cost = 0.
69
- if config.TEST:
70
- self._test_task_stop = threading.Event()
71
- self._test_queue = queue.Queue()
72
- self._test_thread = self._start_thread(self.test_data_loop)
 
73
 
74
  # self._c = 0
75
 
76
- def test_data_loop(self):
77
  writer = TestDataWriter()
78
- while not self._test_task_stop.is_set():
79
- test_data = self._test_queue.get()
80
  writer.write(test_data) # Save test_data to CSV
81
 
82
 
@@ -110,23 +119,108 @@ class WhisperTranscriptionService:
110
  """添加音频帧到处理队列"""
111
  self._frame_queue.put(frame_np)
112
 
 
 
 
 
 
 
 
113
  def _frame_processing_loop(self) -> None:
114
  """从队列获取音频帧并合并到缓冲区"""
115
  while not self._frame_processing_thread_stop.is_set():
116
  try:
117
- audio = self._frame_queue.get(timeout=0.1)
118
- # save_to_wave(f"{self._c}_before_vad.wav", audio)
119
- processed_audio = self._vad.process_audio(audio)
120
- if processed_audio.shape[0] > 0:
121
- # vad_processed_audio = processed_audio
122
- # save_to_wave(f"{self._c}_after_vad.wav", processed_audio)
123
- # vad_frame_obj = np.frombuffer(processed_audio.audio, dtype=np.float32)
124
- logger.debug(f"Vad frame: {processed_audio.shape[0]/self.sample_rate:.2f}")
125
- # apply vad speech check:
126
- self._vad_frame_queue.put(processed_audio)
 
 
127
  except queue.Empty:
128
  pass
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
132
  """转录音频并返回转录片段"""
@@ -175,43 +269,7 @@ class WhisperTranscriptionService:
175
  self._translate_time_cost = round(time_diff, 3)
176
  return translated_text
177
 
178
- def _transcription_processing_loop(self) -> None:
179
- """主转录处理循环"""
180
 
181
- while not self._translate_thread_stop.is_set():
182
- audio_buffer = self._vad_frame_queue.get()
183
- if audio_buffer is None or len(audio_buffer) < int(self.sample_rate):
184
- time.sleep(0.2)
185
- continue
186
-
187
- logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
188
- # try:
189
- meta_item = self._transcribe_audio(audio_buffer)
190
- segments = meta_item.segments
191
- logger.debug(f"Segments: {segments}")
192
- if len(segments):
193
- result = self._process_transcription_results_2(segments)
194
- self._send_result_to_client(result)
195
- time.sleep(0.1)
196
- # 处理转录结果并发送到客户端
197
- # for result in self._process_transcription_results(segments, audio_buffer):
198
- # self._send_result_to_client(result)
199
-
200
- # except Exception as e:
201
- # logger.error(f"Error processing audio: {e}")
202
-
203
- def _process_transcription_results_2(self, segments: List[TranscriptToken],):
204
- seg_text = self.text_separator.join(seg.text for seg in segments)
205
- item = TransResult(
206
- seg_id=self.row_number,
207
- context=seg_text,
208
- from_=self.source_language,
209
- to=self.target_language,
210
- tran_content=self._translate_text_large(seg_text),
211
- partial=False
212
- )
213
- self.row_number += 1
214
- return item
215
 
216
  def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
217
  """
@@ -243,8 +301,8 @@ class WhisperTranscriptionService:
243
  )
244
  current_time = time.perf_counter()
245
  time_diff = current_time - start_time
246
- if config.TEST:
247
- self._test_queue.put(DebugResult(
248
  seg_id=ana_result.seg_id,
249
  transcrible_time=self._transcrible_time_cost,
250
  translate_time=self._translate_time_cost,
@@ -273,6 +331,6 @@ class WhisperTranscriptionService:
273
  """停止所有处理线程并清理资源"""
274
  self._translate_thread_stop.set()
275
  self._frame_processing_thread_stop.set()
276
- if config.TEST:
277
- self._test_task_stop.set()
278
  logger.info(f"Stopping transcription service for client: {self.client_uid}")
 
8
  import asyncio
9
  import numpy as np
10
  import config
11
+ import collections
12
  from api_model import TransResult, Message, DebugResult
13
 
14
+ from .utils import log_block, save_to_wave, TestDataWriter, filter_words
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
17
  TranscriptStabilityAnalyzer, TranscriptToken)
18
  from transcribe.helpers.vadprocessor import VadProcessor
19
+ # from transcribe.helpers.vad_dynamic import VadProcessor
20
+ # from transcribe.helpers.vadprocessor import VadProcessor
21
  from transcribe.pipelines import MetaItem
22
 
23
  logger = getLogger("TranscriptionService")
 
45
  self.sample_rate = 16000
46
 
47
  self.lock = threading.Lock()
48
+
 
49
 
50
  # 文本分隔符,根据语言设置
51
  self.text_separator = self._get_text_separator(language)
52
  self.loop = asyncio.get_event_loop()
53
  # 发送就绪状态
54
+ # 原始音频队列
55
+ self._frame_queue = queue.Queue()
56
+ # 音频队列缓冲区
57
+ self.frames_np = None
58
+ # 完整音频队列
59
+ self.segments_queue = collections.deque()
60
+ self._temp_string = ""
61
 
62
  self._transcrible_analysis = None
63
  # 启动处理线程
 
66
 
67
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
68
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
69
+ # if language == "zh":
70
+ # self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
71
+ # else:
72
+ # self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
73
  self.row_number = 0
74
  # for test
75
  self._transcrible_time_cost = 0.
76
  self._translate_time_cost = 0.
77
+
78
+ if config.SAVE_DATA_SAVE:
79
+ self._save_task_stop = threading.Event()
80
+ self._save_queue = queue.Queue()
81
+ self._save_thread = self._start_thread(self.save_data_loop)
82
 
83
  # self._c = 0
84
 
85
+ def save_data_loop(self):
86
  writer = TestDataWriter()
87
+ while not self._save_task_stop.is_set():
88
+ test_data = self._save_queue.get()
89
  writer.write(test_data) # Save test_data to CSV
90
 
91
 
 
119
  """添加音频帧到处理队列"""
120
  self._frame_queue.put(frame_np)
121
 
122
+ def _apply_voice_activity_detection(self, frame_np:np.array):
123
+ """应用语音活动检测来优化音频缓冲区"""
124
+ processed_audio = self._translate_pipe.voice_detect(frame_np.tobytes())
125
+ speech_audio = np.frombuffer(processed_audio.audio, dtype=np.float32)
126
+ speech_status = processed_audio.speech_status
127
+ return speech_audio, speech_status
128
+
129
  def _frame_processing_loop(self) -> None:
130
  """从队列获取音频帧并合并到缓冲区"""
131
  while not self._frame_processing_thread_stop.is_set():
132
  try:
133
+ frame_np = self._frame_queue.get(timeout=0.1)
134
+ frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
135
+ if frame_np is None or len(frame_np) == 0:
136
+ continue
137
+ with self.lock:
138
+ if self.frames_np is None:
139
+ self.frames_np = frame_np.copy()
140
+ else:
141
+ self.frames_np = np.append(self.frames_np, frame_np)
142
+ if speech_status == "END" and len(self.frames_np) > 0:
143
+ self.segments_queue.appendleft(self.frames_np.copy())
144
+ self.frames_np = np.array([], dtype=np.float32)
145
  except queue.Empty:
146
  pass
147
 
148
+ def _process_transcription_results_2(self, seg_text:str,partial):
149
+
150
+ item = TransResult(
151
+ seg_id=self.row_number,
152
+ context=seg_text,
153
+ from_=self.source_language,
154
+ to=self.target_language,
155
+ tran_content=self._translate_text_large(seg_text),
156
+ partial=partial
157
+ )
158
+ if partial == False:
159
+ self.row_number += 1
160
+ return item
161
+
162
+ def _transcription_processing_loop(self) -> None:
163
+ """主转录处理循环"""
164
+ frame_epoch = 1
165
+ while not self._translate_thread_stop.is_set():
166
+
167
+ if self.frames_np is None:
168
+ time.sleep(0.01)
169
+ continue
170
+
171
+
172
+ if len(self.segments_queue) >0:
173
+ audio_buffer = self.segments_queue.pop()
174
+ partial = False
175
+ else:
176
+ with self.lock:
177
+ audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
178
+ partial = True
179
+
180
+ if len(audio_buffer) ==0:
181
+ time.sleep(0.01)
182
+ continue
183
+
184
+ if len(audio_buffer) < int(self.sample_rate):
185
+ silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
186
+ silence_audio[-len(audio_buffer):] = audio_buffer
187
+ audio_buffer = silence_audio
188
+
189
+
190
+ logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
191
+ # try:
192
+ meta_item = self._transcribe_audio(audio_buffer)
193
+ segments = meta_item.segments
194
+ logger.debug(f"Segments: {segments}")
195
+ segments = filter_words(segments)
196
+ if len(segments):
197
+ seg_text = self.text_separator.join(seg.text for seg in segments)
198
+ if self._temp_string:
199
+ seg_text = self._temp_string + seg_text
200
+
201
+ if partial == False:
202
+ if len(seg_text) < config.TEXT_THREHOLD:
203
+ partial = True
204
+ self._temp_string = seg_text
205
+ else:
206
+ self._temp_string = ""
207
+
208
+
209
+ result = self._process_transcription_results_2(seg_text, partial)
210
+ self._send_result_to_client(result)
211
+ time.sleep(0.1)
212
+
213
+ if partial == False:
214
+ frame_epoch = 1
215
+ else:
216
+ frame_epoch += 1
217
+ # 处理转录结果并发送到客户端
218
+ # for result in self._process_transcription_results(segments, audio_buffer):
219
+ # self._send_result_to_client(result)
220
+
221
+ # except Exception as e:
222
+ # logger.error(f"Error processing audio: {e}")
223
+
224
 
225
  def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
226
  """转录音频并返回转录片段"""
 
269
  self._translate_time_cost = round(time_diff, 3)
270
  return translated_text
271
 
 
 
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
275
  """
 
301
  )
302
  current_time = time.perf_counter()
303
  time_diff = current_time - start_time
304
+ if config.SAVE_DATA_SAVE:
305
+ self._save_queue.put(DebugResult(
306
  seg_id=ana_result.seg_id,
307
  transcrible_time=self._transcrible_time_cost,
308
  translate_time=self._translate_time_cost,
 
331
  """停止所有处理线程并清理资源"""
332
  self._translate_thread_stop.set()
333
  self._frame_processing_thread_stop.set()
334
+ if config.SAVE_DATA_SAVE:
335
+ self._save_task_stop.set()
336
  logger.info(f"Stopping transcription service for client: {self.client_uid}")
uv.lock CHANGED
The diff for this file is too large to render. See raw diff