daihui.zhang commited on
Commit
fca9809
·
1 Parent(s): 9150655

add vad pipeline

Browse files
transcribe/helpers/translator.py CHANGED
@@ -1,6 +1,6 @@
1
  from logging import getLogger
2
  from llama_cpp import Llama
3
- import time
4
 
5
  logger = getLogger(__name__)
6
 
@@ -19,7 +19,7 @@ class QwenTranslator:
19
  {"role": "user", "content": prompt},
20
  ]
21
 
22
-
23
  def translate(self, prompt, src_lang, dst_lang) -> str:
24
  message = self.to_message(prompt, src_lang, dst_lang)
25
  output = self.llm.create_chat_completion(messages=message, temperature=0)
 
1
  from logging import getLogger
2
  from llama_cpp import Llama
3
+ from functools import lru_cache
4
 
5
  logger = getLogger(__name__)
6
 
 
19
  {"role": "user", "content": prompt},
20
  ]
21
 
22
+ @lru_cache(maxsize=10)
23
  def translate(self, prompt, src_lang, dst_lang) -> str:
24
  message = self.to_message(prompt, src_lang, dst_lang)
25
  output = self.llm.create_chat_completion(messages=message, temperature=0)
transcribe/helpers/vadprocessor.py CHANGED
@@ -4,10 +4,35 @@ import numpy as np
4
  import onnxruntime
5
  from datetime import timedelta
6
  from pydub import AudioSegment
7
- from silero_vad import load_silero_vad, get_speech_timestamps
8
  import os
9
  import logging
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  class SileroVADProcessor:
13
  """
 
4
  import onnxruntime
5
  from datetime import timedelta
6
  from pydub import AudioSegment
7
+ from silero_vad import load_silero_vad, get_speech_timestamps, VADIterator
8
  import os
9
  import logging
10
 
11
+ class FixedVADIterator(VADIterator):
12
+ '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
13
+ If audio to be processed at once is long and multiple voiced segments detected,
14
+ then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
15
+ '''
16
+
17
+ def reset_states(self):
18
+ super().reset_states()
19
+ self.buffer = np.array([],dtype=np.float32)
20
+
21
+ def __call__(self, x, return_seconds=False):
22
+ self.buffer = np.append(self.buffer, x)
23
+ ret = None
24
+ while len(self.buffer) >= 512:
25
+ r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
26
+ self.buffer = self.buffer[512:]
27
+ if ret is None:
28
+ ret = r
29
+ elif r is not None:
30
+ if 'end' in r:
31
+ ret['end'] = r['end'] # the latter end
32
+ if 'start' in r and 'end' in ret: # there is an earlier start.
33
+ # Remove end, merging this segment with the previous one.
34
+ del ret['end']
35
+ return ret if ret != {} else None
36
 
37
  class SileroVADProcessor:
38
  """
transcribe/pipelines/base.py CHANGED
@@ -2,6 +2,10 @@
2
  from dataclasses import dataclass, field
3
  from multiprocessing import Process, Queue
4
  from multiprocessing import Event
 
 
 
 
5
 
6
  @dataclass
7
  class Segment:
@@ -53,7 +57,9 @@ class BasePipe(Process):
53
  raise NotImplementedError
54
 
55
  def run(self):
 
56
  self.init()
 
57
  self.set_ready()
58
  while True:
59
  item = self.input_queue.get()
 
2
  from dataclasses import dataclass, field
3
  from multiprocessing import Process, Queue
4
  from multiprocessing import Event
5
+ from logging import getLogger
6
+
7
+ logger = getLogger(__name__)
8
+
9
 
10
  @dataclass
11
  class Segment:
 
57
  raise NotImplementedError
58
 
59
  def run(self):
60
+ logger.info(f"start initial {self.__class__.__name__}")
61
  self.init()
62
+ logger.info(f"finish initial {self.__class__.__name__}")
63
  self.set_ready()
64
  while True:
65
  item = self.input_queue.get()
transcribe/pipelines/pipe_vad.py CHANGED
@@ -1,15 +1,18 @@
1
 
2
  from .base import MetaItem, BasePipe
3
- from ..helpers.vadprocessor import SileroVADProcessor
4
  import numpy as np
5
- from silero_vad import read_audio, get_speech_timestamps,collect_chunks, VADIterator
6
  import torch
 
 
7
 
8
  class VadPipe(BasePipe):
9
  model = None
10
- sample_rate=16000
11
  window_size_samples = 512
12
 
 
13
  @classmethod
14
  def init(cls):
15
  if cls.model is None:
@@ -21,50 +24,28 @@ class VadPipe(BasePipe):
21
  min_silence_duration=250,
22
  sample_rate=cls.sample_rate
23
  )
 
 
 
 
 
 
 
 
 
24
 
25
- @property
26
- def vad_iterator(self):
27
- return VADIterator(self.model.silero_vad, sampling_rate=self.sample_rate,)
28
 
29
  def process(self, in_data: MetaItem) -> MetaItem:
30
  source_audio = in_data.source_audio
31
  source_audio = np.frombuffer(source_audio, dtype=np.float32)
32
- speech_segments = []
33
- is_speech_active = False
34
- # current_segment_end = len(source_audio)
35
-
36
- for i in range(0, len(source_audio), self.window_size_samples):
37
-
38
- window = source_audio[i:i+self.window_size_samples]
39
-
40
- if len(window) < self.window_size_samples:
41
- padded_window = np.zeros(self.window_size_samples, dtype=np.float32)
42
- padded_window[:len(window)] = window
43
- window = padded_window
44
-
45
- speech_dict = self.vad_iterator(window, return_seconds=False)
46
- if not speech_dict:
47
- continue
48
-
49
- # 计算当前偏移量
50
-
51
- if speech_dict and 'start' in speech_dict and not is_speech_active:
52
- is_speech_active = True
53
- # current_segment_start = speech_dict['start'] + i
54
-
55
- if is_speech_active:
56
- speech_segments.append(window)
57
- # # 如果检测到语音结束
58
- # if speech_dict and 'end' in speech_dict and is_speech_active:
59
- # # 调整语音结束时间,加上窗口偏移
60
- # current_segment_end = min(speech_dict['end'] + i, current_segment_end)
61
- # is_speech_active = False
62
- # speech_audio = source_audio[current_segment_start: current_segment_end]
63
- # speech_segments.append(speech_audio)
64
-
65
- self.vad_iterator.reset_states()
66
- combied_audio = np.concatenate(speech_segments, axis=0).tobytes() if len(speech_segments) else b""
67
- in_data.audio = combied_audio
68
  in_data.source_audio = b""
69
-
70
  return in_data
 
1
 
2
  from .base import MetaItem, BasePipe
3
+ from ..helpers.vadprocessor import SileroVADProcessor, FixedVADIterator
4
  import numpy as np
5
+ from silero_vad import get_speech_timestamps,collect_chunks
6
  import torch
7
+ import noisereduce as nr
8
+
9
 
10
  class VadPipe(BasePipe):
11
  model = None
12
+ sample_rate = 16000
13
  window_size_samples = 512
14
 
15
+
16
  @classmethod
17
  def init(cls):
18
  if cls.model is None:
 
24
  min_silence_duration=250,
25
  sample_rate=cls.sample_rate
26
  )
27
+ cls.vac = FixedVADIterator(cls.model.silero_vad, sampling_rate=cls.sample_rate,)
28
+ cls.vac.reset_states()
29
+
30
+
31
+ def get_previous_buffer(self):
32
+ if len(self.previous_buffer) == 2:
33
+ return self.previous_buffer[-1]
34
+ return np.array([], dtype=np.float32)
35
+
36
 
37
+ def reduce_noise(self, data):
38
+ return nr.reduce_noise(y=data, sr=self.sample_rate)
39
+
40
 
41
  def process(self, in_data: MetaItem) -> MetaItem:
42
  source_audio = in_data.source_audio
43
  source_audio = np.frombuffer(source_audio, dtype=np.float32)
44
+ send_audio = b""
45
+ speech_timestamps = get_speech_timestamps(source_audio, self.model.silero_vad, sampling_rate=16000)
46
+ if speech_timestamps:
47
+ send_audio = collect_chunks(speech_timestamps, torch.Tensor(source_audio))
48
+ send_audio = send_audio.numpy()
49
+ # send_audio = self.reduce_noise(send_audio).tobytes()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  in_data.source_audio = b""
 
51
  return in_data
transcribe/pipelines/pipe_whisper.py CHANGED
@@ -7,9 +7,6 @@ from ..helpers.whisper import WhisperCPP
7
  class WhisperPipe(BasePipe):
8
  whisper = None
9
 
10
- def __init__(self, in_queue=None, out_queue=None) -> None:
11
- super().__init__(in_queue, out_queue)
12
-
13
 
14
  @classmethod
15
  def init(cls):
 
7
  class WhisperPipe(BasePipe):
8
  whisper = None
9
 
 
 
 
10
 
11
  @classmethod
12
  def init(cls):
transcribe/strategy.py CHANGED
@@ -98,7 +98,7 @@ def segement_merge(segments):
98
 
99
  for seg in segments:
100
  temp_seq.append(seg)
101
- if any([mk in seg.text for mk in config.SENTENCE_END_MARKERS]):
102
  sequences.append(temp_seq.copy())
103
  temp_seq = []
104
  if temp_seq:
@@ -114,18 +114,18 @@ def segments_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
114
 
115
  if (len(audio_buffer) / sample_rate) < 12:
116
  # 低于12s 使用短句符号比如逗号作为判断依据
117
- markers = config.PAUSE_END_MARKERS
118
  is_end = False
119
 
120
  for idx, seg in enumerate(segments):
121
  left_watch_sequences.append(seg)
122
  if seg.text in markers:
123
  seg_index = int(seg.t1 / 100 * sample_rate)
124
- rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
125
  # is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
126
  right_watch_sequences = segments[min(idx+1, len(segments)):]
127
- if rest_buffer_duration >= 1.5:
128
- left_watch_idx = seg_index
129
  break
130
  return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
131
 
 
98
 
99
  for seg in segments:
100
  temp_seq.append(seg)
101
+ if any([mk in seg.text for mk in config.SENTENCE_END_MARKERS + config.PAUSE_END_MARKERS]):
102
  sequences.append(temp_seq.copy())
103
  temp_seq = []
104
  if temp_seq:
 
114
 
115
  if (len(audio_buffer) / sample_rate) < 12:
116
  # 低于12s 使用短句符号比如逗号作为判断依据
117
+ markers = config.PAUSE_END_MARKERS + config.SENTENCE_END_MARKERS
118
  is_end = False
119
 
120
  for idx, seg in enumerate(segments):
121
  left_watch_sequences.append(seg)
122
  if seg.text in markers:
123
  seg_index = int(seg.t1 / 100 * sample_rate)
124
+ # rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
125
  # is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
126
  right_watch_sequences = segments[min(idx+1, len(segments)):]
127
+ # if rest_buffer_duration >= 1.5:
128
+ left_watch_idx = seg_index
129
  break
130
  return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
131
 
transcribe/whisper_llm_serve.py CHANGED
@@ -40,7 +40,6 @@ class PyWhiperCppServe(ServeClientBase):
40
  self.lock = threading.Lock()
41
  self.frames_np = None
42
  self._frame_queue = queue.Queue()
43
- self._previous_frame_queue = collections.deque(maxlen=2)
44
  self.sample_rate = 16000
45
 
46
  self.send_ready_state()
@@ -69,24 +68,19 @@ class PyWhiperCppServe(ServeClientBase):
69
  def add_frames(self, frame_np):
70
  self._frame_queue.put(frame_np)
71
 
72
- def get_prev_frame(self, ):
73
- if len(self._previous_frame_queue) == 2:
74
- return self._previous_frame_queue[-1]
75
-
 
 
 
 
76
 
77
  def get_frame_from_queue(self,):
78
  while True:
79
  try:
80
  frame_np = self._frame_queue.get(timeout=0.1)
81
- # frame_np = item.source_audio
82
- # self._previous_frame_queue.appendleft(frame_np.copy())
83
- # prev_frame_np = self.get_prev_frame()
84
- # if prev_frame_np is not None:
85
- # frame_np = np.concatenate([prev_frame_np[int(-0.05 * self.sample_rate):],frame_np], axis=0)
86
- # item = translate_pipes.voice_detect(frame_np.tobytes())
87
- # if item.audio == b"":
88
- # continue
89
- # frame_np = np.frombuffer(item.audio, dtype=np.float32)
90
  with self.lock:
91
  if self.frames_np is None:
92
  self.frames_np = frame_np.copy()
@@ -96,7 +90,6 @@ class PyWhiperCppServe(ServeClientBase):
96
  pass
97
 
98
 
99
-
100
  def update_audio_buffer(self, last_offset):
101
  with self.lock:
102
  self.frames_np = self.frames_np[last_offset:]
@@ -244,6 +237,7 @@ class PyWhiperCppServe(ServeClientBase):
244
  def get_audio_chunk_for_processing(self):
245
  if self.frames_np.shape[0] >= self.sample_rate * 1:
246
  return self.frames_np.copy()
 
247
  # 计算需要填充的样本数
248
  padding_length = self.sample_rate * 1 - len(self.frames_np)
249
  # 创建静音填充(零值)
 
40
  self.lock = threading.Lock()
41
  self.frames_np = None
42
  self._frame_queue = queue.Queue()
 
43
  self.sample_rate = 16000
44
 
45
  self.send_ready_state()
 
68
  def add_frames(self, frame_np):
69
  self._frame_queue.put(frame_np)
70
 
71
+ def vad_merge(self):
72
+ with self.lock:
73
+ frame = self.frames_np.copy()
74
+ item = translate_pipes.voice_detect(frame.tobytes())
75
+ if item.audio != b'':
76
+ frame_np = np.frombuffer(item.audio, dtype=np.float32)
77
+ self.frames_np = frame_np.copy()
78
+
79
 
80
  def get_frame_from_queue(self,):
81
  while True:
82
  try:
83
  frame_np = self._frame_queue.get(timeout=0.1)
 
 
 
 
 
 
 
 
 
84
  with self.lock:
85
  if self.frames_np is None:
86
  self.frames_np = frame_np.copy()
 
90
  pass
91
 
92
 
 
93
  def update_audio_buffer(self, last_offset):
94
  with self.lock:
95
  self.frames_np = self.frames_np[last_offset:]
 
237
  def get_audio_chunk_for_processing(self):
238
  if self.frames_np.shape[0] >= self.sample_rate * 1:
239
  return self.frames_np.copy()
240
+ self.vad_merge()
241
  # 计算需要填充的样本数
242
  padding_length = self.sample_rate * 1 - len(self.frames_np)
243
  # 创建静音填充(零值)