daihui.zhang
commited on
Commit
·
fca9809
1
Parent(s):
9150655
add vad pipeline
Browse files
transcribe/helpers/translator.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from logging import getLogger
|
2 |
from llama_cpp import Llama
|
3 |
-
import
|
4 |
|
5 |
logger = getLogger(__name__)
|
6 |
|
@@ -19,7 +19,7 @@ class QwenTranslator:
|
|
19 |
{"role": "user", "content": prompt},
|
20 |
]
|
21 |
|
22 |
-
|
23 |
def translate(self, prompt, src_lang, dst_lang) -> str:
|
24 |
message = self.to_message(prompt, src_lang, dst_lang)
|
25 |
output = self.llm.create_chat_completion(messages=message, temperature=0)
|
|
|
1 |
from logging import getLogger
|
2 |
from llama_cpp import Llama
|
3 |
+
from functools import lru_cache
|
4 |
|
5 |
logger = getLogger(__name__)
|
6 |
|
|
|
19 |
{"role": "user", "content": prompt},
|
20 |
]
|
21 |
|
22 |
+
@lru_cache(maxsize=10)
|
23 |
def translate(self, prompt, src_lang, dst_lang) -> str:
|
24 |
message = self.to_message(prompt, src_lang, dst_lang)
|
25 |
output = self.llm.create_chat_completion(messages=message, temperature=0)
|
transcribe/helpers/vadprocessor.py
CHANGED
@@ -4,10 +4,35 @@ import numpy as np
|
|
4 |
import onnxruntime
|
5 |
from datetime import timedelta
|
6 |
from pydub import AudioSegment
|
7 |
-
from silero_vad import load_silero_vad, get_speech_timestamps
|
8 |
import os
|
9 |
import logging
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
class SileroVADProcessor:
|
13 |
"""
|
|
|
4 |
import onnxruntime
|
5 |
from datetime import timedelta
|
6 |
from pydub import AudioSegment
|
7 |
+
from silero_vad import load_silero_vad, get_speech_timestamps, VADIterator
|
8 |
import os
|
9 |
import logging
|
10 |
|
11 |
+
class FixedVADIterator(VADIterator):
|
12 |
+
'''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
|
13 |
+
If audio to be processed at once is long and multiple voiced segments detected,
|
14 |
+
then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
|
15 |
+
'''
|
16 |
+
|
17 |
+
def reset_states(self):
|
18 |
+
super().reset_states()
|
19 |
+
self.buffer = np.array([],dtype=np.float32)
|
20 |
+
|
21 |
+
def __call__(self, x, return_seconds=False):
|
22 |
+
self.buffer = np.append(self.buffer, x)
|
23 |
+
ret = None
|
24 |
+
while len(self.buffer) >= 512:
|
25 |
+
r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
|
26 |
+
self.buffer = self.buffer[512:]
|
27 |
+
if ret is None:
|
28 |
+
ret = r
|
29 |
+
elif r is not None:
|
30 |
+
if 'end' in r:
|
31 |
+
ret['end'] = r['end'] # the latter end
|
32 |
+
if 'start' in r and 'end' in ret: # there is an earlier start.
|
33 |
+
# Remove end, merging this segment with the previous one.
|
34 |
+
del ret['end']
|
35 |
+
return ret if ret != {} else None
|
36 |
|
37 |
class SileroVADProcessor:
|
38 |
"""
|
transcribe/pipelines/base.py
CHANGED
@@ -2,6 +2,10 @@
|
|
2 |
from dataclasses import dataclass, field
|
3 |
from multiprocessing import Process, Queue
|
4 |
from multiprocessing import Event
|
|
|
|
|
|
|
|
|
5 |
|
6 |
@dataclass
|
7 |
class Segment:
|
@@ -53,7 +57,9 @@ class BasePipe(Process):
|
|
53 |
raise NotImplementedError
|
54 |
|
55 |
def run(self):
|
|
|
56 |
self.init()
|
|
|
57 |
self.set_ready()
|
58 |
while True:
|
59 |
item = self.input_queue.get()
|
|
|
2 |
from dataclasses import dataclass, field
|
3 |
from multiprocessing import Process, Queue
|
4 |
from multiprocessing import Event
|
5 |
+
from logging import getLogger
|
6 |
+
|
7 |
+
logger = getLogger(__name__)
|
8 |
+
|
9 |
|
10 |
@dataclass
|
11 |
class Segment:
|
|
|
57 |
raise NotImplementedError
|
58 |
|
59 |
def run(self):
|
60 |
+
logger.info(f"start initial {self.__class__.__name__}")
|
61 |
self.init()
|
62 |
+
logger.info(f"finish initial {self.__class__.__name__}")
|
63 |
self.set_ready()
|
64 |
while True:
|
65 |
item = self.input_queue.get()
|
transcribe/pipelines/pipe_vad.py
CHANGED
@@ -1,15 +1,18 @@
|
|
1 |
|
2 |
from .base import MetaItem, BasePipe
|
3 |
-
from ..helpers.vadprocessor import SileroVADProcessor
|
4 |
import numpy as np
|
5 |
-
from silero_vad import
|
6 |
import torch
|
|
|
|
|
7 |
|
8 |
class VadPipe(BasePipe):
|
9 |
model = None
|
10 |
-
sample_rate=16000
|
11 |
window_size_samples = 512
|
12 |
|
|
|
13 |
@classmethod
|
14 |
def init(cls):
|
15 |
if cls.model is None:
|
@@ -21,50 +24,28 @@ class VadPipe(BasePipe):
|
|
21 |
min_silence_duration=250,
|
22 |
sample_rate=cls.sample_rate
|
23 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
|
29 |
def process(self, in_data: MetaItem) -> MetaItem:
|
30 |
source_audio = in_data.source_audio
|
31 |
source_audio = np.frombuffer(source_audio, dtype=np.float32)
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
window = source_audio[i:i+self.window_size_samples]
|
39 |
-
|
40 |
-
if len(window) < self.window_size_samples:
|
41 |
-
padded_window = np.zeros(self.window_size_samples, dtype=np.float32)
|
42 |
-
padded_window[:len(window)] = window
|
43 |
-
window = padded_window
|
44 |
-
|
45 |
-
speech_dict = self.vad_iterator(window, return_seconds=False)
|
46 |
-
if not speech_dict:
|
47 |
-
continue
|
48 |
-
|
49 |
-
# 计算当前偏移量
|
50 |
-
|
51 |
-
if speech_dict and 'start' in speech_dict and not is_speech_active:
|
52 |
-
is_speech_active = True
|
53 |
-
# current_segment_start = speech_dict['start'] + i
|
54 |
-
|
55 |
-
if is_speech_active:
|
56 |
-
speech_segments.append(window)
|
57 |
-
# # 如果检测到语音结束
|
58 |
-
# if speech_dict and 'end' in speech_dict and is_speech_active:
|
59 |
-
# # 调整语音结束时间,加上窗口偏移
|
60 |
-
# current_segment_end = min(speech_dict['end'] + i, current_segment_end)
|
61 |
-
# is_speech_active = False
|
62 |
-
# speech_audio = source_audio[current_segment_start: current_segment_end]
|
63 |
-
# speech_segments.append(speech_audio)
|
64 |
-
|
65 |
-
self.vad_iterator.reset_states()
|
66 |
-
combied_audio = np.concatenate(speech_segments, axis=0).tobytes() if len(speech_segments) else b""
|
67 |
-
in_data.audio = combied_audio
|
68 |
in_data.source_audio = b""
|
69 |
-
|
70 |
return in_data
|
|
|
1 |
|
2 |
from .base import MetaItem, BasePipe
|
3 |
+
from ..helpers.vadprocessor import SileroVADProcessor, FixedVADIterator
|
4 |
import numpy as np
|
5 |
+
from silero_vad import get_speech_timestamps,collect_chunks
|
6 |
import torch
|
7 |
+
import noisereduce as nr
|
8 |
+
|
9 |
|
10 |
class VadPipe(BasePipe):
|
11 |
model = None
|
12 |
+
sample_rate = 16000
|
13 |
window_size_samples = 512
|
14 |
|
15 |
+
|
16 |
@classmethod
|
17 |
def init(cls):
|
18 |
if cls.model is None:
|
|
|
24 |
min_silence_duration=250,
|
25 |
sample_rate=cls.sample_rate
|
26 |
)
|
27 |
+
cls.vac = FixedVADIterator(cls.model.silero_vad, sampling_rate=cls.sample_rate,)
|
28 |
+
cls.vac.reset_states()
|
29 |
+
|
30 |
+
|
31 |
+
def get_previous_buffer(self):
|
32 |
+
if len(self.previous_buffer) == 2:
|
33 |
+
return self.previous_buffer[-1]
|
34 |
+
return np.array([], dtype=np.float32)
|
35 |
+
|
36 |
|
37 |
+
def reduce_noise(self, data):
|
38 |
+
return nr.reduce_noise(y=data, sr=self.sample_rate)
|
39 |
+
|
40 |
|
41 |
def process(self, in_data: MetaItem) -> MetaItem:
|
42 |
source_audio = in_data.source_audio
|
43 |
source_audio = np.frombuffer(source_audio, dtype=np.float32)
|
44 |
+
send_audio = b""
|
45 |
+
speech_timestamps = get_speech_timestamps(source_audio, self.model.silero_vad, sampling_rate=16000)
|
46 |
+
if speech_timestamps:
|
47 |
+
send_audio = collect_chunks(speech_timestamps, torch.Tensor(source_audio))
|
48 |
+
send_audio = send_audio.numpy()
|
49 |
+
# send_audio = self.reduce_noise(send_audio).tobytes()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
in_data.source_audio = b""
|
|
|
51 |
return in_data
|
transcribe/pipelines/pipe_whisper.py
CHANGED
@@ -7,9 +7,6 @@ from ..helpers.whisper import WhisperCPP
|
|
7 |
class WhisperPipe(BasePipe):
|
8 |
whisper = None
|
9 |
|
10 |
-
def __init__(self, in_queue=None, out_queue=None) -> None:
|
11 |
-
super().__init__(in_queue, out_queue)
|
12 |
-
|
13 |
|
14 |
@classmethod
|
15 |
def init(cls):
|
|
|
7 |
class WhisperPipe(BasePipe):
|
8 |
whisper = None
|
9 |
|
|
|
|
|
|
|
10 |
|
11 |
@classmethod
|
12 |
def init(cls):
|
transcribe/strategy.py
CHANGED
@@ -98,7 +98,7 @@ def segement_merge(segments):
|
|
98 |
|
99 |
for seg in segments:
|
100 |
temp_seq.append(seg)
|
101 |
-
if any([mk in seg.text for mk in config.SENTENCE_END_MARKERS]):
|
102 |
sequences.append(temp_seq.copy())
|
103 |
temp_seq = []
|
104 |
if temp_seq:
|
@@ -114,18 +114,18 @@ def segments_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
|
|
114 |
|
115 |
if (len(audio_buffer) / sample_rate) < 12:
|
116 |
# 低于12s 使用短句符号比如逗号作为判断依据
|
117 |
-
markers = config.PAUSE_END_MARKERS
|
118 |
is_end = False
|
119 |
|
120 |
for idx, seg in enumerate(segments):
|
121 |
left_watch_sequences.append(seg)
|
122 |
if seg.text in markers:
|
123 |
seg_index = int(seg.t1 / 100 * sample_rate)
|
124 |
-
rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
|
125 |
# is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
|
126 |
right_watch_sequences = segments[min(idx+1, len(segments)):]
|
127 |
-
if rest_buffer_duration >= 1.5:
|
128 |
-
|
129 |
break
|
130 |
return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
|
131 |
|
|
|
98 |
|
99 |
for seg in segments:
|
100 |
temp_seq.append(seg)
|
101 |
+
if any([mk in seg.text for mk in config.SENTENCE_END_MARKERS + config.PAUSE_END_MARKERS]):
|
102 |
sequences.append(temp_seq.copy())
|
103 |
temp_seq = []
|
104 |
if temp_seq:
|
|
|
114 |
|
115 |
if (len(audio_buffer) / sample_rate) < 12:
|
116 |
# 低于12s 使用短句符号比如逗号作为判断依据
|
117 |
+
markers = config.PAUSE_END_MARKERS + config.SENTENCE_END_MARKERS
|
118 |
is_end = False
|
119 |
|
120 |
for idx, seg in enumerate(segments):
|
121 |
left_watch_sequences.append(seg)
|
122 |
if seg.text in markers:
|
123 |
seg_index = int(seg.t1 / 100 * sample_rate)
|
124 |
+
# rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
|
125 |
# is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
|
126 |
right_watch_sequences = segments[min(idx+1, len(segments)):]
|
127 |
+
# if rest_buffer_duration >= 1.5:
|
128 |
+
left_watch_idx = seg_index
|
129 |
break
|
130 |
return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
|
131 |
|
transcribe/whisper_llm_serve.py
CHANGED
@@ -40,7 +40,6 @@ class PyWhiperCppServe(ServeClientBase):
|
|
40 |
self.lock = threading.Lock()
|
41 |
self.frames_np = None
|
42 |
self._frame_queue = queue.Queue()
|
43 |
-
self._previous_frame_queue = collections.deque(maxlen=2)
|
44 |
self.sample_rate = 16000
|
45 |
|
46 |
self.send_ready_state()
|
@@ -69,24 +68,19 @@ class PyWhiperCppServe(ServeClientBase):
|
|
69 |
def add_frames(self, frame_np):
|
70 |
self._frame_queue.put(frame_np)
|
71 |
|
72 |
-
def
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
76 |
|
77 |
def get_frame_from_queue(self,):
|
78 |
while True:
|
79 |
try:
|
80 |
frame_np = self._frame_queue.get(timeout=0.1)
|
81 |
-
# frame_np = item.source_audio
|
82 |
-
# self._previous_frame_queue.appendleft(frame_np.copy())
|
83 |
-
# prev_frame_np = self.get_prev_frame()
|
84 |
-
# if prev_frame_np is not None:
|
85 |
-
# frame_np = np.concatenate([prev_frame_np[int(-0.05 * self.sample_rate):],frame_np], axis=0)
|
86 |
-
# item = translate_pipes.voice_detect(frame_np.tobytes())
|
87 |
-
# if item.audio == b"":
|
88 |
-
# continue
|
89 |
-
# frame_np = np.frombuffer(item.audio, dtype=np.float32)
|
90 |
with self.lock:
|
91 |
if self.frames_np is None:
|
92 |
self.frames_np = frame_np.copy()
|
@@ -96,7 +90,6 @@ class PyWhiperCppServe(ServeClientBase):
|
|
96 |
pass
|
97 |
|
98 |
|
99 |
-
|
100 |
def update_audio_buffer(self, last_offset):
|
101 |
with self.lock:
|
102 |
self.frames_np = self.frames_np[last_offset:]
|
@@ -244,6 +237,7 @@ class PyWhiperCppServe(ServeClientBase):
|
|
244 |
def get_audio_chunk_for_processing(self):
|
245 |
if self.frames_np.shape[0] >= self.sample_rate * 1:
|
246 |
return self.frames_np.copy()
|
|
|
247 |
# 计算需要填充的样本数
|
248 |
padding_length = self.sample_rate * 1 - len(self.frames_np)
|
249 |
# 创建静音填充(零值)
|
|
|
40 |
self.lock = threading.Lock()
|
41 |
self.frames_np = None
|
42 |
self._frame_queue = queue.Queue()
|
|
|
43 |
self.sample_rate = 16000
|
44 |
|
45 |
self.send_ready_state()
|
|
|
68 |
def add_frames(self, frame_np):
|
69 |
self._frame_queue.put(frame_np)
|
70 |
|
71 |
+
def vad_merge(self):
|
72 |
+
with self.lock:
|
73 |
+
frame = self.frames_np.copy()
|
74 |
+
item = translate_pipes.voice_detect(frame.tobytes())
|
75 |
+
if item.audio != b'':
|
76 |
+
frame_np = np.frombuffer(item.audio, dtype=np.float32)
|
77 |
+
self.frames_np = frame_np.copy()
|
78 |
+
|
79 |
|
80 |
def get_frame_from_queue(self,):
|
81 |
while True:
|
82 |
try:
|
83 |
frame_np = self._frame_queue.get(timeout=0.1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
with self.lock:
|
85 |
if self.frames_np is None:
|
86 |
self.frames_np = frame_np.copy()
|
|
|
90 |
pass
|
91 |
|
92 |
|
|
|
93 |
def update_audio_buffer(self, last_offset):
|
94 |
with self.lock:
|
95 |
self.frames_np = self.frames_np[last_offset:]
|
|
|
237 |
def get_audio_chunk_for_processing(self):
|
238 |
if self.frames_np.shape[0] >= self.sample_rate * 1:
|
239 |
return self.frames_np.copy()
|
240 |
+
self.vad_merge()
|
241 |
# 计算需要填充的样本数
|
242 |
padding_length = self.sample_rate * 1 - len(self.frames_np)
|
243 |
# 创建静音填充(零值)
|