Xin Zhang
Merge branch 'vad'
b8873f5
from .base import MetaItem, BasePipe
from ..helpers.vadprocessor import FixedVADIterator
import numpy as np
import logging
# import noisereduce as nr
class VadPipe(BasePipe):
vac = None
sample_rate = 16000
def __init__(self, in_queue=None, out_queue=None) -> None:
super().__init__(in_queue, out_queue)
self._offset = 0 # 处理的frame size offset
self._status = 'END'
def reset(self):
self._offset = 0
self._status = 'END'
self.vac.reset_states()
@classmethod
def init(cls):
if cls.vac is None:
cls.vac = FixedVADIterator(
threshold=0.6,
sampling_rate=cls.sample_rate,
# speech_pad_ms=10
min_silence_duration_ms = 100,
# speech_pad_ms = 30,
)
cls.vac.reset_states()
# def reduce_noise(self, data):
# return nr.reduce_noise(y=data, sr=self.sample_rate)
def _process_speech_chunk(self, source_audio:np.ndarray):
speech_dict = self.vac(source_audio, return_seconds=False)
if speech_dict:
relative_start_frame = None
relative_end_frame = None
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
if start_frame:
relative_start_frame =start_frame - self._offset
if end_frame:
relative_end_frame = end_frame - self._offset
return relative_start_frame, relative_end_frame
def process(self, in_data: MetaItem) -> MetaItem:
if self._offset == 0:
self.vac.reset_states()
# silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
speech_data = self._process_speech_chunk(source_audio)
if speech_data: # 表示有音频的变化点出现
rel_start_frame, rel_end_frame = speech_data
if rel_start_frame is not None and rel_end_frame is None:
self._status = "START" # 语音开始
target_audio = source_audio[max(rel_start_frame-100, 0):]
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
elif rel_start_frame is None and rel_end_frame is not None:
self._status = "END" # 音频结束
target_audio = source_audio[:rel_end_frame]
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
else:
self._status = 'END'
target_audio = source_audio[max(rel_start_frame-100, 0):rel_end_frame]
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
# logging.debug("❌ No valid speech segment detected, setting status to END")
else:
if self._status == 'START':
target_audio = source_audio
# logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
else: # end
target_audio = np.array([],dtype=np.float32)
# self._status = 'END'
# logging.debug("❌ No speech detected, setting status to END")
self._offset += len(source_audio)
in_data.audio = target_audio.tobytes()
in_data.source_audio = b''
in_data.speech_status = self._status
return in_data