|
|
|
from .base import MetaItem, BasePipe |
|
from ..helpers.vadprocessor import FixedVADIterator |
|
|
|
import numpy as np |
|
import logging |
|
|
|
|
|
|
|
|
|
class VadPipe(BasePipe): |
|
vac = None |
|
sample_rate = 16000 |
|
|
|
def __init__(self, in_queue=None, out_queue=None) -> None: |
|
super().__init__(in_queue, out_queue) |
|
self._offset = 0 |
|
self._status = 'END' |
|
|
|
|
|
def reset(self): |
|
self._offset = 0 |
|
self._status = 'END' |
|
|
|
self.vac.reset_states() |
|
|
|
@classmethod |
|
def init(cls): |
|
if cls.vac is None: |
|
cls.vac = FixedVADIterator( |
|
threshold=0.6, |
|
sampling_rate=cls.sample_rate, |
|
|
|
min_silence_duration_ms = 100, |
|
|
|
) |
|
cls.vac.reset_states() |
|
|
|
|
|
|
|
|
|
|
|
def _process_speech_chunk(self, source_audio:np.ndarray): |
|
speech_dict = self.vac(source_audio, return_seconds=False) |
|
if speech_dict: |
|
relative_start_frame = None |
|
relative_end_frame = None |
|
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end") |
|
if start_frame: |
|
relative_start_frame =start_frame - self._offset |
|
if end_frame: |
|
relative_end_frame = end_frame - self._offset |
|
return relative_start_frame, relative_end_frame |
|
|
|
def process(self, in_data: MetaItem) -> MetaItem: |
|
if self._offset == 0: |
|
self.vac.reset_states() |
|
|
|
|
|
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32) |
|
speech_data = self._process_speech_chunk(source_audio) |
|
|
|
if speech_data: |
|
rel_start_frame, rel_end_frame = speech_data |
|
if rel_start_frame is not None and rel_end_frame is None: |
|
self._status = "START" |
|
target_audio = source_audio[max(rel_start_frame-100, 0):] |
|
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame)) |
|
elif rel_start_frame is None and rel_end_frame is not None: |
|
self._status = "END" |
|
target_audio = source_audio[:rel_end_frame] |
|
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame)) |
|
else: |
|
self._status = 'END' |
|
target_audio = source_audio[max(rel_start_frame-100, 0):rel_end_frame] |
|
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame)) |
|
|
|
else: |
|
if self._status == 'START': |
|
target_audio = source_audio |
|
|
|
else: |
|
target_audio = np.array([],dtype=np.float32) |
|
|
|
|
|
|
|
self._offset += len(source_audio) |
|
|
|
in_data.audio = target_audio.tobytes() |
|
in_data.source_audio = b'' |
|
in_data.speech_status = self._status |
|
return in_data |
|
|