File size: 3,516 Bytes
485d8e3 9be4c60 ca5d527 c0447ed 6696134 98c9c23 9bdac3d fca9809 d8ef700 c0447ed 31ad35a fca9809 c0447ed ca5d527 ea1c85a fca9809 ca5d527 750e8d5 ca5d527 9be4c60 aca5e0b 750e8d5 c0447ed 31ad35a ca5d527 70b1d55 750e8d5 ca5d527 6485991 ca5d527 750e8d5 ca5d527 aca5e0b ca5d527 9be4c60 ca5d527 ea1c85a c0447ed ca5d527 ea1c85a ca5d527 750e8d5 ca5d527 ea1c85a bdb9da4 ca5d527 f14a125 ca5d527 bdb9da4 ca5d527 e19aebc ca5d527 83ea845 ca5d527 f5bdb50 ca5d527 f5bdb50 ca5d527 f5bdb50 ca5d527 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from .base import MetaItem, BasePipe
from ..helpers.vadprocessor import FixedVADIterator
import numpy as np
import logging
# import noisereduce as nr
class VadPipe(BasePipe):
vac = None
sample_rate = 16000
def __init__(self, in_queue=None, out_queue=None) -> None:
super().__init__(in_queue, out_queue)
self._offset = 0 # 处理的frame size offset
self._status = 'END'
def reset(self):
self._offset = 0
self._status = 'END'
self.vac.reset_states()
@classmethod
def init(cls):
if cls.vac is None:
cls.vac = FixedVADIterator(
threshold=0.6,
sampling_rate=cls.sample_rate,
# speech_pad_ms=10
min_silence_duration_ms = 100,
# speech_pad_ms = 30,
)
cls.vac.reset_states()
# def reduce_noise(self, data):
# return nr.reduce_noise(y=data, sr=self.sample_rate)
def _process_speech_chunk(self, source_audio:np.ndarray):
speech_dict = self.vac(source_audio, return_seconds=False)
if speech_dict:
relative_start_frame = None
relative_end_frame = None
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
if start_frame:
relative_start_frame =start_frame - self._offset
if end_frame:
relative_end_frame = end_frame - self._offset
return relative_start_frame, relative_end_frame
def process(self, in_data: MetaItem) -> MetaItem:
if self._offset == 0:
self.vac.reset_states()
# silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
speech_data = self._process_speech_chunk(source_audio)
if speech_data: # 表示有音频的变化点出现
rel_start_frame, rel_end_frame = speech_data
if rel_start_frame is not None and rel_end_frame is None:
self._status = "START" # 语音开始
target_audio = source_audio[max(rel_start_frame-100, 0):]
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
elif rel_start_frame is None and rel_end_frame is not None:
self._status = "END" # 音频结束
target_audio = source_audio[:rel_end_frame]
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
else:
self._status = 'END'
target_audio = source_audio[max(rel_start_frame-100, 0):rel_end_frame]
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
# logging.debug("❌ No valid speech segment detected, setting status to END")
else:
if self._status == 'START':
target_audio = source_audio
# logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
else: # end
target_audio = np.array([],dtype=np.float32)
# self._status = 'END'
# logging.debug("❌ No speech detected, setting status to END")
self._offset += len(source_audio)
in_data.audio = target_audio.tobytes()
in_data.source_audio = b''
in_data.speech_status = self._status
return in_data
|