david
commited on
Commit
·
d8ef700
1
Parent(s):
37262f1
add custom vad silence
Browse files
transcribe/pipelines/pipe_vad.py
CHANGED
@@ -24,6 +24,34 @@ def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000)
|
|
24 |
|
25 |
return torch.cat(chunks)
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
class VadPipe(BasePipe):
|
28 |
model = None
|
29 |
sample_rate = 16000
|
@@ -63,7 +91,7 @@ class VadPipe(BasePipe):
|
|
63 |
speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
|
64 |
|
65 |
if speech_timestamps:
|
66 |
-
send_audio =
|
67 |
send_audio = send_audio.numpy()
|
68 |
in_data.audio = send_audio
|
69 |
# send_audio = self.reduce_noise(send_audio).tobytes()
|
|
|
24 |
|
25 |
return torch.cat(chunks)
|
26 |
|
27 |
+
def collect_chunks_improved(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
|
28 |
+
chunks = []
|
29 |
+
silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
|
30 |
+
silence = torch.zeros(silent_samples) # 创建300ms的静音
|
31 |
+
min_gap_samples = int(0.1 * sample_rate) # 最小间隔阈值 (100ms)
|
32 |
+
|
33 |
+
# 对时间戳进行简单的平滑处理
|
34 |
+
smoothed_tss = []
|
35 |
+
for i, ts in enumerate(tss):
|
36 |
+
if i > 0 and ts['start'] - tss[i-1]['end'] < 0.02 * sample_rate: # 如果间隔小于20ms,认为是连续的
|
37 |
+
smoothed_tss[-1]['end'] = ts['end'] # 合并到前一个片段
|
38 |
+
else:
|
39 |
+
smoothed_tss.append(ts)
|
40 |
+
|
41 |
+
for i in range(len(smoothed_tss)):
|
42 |
+
# 添加当前语音片段
|
43 |
+
chunks.append(wav[smoothed_tss[i]['start']: smoothed_tss[i]['end']])
|
44 |
+
|
45 |
+
# 如果不是最后一个片段,且与下一个片段间隔大于阈值,则添加静音
|
46 |
+
if i < len(smoothed_tss) - 1:
|
47 |
+
gap = smoothed_tss[i+1]['start'] - smoothed_tss[i]['end']
|
48 |
+
if gap > min_gap_samples:
|
49 |
+
# 根据间隔大小动态调整静音长度,但最大不超过300ms
|
50 |
+
silence_length = min(gap // 2, silent_samples)
|
51 |
+
chunks.append(torch.zeros(silence_length))
|
52 |
+
|
53 |
+
return torch.cat(chunks)
|
54 |
+
|
55 |
class VadPipe(BasePipe):
|
56 |
model = None
|
57 |
sample_rate = 16000
|
|
|
91 |
speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
|
92 |
|
93 |
if speech_timestamps:
|
94 |
+
send_audio = collect_chunks_improved(speech_timestamps, torch.Tensor(source_audio))
|
95 |
send_audio = send_audio.numpy()
|
96 |
in_data.audio = send_audio
|
97 |
# send_audio = self.reduce_noise(send_audio).tobytes()
|