david commited on
Commit
d8ef700
·
1 Parent(s): 37262f1

add custom vad silence

Browse files
Files changed (1) hide show
  1. transcribe/pipelines/pipe_vad.py +29 -1
transcribe/pipelines/pipe_vad.py CHANGED
@@ -24,6 +24,34 @@ def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000)
24
 
25
  return torch.cat(chunks)
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  class VadPipe(BasePipe):
28
  model = None
29
  sample_rate = 16000
@@ -63,7 +91,7 @@ class VadPipe(BasePipe):
63
  speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
64
 
65
  if speech_timestamps:
66
- send_audio = collect_chunks(speech_timestamps, torch.Tensor(source_audio))
67
  send_audio = send_audio.numpy()
68
  in_data.audio = send_audio
69
  # send_audio = self.reduce_noise(send_audio).tobytes()
 
24
 
25
  return torch.cat(chunks)
26
 
27
+ def collect_chunks_improved(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
28
+ chunks = []
29
+ silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
30
+ silence = torch.zeros(silent_samples) # 创建300ms的静音
31
+ min_gap_samples = int(0.1 * sample_rate) # 最小间隔阈值 (100ms)
32
+
33
+ # 对时间戳进行简单的平滑处理
34
+ smoothed_tss = []
35
+ for i, ts in enumerate(tss):
36
+ if i > 0 and ts['start'] - tss[i-1]['end'] < 0.02 * sample_rate: # 如果间隔小于20ms,认为是连续的
37
+ smoothed_tss[-1]['end'] = ts['end'] # 合并到前一个片段
38
+ else:
39
+ smoothed_tss.append(ts)
40
+
41
+ for i in range(len(smoothed_tss)):
42
+ # 添加当前语音片段
43
+ chunks.append(wav[smoothed_tss[i]['start']: smoothed_tss[i]['end']])
44
+
45
+ # 如果不是最后一个片段,且与下一个片段间隔大于阈值,则添加静音
46
+ if i < len(smoothed_tss) - 1:
47
+ gap = smoothed_tss[i+1]['start'] - smoothed_tss[i]['end']
48
+ if gap > min_gap_samples:
49
+ # 根据间隔大小动态调整静音长度,但最大不超过300ms
50
+ silence_length = min(gap // 2, silent_samples)
51
+ chunks.append(torch.zeros(silence_length))
52
+
53
+ return torch.cat(chunks)
54
+
55
  class VadPipe(BasePipe):
56
  model = None
57
  sample_rate = 16000
 
91
  speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
92
 
93
  if speech_timestamps:
94
+ send_audio = collect_chunks_improved(speech_timestamps, torch.Tensor(source_audio))
95
  send_audio = send_audio.numpy()
96
  in_data.audio = send_audio
97
  # send_audio = self.reduce_noise(send_audio).tobytes()