MoYoYoTech
/

Translator

Model card Files Files and versions

david commited on Apr 16

Commit

d8ef700

·

1 Parent(s): 37262f1

add custom vad silence

Files changed (1) hide show

transcribe/pipelines/pipe_vad.py +29 -1

transcribe/pipelines/pipe_vad.py CHANGED Viewed

@@ -24,6 +24,34 @@ def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000)
     return torch.cat(chunks)
 class VadPipe(BasePipe):
     model = None
     sample_rate = 16000
@@ -63,7 +91,7 @@ class VadPipe(BasePipe):
         speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
         if speech_timestamps:
-            send_audio = collect_chunks(speech_timestamps, torch.Tensor(source_audio))
             send_audio = send_audio.numpy()
             in_data.audio = send_audio
             # send_audio = self.reduce_noise(send_audio).tobytes()

     return torch.cat(chunks)
+def collect_chunks_improved(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
+    chunks = []
+    silent_samples = int(0.3 * sample_rate)  # 300ms 的静音样本数
+    silence = torch.zeros(silent_samples)  # 创建300ms的静音
+    min_gap_samples = int(0.1 * sample_rate)  # 最小间隔阈值 (100ms)
+    # 对时间戳进行简单的平滑处理
+    smoothed_tss = []
+    for i, ts in enumerate(tss):
+        if i > 0 and ts['start'] - tss[i-1]['end'] < 0.02 * sample_rate:  # 如果间隔小于20ms，认为是连续的
+            smoothed_tss[-1]['end'] = ts['end']  # 合并到前一个片段
+        else:
+            smoothed_tss.append(ts)
+    for i in range(len(smoothed_tss)):
+        # 添加当前语音片段
+        chunks.append(wav[smoothed_tss[i]['start']: smoothed_tss[i]['end']])
+        # 如果不是最后一个片段，且与下一个片段间隔大于阈值，则添加静音
+        if i < len(smoothed_tss) - 1:
+            gap = smoothed_tss[i+1]['start'] - smoothed_tss[i]['end']
+            if gap > min_gap_samples:
+                # 根据间隔大小动态调整静音长度，但最大不超过300ms
+                silence_length = min(gap // 2, silent_samples)
+                chunks.append(torch.zeros(silence_length))
+    return torch.cat(chunks)
 class VadPipe(BasePipe):
     model = None
     sample_rate = 16000
         speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
         if speech_timestamps:
+            send_audio = collect_chunks_improved(speech_timestamps, torch.Tensor(source_audio))
             send_audio = send_audio.numpy()
             in_data.audio = send_audio
             # send_audio = self.reduce_noise(send_audio).tobytes()