cdactvm commited on
Commit
06dde59
·
verified ·
1 Parent(s): cd1b576

Update applyVad.py

Browse files
Files changed (1) hide show
  1. applyVad.py +247 -212
applyVad.py CHANGED
@@ -1,212 +1,247 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # In[ ]:
5
-
6
-
7
- # import webrtcvad
8
- # import numpy as np
9
- # import librosa
10
- # def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
11
- # '''
12
- # Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech.
13
- # This is useful in noisy environments where you want to filter out non-speech parts of the audio.
14
- # webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project.
15
- # It helps detect speech in small chunks of audio.
16
- # '''
17
- # vad = webrtcvad.Vad()
18
- # audio_int16 = np.int16(audio * 32767)
19
- # frame_size = int(sr * frame_duration / 1000)
20
- # frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
21
- # voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])
22
- # voiced_audio = np.float32(voiced_audio) / 32767
23
- # return voiced_audio
24
-
25
-
26
- # In[1]:
27
-
28
-
29
- # import webrtcvad
30
- # import numpy as np
31
- # import librosa
32
-
33
- # def apply_vad(audio, sr):
34
- # # Ensure that sample rate is supported by webrtcvad
35
- # if sr not in [8000, 16000, 32000, 48000]:
36
- # raise ValueError("Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz")
37
-
38
- # vad = webrtcvad.Vad(2) # Aggressiveness mode: 0-3
39
- # frame_duration_ms = 30 # Use 10ms, 20ms, or 30ms frames only
40
-
41
- # # Convert to PCM 16-bit and calculate frame length
42
- # audio_pcm16 = (audio * 32767).astype(np.int16)
43
- # frame_length = int(sr * frame_duration_ms / 1000) * 2 # 2 bytes per sample for 16-bit PCM
44
-
45
- # # Create frames ensuring correct frame size
46
- # frames = [
47
- # audio_pcm16[i:i + frame_length].tobytes()
48
- # for i in range(0, len(audio_pcm16) - frame_length, frame_length)
49
- # ]
50
-
51
- # # Apply VAD
52
- # voiced_frames = []
53
- # for frame in frames:
54
- # try:
55
- # if vad.is_speech(frame, sample_rate=sr):
56
- # voiced_frames.append(frame)
57
- # except Exception as e:
58
- # print(f"Error during VAD frame processing: {e}")
59
-
60
- # if not voiced_frames:
61
- # raise Exception("No voiced frames detected.")
62
-
63
- # # Concatenate voiced frames
64
- # voiced_audio = b''.join(voiced_frames)
65
- # return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0
66
-
67
-
68
- # In[ ]:
69
-
70
-
71
- # import webrtcvad
72
- # import numpy as np
73
- # import librosa
74
-
75
- # def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
76
- # '''
77
- # Voice Activity Detection (VAD): Detects speech in audio.
78
- # '''
79
- # vad = webrtcvad.Vad(aggressiveness)
80
-
81
- # # Resample to 16000 Hz if not already (recommended for better compatibility)
82
- # if sr != 16000:
83
- # audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
84
- # sr = 16000
85
-
86
- # # Convert to 16-bit PCM format expected by webrtcvad
87
- # audio_int16 = np.int16(audio * 32767)
88
-
89
- # # Ensure frame size matches WebRTC's expected lengths
90
- # frame_size = int(sr * frame_duration / 1000)
91
- # if frame_size % 2 != 0:
92
- # frame_size -= 1 # Make sure it's even to avoid processing issues
93
-
94
- # frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
95
-
96
- # # Filter out non-speech frames
97
- # voiced_frames = []
98
- # for frame in frames:
99
- # if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
100
- # voiced_frames.append(frame)
101
-
102
- # # Concatenate the voiced frames
103
- # voiced_audio = np.concatenate(voiced_frames)
104
- # voiced_audio = np.float32(voiced_audio) / 32767
105
-
106
- # return voiced_audio
107
-
108
-
109
- # In[3]:
110
-
111
-
112
- # import webrtcvad
113
- # import numpy as np
114
- # import librosa
115
-
116
- # def frame_generator(frame_duration_ms, audio, sample_rate):
117
- # """
118
- # Generates audio frames from PCM audio data.
119
- # Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.
120
- # """
121
- # n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) # Convert to byte length
122
- # offset = 0
123
- # while offset + n < len(audio):
124
- # yield audio[offset:offset + n]
125
- # offset += n
126
-
127
- # def apply_vad(audio, sample_rate):
128
- # vad = webrtcvad.Vad()
129
- # vad.set_mode(1)
130
- # print("Applying VAD with mode:", 1)
131
- # print("Audio length:", len(audio), "bytes")
132
- # print("Sample rate:", sample_rate)
133
-
134
- # # Ensure mono and correct sample rate
135
- # if sample_rate != 16000:
136
- # print("Sample rate issue detected.")
137
- # raise ValueError("Sample rate must be 16000 Hz")
138
-
139
- # frames = frame_generator(30, audio, sample_rate)
140
- # frames = list(frames)
141
-
142
- # print("Number of frames:", len(frames))
143
- # try:
144
- # segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]
145
-
146
- # if not segments:
147
- # raise Exception("No voiced frames detected.")
148
-
149
- # return b''.join(segments)
150
-
151
- # except Exception as e:
152
- # print(f"Error during VAD frame processing: {e}")
153
- # raise
154
-
155
-
156
- # In[5]:
157
-
158
-
159
- import torch
160
- import torchaudio
161
- from silero_vad import get_speech_timestamps, read_audio, save_audio
162
-
163
- def apply_silero_vad(audio_file_path):
164
- """
165
- Applies Silero VAD to an audio file and returns the processed audio
166
- containing only the voiced segments.
167
- """
168
- # Load the Silero VAD model
169
- model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
170
-
171
- # Define helper utilities manually
172
- def read_audio(path, sampling_rate=16000):
173
- wav, sr = torchaudio.load(path)
174
- if sr != sampling_rate:
175
- wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)
176
- return wav.squeeze(0)
177
-
178
- def save_audio(path, tensor, sampling_rate=16000):
179
- torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)
180
-
181
- # Read the audio file
182
- wav = read_audio(audio_file_path, sampling_rate=16000)
183
-
184
- # Get timestamps for speech segments
185
- speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)
186
-
187
- # If no speech detected, raise an exception
188
- if not speech_timestamps:
189
- raise Exception("No voiced frames detected using Silero VAD.")
190
-
191
- # Combine the voiced segments
192
- voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])
193
-
194
- # Save the processed audio if needed
195
- save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)
196
-
197
- # Convert to numpy bytes for further processing
198
- return voiced_audio.numpy().tobytes()
199
-
200
- # Example usage
201
- try:
202
- processed_audio = apply_silero_vad("path_to_your_audio.wav")
203
- print("VAD completed successfully!")
204
- except Exception as e:
205
- print(f"Error during Silero VAD processing: {e}")
206
-
207
-
208
- # In[ ]:
209
-
210
-
211
-
212
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[ ]:
5
+
6
+
7
+ # import webrtcvad
8
+ # import numpy as np
9
+ # import librosa
10
+ # def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
11
+ # '''
12
+ # Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech.
13
+ # This is useful in noisy environments where you want to filter out non-speech parts of the audio.
14
+ # webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project.
15
+ # It helps detect speech in small chunks of audio.
16
+ # '''
17
+ # vad = webrtcvad.Vad()
18
+ # audio_int16 = np.int16(audio * 32767)
19
+ # frame_size = int(sr * frame_duration / 1000)
20
+ # frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
21
+ # voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])
22
+ # voiced_audio = np.float32(voiced_audio) / 32767
23
+ # return voiced_audio
24
+
25
+
26
+ # In[1]:
27
+
28
+
29
+ # import webrtcvad
30
+ # import numpy as np
31
+ # import librosa
32
+
33
+ # def apply_vad(audio, sr):
34
+ # # Ensure that sample rate is supported by webrtcvad
35
+ # if sr not in [8000, 16000, 32000, 48000]:
36
+ # raise ValueError("Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz")
37
+
38
+ # vad = webrtcvad.Vad(2) # Aggressiveness mode: 0-3
39
+ # frame_duration_ms = 30 # Use 10ms, 20ms, or 30ms frames only
40
+
41
+ # # Convert to PCM 16-bit and calculate frame length
42
+ # audio_pcm16 = (audio * 32767).astype(np.int16)
43
+ # frame_length = int(sr * frame_duration_ms / 1000) * 2 # 2 bytes per sample for 16-bit PCM
44
+
45
+ # # Create frames ensuring correct frame size
46
+ # frames = [
47
+ # audio_pcm16[i:i + frame_length].tobytes()
48
+ # for i in range(0, len(audio_pcm16) - frame_length, frame_length)
49
+ # ]
50
+
51
+ # # Apply VAD
52
+ # voiced_frames = []
53
+ # for frame in frames:
54
+ # try:
55
+ # if vad.is_speech(frame, sample_rate=sr):
56
+ # voiced_frames.append(frame)
57
+ # except Exception as e:
58
+ # print(f"Error during VAD frame processing: {e}")
59
+
60
+ # if not voiced_frames:
61
+ # raise Exception("No voiced frames detected.")
62
+
63
+ # # Concatenate voiced frames
64
+ # voiced_audio = b''.join(voiced_frames)
65
+ # return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0
66
+
67
+
68
+ # In[ ]:
69
+
70
+
71
+ # import webrtcvad
72
+ # import numpy as np
73
+ # import librosa
74
+
75
+ # def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
76
+ # '''
77
+ # Voice Activity Detection (VAD): Detects speech in audio.
78
+ # '''
79
+ # vad = webrtcvad.Vad(aggressiveness)
80
+
81
+ # # Resample to 16000 Hz if not already (recommended for better compatibility)
82
+ # if sr != 16000:
83
+ # audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
84
+ # sr = 16000
85
+
86
+ # # Convert to 16-bit PCM format expected by webrtcvad
87
+ # audio_int16 = np.int16(audio * 32767)
88
+
89
+ # # Ensure frame size matches WebRTC's expected lengths
90
+ # frame_size = int(sr * frame_duration / 1000)
91
+ # if frame_size % 2 != 0:
92
+ # frame_size -= 1 # Make sure it's even to avoid processing issues
93
+
94
+ # frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
95
+
96
+ # # Filter out non-speech frames
97
+ # voiced_frames = []
98
+ # for frame in frames:
99
+ # if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
100
+ # voiced_frames.append(frame)
101
+
102
+ # # Concatenate the voiced frames
103
+ # voiced_audio = np.concatenate(voiced_frames)
104
+ # voiced_audio = np.float32(voiced_audio) / 32767
105
+
106
+ # return voiced_audio
107
+
108
+
109
+ # In[3]:
110
+
111
+
112
+ # import webrtcvad
113
+ # import numpy as np
114
+ # import librosa
115
+
116
+ # def frame_generator(frame_duration_ms, audio, sample_rate):
117
+ # """
118
+ # Generates audio frames from PCM audio data.
119
+ # Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.
120
+ # """
121
+ # n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) # Convert to byte length
122
+ # offset = 0
123
+ # while offset + n < len(audio):
124
+ # yield audio[offset:offset + n]
125
+ # offset += n
126
+
127
+ # def apply_vad(audio, sample_rate):
128
+ # vad = webrtcvad.Vad()
129
+ # vad.set_mode(1)
130
+ # print("Applying VAD with mode:", 1)
131
+ # print("Audio length:", len(audio), "bytes")
132
+ # print("Sample rate:", sample_rate)
133
+
134
+ # # Ensure mono and correct sample rate
135
+ # if sample_rate != 16000:
136
+ # print("Sample rate issue detected.")
137
+ # raise ValueError("Sample rate must be 16000 Hz")
138
+
139
+ # frames = frame_generator(30, audio, sample_rate)
140
+ # frames = list(frames)
141
+
142
+ # print("Number of frames:", len(frames))
143
+ # try:
144
+ # segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]
145
+
146
+ # if not segments:
147
+ # raise Exception("No voiced frames detected.")
148
+
149
+ # return b''.join(segments)
150
+
151
+ # except Exception as e:
152
+ # print(f"Error during VAD frame processing: {e}")
153
+ # raise
154
+
155
+
156
+ # In[5]:
157
+
158
+
159
+ # import torch
160
+ # import torchaudio
161
+ # from silero_vad import get_speech_timestamps, read_audio, save_audio
162
+
163
+ # def apply_silero_vad(audio_file_path):
164
+ # """
165
+ # Applies Silero VAD to an audio file and returns the processed audio
166
+ # containing only the voiced segments.
167
+ # """
168
+ # # Load the Silero VAD model
169
+ # model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
170
+
171
+ # # Define helper utilities manually
172
+ # def read_audio(path, sampling_rate=16000):
173
+ # wav, sr = torchaudio.load(path)
174
+ # if sr != sampling_rate:
175
+ # wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)
176
+ # return wav.squeeze(0)
177
+
178
+ # def save_audio(path, tensor, sampling_rate=16000):
179
+ # torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)
180
+
181
+ # # Read the audio file
182
+ # wav = read_audio(audio_file_path, sampling_rate=16000)
183
+
184
+ # # Get timestamps for speech segments
185
+ # speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)
186
+
187
+ # # If no speech detected, raise an exception
188
+ # if not speech_timestamps:
189
+ # raise Exception("No voiced frames detected using Silero VAD.")
190
+
191
+ # # Combine the voiced segments
192
+ # voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])
193
+
194
+ # # Save the processed audio if needed
195
+ # save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)
196
+
197
+ # # Convert to numpy bytes for further processing
198
+ # return voiced_audio.numpy().tobytes()
199
+
200
+ # # Example usage
201
+ # try:
202
+ # processed_audio = apply_silero_vad("path_to_your_audio.wav")
203
+ # print("VAD completed successfully!")
204
+ # except Exception as e:
205
+ # print(f"Error during Silero VAD processing: {e}")
206
+
207
+
208
+ import webrtcvad
209
+ import numpy as np
210
+ import librosa
211
+
212
+ def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
213
+ '''
214
+ Voice Activity Detection (VAD): Detects speech in audio.
215
+ '''
216
+ vad = webrtcvad.Vad(aggressiveness)
217
+
218
+ # Resample to 16000 Hz if not already (recommended for better compatibility)
219
+ if sr != 16000:
220
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
221
+ sr = 16000
222
+
223
+ # Convert to 16-bit PCM format expected by webrtcvad
224
+ audio_int16 = np.int16(audio * 32767)
225
+
226
+ # Ensure frame size matches WebRTC's expected lengths
227
+ frame_size = int(sr * frame_duration / 1000)
228
+ if frame_size % 2 != 0:
229
+ frame_size -= 1 # Make sure it's even to avoid processing issues
230
+
231
+ frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
232
+
233
+ # Filter out non-speech frames
234
+ voiced_frames = []
235
+ for frame in frames:
236
+ if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
237
+ voiced_frames.append(frame)
238
+
239
+ # Concatenate the voiced frames
240
+ voiced_audio = np.concatenate(voiced_frames)
241
+ voiced_audio = np.float32(voiced_audio) / 32767
242
+
243
+ return voiced_audio
244
+
245
+
246
+
247
+