Knight-coderr commited on
Commit
49464b8
·
verified ·
1 Parent(s): c9bae39

Upload 8 files

Browse files

updated logic to perform speech segmentation

models/label_encoder.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a72ac54ec6b1b3294cbc778433b4604ff31217e34e5e16d00f3746bdd742ada7
3
- size 140069973
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3b7be97ecc414067f82ca5d67d82d063b2358eac185326e9ea28403cfed6654
3
+ size 547
models/noise_classifier.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a72ac54ec6b1b3294cbc778433b4604ff31217e34e5e16d00f3746bdd742ada7
3
- size 140069973
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb7bb84f65e96c46b49ab6c306d9eeca2911f71a08394f66c2a93db6d2979411
3
+ size 5906474
requirements.txt CHANGED
@@ -4,4 +4,10 @@ librosa
4
  speechbrain
5
  pyannote.audio
6
  torchaudio
7
- scikit-learn
 
 
 
 
 
 
 
4
  speechbrain
5
  pyannote.audio
6
  torchaudio
7
+ scikit-learn
8
+ numpy
9
+ torchaudio
10
+ scipy
11
+ joblib
12
+ tensorflow
13
+ tensorflow-hub
utils/noise_classification.py CHANGED
@@ -1,15 +1,63 @@
1
  import numpy as np
2
- import librosa
 
3
  import joblib
 
 
4
 
5
- # Load your trained model + label encoder
6
  clf = joblib.load("models/noise_classifier.pkl")
7
  label_encoder = joblib.load("models/label_encoder.pkl")
8
 
9
- def classify_noise(audio_path):
10
- y, sr = librosa.load(audio_path, sr=None)
11
- mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
12
- feature = np.mean(mfcc.T, axis=0).reshape(1, -1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  probs = clf.predict_proba(feature)[0]
14
- top_idx = np.argsort(probs)[::-1][:5]
15
- return [(label_encoder.inverse_transform([i])[0], probs[i]) for i in top_idx]
 
 
 
 
 
 
 
 
1
  import numpy as np
2
+ import torchaudio
3
+ import torchaudio.transforms as T
4
  import joblib
5
+ from scipy.stats import skew, kurtosis
6
+ import tensorflow_hub as hub
7
 
8
+ # Load classifier and label encoder
9
  clf = joblib.load("models/noise_classifier.pkl")
10
  label_encoder = joblib.load("models/label_encoder.pkl")
11
 
12
+ # Load YAMNet model
13
+ yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")
14
+
15
+ def get_yamnet_embedding(audio_path):
16
+ """
17
+ Extract YAMNet embeddings with statistical pooling from a WAV file.
18
+ """
19
+ try:
20
+ waveform, sr = torchaudio.load(audio_path)
21
+ if sr != 16000:
22
+ resampler = T.Resample(orig_freq=sr, new_freq=16000)
23
+ waveform = resampler(waveform)
24
+ if waveform.size(0) > 1:
25
+ waveform = waveform.mean(dim=0)
26
+ else:
27
+ waveform = waveform.squeeze(0)
28
+
29
+ waveform_np = waveform.numpy()
30
+ _, embeddings, _ = yamnet_model(waveform_np)
31
+
32
+ # Statistical features
33
+ mean = np.mean(embeddings, axis=0)
34
+ std = np.std(embeddings, axis=0)
35
+ min_val = np.min(embeddings, axis=0)
36
+ max_val = np.max(embeddings, axis=0)
37
+ skewness = skew(embeddings, axis=0)
38
+ kurt = kurtosis(embeddings, axis=0)
39
+
40
+ return np.concatenate([mean, std, min_val, max_val, skewness, kurt])
41
+ except Exception as e:
42
+ print(f"Failed to process {audio_path}: {e}")
43
+ return None
44
+
45
+ def classify_noise(audio_path, threshold=0.6):
46
+ """
47
+ Classify noise with rejection threshold for 'Unknown' label.
48
+ """
49
+ feature = get_yamnet_embedding(audio_path)
50
+ if feature is None:
51
+ return [("Unknown", 0.0)]
52
+
53
+ feature = feature.reshape(1, -1)
54
  probs = clf.predict_proba(feature)[0]
55
+
56
+ top_idx = np.argmax(probs)
57
+ top_prob = probs[top_idx]
58
+
59
+ if top_prob < threshold:
60
+ return [("Unknown", top_prob)]
61
+
62
+ top_indices = np.argsort(probs)[::-1][:5]
63
+ return [(label_encoder.inverse_transform([i])[0], probs[i]) for i in top_indices]
utils/speaker_diarization.py CHANGED
@@ -1,6 +1,8 @@
1
  from pyannote.audio import Pipeline
2
 
3
- diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
 
 
4
 
5
  def diarize_speakers(audio_path):
6
  return diarization_pipeline(audio_path)
 
1
  from pyannote.audio import Pipeline
2
 
3
+ diarization_pipeline = Pipeline.from_pretrained(
4
+ "pyannote/speaker-diarization-3.1"
5
+ )
6
 
7
  def diarize_speakers(audio_path):
8
  return diarization_pipeline(audio_path)
utils/vad_segmentation.py CHANGED
@@ -1,8 +1,17 @@
1
- import torchaudio
2
- from pyannote.audio import Pipeline
3
 
4
- pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection")
 
5
 
6
- def detect_speech_segments(audio_path):
7
- vad_result = pipeline(audio_path)
8
- return [(segment.start, segment.end) for segment in vad_result.get_timeline().support()]
 
 
 
 
 
 
 
 
 
1
+ from pyannote.audio import Model
2
+ from pyannote.audio.pipelines import VoiceActivityDetection
3
 
4
+ model = Model.from_pretrained("pyannote/segmentation",
5
+ use_auth_token="")
6
 
7
+
8
+ vad_pipeline = VoiceActivityDetection(segmentation=model)
9
+ HYPER_PARAMETERS = {
10
+ "onset": 0.5, "offset": 0.5,
11
+ "min_duration_on": 0.0,
12
+ "min_duration_off": 0.0
13
+ }
14
+
15
+ vad_pipeline.instantiate(HYPER_PARAMETERS)
16
+ def vad_segmentation(input_path, output_path, aggressiveness=2):
17
+ return vad_pipeline(input_path)