Spaces:

Knight-coderr
/

Forensic-Noise-Classifier

Running

App Files Files Community

Knight-coderr commited on 10 days ago

Commit

49464b8

verified ·

1 Parent(s): c9bae39

Upload 8 files

Browse files

updated logic to perform speech segmentation

Files changed (6) hide show

models/label_encoder.pkl +2 -2
models/noise_classifier.pkl +2 -2
requirements.txt +7 -1
utils/noise_classification.py +56 -8
utils/speaker_diarization.py +3 -1
utils/vad_segmentation.py +15 -6

models/label_encoder.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a72ac54ec6b1b3294cbc778433b4604ff31217e34e5e16d00f3746bdd742ada7
-size 140069973

 version https://git-lfs.github.com/spec/v1
+oid sha256:a3b7be97ecc414067f82ca5d67d82d063b2358eac185326e9ea28403cfed6654
+size 547

models/noise_classifier.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a72ac54ec6b1b3294cbc778433b4604ff31217e34e5e16d00f3746bdd742ada7
-size 140069973

 version https://git-lfs.github.com/spec/v1
+oid sha256:eb7bb84f65e96c46b49ab6c306d9eeca2911f71a08394f66c2a93db6d2979411
+size 5906474

requirements.txt CHANGED Viewed

@@ -4,4 +4,10 @@ librosa
 speechbrain
 pyannote.audio
 torchaudio
-scikit-learn

 speechbrain
 pyannote.audio
 torchaudio
+scikit-learn
+numpy
+torchaudio
+scipy
+joblib
+tensorflow
+tensorflow-hub

utils/noise_classification.py CHANGED Viewed

@@ -1,15 +1,63 @@
 import numpy as np
-import librosa
 import joblib
-# Load your trained model + label encoder
 clf = joblib.load("models/noise_classifier.pkl")
 label_encoder = joblib.load("models/label_encoder.pkl")
-def classify_noise(audio_path):
-    y, sr = librosa.load(audio_path, sr=None)
-    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
-    feature = np.mean(mfcc.T, axis=0).reshape(1, -1)
     probs = clf.predict_proba(feature)[0]
-    top_idx = np.argsort(probs)[::-1][:5]
-    return [(label_encoder.inverse_transform([i])[0], probs[i]) for i in top_idx]

 import numpy as np
+import torchaudio
+import torchaudio.transforms as T
 import joblib
+from scipy.stats import skew, kurtosis
+import tensorflow_hub as hub
+# Load classifier and label encoder
 clf = joblib.load("models/noise_classifier.pkl")
 label_encoder = joblib.load("models/label_encoder.pkl")
+# Load YAMNet model
+yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")
+def get_yamnet_embedding(audio_path):
+    """
+    Extract YAMNet embeddings with statistical pooling from a WAV file.
+    """
+    try:
+        waveform, sr = torchaudio.load(audio_path)
+        if sr != 16000:
+            resampler = T.Resample(orig_freq=sr, new_freq=16000)
+            waveform = resampler(waveform)
+        if waveform.size(0) > 1:
+            waveform = waveform.mean(dim=0)
+        else:
+            waveform = waveform.squeeze(0)
+        waveform_np = waveform.numpy()
+        _, embeddings, _ = yamnet_model(waveform_np)
+        # Statistical features
+        mean = np.mean(embeddings, axis=0)
+        std = np.std(embeddings, axis=0)
+        min_val = np.min(embeddings, axis=0)
+        max_val = np.max(embeddings, axis=0)
+        skewness = skew(embeddings, axis=0)
+        kurt = kurtosis(embeddings, axis=0)
+        return np.concatenate([mean, std, min_val, max_val, skewness, kurt])
+    except Exception as e:
+        print(f"Failed to process {audio_path}: {e}")
+        return None
+def classify_noise(audio_path, threshold=0.6):
+    """
+    Classify noise with rejection threshold for 'Unknown' label.
+    """
+    feature = get_yamnet_embedding(audio_path)
+    if feature is None:
+        return [("Unknown", 0.0)]
+    feature = feature.reshape(1, -1)
     probs = clf.predict_proba(feature)[0]
+    top_idx = np.argmax(probs)
+    top_prob = probs[top_idx]
+    if top_prob < threshold:
+        return [("Unknown", top_prob)]
+    top_indices = np.argsort(probs)[::-1][:5]
+    return [(label_encoder.inverse_transform([i])[0], probs[i]) for i in top_indices]

utils/speaker_diarization.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from pyannote.audio import Pipeline
-diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
 def diarize_speakers(audio_path):
     return diarization_pipeline(audio_path)

 from pyannote.audio import Pipeline
+diarization_pipeline = Pipeline.from_pretrained(
+      "pyannote/speaker-diarization-3.1"
+)
 def diarize_speakers(audio_path):
     return diarization_pipeline(audio_path)

utils/vad_segmentation.py CHANGED Viewed

@@ -1,8 +1,17 @@
-import torchaudio
-from pyannote.audio import Pipeline
-pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection")
-def detect_speech_segments(audio_path):
-    vad_result = pipeline(audio_path)
-    return [(segment.start, segment.end) for segment in vad_result.get_timeline().support()]

+from pyannote.audio import Model
+from pyannote.audio.pipelines import VoiceActivityDetection
+model = Model.from_pretrained("pyannote/segmentation",
+                              use_auth_token="")
+vad_pipeline = VoiceActivityDetection(segmentation=model)
+HYPER_PARAMETERS = {
+  "onset": 0.5, "offset": 0.5,
+  "min_duration_on": 0.0,
+  "min_duration_off": 0.0
+}
+vad_pipeline.instantiate(HYPER_PARAMETERS)
+def vad_segmentation(input_path, output_path, aggressiveness=2):
+    return vad_pipeline(input_path)