Upload 6 files
Browse files- app.py +44 -0
- requirements.txt +7 -0
- utils/noise_classification.py +15 -0
- utils/noise_removal.py +8 -0
- utils/speaker_diarization.py +6 -0
- utils/vad_segmentation.py +8 -0
app.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import librosa
|
3 |
+
import soundfile as sf
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
from utils.noise_removal import remove_noise
|
7 |
+
from utils.vad_segmentation import detect_speech_segments
|
8 |
+
from utils.speaker_diarization import diarize_speakers
|
9 |
+
from utils.noise_classification import classify_noise
|
10 |
+
|
11 |
+
st.set_page_config(page_title="Audio Analyzer", layout="wide")
|
12 |
+
st.title(" Audio Analysis Pipeline")
|
13 |
+
|
14 |
+
uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])
|
15 |
+
|
16 |
+
if uploaded_file:
|
17 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
18 |
+
tmp.write(uploaded_file.read())
|
19 |
+
tmp_path = tmp.name
|
20 |
+
|
21 |
+
st.audio(tmp_path, format='audio/wav')
|
22 |
+
|
23 |
+
st.subheader("1️⃣ Noise Removal")
|
24 |
+
denoised_path = tmp_path.replace(".wav", "_denoised.wav")
|
25 |
+
remove_noise(tmp_path, denoised_path)
|
26 |
+
st.audio(denoised_path, format="audio/wav")
|
27 |
+
|
28 |
+
st.subheader("2️⃣ Speech Segmentation")
|
29 |
+
speech_segments = detect_speech_segments(denoised_path)
|
30 |
+
st.write(f"Detected {len(speech_segments)} speech segments.")
|
31 |
+
for i, (start, end) in enumerate(speech_segments[:5]):
|
32 |
+
st.write(f"Segment {i+1}: {start:.2f}s to {end:.2f}s")
|
33 |
+
|
34 |
+
st.subheader("3️⃣ Speaker Diarization")
|
35 |
+
diarization = diarize_speakers(denoised_path)
|
36 |
+
st.text("Speakers detected:")
|
37 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
38 |
+
st.write(f"{turn.start:.2f}s - {turn.end:.2f}s: {speaker}")
|
39 |
+
|
40 |
+
st.subheader("4️⃣ Noise Classification")
|
41 |
+
noise_predictions = classify_noise(denoised_path)
|
42 |
+
st.write("Top predicted noise classes:")
|
43 |
+
for cls, prob in noise_predictions:
|
44 |
+
st.write(f"{cls}: {prob:.2f}")
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
soundfile
|
3 |
+
librosa
|
4 |
+
speechbrain
|
5 |
+
pyannote.audio
|
6 |
+
torchaudio
|
7 |
+
scikit-learn
|
utils/noise_classification.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import librosa
|
3 |
+
import joblib
|
4 |
+
|
5 |
+
# Load your trained model + label encoder
|
6 |
+
clf = joblib.load("models/noise_classifier.pkl")
|
7 |
+
label_encoder = joblib.load("models/label_encoder.pkl")
|
8 |
+
|
9 |
+
def classify_noise(audio_path):
|
10 |
+
y, sr = librosa.load(audio_path, sr=None)
|
11 |
+
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
12 |
+
feature = np.mean(mfcc.T, axis=0).reshape(1, -1)
|
13 |
+
probs = clf.predict_proba(feature)[0]
|
14 |
+
top_idx = np.argsort(probs)[::-1][:5]
|
15 |
+
return [(label_encoder.inverse_transform([i])[0], probs[i]) for i in top_idx]
|
utils/noise_removal.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from speechbrain.pretrained import SpectralMaskEnhancement
|
2 |
+
import torchaudio
|
3 |
+
|
4 |
+
model = SpectralMaskEnhancement.from_hparams(source="speechbrain/metricgan-plus-voicebank")
|
5 |
+
|
6 |
+
def remove_noise(input_path, output_path):
|
7 |
+
enhanced = model.enhance_file(input_path)
|
8 |
+
torchaudio.save(output_path, enhanced[0], enhanced[1])
|
utils/speaker_diarization.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pyannote.audio import Pipeline
|
2 |
+
|
3 |
+
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
|
4 |
+
|
5 |
+
def diarize_speakers(audio_path):
|
6 |
+
return diarization_pipeline(audio_path)
|
utils/vad_segmentation.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torchaudio
|
2 |
+
from pyannote.audio import Pipeline
|
3 |
+
|
4 |
+
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection")
|
5 |
+
|
6 |
+
def detect_speech_segments(audio_path):
|
7 |
+
vad_result = pipeline(audio_path)
|
8 |
+
return [(segment.start, segment.end) for segment in vad_result.get_timeline().support()]
|