Spaces:
Sleeping
Sleeping
File size: 6,742 Bytes
a9f8ee6 9e5fc19 3867db1 9e5fc19 9f79df9 9e5fc19 d6a1faf 3867db1 bfc2d97 3867db1 bfc2d97 3867db1 bfc2d97 3867db1 bfc2d97 3867db1 bfc2d97 3867db1 656615f a9f8ee6 3867db1 a9f8ee6 2ee5365 a9f8ee6 2ee5365 a9f8ee6 656615f a9f8ee6 3867db1 a9f8ee6 2ee5365 a9f8ee6 9e5fc19 a9f8ee6 3867db1 9e5fc19 9f79df9 9e5fc19 a9f8ee6 3867db1 9e5fc19 9f79df9 a9f8ee6 9e5fc19 3867db1 9e5fc19 9f79df9 3867db1 d6a1faf 9e5fc19 d6a1faf 9e5fc19 a9f8ee6 9f79df9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
import os
import subprocess
import streamlit as st
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import wave
import json
from vosk import Model, KaldiRecognizer
from transformers import pipeline
from huggingface_hub import snapshot_download
from pydub import AudioSegment
import noisereduce as nr
import plotly.graph_objects as go
import plotly.express as px
# π¨ Apply Custom Dark Mode CSS
st.markdown(
"""
<style>
.stApp {
background-color: #121212;
color: white;
}
.title {
font-size: 32px;
text-align: center;
color: #4CAF50;
font-weight: bold;
}
.subheader {
font-size: 20px;
font-weight: bold;
color: #BB86FC;
}
.stButton>button {
background-color: #BB86FC !important;
color: black !important;
font-size: 18px !important;
padding: 10px 24px !important;
border-radius: 10px !important;
border: none !important;
}
.stAudio {
width: 100% !important;
}
.stMarkdown {
font-size: 16px;
color: #E0E0E0;
}
.stTextInput>div>div>input {
background-color: #1E1E1E !important;
color: white !important;
border-radius: 10px !important;
}
</style>
""",
unsafe_allow_html=True
)
# β
Auto-Download Vosk Model (Speech-to-Text)
VOSK_MODEL = "vosk-model-small-en-us-0.15"
if not os.path.exists(VOSK_MODEL):
st.write("π₯ Downloading Vosk Model...")
subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"])
subprocess.run(["unzip", "vosk.zip"])
subprocess.run(["rm", "vosk.zip"])
# Load Vosk model
model = Model(VOSK_MODEL)
# β
Auto-Download Wav2Vec2 Model (Emotion Detection)
WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53"
if not os.path.exists(WAV2VEC_MODEL):
st.write(f"π₯ Downloading {WAV2VEC_MODEL}...")
snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL)
# Load emotion detection model
emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL)
# β
Streamlit UI
st.markdown("<div class='title'>ποΈ Speech Detection System</div>", unsafe_allow_html=True)
st.markdown("<div class='subheader'>π Upload an audio file for speech-to-text, noise filtering, and emotion analysis.</div>", unsafe_allow_html=True)
uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])
if uploaded_file:
# Convert MP3 to WAV if needed
file_path = f"temp/{uploaded_file.name}"
os.makedirs("temp", exist_ok=True)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
if file_path.endswith(".mp3"):
wav_path = file_path.replace(".mp3", ".wav")
audio = AudioSegment.from_mp3(file_path)
audio.export(wav_path, format="wav")
file_path = wav_path
# Load audio
y, sr = librosa.load(file_path, sr=16000)
# π΅ Display waveform using Plotly
st.markdown("<div class='subheader'>πΌ Interactive Audio Waveform:</div>", unsafe_allow_html=True)
time_axis = np.linspace(0, len(y) / sr, num=len(y))
fig_waveform = go.Figure()
fig_waveform.add_trace(go.Scatter(
x=time_axis,
y=y,
mode='lines',
line=dict(color='cyan'),
name="Waveform"
))
fig_waveform.update_layout(
title="Audio Waveform",
xaxis_title="Time (seconds)",
yaxis_title="Amplitude",
template="plotly_dark"
)
st.plotly_chart(fig_waveform)
# β
Noise Reduction
st.markdown("<div class='subheader'>π Applying Noise Reduction...</div>", unsafe_allow_html=True)
y_denoised = nr.reduce_noise(y=y, sr=sr)
denoised_path = file_path.replace(".wav", "_denoised.wav")
sf.write(denoised_path, y_denoised, sr)
# β
Spectrogram using Plotly
st.markdown("<div class='subheader'>π€ Spectrogram (Frequency Analysis):</div>", unsafe_allow_html=True)
S = librosa.stft(y)
S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
fig_spectrogram = px.imshow(
S_db,
aspect='auto',
origin='lower',
labels={"x": "Time (frames)", "y": "Frequency (bins)", "color": "Intensity (dB)"},
color_continuous_scale="plasma"
)
fig_spectrogram.update_layout(
title="Spectrogram",
template="plotly_dark"
)
st.plotly_chart(fig_spectrogram)
# β
MFCC using Plotly
st.markdown("<div class='subheader'>π΅ MFCC Feature Extraction:</div>", unsafe_allow_html=True)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
fig_mfcc = px.imshow(
mfccs,
aspect='auto',
origin='lower',
labels={"x": "Time (frames)", "y": "MFCC Coefficients", "color": "Magnitude"},
color_continuous_scale="viridis"
)
fig_mfcc.update_layout(
title="Mel-Frequency Cepstral Coefficients (MFCC)",
template="plotly_dark"
)
st.plotly_chart(fig_mfcc)
# β
Speech-to-Text using Vosk
def transcribe_audio(audio_path):
wf = wave.open(audio_path, "rb")
rec = KaldiRecognizer(model, wf.getframerate())
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
return result["text"]
transcription = transcribe_audio(file_path)
st.markdown("<div class='subheader'>π Transcribed Text:</div>", unsafe_allow_html=True)
st.markdown(f"<div class='stMarkdown'>{transcription}</div>", unsafe_allow_html=True)
# β
Emotion Detection
st.markdown("<div class='subheader'>π Emotion Analysis:</div>", unsafe_allow_html=True)
emotion_result = emotion_model(file_path)
emotion_labels = {
"LABEL_0": "Neutral",
"LABEL_1": "Happy",
"LABEL_2": "Sad",
"LABEL_3": "Angry",
"LABEL_4": "Surprised"
}
top_emotion = max(emotion_result, key=lambda x: x["score"])
emotion_name = emotion_labels.get(top_emotion["label"], "Unknown")
emotion_score = top_emotion["score"]
st.markdown(
f"""
<div style="font-size:24px; color:#4CAF50; font-weight:bold;">
{emotion_name} ({emotion_score:.2%} confidence)
</div>
""",
unsafe_allow_html=True
)
# β
Play Original & Denoised Audio
st.audio(file_path, format="audio/wav")
st.audio(denoised_path, format="audio/wav") |