File size: 6,742 Bytes
a9f8ee6
 
9e5fc19
 
 
 
 
 
 
 
 
 
3867db1
9e5fc19
 
9f79df9
 
9e5fc19
d6a1faf
3867db1
 
 
 
bfc2d97
 
3867db1
 
 
 
bfc2d97
3867db1
 
 
 
 
bfc2d97
3867db1
 
bfc2d97
 
3867db1
 
 
 
 
 
 
 
 
 
bfc2d97
 
 
 
 
 
3867db1
 
 
 
 
656615f
a9f8ee6
 
 
3867db1
a9f8ee6
 
 
2ee5365
a9f8ee6
 
2ee5365
a9f8ee6
656615f
a9f8ee6
3867db1
a9f8ee6
2ee5365
a9f8ee6
 
9e5fc19
a9f8ee6
3867db1
 
9e5fc19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f79df9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e5fc19
a9f8ee6
3867db1
9e5fc19
 
 
 
9f79df9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9f8ee6
9e5fc19
 
 
 
 
 
 
 
 
 
 
 
 
3867db1
 
 
9e5fc19
9f79df9
3867db1
d6a1faf
9e5fc19
d6a1faf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e5fc19
a9f8ee6
9f79df9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import os
import subprocess
import streamlit as st
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import wave
import json
from vosk import Model, KaldiRecognizer
from transformers import pipeline
from huggingface_hub import snapshot_download
from pydub import AudioSegment
import noisereduce as nr
import plotly.graph_objects as go
import plotly.express as px

# 🎨 Apply Custom Dark Mode CSS
st.markdown(
    """
    <style>
        .stApp {
            background-color: #121212;
            color: white;
        }
        .title {
            font-size: 32px;
            text-align: center;
            color: #4CAF50;
            font-weight: bold;
        }
        .subheader {
            font-size: 20px;
            font-weight: bold;
            color: #BB86FC;
        }
        .stButton>button {
            background-color: #BB86FC !important;
            color: black !important;
            font-size: 18px !important;
            padding: 10px 24px !important;
            border-radius: 10px !important;
            border: none !important;
        }
        .stAudio {
            width: 100% !important;
        }
        .stMarkdown {
            font-size: 16px;
            color: #E0E0E0;
        }
        .stTextInput>div>div>input {
            background-color: #1E1E1E !important;
            color: white !important;
            border-radius: 10px !important;
        }
    </style>
    """,
    unsafe_allow_html=True
)

# βœ… Auto-Download Vosk Model (Speech-to-Text)
VOSK_MODEL = "vosk-model-small-en-us-0.15"
if not os.path.exists(VOSK_MODEL):
    st.write("πŸ“₯ Downloading Vosk Model...")
    subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"])
    subprocess.run(["unzip", "vosk.zip"])
    subprocess.run(["rm", "vosk.zip"])

# Load Vosk model
model = Model(VOSK_MODEL)

# βœ… Auto-Download Wav2Vec2 Model (Emotion Detection)
WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53"
if not os.path.exists(WAV2VEC_MODEL):
    st.write(f"πŸ“₯ Downloading {WAV2VEC_MODEL}...")
    snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL)

# Load emotion detection model
emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL)

# βœ… Streamlit UI
st.markdown("<div class='title'>πŸŽ™οΈ Speech Detection System</div>", unsafe_allow_html=True)
st.markdown("<div class='subheader'>πŸ” Upload an audio file for speech-to-text, noise filtering, and emotion analysis.</div>", unsafe_allow_html=True)

uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])

if uploaded_file:
    # Convert MP3 to WAV if needed
    file_path = f"temp/{uploaded_file.name}"
    os.makedirs("temp", exist_ok=True)
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    if file_path.endswith(".mp3"):
        wav_path = file_path.replace(".mp3", ".wav")
        audio = AudioSegment.from_mp3(file_path)
        audio.export(wav_path, format="wav")
        file_path = wav_path

    # Load audio
    y, sr = librosa.load(file_path, sr=16000)

    # 🎡 Display waveform using Plotly
    st.markdown("<div class='subheader'>🎼 Interactive Audio Waveform:</div>", unsafe_allow_html=True)

    time_axis = np.linspace(0, len(y) / sr, num=len(y))

    fig_waveform = go.Figure()
    fig_waveform.add_trace(go.Scatter(
        x=time_axis,
        y=y,
        mode='lines',
        line=dict(color='cyan'),
        name="Waveform"
    ))

    fig_waveform.update_layout(
        title="Audio Waveform",
        xaxis_title="Time (seconds)",
        yaxis_title="Amplitude",
        template="plotly_dark"
    )

    st.plotly_chart(fig_waveform)

    # βœ… Noise Reduction
    st.markdown("<div class='subheader'>πŸ”‡ Applying Noise Reduction...</div>", unsafe_allow_html=True)
    y_denoised = nr.reduce_noise(y=y, sr=sr)
    denoised_path = file_path.replace(".wav", "_denoised.wav")
    sf.write(denoised_path, y_denoised, sr)

    # βœ… Spectrogram using Plotly
    st.markdown("<div class='subheader'>🎀 Spectrogram (Frequency Analysis):</div>", unsafe_allow_html=True)

    S = librosa.stft(y)
    S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)

    fig_spectrogram = px.imshow(
        S_db, 
        aspect='auto', 
        origin='lower',
        labels={"x": "Time (frames)", "y": "Frequency (bins)", "color": "Intensity (dB)"},
        color_continuous_scale="plasma"
    )

    fig_spectrogram.update_layout(
        title="Spectrogram",
        template="plotly_dark"
    )

    st.plotly_chart(fig_spectrogram)

    # βœ… MFCC using Plotly
    st.markdown("<div class='subheader'>🎡 MFCC Feature Extraction:</div>", unsafe_allow_html=True)

    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    fig_mfcc = px.imshow(
        mfccs, 
        aspect='auto', 
        origin='lower',
        labels={"x": "Time (frames)", "y": "MFCC Coefficients", "color": "Magnitude"},
        color_continuous_scale="viridis"
    )

    fig_mfcc.update_layout(
        title="Mel-Frequency Cepstral Coefficients (MFCC)",
        template="plotly_dark"
    )

    st.plotly_chart(fig_mfcc)

    # βœ… Speech-to-Text using Vosk
    def transcribe_audio(audio_path):
        wf = wave.open(audio_path, "rb")
        rec = KaldiRecognizer(model, wf.getframerate())

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                result = json.loads(rec.Result())
                return result["text"]

    transcription = transcribe_audio(file_path)

    st.markdown("<div class='subheader'>πŸ“ Transcribed Text:</div>", unsafe_allow_html=True)
    st.markdown(f"<div class='stMarkdown'>{transcription}</div>", unsafe_allow_html=True)

    # βœ… Emotion Detection
    st.markdown("<div class='subheader'>😊 Emotion Analysis:</div>", unsafe_allow_html=True)

    emotion_result = emotion_model(file_path)
    emotion_labels = {
        "LABEL_0": "Neutral",
        "LABEL_1": "Happy",
        "LABEL_2": "Sad",
        "LABEL_3": "Angry",
        "LABEL_4": "Surprised"
    }
    top_emotion = max(emotion_result, key=lambda x: x["score"])
    emotion_name = emotion_labels.get(top_emotion["label"], "Unknown")
    emotion_score = top_emotion["score"]

    st.markdown(
        f"""
        <div style="font-size:24px; color:#4CAF50; font-weight:bold;">
            {emotion_name} ({emotion_score:.2%} confidence)
        </div>
        """,
        unsafe_allow_html=True
    )

    # βœ… Play Original & Denoised Audio
    st.audio(file_path, format="audio/wav")
    st.audio(denoised_path, format="audio/wav")