Spaces:
Running
Running
import os | |
import subprocess | |
import streamlit as st | |
import librosa | |
import librosa.display | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import soundfile as sf | |
import wave | |
import json | |
from vosk import Model, KaldiRecognizer | |
from transformers import pipeline | |
from huggingface_hub import snapshot_download | |
from pydub import AudioSegment | |
import noisereduce as nr | |
# π¨ Apply Custom Dark Mode CSS | |
st.markdown( | |
""" | |
<style> | |
.stApp { | |
background-color: #121212; | |
color: white; | |
} | |
.title { | |
font-size: 32px; | |
text-align: center; | |
color: #4CAF50; | |
font-weight: bold; | |
} | |
.subheader { | |
font-size: 20px; | |
font-weight: bold; | |
color: #BB86FC; | |
} | |
.stButton>button { | |
background-color: #BB86FC !important; | |
color: black !important; | |
font-size: 18px !important; | |
padding: 10px 24px !important; | |
border-radius: 10px !important; | |
border: none !important; | |
} | |
.stAudio { | |
width: 100% !important; | |
} | |
.stMarkdown { | |
font-size: 16px; | |
color: #E0E0E0; | |
} | |
.stTextInput>div>div>input { | |
background-color: #1E1E1E !important; | |
color: white !important; | |
border-radius: 10px !important; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
# β Auto-Download Vosk Model (Speech-to-Text) | |
VOSK_MODEL = "vosk-model-small-en-us-0.15" | |
if not os.path.exists(VOSK_MODEL): | |
st.write("π₯ Downloading Vosk Model...") | |
subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"]) | |
subprocess.run(["unzip", "vosk.zip"]) | |
subprocess.run(["rm", "vosk.zip"]) | |
# Load Vosk model | |
model = Model(VOSK_MODEL) | |
# β Auto-Download Wav2Vec2 Model (Emotion Detection) | |
WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53" | |
if not os.path.exists(WAV2VEC_MODEL): | |
st.write(f"π₯ Downloading {WAV2VEC_MODEL}...") | |
snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL) | |
# Load emotion detection model | |
emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL) | |
# β Streamlit UI | |
st.markdown("<div class='title'>ποΈ Speech Detection System</div>", unsafe_allow_html=True) | |
st.markdown("<div class='subheader'>π Upload an audio file for speech-to-text, noise filtering, and emotion analysis.</div>", unsafe_allow_html=True) | |
uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"]) | |
if uploaded_file: | |
# Convert MP3 to WAV if needed | |
file_path = f"temp/{uploaded_file.name}" | |
os.makedirs("temp", exist_ok=True) | |
with open(file_path, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
if file_path.endswith(".mp3"): | |
wav_path = file_path.replace(".mp3", ".wav") | |
audio = AudioSegment.from_mp3(file_path) | |
audio.export(wav_path, format="wav") | |
file_path = wav_path | |
# Load audio | |
y, sr = librosa.load(file_path, sr=16000) | |
# π΅ Display waveform | |
st.markdown("<div class='subheader'>πΌ Audio Waveform:</div>", unsafe_allow_html=True) | |
fig, ax = plt.subplots(figsize=(10, 4)) | |
librosa.display.waveshow(y, sr=sr, ax=ax, color="cyan") | |
ax.set_facecolor("#121212") # Dark background for waveform | |
st.pyplot(fig) | |
# β Noise Reduction | |
st.markdown("<div class='subheader'>π Applying Noise Reduction...</div>", unsafe_allow_html=True) | |
y_denoised = nr.reduce_noise(y=y, sr=sr) | |
denoised_path = file_path.replace(".wav", "_denoised.wav") | |
sf.write(denoised_path, y_denoised, sr) | |
# β Speech-to-Text using Vosk | |
def transcribe_audio(audio_path): | |
wf = wave.open(audio_path, "rb") | |
rec = KaldiRecognizer(model, wf.getframerate()) | |
while True: | |
data = wf.readframes(4000) | |
if len(data) == 0: | |
break | |
if rec.AcceptWaveform(data): | |
result = json.loads(rec.Result()) | |
return result["text"] | |
transcription = transcribe_audio(file_path) | |
st.markdown("<div class='subheader'>π Transcribed Text:</div>", unsafe_allow_html=True) | |
st.markdown(f"<div class='stMarkdown'>{transcription}</div>", unsafe_allow_html=True) | |
# β Emotion Detection (Formatted Output) | |
st.markdown("<div class='subheader'>π Emotion Analysis:</div>", unsafe_allow_html=True) | |
emotion_result = emotion_model(file_path) | |
emotion_labels = { | |
"LABEL_0": "Neutral", | |
"LABEL_1": "Happy", | |
"LABEL_2": "Sad", | |
"LABEL_3": "Angry", | |
"LABEL_4": "Surprised" | |
} | |
top_emotion = max(emotion_result, key=lambda x: x["score"]) | |
emotion_name = emotion_labels.get(top_emotion["label"], "Unknown") | |
emotion_score = top_emotion["score"] | |
st.markdown( | |
f""" | |
<div style="font-size:24px; color:#4CAF50; font-weight:bold;"> | |
{emotion_name} ({emotion_score:.2%} confidence) | |
</div> | |
""", | |
unsafe_allow_html=True | |
) | |
# β Play Original & Denoised Audio | |
st.markdown("<div class='subheader'>π Play Audio:</div>", unsafe_allow_html=True) | |
st.audio(file_path, format="audio/wav", start_time=0) | |
st.markdown("<div class='subheader'>π Denoised Audio:</div>", unsafe_allow_html=True) | |
st.audio(denoised_path, format="audio/wav", start_time=0) | |