Spaces:

ojas121
/

speech_emotion_project

Running

App Files Files Community

ojas121 commited on Mar 11

Commit

9e5fc19

verified ·

1 Parent(s): 50a7bb0

Create app.py

Browse files

Files changed (1) hide show

app.py +81 -0

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import streamlit as st
+import librosa
+import librosa.display
+import numpy as np
+import matplotlib.pyplot as plt
+import soundfile as sf
+import wave
+import json
+from vosk import Model, KaldiRecognizer
+from transformers import pipeline
+import os
+from pydub import AudioSegment
+import noisereduce as nr
+# Load Vosk model
+MODEL_PATH = "vosk-model-small-en-us-0.15"
+if not os.path.exists(MODEL_PATH):
+    st.error("Vosk model not found! Please download and extract it.")
+    st.stop()
+model = Model(MODEL_PATH)
+# Streamlit UI
+st.title("🎙️ Speech Detection System using Mozilla Common Voice")
+st.write("Upload an audio file and get real-time speech-to-text, noise filtering, and emotion analysis.")
+uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])
+if uploaded_file:
+    # Convert MP3 to WAV if needed
+    file_path = f"temp/{uploaded_file.name}"
+    os.makedirs("temp", exist_ok=True)
+    with open(file_path, "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    if file_path.endswith(".mp3"):
+        wav_path = file_path.replace(".mp3", ".wav")
+        audio = AudioSegment.from_mp3(file_path)
+        audio.export(wav_path, format="wav")
+        file_path = wav_path
+    # Load audio
+    y, sr = librosa.load(file_path, sr=16000)
+    # Display waveform
+    fig, ax = plt.subplots(figsize=(10, 4))
+    librosa.display.waveshow(y, sr=sr, ax=ax)
+    st.pyplot(fig)
+    # Noise Reduction
+    y_denoised = nr.reduce_noise(y=y, sr=sr)
+    denoised_path = file_path.replace(".wav", "_denoised.wav")
+    sf.write(denoised_path, y_denoised, sr)
+    # Speech-to-Text using Vosk
+    def transcribe_audio(audio_path):
+        wf = wave.open(audio_path, "rb")
+        rec = KaldiRecognizer(model, wf.getframerate())
+        while True:
+            data = wf.readframes(4000)
+            if len(data) == 0:
+                break
+            if rec.AcceptWaveform(data):
+                result = json.loads(rec.Result())
+                return result["text"]
+    transcription = transcribe_audio(file_path)
+    st.subheader("📝 Transcribed Text:")
+    st.write(transcription)
+    # Emotion Detection
+    emotion_model = pipeline("audio-classification", model="superb/wav2vec2-large-xlsr-53")
+    emotion_result = emotion_model(file_path)
+    st.subheader("😊 Emotion Analysis:")
+    st.write(emotion_result)
+    # Play original and denoised audio
+    st.audio(file_path, format="audio/wav", start_time=0)
+    st.subheader("🔊 Denoised Audio:")
+    st.audio(denoised_path, format="audio/wav", start_time=0)