ojas121 commited on
Commit
9e5fc19
Β·
verified Β·
1 Parent(s): 50a7bb0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import librosa
3
+ import librosa.display
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import soundfile as sf
7
+ import wave
8
+ import json
9
+ from vosk import Model, KaldiRecognizer
10
+ from transformers import pipeline
11
+ import os
12
+ from pydub import AudioSegment
13
+ import noisereduce as nr
14
+
15
+ # Load Vosk model
16
+ MODEL_PATH = "vosk-model-small-en-us-0.15"
17
+ if not os.path.exists(MODEL_PATH):
18
+ st.error("Vosk model not found! Please download and extract it.")
19
+ st.stop()
20
+ model = Model(MODEL_PATH)
21
+
22
+ # Streamlit UI
23
+ st.title("πŸŽ™οΈ Speech Detection System using Mozilla Common Voice")
24
+ st.write("Upload an audio file and get real-time speech-to-text, noise filtering, and emotion analysis.")
25
+
26
+ uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])
27
+
28
+ if uploaded_file:
29
+ # Convert MP3 to WAV if needed
30
+ file_path = f"temp/{uploaded_file.name}"
31
+ os.makedirs("temp", exist_ok=True)
32
+ with open(file_path, "wb") as f:
33
+ f.write(uploaded_file.getbuffer())
34
+
35
+ if file_path.endswith(".mp3"):
36
+ wav_path = file_path.replace(".mp3", ".wav")
37
+ audio = AudioSegment.from_mp3(file_path)
38
+ audio.export(wav_path, format="wav")
39
+ file_path = wav_path
40
+
41
+ # Load audio
42
+ y, sr = librosa.load(file_path, sr=16000)
43
+
44
+ # Display waveform
45
+ fig, ax = plt.subplots(figsize=(10, 4))
46
+ librosa.display.waveshow(y, sr=sr, ax=ax)
47
+ st.pyplot(fig)
48
+
49
+ # Noise Reduction
50
+ y_denoised = nr.reduce_noise(y=y, sr=sr)
51
+ denoised_path = file_path.replace(".wav", "_denoised.wav")
52
+ sf.write(denoised_path, y_denoised, sr)
53
+
54
+ # Speech-to-Text using Vosk
55
+ def transcribe_audio(audio_path):
56
+ wf = wave.open(audio_path, "rb")
57
+ rec = KaldiRecognizer(model, wf.getframerate())
58
+
59
+ while True:
60
+ data = wf.readframes(4000)
61
+ if len(data) == 0:
62
+ break
63
+ if rec.AcceptWaveform(data):
64
+ result = json.loads(rec.Result())
65
+ return result["text"]
66
+
67
+ transcription = transcribe_audio(file_path)
68
+ st.subheader("πŸ“ Transcribed Text:")
69
+ st.write(transcription)
70
+
71
+ # Emotion Detection
72
+ emotion_model = pipeline("audio-classification", model="superb/wav2vec2-large-xlsr-53")
73
+ emotion_result = emotion_model(file_path)
74
+
75
+ st.subheader("😊 Emotion Analysis:")
76
+ st.write(emotion_result)
77
+
78
+ # Play original and denoised audio
79
+ st.audio(file_path, format="audio/wav", start_time=0)
80
+ st.subheader("πŸ”Š Denoised Audio:")
81
+ st.audio(denoised_path, format="audio/wav", start_time=0)