Audio-Emotion-Recognition

Running

App Files Files Community

omsandeeppatil commited on Jan 16

Commit

eb64d62

verified ·

1 Parent(s): 26f158e

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -105

app.py CHANGED Viewed

@@ -1,13 +1,9 @@
 import gradio as gr
 import torch
-import torchaudio
 import numpy as np
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
-from queue import Queue
-from threading import Thread
-import time
-# Initialize device and model
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_name = "Hatman/audio-emotion-detection"
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
@@ -25,34 +21,32 @@ EMOTION_LABELS = {
     6: "surprise"
 }
-CHUNK_DURATION = 3  # seconds
-SAMPLE_RATE = 16000
-CHUNK_SIZE = SAMPLE_RATE * CHUNK_DURATION
-class AudioProcessor:
-    def __init__(self):
-        self.audio_queue = Queue()
-        self.results_queue = Queue()
-        self.is_running = False
-        self.current_emotions = []
-    def process_chunk(self, audio_chunk):
-        """Process a single chunk of audio"""
-        # Ensure the chunk is the right length
-        if len(audio_chunk) < CHUNK_SIZE:
-            # Pad with zeros if too short
-            audio_chunk = np.pad(audio_chunk, (0, CHUNK_SIZE - len(audio_chunk)))
-        elif len(audio_chunk) > CHUNK_SIZE:
-            # Take only the first CHUNK_SIZE samples
-            audio_chunk = audio_chunk[:CHUNK_SIZE]
         # Prepare input for the model
         inputs = feature_extractor(
-            audio_chunk,
-            sampling_rate=SAMPLE_RATE,
             return_tensors="pt",
             padding=True
         )
         inputs = {k: v.to(device) for k, v in inputs.items()}
         # Get prediction
@@ -61,84 +55,31 @@ class AudioProcessor:
             logits = outputs.logits
             predicted_id = torch.argmax(logits, dim=-1).item()
-        return EMOTION_LABELS[predicted_id]
-    def process_audio_stream(self):
-        """Continuously process audio chunks from the queue"""
-        while self.is_running:
-            if not self.audio_queue.empty():
-                audio_chunk = self.audio_queue.get()
-                emotion = self.process_chunk(audio_chunk)
-                self.current_emotions.append(emotion)
-                # Keep only the last 5 emotions
-                self.current_emotions = self.current_emotions[-5:]
-                self.results_queue.put(self.current_emotions.copy())
-            time.sleep(0.1)
-    def start(self):
-        """Start the processing thread"""
-        self.is_running = True
-        self.process_thread = Thread(target=self.process_audio_stream)
-        self.process_thread.start()
-    def stop(self):
-        """Stop the processing thread"""
-        self.is_running = False
-        if hasattr(self, 'process_thread'):
-            self.process_thread.join()
-audio_processor = AudioProcessor()
-def process_audio(audio, state):
-    """Process incoming audio stream"""
-    if state is None or not state.get('is_running', False):
-        audio_processor.start()
-        state = {'is_running': True}
-    # Convert audio to numpy array if it's not already
-    if isinstance(audio, tuple):
-        audio = audio[1]  # Get the actual audio data
-    audio = np.array(audio)
-    # Add to processing queue
-    audio_processor.audio_queue.put(audio)
-    # Get latest results
-    if not audio_processor.results_queue.empty():
-        emotions = audio_processor.results_queue.get()
-        return gr.update(value=", ".join(emotions)), state
-    return gr.update(), state
-# Define event handler for cleanup
-def on_close():
-    audio_processor.stop()
-with gr.Blocks() as demo:
-    gr.Markdown("# Real-time Audio Emotion Detection")
-    gr.Markdown("Speak into your microphone. Emotions are detected in 3-second chunks.")
-    state = gr.State({'is_running': False})
-    output = gr.Textbox(label="Detected Emotions (Last 5 chunks)", lines=2)
-    audio_input = gr.Audio(
-        sources=["microphone"],
-        type="numpy",
-        streaming=True,
-        label="Microphone Input",
-        show_label=True
-    )
-    audio_input.stream(
-        process_audio,
-        inputs=[audio_input, state],
-        outputs=[output, state],
-        show_progress=False
-    )
-# Launch with cleanup handling
-demo.queue(max_size=10).launch(share=True, prevent_thread_lock=True)
-# Register cleanup
-import atexit
-atexit.register(on_close)

 import gradio as gr
 import torch
 import numpy as np
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
+# Initialize model and processor
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_name = "Hatman/audio-emotion-detection"
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
     6: "surprise"
 }
+def process_audio(audio):
+    """Process audio chunk and return emotion"""
+    if audio is None:
+        return ""
+    # Get the audio data
+    if isinstance(audio, tuple):
+        audio = audio[1]
+    # Convert to numpy array if needed
+    audio = np.array(audio)
+    # Ensure we have mono audio
+    if len(audio.shape) > 1:
+        audio = audio.mean(axis=1)
+    try:
         # Prepare input for the model
         inputs = feature_extractor(
+            audio,
+            sampling_rate=16000,
             return_tensors="pt",
             padding=True
         )
+        # Move to appropriate device
         inputs = {k: v.to(device) for k, v in inputs.items()}
         # Get prediction
             logits = outputs.logits
             predicted_id = torch.argmax(logits, dim=-1).item()
+        emotion = EMOTION_LABELS[predicted_id]
+        return emotion
+    except Exception as e:
+        print(f"Error processing audio: {e}")
+        return "Error processing audio"
+# Create Gradio interface
+demo = gr.Interface(
+    fn=process_audio,
+    inputs=[
+        gr.Audio(
+            sources=["microphone"],
+            type="numpy",
+            streaming=True,
+            label="Speak into your microphone",
+            show_label=True
+        )
+    ],
+    outputs=gr.Textbox(label="Detected Emotion"),
+    title="Live Emotion Detection",
+    description="Speak into your microphone to detect emotions in real-time.",
+    live=True,
+    allow_flagging=False
+)
+# Launch with a small queue for better real-time performance
+demo.queue(max_size=1).launch(share=True)