Audio-Emotion-Recognition

Running

App Files Files Community

omsandeeppatil commited on Jan 16

Commit

787c2bc

verified ·

1 Parent(s): 0a54d22

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -57

app.py CHANGED Viewed

@@ -1,13 +1,18 @@
 import gradio as gr
 import torch
 import torchaudio
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
 # Initialize device and model
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_name = "Hatman/audio-emotion-detection"
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
 # Define emotion labels
 EMOTION_LABELS = {
@@ -20,69 +25,121 @@ EMOTION_LABELS = {
     6: "surprise"
 }
-def preprocess_audio(audio):
-    """Preprocess audio file for model input"""
-    waveform, sampling_rate = torchaudio.load(audio)
-    resampled_waveform = torchaudio.transforms.Resample(
-        orig_freq=sampling_rate,
-        new_freq=16000
-    )(waveform)
-    return {
-        'speech': resampled_waveform.numpy().flatten(),
-        'sampling_rate': 16000
-    }
-def inference(audio):
-    """Full inference function returning emotion, logits, and predicted IDs"""
-    example = preprocess_audio(audio)
-    inputs = feature_extractor(
-        example['speech'],
-        sampling_rate=16000,
-        return_tensors="pt",
-        padding=True
-    )
-    # Move inputs to appropriate device
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs.logits
-        predicted_ids = torch.argmax(logits, dim=-1)
-    predicted_emotion = EMOTION_LABELS[predicted_ids.item()]
-    return predicted_emotion, logits.tolist(), predicted_ids.tolist()
-def inference_label(audio):
-    """Simplified inference function returning only the emotion label"""
-    emotion, _, _ = inference(audio)
-    return emotion
-# Create Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Audio Emotion Detection")
-    with gr.Tab("Quick Analysis"):
-        gr.Interface(
-            fn=inference_label,
-            inputs=gr.Audio(type="filepath"),
-            outputs=gr.Label(label="Detected Emotion"),
-            title="Audio Emotion Analysis",
-            description="Upload or record audio to detect the emotional content."
-        )
-    with gr.Tab("Detailed Analysis"):
-        gr.Interface(
-            fn=inference,
-            inputs=gr.Audio(type="filepath"),
-            outputs=[
-                gr.Label(label="Detected Emotion"),
-                gr.JSON(label="Confidence Scores"),
-                gr.JSON(label="Internal IDs")
-            ],
-            title="Audio Emotion Analysis (Detailed)",
-            description="Get detailed analysis including confidence scores for each emotion."
-        )
-# Launch the app
-demo.launch(share=True)

 import gradio as gr
 import torch
 import torchaudio
+import numpy as np
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
+from queue import Queue
+from threading import Thread
+import time
 # Initialize device and model
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_name = "Hatman/audio-emotion-detection"
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
+model.to(device)
 # Define emotion labels
 EMOTION_LABELS = {
     6: "surprise"
 }
+CHUNK_DURATION = 3  # seconds
+SAMPLE_RATE = 16000
+CHUNK_SIZE = SAMPLE_RATE * CHUNK_DURATION
+class AudioProcessor:
+    def __init__(self):
+        self.audio_queue = Queue()
+        self.results_queue = Queue()
+        self.is_running = False
+        self.current_emotions = []
+    def process_chunk(self, audio_chunk):
+        """Process a single chunk of audio"""
+        # Ensure the chunk is the right length
+        if len(audio_chunk) < CHUNK_SIZE:
+            # Pad with zeros if too short
+            audio_chunk = np.pad(audio_chunk, (0, CHUNK_SIZE - len(audio_chunk)))
+        elif len(audio_chunk) > CHUNK_SIZE:
+            # Take only the first CHUNK_SIZE samples
+            audio_chunk = audio_chunk[:CHUNK_SIZE]
+        # Prepare input for the model
+        inputs = feature_extractor(
+            audio_chunk,
+            sampling_rate=SAMPLE_RATE,
+            return_tensors="pt",
+            padding=True
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Get prediction
+        with torch.no_grad():
+            outputs = model(**inputs)
+            logits = outputs.logits
+            predicted_id = torch.argmax(logits, dim=-1).item()
+        return EMOTION_LABELS[predicted_id]
+    def process_audio_stream(self):
+        """Continuously process audio chunks from the queue"""
+        while self.is_running:
+            if not self.audio_queue.empty():
+                audio_chunk = self.audio_queue.get()
+                emotion = self.process_chunk(audio_chunk)
+                self.current_emotions.append(emotion)
+                # Keep only the last 5 emotions
+                self.current_emotions = self.current_emotions[-5:]
+                self.results_queue.put(self.current_emotions.copy())
+            time.sleep(0.1)
+    def start(self):
+        """Start the processing thread"""
+        self.is_running = True
+        self.process_thread = Thread(target=self.process_audio_stream)
+        self.process_thread.start()
+    def stop(self):
+        """Stop the processing thread"""
+        self.is_running = False
+        if hasattr(self, 'process_thread'):
+            self.process_thread.join()
+audio_processor = AudioProcessor()
+def process_audio(audio, state):
+    """Process incoming audio stream"""
+    if state is None or not state.get('is_running', False):
+        audio_processor.start()
+        state = {'is_running': True}
+    # Convert audio to numpy array if it's not already
+    if isinstance(audio, tuple):
+        audio = audio[1]  # Get the actual audio data
+    audio = np.array(audio)
+    # Add to processing queue
+    audio_processor.audio_queue.put(audio)
+    # Get latest results
+    if not audio_processor.results_queue.empty():
+        emotions = audio_processor.results_queue.get()
+        return gr.update(value=", ".join(emotions)), state
+    return gr.update(), state
+def cleanup(state):
+    """Cleanup when the interface is closed"""
+    if state and state.get('is_running', False):
+        audio_processor.stop()
+        state['is_running'] = False
+    return state
 with gr.Blocks() as demo:
+    gr.Markdown("# Real-time Audio Emotion Detection")
+    gr.Markdown("Speak into your microphone. Emotions are detected in 3-second chunks.")
+    state = gr.State(None)
+    output = gr.Textbox(label="Detected Emotions (Last 5 chunks)", lines=2)
+    audio_input = gr.Audio(
+        source="microphone",
+        type="numpy",
+        streaming=True,
+        label="Microphone Input",
+        show_label=True
+    )
+    audio_input.stream(
+        process_audio,
+        inputs=[audio_input, state],
+        outputs=[output, state],
+        show_progress=False
+    )
+    demo.load(lambda: None, None, state)
+    demo.close(cleanup, state, state)
+demo.queue().launch(share=True)