Audio-Emotion-Recognition

Running

App Files Files Community

omsandeeppatil commited on Jan 16

Commit

1de8eea

verified ·

1 Parent(s): eb64d62

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -17

app.py CHANGED Viewed

@@ -26,18 +26,26 @@ def process_audio(audio):
     if audio is None:
         return ""
-    # Get the audio data
-    if isinstance(audio, tuple):
-        audio = audio[1]
-    # Convert to numpy array if needed
-    audio = np.array(audio)
-    # Ensure we have mono audio
-    if len(audio.shape) > 1:
-        audio = audio.mean(axis=1)
     try:
         # Prepare input for the model
         inputs = feature_extractor(
             audio,
@@ -46,8 +54,8 @@ def process_audio(audio):
             padding=True
         )
-        # Move to appropriate device
-        inputs = {k: v.to(device) for k, v in inputs.items()}
         # Get prediction
         with torch.no_grad():
@@ -55,12 +63,16 @@ def process_audio(audio):
             logits = outputs.logits
             predicted_id = torch.argmax(logits, dim=-1).item()
         emotion = EMOTION_LABELS[predicted_id]
-        return emotion
     except Exception as e:
-        print(f"Error processing audio: {e}")
-        return "Error processing audio"
 # Create Gradio interface
 demo = gr.Interface(
@@ -82,4 +94,5 @@ demo = gr.Interface(
 )
 # Launch with a small queue for better real-time performance
-demo.queue(max_size=1).launch(share=True)

     if audio is None:
         return ""
     try:
+        # Get the audio data
+        if isinstance(audio, tuple):
+            audio = audio[1]
+        # Convert to numpy array and ensure float32 type
+        audio = np.array(audio, dtype=np.float32)
+        # Ensure we have mono audio
+        if len(audio.shape) > 1:
+            audio = audio.mean(axis=1)
+        # Normalize audio if needed
+        if audio.max() > 1.0 or audio.min() < -1.0:
+            audio = audio / max(abs(audio.max()), abs(audio.min()))
+        # Ensure we have non-zero audio
+        if len(audio) == 0 or np.all(audio == 0):
+            return "No audio detected"
         # Prepare input for the model
         inputs = feature_extractor(
             audio,
             padding=True
         )
+        # Ensure all tensors are float32
+        inputs = {k: v.to(device, dtype=torch.float32) for k, v in inputs.items()}
         # Get prediction
         with torch.no_grad():
             logits = outputs.logits
             predicted_id = torch.argmax(logits, dim=-1).item()
+            # Get probabilities
+            probs = torch.nn.functional.softmax(logits, dim=-1)
+            confidence = probs[0][predicted_id].item() * 100
         emotion = EMOTION_LABELS[predicted_id]
+        return f"{emotion} (confidence: {confidence:.1f}%)"
     except Exception as e:
+        print(f"Error in audio processing: {str(e)}")
+        return "Error processing audio. Please try again."
 # Create Gradio interface
 demo = gr.Interface(
 )
 # Launch with a small queue for better real-time performance
+if __name__ == "__main__":
+    demo.queue(max_size=1).launch(share=True)