Spaces:

Boltz79
/

Sentiment-Analysis

Sleeping

App Files Files Community

Boltz79 commited on Jan 31

Commit

ddf32d8

verified ·

1 Parent(s): 45e49c6

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -62

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import numpy as np
 import torch
 from transformers import pipeline
 import librosa
 class EmotionRecognizer:
     def __init__(self):
@@ -11,100 +12,113 @@ class EmotionRecognizer:
             model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
             device=0 if torch.cuda.is_available() else -1
         )
-        self.sample_rate = 16000
-    def process_audio(self, audio_input):
         try:
-            # Extract audio data and sample rate from gradio input
-            sample_rate, audio_data = audio_input
-            # Convert stereo to mono if necessary
-            if len(audio_data.shape) > 1:
-                audio_data = np.mean(audio_data, axis=1)
-            # Convert to float32 and normalize
-            audio_data = audio_data.astype(np.float32)
-            if np.max(np.abs(audio_data)) > 1.0:
-                audio_data = audio_data / np.max(np.abs(audio_data))
             # Resample if necessary
-            if sample_rate != self.sample_rate:
-                audio_data = librosa.resample(
-                    y=audio_data,
-                    orig_sr=sample_rate,
-                    target_sr=self.sample_rate
                 )
-            # Ensure the audio isn't too short
-            if len(audio_data) < self.sample_rate:
-                # Pad audio if it's too short
-                audio_data = np.pad(audio_data, (0, self.sample_rate - len(audio_data)))
-            elif len(audio_data) > 10 * self.sample_rate:
-                # Take first 10 seconds if audio is too long
-                audio_data = audio_data[:10 * self.sample_rate]
-            # Make prediction
-            result = self.classifier({"array": audio_data, "sampling_rate": self.sample_rate})
-            # Format results
-            emotions_text = "\n".join([
-                f"{pred['label']}: {pred['score']*100:.2f}%"
-                for pred in result
             ])
-            # Prepare plot data
             plot_data = {
-                "labels": [pred['label'] for pred in result],
-                "values": [pred['score'] * 100 for pred in result]
             }
-            return emotions_text, plot_data
         except Exception as e:
-            print(f"Error details: {str(e)}")
-            return f"Error processing audio: {str(e)}", None
 def create_interface():
     recognizer = EmotionRecognizer()
-    def process_audio_file(audio):
-        if audio is None:
-            return "Please provide an audio input.", None
-        return recognizer.process_audio(audio)
-    with gr.Blocks() as interface:
-        gr.Markdown("# Audio Emotion Recognition")
-        gr.Markdown("Record or upload audio to analyze the emotional content. The model works best with clear speech in English.")
         with gr.Row():
             with gr.Column():
                 audio_input = gr.Audio(
-                    label="Upload or Record Audio",
-                    type="numpy",
                     sources=["microphone", "upload"],
                 )
-                analyze_btn = gr.Button("Analyze Emotion")
-                gr.Markdown("Note: Audio will be automatically converted to mono and resampled if needed.")
             with gr.Column():
-                output_text = gr.Textbox(
-                    label="Results",
-                    lines=5
                 )
-                output_plot = gr.BarPlot(
-                    title="Emotion Confidence Scores",
-                    x_title="Emotions",
-                    y_title="Confidence (%)"
                 )
-        analyze_btn.click(
-            fn=process_audio_file,
-            inputs=[audio_input],
-            outputs=[output_text, output_plot]
         )
     return interface
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch(share=True)

 import torch
 from transformers import pipeline
 import librosa
+import soundfile as sf
 class EmotionRecognizer:
     def __init__(self):
             model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
             device=0 if torch.cuda.is_available() else -1
         )
+        self.target_sr = 16000  # Target sample rate for the model
+        self.max_duration = 10  # Max audio duration in seconds
+    def process_audio(self, audio_path):
         try:
+            # Load audio file using soundfile (works better in Hugging Face Spaces)
+            audio, orig_sr = sf.read(audio_path)
+            # Convert stereo to mono if needed
+            if len(audio.shape) > 1:
+                audio = np.mean(audio, axis=1)
             # Resample if necessary
+            if orig_sr != self.target_sr:
+                audio = librosa.resample(
+                    y=audio.astype(np.float32),
+                    orig_sr=orig_sr,
+                    target_sr=self.target_sr
                 )
+            else:
+                audio = audio.astype(np.float32)
+            # Normalize audio
+            audio = librosa.util.normalize(audio)
+            # Trim/pad audio to max duration
+            max_samples = self.max_duration * self.target_sr
+            if len(audio) > max_samples:
+                audio = audio[:max_samples]
+            else:
+                audio = np.pad(audio, (0, max(0, max_samples - len(audio))))
+            # Run classification
+            results = self.classifier(
+                {"array": audio, "sampling_rate": self.target_sr}
+            )
+            # Format output
+            labels = [res["label"] for res in results]
+            scores = [res["score"] * 100 for res in results]
+            text_output = "\n".join([
+                f"{label}: {score:.2f}%"
+                for label, score in zip(labels, scores)
             ])
             plot_data = {
+                "labels": labels,
+                "values": scores
             }
+            return text_output, plot_data
         except Exception as e:
+            error_msg = f"Error processing audio: {str(e)}"
+            print(error_msg)
+            return error_msg, None
 def create_interface():
     recognizer = EmotionRecognizer()
+    with gr.Blocks(title="Audio Emotion Recognition") as interface:
+        gr.Markdown("# 🎙️ Audio Emotion Recognition")
+        gr.Markdown("Record or upload audio (English speech, 3-10 seconds)")
         with gr.Row():
             with gr.Column():
                 audio_input = gr.Audio(
                     sources=["microphone", "upload"],
+                    type="filepath",
+                    label="Input Audio",
+                    waveform_options={"waveform_progress_color": "#FF0066"}
                 )
+                submit_btn = gr.Button("Analyze", variant="primary")
             with gr.Column():
+                text_output = gr.Textbox(
+                    label="Emotion Analysis Results",
+                    interactive=False
                 )
+                plot_output = gr.BarPlot(
+                    label="Confidence Scores",
+                    x="labels",
+                    y="values",
+                    color="labels",
+                    height=300
                 )
+        submit_btn.click(
+            fn=recognizer.process_audio,
+            inputs=audio_input,
+            outputs=[text_output, plot_output]
+        )
+        gr.Examples(
+            examples=[
+                "https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_angry.wav",
+                "https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_happy.wav"
+            ],
+            inputs=audio_input,
+            outputs=[text_output, plot_output],
+            fn=recognizer.process_audio,
+            cache_examples=True
         )
     return interface
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch()