omsandeeppatil commited on
Commit
eb64d62
·
verified ·
1 Parent(s): 26f158e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -105
app.py CHANGED
@@ -1,13 +1,9 @@
1
  import gradio as gr
2
  import torch
3
- import torchaudio
4
  import numpy as np
5
  from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
6
- from queue import Queue
7
- from threading import Thread
8
- import time
9
 
10
- # Initialize device and model
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
  model_name = "Hatman/audio-emotion-detection"
13
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
@@ -25,34 +21,32 @@ EMOTION_LABELS = {
25
  6: "surprise"
26
  }
27
 
28
- CHUNK_DURATION = 3 # seconds
29
- SAMPLE_RATE = 16000
30
- CHUNK_SIZE = SAMPLE_RATE * CHUNK_DURATION
31
-
32
- class AudioProcessor:
33
- def __init__(self):
34
- self.audio_queue = Queue()
35
- self.results_queue = Queue()
36
- self.is_running = False
37
- self.current_emotions = []
38
 
39
- def process_chunk(self, audio_chunk):
40
- """Process a single chunk of audio"""
41
- # Ensure the chunk is the right length
42
- if len(audio_chunk) < CHUNK_SIZE:
43
- # Pad with zeros if too short
44
- audio_chunk = np.pad(audio_chunk, (0, CHUNK_SIZE - len(audio_chunk)))
45
- elif len(audio_chunk) > CHUNK_SIZE:
46
- # Take only the first CHUNK_SIZE samples
47
- audio_chunk = audio_chunk[:CHUNK_SIZE]
48
-
 
 
49
  # Prepare input for the model
50
  inputs = feature_extractor(
51
- audio_chunk,
52
- sampling_rate=SAMPLE_RATE,
53
  return_tensors="pt",
54
  padding=True
55
  )
 
 
56
  inputs = {k: v.to(device) for k, v in inputs.items()}
57
 
58
  # Get prediction
@@ -61,84 +55,31 @@ class AudioProcessor:
61
  logits = outputs.logits
62
  predicted_id = torch.argmax(logits, dim=-1).item()
63
 
64
- return EMOTION_LABELS[predicted_id]
65
-
66
- def process_audio_stream(self):
67
- """Continuously process audio chunks from the queue"""
68
- while self.is_running:
69
- if not self.audio_queue.empty():
70
- audio_chunk = self.audio_queue.get()
71
- emotion = self.process_chunk(audio_chunk)
72
- self.current_emotions.append(emotion)
73
- # Keep only the last 5 emotions
74
- self.current_emotions = self.current_emotions[-5:]
75
- self.results_queue.put(self.current_emotions.copy())
76
- time.sleep(0.1)
77
-
78
- def start(self):
79
- """Start the processing thread"""
80
- self.is_running = True
81
- self.process_thread = Thread(target=self.process_audio_stream)
82
- self.process_thread.start()
83
-
84
- def stop(self):
85
- """Stop the processing thread"""
86
- self.is_running = False
87
- if hasattr(self, 'process_thread'):
88
- self.process_thread.join()
89
-
90
- audio_processor = AudioProcessor()
91
-
92
- def process_audio(audio, state):
93
- """Process incoming audio stream"""
94
- if state is None or not state.get('is_running', False):
95
- audio_processor.start()
96
- state = {'is_running': True}
97
-
98
- # Convert audio to numpy array if it's not already
99
- if isinstance(audio, tuple):
100
- audio = audio[1] # Get the actual audio data
101
- audio = np.array(audio)
102
-
103
- # Add to processing queue
104
- audio_processor.audio_queue.put(audio)
105
-
106
- # Get latest results
107
- if not audio_processor.results_queue.empty():
108
- emotions = audio_processor.results_queue.get()
109
- return gr.update(value=", ".join(emotions)), state
110
 
111
- return gr.update(), state
 
 
112
 
113
- # Define event handler for cleanup
114
- def on_close():
115
- audio_processor.stop()
116
-
117
- with gr.Blocks() as demo:
118
- gr.Markdown("# Real-time Audio Emotion Detection")
119
- gr.Markdown("Speak into your microphone. Emotions are detected in 3-second chunks.")
120
-
121
- state = gr.State({'is_running': False})
122
- output = gr.Textbox(label="Detected Emotions (Last 5 chunks)", lines=2)
123
-
124
- audio_input = gr.Audio(
125
- sources=["microphone"],
126
- type="numpy",
127
- streaming=True,
128
- label="Microphone Input",
129
- show_label=True
130
- )
131
-
132
- audio_input.stream(
133
- process_audio,
134
- inputs=[audio_input, state],
135
- outputs=[output, state],
136
- show_progress=False
137
- )
138
-
139
- # Launch with cleanup handling
140
- demo.queue(max_size=10).launch(share=True, prevent_thread_lock=True)
141
 
142
- # Register cleanup
143
- import atexit
144
- atexit.register(on_close)
 
1
  import gradio as gr
2
  import torch
 
3
  import numpy as np
4
  from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
 
 
 
5
 
6
+ # Initialize model and processor
7
  device = "cuda" if torch.cuda.is_available() else "cpu"
8
  model_name = "Hatman/audio-emotion-detection"
9
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
 
21
  6: "surprise"
22
  }
23
 
24
+ def process_audio(audio):
25
+ """Process audio chunk and return emotion"""
26
+ if audio is None:
27
+ return ""
 
 
 
 
 
 
28
 
29
+ # Get the audio data
30
+ if isinstance(audio, tuple):
31
+ audio = audio[1]
32
+
33
+ # Convert to numpy array if needed
34
+ audio = np.array(audio)
35
+
36
+ # Ensure we have mono audio
37
+ if len(audio.shape) > 1:
38
+ audio = audio.mean(axis=1)
39
+
40
+ try:
41
  # Prepare input for the model
42
  inputs = feature_extractor(
43
+ audio,
44
+ sampling_rate=16000,
45
  return_tensors="pt",
46
  padding=True
47
  )
48
+
49
+ # Move to appropriate device
50
  inputs = {k: v.to(device) for k, v in inputs.items()}
51
 
52
  # Get prediction
 
55
  logits = outputs.logits
56
  predicted_id = torch.argmax(logits, dim=-1).item()
57
 
58
+ emotion = EMOTION_LABELS[predicted_id]
59
+ return emotion
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ except Exception as e:
62
+ print(f"Error processing audio: {e}")
63
+ return "Error processing audio"
64
 
65
+ # Create Gradio interface
66
+ demo = gr.Interface(
67
+ fn=process_audio,
68
+ inputs=[
69
+ gr.Audio(
70
+ sources=["microphone"],
71
+ type="numpy",
72
+ streaming=True,
73
+ label="Speak into your microphone",
74
+ show_label=True
75
+ )
76
+ ],
77
+ outputs=gr.Textbox(label="Detected Emotion"),
78
+ title="Live Emotion Detection",
79
+ description="Speak into your microphone to detect emotions in real-time.",
80
+ live=True,
81
+ allow_flagging=False
82
+ )
 
 
 
 
 
 
 
 
 
 
83
 
84
+ # Launch with a small queue for better real-time performance
85
+ demo.queue(max_size=1).launch(share=True)