omsandeeppatil commited on
Commit
787c2bc
·
verified ·
1 Parent(s): 0a54d22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -57
app.py CHANGED
@@ -1,13 +1,18 @@
1
  import gradio as gr
2
  import torch
3
  import torchaudio
 
4
  from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
 
 
 
5
 
6
  # Initialize device and model
7
  device = "cuda" if torch.cuda.is_available() else "cpu"
8
  model_name = "Hatman/audio-emotion-detection"
9
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
10
  model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
 
11
 
12
  # Define emotion labels
13
  EMOTION_LABELS = {
@@ -20,69 +25,121 @@ EMOTION_LABELS = {
20
  6: "surprise"
21
  }
22
 
23
- def preprocess_audio(audio):
24
- """Preprocess audio file for model input"""
25
- waveform, sampling_rate = torchaudio.load(audio)
26
- resampled_waveform = torchaudio.transforms.Resample(
27
- orig_freq=sampling_rate,
28
- new_freq=16000
29
- )(waveform)
30
- return {
31
- 'speech': resampled_waveform.numpy().flatten(),
32
- 'sampling_rate': 16000
33
- }
34
 
35
- def inference(audio):
36
- """Full inference function returning emotion, logits, and predicted IDs"""
37
- example = preprocess_audio(audio)
38
- inputs = feature_extractor(
39
- example['speech'],
40
- sampling_rate=16000,
41
- return_tensors="pt",
42
- padding=True
43
- )
44
-
45
- # Move inputs to appropriate device
46
- inputs = {k: v.to(device) for k, v in inputs.items()}
47
 
48
- with torch.no_grad():
49
- outputs = model(**inputs)
50
- logits = outputs.logits
51
- predicted_ids = torch.argmax(logits, dim=-1)
 
 
 
 
 
52
 
53
- predicted_emotion = EMOTION_LABELS[predicted_ids.item()]
54
- return predicted_emotion, logits.tolist(), predicted_ids.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- def inference_label(audio):
57
- """Simplified inference function returning only the emotion label"""
58
- emotion, _, _ = inference(audio)
59
- return emotion
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- # Create Gradio interface
62
  with gr.Blocks() as demo:
63
- gr.Markdown("# Audio Emotion Detection")
 
64
 
65
- with gr.Tab("Quick Analysis"):
66
- gr.Interface(
67
- fn=inference_label,
68
- inputs=gr.Audio(type="filepath"),
69
- outputs=gr.Label(label="Detected Emotion"),
70
- title="Audio Emotion Analysis",
71
- description="Upload or record audio to detect the emotional content."
72
- )
73
 
74
- with gr.Tab("Detailed Analysis"):
75
- gr.Interface(
76
- fn=inference,
77
- inputs=gr.Audio(type="filepath"),
78
- outputs=[
79
- gr.Label(label="Detected Emotion"),
80
- gr.JSON(label="Confidence Scores"),
81
- gr.JSON(label="Internal IDs")
82
- ],
83
- title="Audio Emotion Analysis (Detailed)",
84
- description="Get detailed analysis including confidence scores for each emotion."
85
- )
 
 
 
 
 
86
 
87
- # Launch the app
88
- demo.launch(share=True)
 
1
  import gradio as gr
2
  import torch
3
  import torchaudio
4
+ import numpy as np
5
  from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
6
+ from queue import Queue
7
+ from threading import Thread
8
+ import time
9
 
10
  # Initialize device and model
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
  model_name = "Hatman/audio-emotion-detection"
13
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
14
  model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
15
+ model.to(device)
16
 
17
  # Define emotion labels
18
  EMOTION_LABELS = {
 
25
  6: "surprise"
26
  }
27
 
28
+ CHUNK_DURATION = 3 # seconds
29
+ SAMPLE_RATE = 16000
30
+ CHUNK_SIZE = SAMPLE_RATE * CHUNK_DURATION
 
 
 
 
 
 
 
 
31
 
32
+ class AudioProcessor:
33
+ def __init__(self):
34
+ self.audio_queue = Queue()
35
+ self.results_queue = Queue()
36
+ self.is_running = False
37
+ self.current_emotions = []
 
 
 
 
 
 
38
 
39
+ def process_chunk(self, audio_chunk):
40
+ """Process a single chunk of audio"""
41
+ # Ensure the chunk is the right length
42
+ if len(audio_chunk) < CHUNK_SIZE:
43
+ # Pad with zeros if too short
44
+ audio_chunk = np.pad(audio_chunk, (0, CHUNK_SIZE - len(audio_chunk)))
45
+ elif len(audio_chunk) > CHUNK_SIZE:
46
+ # Take only the first CHUNK_SIZE samples
47
+ audio_chunk = audio_chunk[:CHUNK_SIZE]
48
 
49
+ # Prepare input for the model
50
+ inputs = feature_extractor(
51
+ audio_chunk,
52
+ sampling_rate=SAMPLE_RATE,
53
+ return_tensors="pt",
54
+ padding=True
55
+ )
56
+ inputs = {k: v.to(device) for k, v in inputs.items()}
57
+
58
+ # Get prediction
59
+ with torch.no_grad():
60
+ outputs = model(**inputs)
61
+ logits = outputs.logits
62
+ predicted_id = torch.argmax(logits, dim=-1).item()
63
+
64
+ return EMOTION_LABELS[predicted_id]
65
+
66
+ def process_audio_stream(self):
67
+ """Continuously process audio chunks from the queue"""
68
+ while self.is_running:
69
+ if not self.audio_queue.empty():
70
+ audio_chunk = self.audio_queue.get()
71
+ emotion = self.process_chunk(audio_chunk)
72
+ self.current_emotions.append(emotion)
73
+ # Keep only the last 5 emotions
74
+ self.current_emotions = self.current_emotions[-5:]
75
+ self.results_queue.put(self.current_emotions.copy())
76
+ time.sleep(0.1)
77
 
78
+ def start(self):
79
+ """Start the processing thread"""
80
+ self.is_running = True
81
+ self.process_thread = Thread(target=self.process_audio_stream)
82
+ self.process_thread.start()
83
+
84
+ def stop(self):
85
+ """Stop the processing thread"""
86
+ self.is_running = False
87
+ if hasattr(self, 'process_thread'):
88
+ self.process_thread.join()
89
+
90
+ audio_processor = AudioProcessor()
91
+
92
+ def process_audio(audio, state):
93
+ """Process incoming audio stream"""
94
+ if state is None or not state.get('is_running', False):
95
+ audio_processor.start()
96
+ state = {'is_running': True}
97
+
98
+ # Convert audio to numpy array if it's not already
99
+ if isinstance(audio, tuple):
100
+ audio = audio[1] # Get the actual audio data
101
+ audio = np.array(audio)
102
+
103
+ # Add to processing queue
104
+ audio_processor.audio_queue.put(audio)
105
+
106
+ # Get latest results
107
+ if not audio_processor.results_queue.empty():
108
+ emotions = audio_processor.results_queue.get()
109
+ return gr.update(value=", ".join(emotions)), state
110
+
111
+ return gr.update(), state
112
+
113
+ def cleanup(state):
114
+ """Cleanup when the interface is closed"""
115
+ if state and state.get('is_running', False):
116
+ audio_processor.stop()
117
+ state['is_running'] = False
118
+ return state
119
 
 
120
  with gr.Blocks() as demo:
121
+ gr.Markdown("# Real-time Audio Emotion Detection")
122
+ gr.Markdown("Speak into your microphone. Emotions are detected in 3-second chunks.")
123
 
124
+ state = gr.State(None)
125
+ output = gr.Textbox(label="Detected Emotions (Last 5 chunks)", lines=2)
 
 
 
 
 
 
126
 
127
+ audio_input = gr.Audio(
128
+ source="microphone",
129
+ type="numpy",
130
+ streaming=True,
131
+ label="Microphone Input",
132
+ show_label=True
133
+ )
134
+
135
+ audio_input.stream(
136
+ process_audio,
137
+ inputs=[audio_input, state],
138
+ outputs=[output, state],
139
+ show_progress=False
140
+ )
141
+
142
+ demo.load(lambda: None, None, state)
143
+ demo.close(cleanup, state, state)
144
 
145
+ demo.queue().launch(share=True)