jcvsalinas commited on
Commit
4f12294
·
verified ·
1 Parent(s): a46d411

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +56 -14
  2. happy.jpg +0 -0
  3. surprise.jpg +0 -0
app.py CHANGED
@@ -3,7 +3,9 @@ import numpy as np
3
  import matplotlib.pyplot as plt
4
  from PIL import Image
5
  import librosa
6
-
 
 
7
  HOME_DIR = ""
8
  local_config_path = 'config.json'
9
  local_preprocessor_config_path = 'preprocessor_config.json'
@@ -22,15 +24,17 @@ id2label = {
22
  2: "fear",
23
  3: "happy",
24
  4: "neutral",
25
- 5: "sad"
 
26
  }
27
 
28
 
29
  def predict(model, feature_extractor, data, max_length, id2label):
30
  # Extract features
 
31
  inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
32
  torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
33
-
34
  # Forward pass
35
  outputs = model(input_values=torch_inputs)
36
 
@@ -44,8 +48,8 @@ def predict(model, feature_extractor, data, max_length, id2label):
44
  predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
45
  predicted_label = id2label[predicted_class_idx]
46
  #predicted_label = predicted_class_idx
47
-
48
- return predicted_label
49
 
50
  from transformers import Wav2Vec2Config, Wav2Vec2Model
51
  import torch.nn as nn
@@ -127,9 +131,23 @@ def recognize_emotion(audio):
127
  audio_data = audio_data.astype(np.float32)
128
  # If you still want to process it with librosa, e.g., to change sample rate:
129
  if sample_rate != 16000:
 
130
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
131
- emotion = predict(model, feature_extractor, audio_data, len(audio_data), id2label)
132
- return emotion, get_emotion_image(emotion)
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  def get_emotion_image(emotion):
135
  # Here, you would have a dictionary or logic to map emotions to images
@@ -138,8 +156,9 @@ def get_emotion_image(emotion):
138
  "disgust": "disgust.jpeg",
139
  "fear": "fear.jpeg",
140
  "happy": "happy.jpeg",
141
- "neutral": "neutral.png",
142
- "sad": "sad.jpeg"
 
143
  # Add other emotions and their corresponding images
144
  }
145
 
@@ -150,15 +169,38 @@ def get_emotion_image(emotion):
150
 
151
  demo = gr.Blocks()
152
  with demo:
 
153
  theme= gr.themes.Soft(),
154
  audio_input = gr.Audio(type="numpy",
155
  sources=["microphone"],
156
- show_label=True
 
157
  )
158
  text_output = gr.Textbox(label="Recognized Emotion")
159
- image_output = gr.Image(label="Emotion Image")
160
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  # Automatically call the recognize_em otion function when audio is recorded
162
- audio_input.stop_recording(fn=recognize_emotion, inputs=audio_input, outputs=[text_output,image_output])
163
- print("Emotion: ", text_output)
164
  demo.launch(share=True)
 
3
  import matplotlib.pyplot as plt
4
  from PIL import Image
5
  import librosa
6
+ import time
7
+ from datetime import datetime
8
+ import pandas as pd
9
  HOME_DIR = ""
10
  local_config_path = 'config.json'
11
  local_preprocessor_config_path = 'preprocessor_config.json'
 
24
  2: "fear",
25
  3: "happy",
26
  4: "neutral",
27
+ 5: "sad",
28
+ 6: "surprise"
29
  }
30
 
31
 
32
  def predict(model, feature_extractor, data, max_length, id2label):
33
  # Extract features
34
+ print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Extracting features...")
35
  inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
36
  torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
37
+ print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Predicting...")
38
  # Forward pass
39
  outputs = model(input_values=torch_inputs)
40
 
 
48
  predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
49
  predicted_label = id2label[predicted_class_idx]
50
  #predicted_label = predicted_class_idx
51
+
52
+ return predicted_label, probabilities
53
 
54
  from transformers import Wav2Vec2Config, Wav2Vec2Model
55
  import torch.nn as nn
 
131
  audio_data = audio_data.astype(np.float32)
132
  # If you still want to process it with librosa, e.g., to change sample rate:
133
  if sample_rate != 16000:
134
+ print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Resampling audio...")
135
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
136
+ emotion, probabilities = predict(model, feature_extractor, audio_data, 48000, id2label) # limit to 3seconds
137
+ print(probabilities)
138
+ probs = probabilities.detach().numpy().flatten().tolist()
139
+ print(probs)
140
+ # Convert probabilities to percentages
141
+ percentages = [round(prob * 100, 2) for prob in probs]
142
+ print(percentages)
143
+ # Define the class labels (adjust to match your specific model's class labels)
144
+ labels = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]
145
+ print(labels)
146
+ # Create a DataFrame
147
+ df = pd.DataFrame({"Emotion": labels, "Probability (%)": percentages})
148
+ df = df.sort_values(by="Probability (%)", ascending=False)
149
+ print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), df)
150
+ return emotion, get_emotion_image(emotion), df
151
 
152
  def get_emotion_image(emotion):
153
  # Here, you would have a dictionary or logic to map emotions to images
 
156
  "disgust": "disgust.jpeg",
157
  "fear": "fear.jpeg",
158
  "happy": "happy.jpeg",
159
+ "neutral": "neutral.jpeg",
160
+ "sad": "sad.jpeg",
161
+ "surprise": "surprise.jpeg"
162
  # Add other emotions and their corresponding images
163
  }
164
 
 
169
 
170
  demo = gr.Blocks()
171
  with demo:
172
+ df_logs = pd.DataFrame(columns=['Timestamp', 'Emotion'])
173
  theme= gr.themes.Soft(),
174
  audio_input = gr.Audio(type="numpy",
175
  sources=["microphone"],
176
+ show_label=True,
177
+ streaming=True
178
  )
179
  text_output = gr.Textbox(label="Recognized Emotion")
180
+ output_df = gr.DataFrame(label="Emotion Probabilities")
181
+ image_output = gr.Image(label="Emotion Image", scale = 1, interactive = False)
182
+ df_logs = gr.DataFrame(label="Output Logs", headers = ['Timestamp', 'Emotion'])
183
+ def process_audio(audio, emotion, image, state, df_probs, df_logs):
184
+
185
+ current_time = time.time()
186
+ if state is None or (current_time - state >= 10):
187
+ state = current_time
188
+ emotion, image, df_probs = recognize_emotion(audio)
189
+ # Sample prediction data
190
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
191
+
192
+ # Create a dictionary for the new row
193
+ new_row = {'Timestamp': timestamp, 'Emotion': emotion}
194
+
195
+ # Append the new row to the DataFrame
196
+ df_logs = pd.concat([df_logs, pd.DataFrame([new_row])], ignore_index=True)
197
+ print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Predicted emotion: ", emotion)
198
+ return emotion, image, state, df_probs, df_logs
199
+ else:
200
+ print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Not yet time")
201
+ return emotion, image, state, df_probs, df_logs
202
+
203
  # Automatically call the recognize_em otion function when audio is recorded
204
+ state = gr.State(None)
205
+ audio_input.stream(fn=process_audio, inputs=[audio_input, text_output, image_output, state, output_df, df_logs], outputs=[text_output, image_output, state, output_df, df_logs])
206
  demo.launch(share=True)
happy.jpg ADDED
surprise.jpg ADDED