jcvsalinas commited on
Commit
ab766bf
·
verified ·
1 Parent(s): 4c52933

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +20 -6
  2. happy.jpg +0 -0
  3. surprise.jpg +0 -0
app.py CHANGED
@@ -31,6 +31,7 @@ id2label = {
31
 
32
  def predict(model, feature_extractor, data, max_length, id2label):
33
  # Extract features
 
34
  inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
35
  torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
36
  print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Predicting...")
@@ -130,6 +131,7 @@ def recognize_emotion(audio):
130
  audio_data = audio_data.astype(np.float32)
131
  # If you still want to process it with librosa, e.g., to change sample rate:
132
  if sample_rate != 16000:
 
133
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
134
  emotion, probabilities = predict(model, feature_extractor, audio_data, 48000, id2label) # limit to 3seconds
135
  print(probabilities)
@@ -155,7 +157,8 @@ def get_emotion_image(emotion):
155
  "fear": "fear.jpeg",
156
  "happy": "happy.jpeg",
157
  "neutral": "neutral.jpeg",
158
- "sad": "sad.jpeg"
 
159
  # Add other emotions and their corresponding images
160
  }
161
 
@@ -166,6 +169,7 @@ def get_emotion_image(emotion):
166
 
167
  demo = gr.Blocks()
168
  with demo:
 
169
  theme= gr.themes.Soft(),
170
  audio_input = gr.Audio(type="numpy",
171
  sources=["microphone"],
@@ -175,18 +179,28 @@ with demo:
175
  text_output = gr.Textbox(label="Recognized Emotion")
176
  output_df = gr.DataFrame(label="Emotion Probabilities")
177
  image_output = gr.Image(label="Emotion Image", scale = 1, interactive = False)
178
- def process_audio(audio, emotion, image, state, df_probs):
 
 
179
  current_time = time.time()
180
- if state is None or current_time - state >= 10:
181
  state = current_time
182
  emotion, image, df_probs = recognize_emotion(audio)
 
 
 
 
 
 
 
 
183
  print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Predicted emotion: ", emotion)
184
- return emotion, image, state, df_probs
185
  else:
186
  print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Not yet time")
187
- return emotion, image, state, df_probs
188
 
189
  # Automatically call the recognize_em otion function when audio is recorded
190
  state = gr.State(None)
191
- audio_input.stream(fn=process_audio, inputs=[audio_input, text_output, image_output, state, output_df], outputs=[text_output, image_output, state, output_df])
192
  demo.launch(share=True)
 
31
 
32
  def predict(model, feature_extractor, data, max_length, id2label):
33
  # Extract features
34
+ print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Extracting features...")
35
  inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
36
  torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
37
  print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Predicting...")
 
131
  audio_data = audio_data.astype(np.float32)
132
  # If you still want to process it with librosa, e.g., to change sample rate:
133
  if sample_rate != 16000:
134
+ print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Resampling audio...")
135
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
136
  emotion, probabilities = predict(model, feature_extractor, audio_data, 48000, id2label) # limit to 3seconds
137
  print(probabilities)
 
157
  "fear": "fear.jpeg",
158
  "happy": "happy.jpeg",
159
  "neutral": "neutral.jpeg",
160
+ "sad": "sad.jpeg",
161
+ "surprise": "surprise.jpeg"
162
  # Add other emotions and their corresponding images
163
  }
164
 
 
169
 
170
  demo = gr.Blocks()
171
  with demo:
172
+ df_logs = pd.DataFrame(columns=['Timestamp', 'Emotion'])
173
  theme= gr.themes.Soft(),
174
  audio_input = gr.Audio(type="numpy",
175
  sources=["microphone"],
 
179
  text_output = gr.Textbox(label="Recognized Emotion")
180
  output_df = gr.DataFrame(label="Emotion Probabilities")
181
  image_output = gr.Image(label="Emotion Image", scale = 1, interactive = False)
182
+ df_logs = gr.DataFrame(label="Output Logs", headers = ['Timestamp', 'Emotion'])
183
+ def process_audio(audio, emotion, image, state, df_probs, df_logs):
184
+
185
  current_time = time.time()
186
+ if state is None or (current_time - state >= 10):
187
  state = current_time
188
  emotion, image, df_probs = recognize_emotion(audio)
189
+ # Sample prediction data
190
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
191
+
192
+ # Create a dictionary for the new row
193
+ new_row = {'Timestamp': timestamp, 'Emotion': emotion}
194
+
195
+ # Append the new row to the DataFrame
196
+ df_logs = pd.concat([df_logs, pd.DataFrame([new_row])], ignore_index=True)
197
  print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Predicted emotion: ", emotion)
198
+ return emotion, image, state, df_probs, df_logs
199
  else:
200
  print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Not yet time")
201
+ return emotion, image, state, df_probs, df_logs
202
 
203
  # Automatically call the recognize_em otion function when audio is recorded
204
  state = gr.State(None)
205
+ audio_input.stream(fn=process_audio, inputs=[audio_input, text_output, image_output, state, output_df, df_logs], outputs=[text_output, image_output, state, output_df, df_logs])
206
  demo.launch(share=True)
happy.jpg ADDED
surprise.jpg ADDED