Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +56 -14
- happy.jpg +0 -0
- surprise.jpg +0 -0
app.py
CHANGED
@@ -3,7 +3,9 @@ import numpy as np
|
|
3 |
import matplotlib.pyplot as plt
|
4 |
from PIL import Image
|
5 |
import librosa
|
6 |
-
|
|
|
|
|
7 |
HOME_DIR = ""
|
8 |
local_config_path = 'config.json'
|
9 |
local_preprocessor_config_path = 'preprocessor_config.json'
|
@@ -22,15 +24,17 @@ id2label = {
|
|
22 |
2: "fear",
|
23 |
3: "happy",
|
24 |
4: "neutral",
|
25 |
-
5: "sad"
|
|
|
26 |
}
|
27 |
|
28 |
|
29 |
def predict(model, feature_extractor, data, max_length, id2label):
|
30 |
# Extract features
|
|
|
31 |
inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
|
32 |
torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
|
33 |
-
|
34 |
# Forward pass
|
35 |
outputs = model(input_values=torch_inputs)
|
36 |
|
@@ -44,8 +48,8 @@ def predict(model, feature_extractor, data, max_length, id2label):
|
|
44 |
predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
|
45 |
predicted_label = id2label[predicted_class_idx]
|
46 |
#predicted_label = predicted_class_idx
|
47 |
-
|
48 |
-
return predicted_label
|
49 |
|
50 |
from transformers import Wav2Vec2Config, Wav2Vec2Model
|
51 |
import torch.nn as nn
|
@@ -127,9 +131,23 @@ def recognize_emotion(audio):
|
|
127 |
audio_data = audio_data.astype(np.float32)
|
128 |
# If you still want to process it with librosa, e.g., to change sample rate:
|
129 |
if sample_rate != 16000:
|
|
|
130 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
131 |
-
emotion = predict(model, feature_extractor, audio_data,
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
def get_emotion_image(emotion):
|
135 |
# Here, you would have a dictionary or logic to map emotions to images
|
@@ -138,8 +156,9 @@ def get_emotion_image(emotion):
|
|
138 |
"disgust": "disgust.jpeg",
|
139 |
"fear": "fear.jpeg",
|
140 |
"happy": "happy.jpeg",
|
141 |
-
"neutral": "neutral.
|
142 |
-
"sad": "sad.jpeg"
|
|
|
143 |
# Add other emotions and their corresponding images
|
144 |
}
|
145 |
|
@@ -150,15 +169,38 @@ def get_emotion_image(emotion):
|
|
150 |
|
151 |
demo = gr.Blocks()
|
152 |
with demo:
|
|
|
153 |
theme= gr.themes.Soft(),
|
154 |
audio_input = gr.Audio(type="numpy",
|
155 |
sources=["microphone"],
|
156 |
-
show_label=True
|
|
|
157 |
)
|
158 |
text_output = gr.Textbox(label="Recognized Emotion")
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
# Automatically call the recognize_em otion function when audio is recorded
|
162 |
-
|
163 |
-
|
164 |
demo.launch(share=True)
|
|
|
3 |
import matplotlib.pyplot as plt
|
4 |
from PIL import Image
|
5 |
import librosa
|
6 |
+
import time
|
7 |
+
from datetime import datetime
|
8 |
+
import pandas as pd
|
9 |
HOME_DIR = ""
|
10 |
local_config_path = 'config.json'
|
11 |
local_preprocessor_config_path = 'preprocessor_config.json'
|
|
|
24 |
2: "fear",
|
25 |
3: "happy",
|
26 |
4: "neutral",
|
27 |
+
5: "sad",
|
28 |
+
6: "surprise"
|
29 |
}
|
30 |
|
31 |
|
32 |
def predict(model, feature_extractor, data, max_length, id2label):
|
33 |
# Extract features
|
34 |
+
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Extracting features...")
|
35 |
inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
|
36 |
torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
|
37 |
+
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Predicting...")
|
38 |
# Forward pass
|
39 |
outputs = model(input_values=torch_inputs)
|
40 |
|
|
|
48 |
predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
|
49 |
predicted_label = id2label[predicted_class_idx]
|
50 |
#predicted_label = predicted_class_idx
|
51 |
+
|
52 |
+
return predicted_label, probabilities
|
53 |
|
54 |
from transformers import Wav2Vec2Config, Wav2Vec2Model
|
55 |
import torch.nn as nn
|
|
|
131 |
audio_data = audio_data.astype(np.float32)
|
132 |
# If you still want to process it with librosa, e.g., to change sample rate:
|
133 |
if sample_rate != 16000:
|
134 |
+
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Resampling audio...")
|
135 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
136 |
+
emotion, probabilities = predict(model, feature_extractor, audio_data, 48000, id2label) # limit to 3seconds
|
137 |
+
print(probabilities)
|
138 |
+
probs = probabilities.detach().numpy().flatten().tolist()
|
139 |
+
print(probs)
|
140 |
+
# Convert probabilities to percentages
|
141 |
+
percentages = [round(prob * 100, 2) for prob in probs]
|
142 |
+
print(percentages)
|
143 |
+
# Define the class labels (adjust to match your specific model's class labels)
|
144 |
+
labels = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]
|
145 |
+
print(labels)
|
146 |
+
# Create a DataFrame
|
147 |
+
df = pd.DataFrame({"Emotion": labels, "Probability (%)": percentages})
|
148 |
+
df = df.sort_values(by="Probability (%)", ascending=False)
|
149 |
+
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), df)
|
150 |
+
return emotion, get_emotion_image(emotion), df
|
151 |
|
152 |
def get_emotion_image(emotion):
|
153 |
# Here, you would have a dictionary or logic to map emotions to images
|
|
|
156 |
"disgust": "disgust.jpeg",
|
157 |
"fear": "fear.jpeg",
|
158 |
"happy": "happy.jpeg",
|
159 |
+
"neutral": "neutral.jpeg",
|
160 |
+
"sad": "sad.jpeg",
|
161 |
+
"surprise": "surprise.jpeg"
|
162 |
# Add other emotions and their corresponding images
|
163 |
}
|
164 |
|
|
|
169 |
|
170 |
demo = gr.Blocks()
|
171 |
with demo:
|
172 |
+
df_logs = pd.DataFrame(columns=['Timestamp', 'Emotion'])
|
173 |
theme= gr.themes.Soft(),
|
174 |
audio_input = gr.Audio(type="numpy",
|
175 |
sources=["microphone"],
|
176 |
+
show_label=True,
|
177 |
+
streaming=True
|
178 |
)
|
179 |
text_output = gr.Textbox(label="Recognized Emotion")
|
180 |
+
output_df = gr.DataFrame(label="Emotion Probabilities")
|
181 |
+
image_output = gr.Image(label="Emotion Image", scale = 1, interactive = False)
|
182 |
+
df_logs = gr.DataFrame(label="Output Logs", headers = ['Timestamp', 'Emotion'])
|
183 |
+
def process_audio(audio, emotion, image, state, df_probs, df_logs):
|
184 |
+
|
185 |
+
current_time = time.time()
|
186 |
+
if state is None or (current_time - state >= 10):
|
187 |
+
state = current_time
|
188 |
+
emotion, image, df_probs = recognize_emotion(audio)
|
189 |
+
# Sample prediction data
|
190 |
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
191 |
+
|
192 |
+
# Create a dictionary for the new row
|
193 |
+
new_row = {'Timestamp': timestamp, 'Emotion': emotion}
|
194 |
+
|
195 |
+
# Append the new row to the DataFrame
|
196 |
+
df_logs = pd.concat([df_logs, pd.DataFrame([new_row])], ignore_index=True)
|
197 |
+
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Predicted emotion: ", emotion)
|
198 |
+
return emotion, image, state, df_probs, df_logs
|
199 |
+
else:
|
200 |
+
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Not yet time")
|
201 |
+
return emotion, image, state, df_probs, df_logs
|
202 |
+
|
203 |
# Automatically call the recognize_em otion function when audio is recorded
|
204 |
+
state = gr.State(None)
|
205 |
+
audio_input.stream(fn=process_audio, inputs=[audio_input, text_output, image_output, state, output_df, df_logs], outputs=[text_output, image_output, state, output_df, df_logs])
|
206 |
demo.launch(share=True)
|
happy.jpg
ADDED
![]() |
surprise.jpg
ADDED
![]() |