Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +20 -6
- happy.jpg +0 -0
- surprise.jpg +0 -0
app.py
CHANGED
@@ -31,6 +31,7 @@ id2label = {
|
|
31 |
|
32 |
def predict(model, feature_extractor, data, max_length, id2label):
|
33 |
# Extract features
|
|
|
34 |
inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
|
35 |
torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
|
36 |
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Predicting...")
|
@@ -130,6 +131,7 @@ def recognize_emotion(audio):
|
|
130 |
audio_data = audio_data.astype(np.float32)
|
131 |
# If you still want to process it with librosa, e.g., to change sample rate:
|
132 |
if sample_rate != 16000:
|
|
|
133 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
134 |
emotion, probabilities = predict(model, feature_extractor, audio_data, 48000, id2label) # limit to 3seconds
|
135 |
print(probabilities)
|
@@ -155,7 +157,8 @@ def get_emotion_image(emotion):
|
|
155 |
"fear": "fear.jpeg",
|
156 |
"happy": "happy.jpeg",
|
157 |
"neutral": "neutral.jpeg",
|
158 |
-
"sad": "sad.jpeg"
|
|
|
159 |
# Add other emotions and their corresponding images
|
160 |
}
|
161 |
|
@@ -166,6 +169,7 @@ def get_emotion_image(emotion):
|
|
166 |
|
167 |
demo = gr.Blocks()
|
168 |
with demo:
|
|
|
169 |
theme= gr.themes.Soft(),
|
170 |
audio_input = gr.Audio(type="numpy",
|
171 |
sources=["microphone"],
|
@@ -175,18 +179,28 @@ with demo:
|
|
175 |
text_output = gr.Textbox(label="Recognized Emotion")
|
176 |
output_df = gr.DataFrame(label="Emotion Probabilities")
|
177 |
image_output = gr.Image(label="Emotion Image", scale = 1, interactive = False)
|
178 |
-
|
|
|
|
|
179 |
current_time = time.time()
|
180 |
-
if state is None or current_time - state >= 10:
|
181 |
state = current_time
|
182 |
emotion, image, df_probs = recognize_emotion(audio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Predicted emotion: ", emotion)
|
184 |
-
return emotion, image, state, df_probs
|
185 |
else:
|
186 |
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Not yet time")
|
187 |
-
return emotion, image, state, df_probs
|
188 |
|
189 |
# Automatically call the recognize_em otion function when audio is recorded
|
190 |
state = gr.State(None)
|
191 |
-
audio_input.stream(fn=process_audio, inputs=[audio_input, text_output, image_output, state, output_df], outputs=[text_output, image_output, state, output_df])
|
192 |
demo.launch(share=True)
|
|
|
31 |
|
32 |
def predict(model, feature_extractor, data, max_length, id2label):
|
33 |
# Extract features
|
34 |
+
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Extracting features...")
|
35 |
inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
|
36 |
torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
|
37 |
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Predicting...")
|
|
|
131 |
audio_data = audio_data.astype(np.float32)
|
132 |
# If you still want to process it with librosa, e.g., to change sample rate:
|
133 |
if sample_rate != 16000:
|
134 |
+
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Resampling audio...")
|
135 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
136 |
emotion, probabilities = predict(model, feature_extractor, audio_data, 48000, id2label) # limit to 3seconds
|
137 |
print(probabilities)
|
|
|
157 |
"fear": "fear.jpeg",
|
158 |
"happy": "happy.jpeg",
|
159 |
"neutral": "neutral.jpeg",
|
160 |
+
"sad": "sad.jpeg",
|
161 |
+
"surprise": "surprise.jpeg"
|
162 |
# Add other emotions and their corresponding images
|
163 |
}
|
164 |
|
|
|
169 |
|
170 |
demo = gr.Blocks()
|
171 |
with demo:
|
172 |
+
df_logs = pd.DataFrame(columns=['Timestamp', 'Emotion'])
|
173 |
theme= gr.themes.Soft(),
|
174 |
audio_input = gr.Audio(type="numpy",
|
175 |
sources=["microphone"],
|
|
|
179 |
text_output = gr.Textbox(label="Recognized Emotion")
|
180 |
output_df = gr.DataFrame(label="Emotion Probabilities")
|
181 |
image_output = gr.Image(label="Emotion Image", scale = 1, interactive = False)
|
182 |
+
df_logs = gr.DataFrame(label="Output Logs", headers = ['Timestamp', 'Emotion'])
|
183 |
+
def process_audio(audio, emotion, image, state, df_probs, df_logs):
|
184 |
+
|
185 |
current_time = time.time()
|
186 |
+
if state is None or (current_time - state >= 10):
|
187 |
state = current_time
|
188 |
emotion, image, df_probs = recognize_emotion(audio)
|
189 |
+
# Sample prediction data
|
190 |
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
191 |
+
|
192 |
+
# Create a dictionary for the new row
|
193 |
+
new_row = {'Timestamp': timestamp, 'Emotion': emotion}
|
194 |
+
|
195 |
+
# Append the new row to the DataFrame
|
196 |
+
df_logs = pd.concat([df_logs, pd.DataFrame([new_row])], ignore_index=True)
|
197 |
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Predicted emotion: ", emotion)
|
198 |
+
return emotion, image, state, df_probs, df_logs
|
199 |
else:
|
200 |
print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Not yet time")
|
201 |
+
return emotion, image, state, df_probs, df_logs
|
202 |
|
203 |
# Automatically call the recognize_em otion function when audio is recorded
|
204 |
state = gr.State(None)
|
205 |
+
audio_input.stream(fn=process_audio, inputs=[audio_input, text_output, image_output, state, output_df, df_logs], outputs=[text_output, image_output, state, output_df, df_logs])
|
206 |
demo.launch(share=True)
|
happy.jpg
ADDED
![]() |
surprise.jpg
ADDED
![]() |