Spaces:

jcvsalinas
/

recorder

Sleeping

App Files Files Community

jcvsalinas commited on Aug 31, 2024

Commit

4f12294

verified ·

1 Parent(s): a46d411

Upload 3 files

Browse files

Files changed (3) hide show

app.py +56 -14
happy.jpg +0 -0
surprise.jpg +0 -0

app.py CHANGED Viewed

@@ -3,7 +3,9 @@ import numpy as np
 import matplotlib.pyplot as plt
 from PIL import Image
 import librosa
 HOME_DIR = ""
 local_config_path = 'config.json'
 local_preprocessor_config_path = 'preprocessor_config.json'
@@ -22,15 +24,17 @@ id2label = {
     2: "fear",
     3: "happy",
     4: "neutral",
-    5: "sad"
 }
 def predict(model, feature_extractor, data, max_length, id2label):
     # Extract features
     inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
     torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
     # Forward pass
     outputs = model(input_values=torch_inputs)
@@ -44,8 +48,8 @@ def predict(model, feature_extractor, data, max_length, id2label):
     predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
     predicted_label = id2label[predicted_class_idx]
     #predicted_label = predicted_class_idx
-    return predicted_label
 from transformers import Wav2Vec2Config, Wav2Vec2Model
 import torch.nn as nn
@@ -127,9 +131,23 @@ def recognize_emotion(audio):
         audio_data = audio_data.astype(np.float32)
     # If you still want to process it with librosa, e.g., to change sample rate:
     if sample_rate != 16000:
         audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
-    emotion = predict(model, feature_extractor, audio_data, len(audio_data), id2label)
-    return emotion, get_emotion_image(emotion)
 def get_emotion_image(emotion):
     # Here, you would have a dictionary or logic to map emotions to images
@@ -138,8 +156,9 @@ def get_emotion_image(emotion):
         "disgust": "disgust.jpeg",
         "fear": "fear.jpeg",
         "happy": "happy.jpeg",
-        "neutral": "neutral.png",
-        "sad": "sad.jpeg"
         # Add other emotions and their corresponding images
     }
@@ -150,15 +169,38 @@ def get_emotion_image(emotion):
 demo = gr.Blocks()
 with demo:
     theme= gr.themes.Soft(),
     audio_input = gr.Audio(type="numpy",
                             sources=["microphone"],
-                            show_label=True
                             )
     text_output = gr.Textbox(label="Recognized Emotion")
-    image_output = gr.Image(label="Emotion Image")
     # Automatically call the recognize_em otion function when audio is recorded
-    audio_input.stop_recording(fn=recognize_emotion, inputs=audio_input, outputs=[text_output,image_output])
-    print("Emotion: ", text_output)
 demo.launch(share=True)

 import matplotlib.pyplot as plt
 from PIL import Image
 import librosa
+import time
+from datetime import datetime
+import pandas as pd
 HOME_DIR = ""
 local_config_path = 'config.json'
 local_preprocessor_config_path = 'preprocessor_config.json'
     2: "fear",
     3: "happy",
     4: "neutral",
+    5: "sad",
+    6: "surprise"
 }
 def predict(model, feature_extractor, data, max_length, id2label):
     # Extract features
+    print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Extracting features...")
     inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
     torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
+    print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Predicting...")
     # Forward pass
     outputs = model(input_values=torch_inputs)
     predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
     predicted_label = id2label[predicted_class_idx]
     #predicted_label = predicted_class_idx
+    return predicted_label, probabilities
 from transformers import Wav2Vec2Config, Wav2Vec2Model
 import torch.nn as nn
         audio_data = audio_data.astype(np.float32)
     # If you still want to process it with librosa, e.g., to change sample rate:
     if sample_rate != 16000:
+        print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Resampling audio...")
         audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+    emotion, probabilities = predict(model, feature_extractor, audio_data, 48000, id2label) # limit to 3seconds
+    print(probabilities)
+    probs = probabilities.detach().numpy().flatten().tolist()
+    print(probs)
+    # Convert probabilities to percentages
+    percentages = [round(prob * 100, 2) for prob in probs]
+    print(percentages)
+    # Define the class labels (adjust to match your specific model's class labels)
+    labels = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]
+    print(labels)
+    # Create a DataFrame
+    df = pd.DataFrame({"Emotion": labels, "Probability (%)": percentages})
+    df = df.sort_values(by="Probability (%)", ascending=False)
+    print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), df)
+    return emotion, get_emotion_image(emotion), df
 def get_emotion_image(emotion):
     # Here, you would have a dictionary or logic to map emotions to images
         "disgust": "disgust.jpeg",
         "fear": "fear.jpeg",
         "happy": "happy.jpeg",
+        "neutral": "neutral.jpeg",
+        "sad": "sad.jpeg",
+        "surprise": "surprise.jpeg"
         # Add other emotions and their corresponding images
     }
 demo = gr.Blocks()
 with demo:
+    df_logs = pd.DataFrame(columns=['Timestamp', 'Emotion'])
     theme= gr.themes.Soft(),
     audio_input = gr.Audio(type="numpy",
                             sources=["microphone"],
+                            show_label=True,
+                            streaming=True
                             )
     text_output = gr.Textbox(label="Recognized Emotion")
+    output_df = gr.DataFrame(label="Emotion Probabilities")
+    image_output = gr.Image(label="Emotion Image", scale = 1, interactive = False)
+    df_logs = gr.DataFrame(label="Output Logs", headers = ['Timestamp', 'Emotion'])
+    def process_audio(audio, emotion, image, state, df_probs, df_logs):
+        current_time = time.time()
+        if state is None or (current_time - state >= 10):
+            state = current_time
+            emotion, image, df_probs = recognize_emotion(audio)
+            # Sample prediction data
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            # Create a dictionary for the new row
+            new_row = {'Timestamp': timestamp, 'Emotion': emotion}
+            # Append the new row to the DataFrame
+            df_logs = pd.concat([df_logs, pd.DataFrame([new_row])], ignore_index=True)
+            print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Predicted emotion: ", emotion)
+            return emotion, image, state, df_probs, df_logs
+        else:
+            print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Not yet time")
+            return emotion, image, state, df_probs, df_logs
     # Automatically call the recognize_em otion function when audio is recorded
+    state = gr.State(None)
+    audio_input.stream(fn=process_audio, inputs=[audio_input, text_output, image_output, state, output_df, df_logs], outputs=[text_output, image_output, state, output_df, df_logs])
 demo.launch(share=True)

happy.jpg ADDED Viewed

surprise.jpg ADDED Viewed