Spaces:

jcvsalinas
/

recorder

Sleeping

App Files Files Community

jcvsalinas commited on Aug 22, 2024

Commit

eb270a4

verified ·

1 Parent(s): 3b66fb1

Upload app.py

Browse files

Files changed (1) hide show

app.py +141 -24

app.py CHANGED Viewed

@@ -1,29 +1,146 @@
 import gradio as gr
 import numpy as np
 import matplotlib.pyplot as plt
-# This function returns the waveform data to be displayed
-def record_audio(audio):
-    if audio is not None:
-        data = audio[0]  # Extract the audio data from the tuple
-        sample_rate = audio[1]  # Extract the sample rate
-        # Create a mm plot
-        plt.figure(figsize=(10, 4))
-        plt.plot(data)
-        plt.title("Real-Time Audio Waveform")
-        plt.xlabel("Sample Number")
-        plt.ylabel("Amplitude")
-        plt.grid(True)
-        return plt
-# Define the Gradio interface
-gr.Interface(
-    fn=record_audio,
     theme= gr.themes.Soft(),
-    inputs=gr.Audio(type="numpy"),  # Capture audio as numpy array
-    outputs="plot",  # Output the waveform plot
-    live=True,  # Enable real-time recording
-    title="Real-Time Audio Recording",
-    description="Record audio in real time and view the waveform."
-).launch(share = True)

 import gradio as gr
 import numpy as np
 import matplotlib.pyplot as plt
+import librosa
+HOME_DIR = ""
+local_config_path = 'config.json'
+local_preprocessor_config_path = 'preprocessor_config.json'
+local_weights_path = 'pytorch_model.bin'
+local_training_args_path = 'training_args.bin'
+import torch
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+# Define the id2label mapping
+id2label = {
+    0: "angry",
+    1: "disgust",
+    2: "fear",
+    3: "happy",
+    4: "neutral",
+    5: "sad"
+}
+def predict(model, feature_extractor, data, max_length, id2label):
+    # Extract features
+    inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
+    torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
+    # Forward pass
+    outputs = model(input_values=torch_inputs)
+    # Extract logits from the output
+    logits = outputs
+    # Apply softmax to get probabilities
+    probabilities = F.softmax(logits, dim=-1)
+    # Get the predicted class index
+    predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
+    predicted_label = id2label[predicted_class_idx]
+    #predicted_label = predicted_class_idx
+    return predicted_label
+from transformers import Wav2Vec2Config, Wav2Vec2Model
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+config = Wav2Vec2Config.from_pretrained(local_config_path)
+class Wav2Vec2ForSpeechClassification(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, config):
+        super(Wav2Vec2ForSpeechClassification, self).__init__()
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = nn.ModuleDict({
+            'dense': nn.Linear(config.hidden_size, config.hidden_size),
+            'activation': nn.ReLU(),
+            'dropout': nn.Dropout(config.final_dropout),
+            'out_proj': nn.Linear(config.hidden_size, config.num_labels)
+        })
+    def forward(self, input_values):
+        outputs = self.wav2vec2(input_values)
+        hidden_states = outputs.last_hidden_state
+        x = self.classifier['dense'](hidden_states[:, 0, :])
+        x = self.classifier['activation'](x)
+        x = self.classifier['dropout'](x)
+        logits = self.classifier['out_proj'](x)
+        return logits
+import json
+from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor
+# Load the preprocessor configuration from the local file
+with open(local_preprocessor_config_path, 'r') as file:
+    preprocessor_config = json.load(file)
+# Initialize the preprocessor using the loaded configuration
+feature_extractor = Wav2Vec2FeatureExtractor(
+    do_normalize=preprocessor_config["do_normalize"],
+    feature_extractor_type=preprocessor_config["feature_extractor_type"],
+    feature_size=preprocessor_config["feature_size"],
+    padding_side=preprocessor_config["padding_side"],
+    padding_value=preprocessor_config["padding_value"],
+    processor_class_from_name=preprocessor_config["processor_class"],
+    return_attention_mask=preprocessor_config["return_attention_mask"],
+    sampling_rate=preprocessor_config["sampling_rate"]
+)
+# load the newly finetuned model from huggingface repo
+from huggingface_hub import hf_hub_download
+model_path = hf_hub_download(
+    repo_id="kvilla/wav2vec-english-speech-emotion-recognition-finetuned",
+    filename="model_finetuned.pth"
+)
+# load the newly finetuned model! from local
+saved_model = torch.load(model_path, map_location=torch.device('cpu'))
+# Create the model with the loaded configuration
+model = Wav2Vec2ForSpeechClassification(config=config)
+# Load the state dictionary
+model.load_state_dict(saved_model, strict=False)
+print("Model initialized successfully.")
+model.eval()
+def recognize_emotion(audio):
+    # Load the audio file using librosa
+    #audio, _ = librosa.load(file_path, sr=16000)
+    sample_rate, audio_data = audio
+    print(audio_data)
+     # Ensure audio data is in floating-point format
+    if not np.issubdtype(audio_data.dtype, np.floating):
+        audio_data = audio_data.astype(np.float32)
+        print(audio_data)
+    # If you still want to process it with librosa, e.g., to change sample rate:
+    if sample_rate != 16000:
+        audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+    return predict(model, feature_extractor, audio_data, len(audio_data), id2label)
+demo = gr.Blocks()
+with demo:
     theme= gr.themes.Soft(),
+    audio_input = gr.Audio(type="numpy",
+                            sources=["microphone"],
+                            show_label=True
+                            )
+    text_output = gr.Textbox(label="Recognized Emotion")
+    # Automatically call the recognize_emotion function when audio is recorded
+    audio_input.stop_recording(fn=recognize_emotion, inputs=audio_input, outputs=text_output)
+demo.launch(share=True)