tensorgirl commited on
Commit
8eee022
1 Parent(s): 36d4cfd

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +118 -0
  2. model.keras +0 -0
  3. requirements.txt +9 -0
  4. xgb.json +0 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import numpy as np
4
+ import librosa
5
+ import tensorflow as tf
6
+ from tensorflow import keras
7
+ from tensorflow.keras import layers
8
+ from transformers import AutoFeatureExtractor
9
+ from sklearnex import patch_sklearn, unpatch_sklearn
10
+ patch_sklearn()
11
+ import xgboost as xgb
12
+
13
+
14
+ MAX_DURATION = 2
15
+ # Sampling rate is the number of samples of audio recorded every second
16
+ SAMPLING_RATE = 16000
17
+ BATCH_SIZE = 2 # Batch-size for training and evaluating our model.
18
+ NUM_CLASSES = 8 # Number of classes our dataset will have (11 in our case).
19
+ HIDDEN_DIM = 768 # Dimension of our model output (768 in case of Wav2Vec 2.0 - Base).
20
+ MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE # Maximum length of the input audio file.
21
+ # Wav2Vec 2.0 results in an output frequency with a stride of about 20ms.
22
+ MAX_FRAMES = 99
23
+ MAX_EPOCHS = 5 # Maximum number of training epochs.
24
+ RAVDESS_CLASS_LABELS = ("angry", "calm", "disgust", "fear", "happy", "neutral","sad","surprise")
25
+ MODEL_CHECKPOINT = "facebook/wav2vec2-base"
26
+
27
+ labels = RAVDESS_CLASS_LABELS
28
+ label2id, id2label = dict(), dict()
29
+
30
+ from transformers import TFWav2Vec2Model
31
+
32
+
33
+ def mean_pool(hidden_states, feature_lengths):
34
+ attenion_mask = tf.sequence_mask(
35
+ feature_lengths, maxlen=MAX_FRAMES, dtype=tf.dtypes.int64
36
+ )
37
+ padding_mask = tf.cast(
38
+ tf.reverse(tf.cumsum(tf.reverse(attenion_mask, [-1]), -1), [-1]),
39
+ dtype=tf.dtypes.bool,
40
+ )
41
+ hidden_states = tf.where(
42
+ tf.broadcast_to(
43
+ tf.expand_dims(~padding_mask, -1), (BATCH_SIZE, MAX_FRAMES, HIDDEN_DIM)
44
+ ),
45
+ 0.0,
46
+ hidden_states,
47
+ )
48
+ pooled_state = tf.math.reduce_sum(hidden_states, axis=1) / tf.reshape(
49
+ tf.math.reduce_sum(tf.cast(padding_mask, dtype=tf.dtypes.float32), axis=1),
50
+ [-1, 1],
51
+ )
52
+ return pooled_state
53
+
54
+
55
+ class TFWav2Vec2ForAudioClassification(keras.Model):
56
+
57
+ def __init__(self, model_checkpoint):
58
+ super().__init__()
59
+ # Instantiate the Wav2Vec 2.0 model without the Classification-Head
60
+ self.wav2vec2 = TFWav2Vec2Model.from_pretrained(
61
+ model_checkpoint, apply_spec_augment=False, from_pt=True
62
+ )
63
+ self.pooling = layers.GlobalAveragePooling1D()
64
+ self.flat = layers.Flatten()
65
+ self.intermediate_layer_dropout = layers.Dropout(0.5)
66
+
67
+
68
+ def call(self, inputs):
69
+ hidden_states = self.wav2vec2(inputs[0])[0]
70
+ if tf.is_tensor(inputs[1]):
71
+ audio_lengths = tf.cumsum(inputs[1], -1)[:, -1]
72
+ feature_lengths = self.wav2vec2.wav2vec2._get_feat_extract_output_lengths(
73
+ audio_lengths
74
+ )
75
+ pooled_state = mean_pool(hidden_states, feature_lengths)
76
+ else:
77
+ pooled_state = self.pooling(hidden_states)
78
+
79
+ intermediate_state = self.flat(self.intermediate_layer_dropout(pooled_state))
80
+
81
+ return intermediate_state
82
+
83
+ wav2vec2_model = TFWav2Vec2ForAudioClassification(MODEL_CHECKPOINT)
84
+ wav2vec2_model.load_weights('model.keras')
85
+
86
+ for i, label in enumerate(labels):
87
+ label2id[label] = str(i)
88
+ id2label[str(i)] = label
89
+
90
+
91
+
92
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
93
+ MODEL_CHECKPOINT, return_attention_mask=True
94
+ )
95
+ xgb_params = {
96
+ 'objective': 'binary:logistic',
97
+ 'predictor': 'cpu_predictor',
98
+ 'disable_default_eval_metric': 'true',
99
+ }
100
+
101
+ model_xgb= xgb.XGBClassifier(**xgb_params)
102
+
103
+ def greet(name):
104
+ inp = feature_extractor(
105
+ name[1],
106
+ sampling_rate=feature_extractor.sampling_rate,
107
+ max_length=MAX_SEQ_LENGTH,
108
+ truncation=True,
109
+ padding=True,
110
+ )
111
+ inp = np.array([y for x,y in inp.items()])
112
+ pred = wav2vec2_model.predict([inp[0],inp[1]])
113
+ pred = model_xgb.predict(pred)
114
+ lab = id2label[str(pred[0])]
115
+ return lab
116
+
117
+ iface = gr.Interface(fn=greet, inputs="audio", outputs="text")
118
+ iface.launch()
model.keras ADDED
Binary file (8.64 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ datasets
3
+ huggingface-hub
4
+ joblib
5
+ librosa
6
+ resampy
7
+ tensorflow
8
+ sklearnex
9
+ keras
xgb.json ADDED
The diff for this file is too large to render. See raw diff