Ariel commited on
Commit
c6cec04
Β·
1 Parent(s): ed3ae15

Update files

Browse files
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Spell Net
3
- emoji: 🐒
4
- colorFrom: red
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 4.12.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
1
  ---
2
+ title: SpellNet
3
+ emoji: πŸ‘€
4
+ colorFrom: yellow
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.10.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ import json
4
+ from tensorflow.math import argmax
5
+ import gradio as gr
6
+ from tensorflow.keras.models import model_from_json
7
+ import src.video_with_landmarks
8
+ import src.video_to_landmark_coordinates
9
+ import src.preprocess_coordinates_data
10
+ from src.load_model import Embedding, Encoder, Decoder, LandmarkEmbedding, EncoderTransformerBlock, MultiHeadAttention, \
11
+ DecoderTransformerBlock
12
+ import src.predict_sequence
13
+
14
+ # Load the character to prediction index dictionary
15
+ character_to_prediction = 'src/character_to_prediction_index.json'
16
+ with open(character_to_prediction) as json_file:
17
+ ORD2CHAR = json.load(json_file)
18
+
19
+ # Load the variables from the JSON file
20
+ json_file_path = "src/variables.json"
21
+ with open(json_file_path, 'r') as json_file:
22
+ variables_dict = json.load(json_file)
23
+
24
+ # Load the model architecture from the JSON file
25
+ json_file = open('src/model_architecture.json', 'r')
26
+ loaded_model_json = json_file.read()
27
+ json_file.close()
28
+
29
+ # Import lips landmark indices
30
+ LIPS_LANDMARK_IDXS = np.array(variables_dict['LIPS_LANDMARK_IDXS'])
31
+
32
+ custom_objects = {'Embedding': Embedding,
33
+ 'Encoder': Encoder,
34
+ 'Decoder': Decoder,
35
+ 'LandmarkEmbedding': LandmarkEmbedding,
36
+ 'EncoderTransformerBlock': EncoderTransformerBlock,
37
+ 'MultiHeadAttention': MultiHeadAttention,
38
+ 'DecoderTransformerBlock': DecoderTransformerBlock}
39
+
40
+
41
+ def process_and_print_sequence(df):
42
+ """
43
+ Process the input DataFrame using specified data processing steps and print the shapes of the sequences.
44
+
45
+ Parameters:
46
+ df (pd.DataFrame): Input DataFrame containing tracking data.
47
+
48
+ Returns:
49
+ processed_sequence (np.ndarray): Processed sequence as a NumPy array.
50
+ """
51
+ LEFT_HAND_IDXS0, LEFT_HAND_NAMES0 = src.preprocess_coordinates_data.get_idxs(df, ['left_hand'], ['z'])
52
+ RIGHT_HAND_IDXS0, RIGHT_HAND_NAMES0 = src.preprocess_coordinates_data.get_idxs(df, ['right_hand'], ['z'])
53
+ LIPS_IDXS0, LIPS_NAMES0 = src.preprocess_coordinates_data.get_idxs(df, ['face'], ['z'], idxs_pos=LIPS_LANDMARK_IDXS)
54
+ COLUMNS0 = np.concatenate((LEFT_HAND_NAMES0, RIGHT_HAND_NAMES0, LIPS_NAMES0))
55
+ N_COLS0 = len(COLUMNS0)
56
+
57
+ df = df[COLUMNS0] # select only columns of interest equal to N_COLS0
58
+ all_tracking_sequence = df.values.reshape(1, -1, N_COLS0).astype(
59
+ np.float32) # reshape after converting DataFrame to numpy array
60
+ preprocess_layer_instance = src.preprocess_coordinates_data.PreprocessLayer() # instantiate PreprocessLayer class
61
+ processed_sequence = preprocess_layer_instance(all_tracking_sequence) # call instance with data
62
+
63
+ print(f'input sequence shape: {all_tracking_sequence.shape}')
64
+ print(f'processed sequence shape: {processed_sequence.shape}')
65
+
66
+ return processed_sequence
67
+
68
+
69
+ def predict_final_sequence(processed_sequence, model):
70
+ """
71
+ This function makes a prediction on a given sequence using a pre-trained model.
72
+
73
+ The sequence is expanded along the 0th dimension to account for batch size.
74
+ The prediction is made using the `predict_phrase` function, which should return a one-hot encoded prediction.
75
+ This one-hot encoded prediction is then converted into index values using argmax.
76
+ Finally, these index values are converted into a string representation using the `outputs2phrase` function.
77
+
78
+ Args:
79
+ processed_sequence (numpy array): An array representing the sequence to make a prediction on.
80
+ This should be of shape (128,164).
81
+ model (tensorflow.python.keras.engine.training.Model): The pre-trained model to use for making predictions.
82
+
83
+ Returns:
84
+ final_prediction (str): The final prediction made by the model, represented as a string.
85
+ """
86
+ # change shape to (1,128,164)
87
+ sequence = np.expand_dims(processed_sequence, axis=0) # change shape to (1,128,164)
88
+
89
+ # Convert the one-hot encoded prediction to a string
90
+ predicted_phrase_one_hot = src.predict_sequence.predict_phrase(sequence, model)
91
+ predicted_phrase_one_hot = predicted_phrase_one_hot[0] # Remove the batch dimension
92
+ predicted_phrase = argmax(predicted_phrase_one_hot, axis=-1).numpy() # Convert one-hot encoding to index values
93
+ print(predicted_phrase)
94
+ final_prediction = src.predict_sequence.outputs2phrase(predicted_phrase, ORD2CHAR)
95
+ return final_prediction
96
+
97
+
98
+ def video_identity(video):
99
+ """
100
+ Processes a video, extracts landmarks, feeds them to a pre-trained model, and makes a prediction.
101
+
102
+ The processing pipeline consists of the following steps:
103
+ 1. Process the video with landmarks.
104
+ 2. Extract landmarks coordinates and save them into a DataFrame.
105
+ 3. Preprocess the landmarks.
106
+ 4. Load a pre-trained model.
107
+ 5. Feed the preprocessed landmarks to the model and get a prediction.
108
+
109
+ Parameters:
110
+ video (str): Path to the video file.
111
+
112
+ Returns:
113
+ tuple: The path to the processed video with landmarks and the predicted outcome.
114
+ """
115
+ # 1. load video and process it with landmarks
116
+ original_video_path = video
117
+ output_path = "src/video_landmarks.mp4"
118
+ src.video_with_landmarks.process_video_with_landmarks(original_video_path, output_path)
119
+
120
+ # 2. extract landmarks coordinates
121
+ df = src.video_to_landmark_coordinates.video_to_landmarks(output_path,
122
+ src.video_to_landmark_coordinates.generate_column_names())
123
+ # Save the DataFrame to a CSV file
124
+ # df.to_csv('landmarks.csv', index=False)
125
+
126
+ # 3. preprocess landmarks
127
+ # Read data from a CSV file
128
+ # df = pd.read_csv('landmarks2.csv')
129
+ # df.drop(['sequence_id'],axis = 1, inplace=True)
130
+ processed_sequence = process_and_print_sequence(df)
131
+
132
+ # 4. load model
133
+ # load model architecture from JSON file
134
+ model = model_from_json(loaded_model_json, custom_objects=custom_objects)
135
+
136
+ # load weights into the new model
137
+ model.load_weights("src/model.h5")
138
+
139
+ # 5. predict
140
+ prediction = predict_final_sequence(processed_sequence, model)
141
+ print(prediction)
142
+
143
+ return output_path, prediction
144
+
145
+
146
+ iface = gr.Interface(video_identity,
147
+ inputs=gr.Video(label="Upload your video"), # Adding a label to the input
148
+ outputs=[gr.Video(label="Processed video"), gr.Textbox(label="Predicted sequence")],
149
+ # Adding labels to the outputs
150
+ title="SpellNet", # Adding a title
151
+ # Adding a description
152
+ description="This application analyzes your video input to interpret American Sign Language (ASL) gestures corresponding to letters, numbers, and other signs. The output consists of the original video enhanced with overlaid landmarks that represent key points of ASL gestures, along with the predicted decoded ASL sequence expressed in textual form.",
153
+ theme="gradio/monochrome", # Changing the theme
154
+ examples=[os.path.join(os.path.dirname(__file__), "src/videoplayback.mp4")],
155
+ cache_examples=False) # Disabling caching
156
+
157
+ if __name__ == "__main__":
158
+ iface.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ opencv-python
3
+ mediapipe
4
+ tensorflow
5
+ numpy
6
+ gradio
src/character_to_prediction_index.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": " ",
3
+ "1": "!",
4
+ "2": "#",
5
+ "3": "$",
6
+ "4": "%",
7
+ "5": "&",
8
+ "6": "'",
9
+ "7": "(",
10
+ "8": ")",
11
+ "9": "*",
12
+ "10": "+",
13
+ "11": ",",
14
+ "12": "-",
15
+ "13": ".",
16
+ "14": "/",
17
+ "15": "0",
18
+ "16": "1",
19
+ "17": "2",
20
+ "18": "3",
21
+ "19": "4",
22
+ "20": "5",
23
+ "21": "6",
24
+ "22": "7",
25
+ "23": "8",
26
+ "24": "9",
27
+ "25": ":",
28
+ "26": ";",
29
+ "27": "=",
30
+ "28": "?",
31
+ "29": "@",
32
+ "30": "[",
33
+ "31": "_",
34
+ "32": "a",
35
+ "33": "b",
36
+ "34": "c",
37
+ "35": "d",
38
+ "36": "e",
39
+ "37": "f",
40
+ "38": "g",
41
+ "39": "h",
42
+ "40": "i",
43
+ "41": "j",
44
+ "42": "k",
45
+ "43": "l",
46
+ "44": "m",
47
+ "45": "n",
48
+ "46": "o",
49
+ "47": "p",
50
+ "48": "q",
51
+ "49": "r",
52
+ "50": "s",
53
+ "51": "t",
54
+ "52": "u",
55
+ "53": "v",
56
+ "54": "w",
57
+ "55": "x",
58
+ "56": "y",
59
+ "57": "z",
60
+ "58": "~"
61
+ }
src/landmarks.csv ADDED
The diff for this file is too large to render. See raw diff
 
src/load_model.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ import json
3
+ import numpy as np
4
+ from tensorflow.keras.models import model_from_json
5
+ import os
6
+
7
+ # Convert the variables to the correct data type
8
+ # Load the variables from the JSON file
9
+ # Get the directory of the current file (preprocess_coordinates_data.py)
10
+ current_directory = os.path.dirname(os.path.abspath(__file__))
11
+
12
+ # Construct the path to variables.json
13
+ json_file_path = os.path.join(current_directory, 'variables.json')
14
+ with open(json_file_path, 'r') as json_file:
15
+ variables_dict = json.load(json_file)
16
+
17
+ # Epsilon value for layer normalisation
18
+ LAYER_NORM_EPS = variables_dict['LAYER_NORM_EPS']
19
+
20
+ # final embedding and transformer embedding size
21
+ UNITS_ENCODER = variables_dict['UNITS_ENCODER']
22
+ UNITS_DECODER = variables_dict['UNITS_DECODER']
23
+
24
+ # Transformer
25
+ NUM_BLOCKS_ENCODER = variables_dict['NUM_BLOCKS_ENCODER']
26
+ NUM_BLOCKS_DECODER = variables_dict['NUM_BLOCKS_DECODER']
27
+ NUM_HEADS = variables_dict['NUM_HEADS']
28
+ MLP_RATIO = variables_dict['MLP_RATIO']
29
+
30
+ # Dropout
31
+ EMBEDDING_DROPOUT = variables_dict['EMBEDDING_DROPOUT']
32
+ MLP_DROPOUT_RATIO = variables_dict['MLP_DROPOUT_RATIO']
33
+ MHA_DROPOUT_RATIO = variables_dict['MHA_DROPOUT_RATIO']
34
+ CLASSIFIER_DROPOUT_RATIO = variables_dict['CLASSIFIER_DROPOUT_RATIO']
35
+
36
+ # Number of Frames to resize recording to
37
+ N_TARGET_FRAMES = variables_dict['N_TARGET_FRAMES']
38
+ N_UNIQUE_CHARACTERS = variables_dict['N_UNIQUE_CHARACTERS']
39
+ N_UNIQUE_CHARACTERS0 = variables_dict['N_UNIQUE_CHARACTERS0']
40
+ PAD_TOKEN = variables_dict['PAD_TOKEN']
41
+ SOS_TOKEN = variables_dict['SOS_TOKEN']
42
+
43
+ # Length of Phrase + EOS Token
44
+ MAX_PHRASE_LENGTH = variables_dict['MAX_PHRASE_LENGTH']
45
+
46
+ # Mean/Standard Deviations of data used for normalizing
47
+ MEANS = np.array(variables_dict['MEANS'])
48
+ STDS = np.array(variables_dict['STDS'])
49
+
50
+ # Initiailizers
51
+ INIT_HE_UNIFORM = tf.keras.initializers.he_uniform
52
+ INIT_GLOROT_UNIFORM = tf.keras.initializers.glorot_uniform
53
+ INIT_ZEROS = tf.keras.initializers.constant(0.0)
54
+ # Activations
55
+ GELU = tf.keras.activations.gelu
56
+
57
+
58
+ class Embedding(tf.keras.Model):
59
+ def __init__(self, **kwargs):
60
+ super(Embedding, self).__init__(**kwargs)
61
+ self.supports_masking = True
62
+
63
+ def build(self, input_shape):
64
+ self.positional_embedding = tf.Variable(
65
+ initial_value=tf.zeros([N_TARGET_FRAMES, UNITS_ENCODER], dtype=tf.float32),
66
+ trainable=True, name='embedding_positional_encoder')
67
+ self.dominant_hand_embedding = LandmarkEmbedding(UNITS_ENCODER, 'dominant_hand')
68
+
69
+ def call(self, x, training=False):
70
+ x = tf.where(tf.math.equal(x, 0.0), 0.0, (x - MEANS) / STDS)
71
+ x = self.dominant_hand_embedding(x)
72
+ x = x + self.positional_embedding
73
+ return x
74
+
75
+ def get_config(self):
76
+ return super().get_config()
77
+
78
+ @classmethod
79
+ def from_config(cls, config):
80
+ return cls(**config)
81
+
82
+
83
+ class Encoder(tf.keras.Model):
84
+ def __init__(self, num_blocks, **kwargs):
85
+ super(Encoder, self).__init__(**kwargs)
86
+ self.num_blocks = num_blocks
87
+ self.support_masking = True
88
+ self.blocks = [
89
+ EncoderTransformerBlock(UNITS_ENCODER, NUM_HEADS, MLP_RATIO, MHA_DROPOUT_RATIO, MLP_DROPOUT_RATIO) for _ in
90
+ range(num_blocks)]
91
+
92
+ if UNITS_ENCODER != UNITS_DECODER:
93
+ self.dense_out = tf.keras.layers.Dense(UNITS_DECODER, kernel_initializer=INIT_GLOROT_UNIFORM,
94
+ use_bias=False)
95
+ self.apply_dense_out = True
96
+ else:
97
+ self.apply_dense_out = False
98
+
99
+ def call(self, x, x_inp, training=False):
100
+ attention_mask = tf.where(tf.math.reduce_sum(x_inp, axis=[2]) == 0.0, 0.0, 1.0)
101
+ attention_mask = tf.expand_dims(attention_mask, axis=1)
102
+ attention_mask = tf.repeat(attention_mask, repeats=N_TARGET_FRAMES, axis=1)
103
+
104
+ for block in self.blocks:
105
+ x = block(x, attention_mask=attention_mask, training=training)
106
+
107
+ if self.apply_dense_out:
108
+ x = self.dense_out(x)
109
+
110
+ return x, attention_mask
111
+
112
+ def get_config(self):
113
+ config = super().get_config()
114
+ config.update({"num_blocks": self.num_blocks})
115
+ return config
116
+
117
+ @classmethod
118
+ def from_config(cls, config):
119
+ return cls(**config)
120
+
121
+
122
+ class Decoder(tf.keras.Model):
123
+ def __init__(self, num_blocks, **kwargs):
124
+ super(Decoder, self).__init__(**kwargs)
125
+ self.num_blocks = num_blocks
126
+ self.supports_masking = True
127
+ self.positional_embedding = tf.Variable(
128
+ initial_value=tf.zeros([N_TARGET_FRAMES, UNITS_DECODER], dtype=tf.float32),
129
+ trainable=True, name='embedding_positional_encoder')
130
+ self.char_emb = tf.keras.layers.Embedding(N_UNIQUE_CHARACTERS, UNITS_DECODER, embeddings_initializer=INIT_ZEROS)
131
+ self.pos_emb_mha = MultiHeadAttention(UNITS_DECODER, NUM_HEADS, MHA_DROPOUT_RATIO)
132
+ self.pos_emb_ln = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
133
+ self.blocks = [
134
+ DecoderTransformerBlock(UNITS_DECODER, NUM_HEADS, MLP_RATIO, MHA_DROPOUT_RATIO, MLP_DROPOUT_RATIO) for _ in
135
+ range(num_blocks)]
136
+
137
+ def get_causal_attention_mask(self, B):
138
+ ones = tf.ones((N_TARGET_FRAMES, N_TARGET_FRAMES))
139
+ mask = tf.linalg.band_part(ones, 0, -1)
140
+ mask = tf.transpose(mask)
141
+ mask = tf.expand_dims(mask, axis=0)
142
+ mask = tf.tile(mask, [B, 1, 1])
143
+ mask = tf.cast(mask, tf.float32)
144
+ return mask
145
+
146
+ def call(self, encoder_outputs, attention_mask, phrase, training=False):
147
+ B = tf.shape(encoder_outputs)[0]
148
+ phrase = tf.cast(phrase, tf.int32)
149
+ phrase = tf.pad(phrase, [[0, 0], [1, 0]], constant_values=SOS_TOKEN, name='prepend_sos_token')
150
+ phrase = tf.pad(phrase, [[0, 0], [0, N_TARGET_FRAMES - MAX_PHRASE_LENGTH - 1]], constant_values=PAD_TOKEN,
151
+ name='append_pad_token')
152
+ causal_mask = self.get_causal_attention_mask(B)
153
+ x = self.positional_embedding + self.char_emb(phrase)
154
+ x = self.pos_emb_ln(x + self.pos_emb_mha(x, x, x, attention_mask=causal_mask))
155
+
156
+ for block in self.blocks:
157
+ x = block(x, encoder_outputs, attention_mask=attention_mask, training=training)
158
+
159
+ x = tf.slice(x, [0, 0, 0], [-1, MAX_PHRASE_LENGTH, -1])
160
+ return x
161
+
162
+ def get_config(self):
163
+ config = super().get_config()
164
+ config.update({"num_blocks": self.num_blocks})
165
+ return config
166
+
167
+ @classmethod
168
+ def from_config(cls, config):
169
+ return cls(**config)
170
+
171
+
172
+ # Embeds a landmark using fully connected layers
173
+ class LandmarkEmbedding(tf.keras.Model):
174
+ def __init__(self, units, name):
175
+ super(LandmarkEmbedding, self).__init__(name=f'{name}_embedding')
176
+ self.units = units
177
+ self.supports_masking = True
178
+
179
+ def build(self, input_shape):
180
+ # Embedding for missing landmark in frame, initizlied with zeros
181
+ self.empty_embedding = self.add_weight(
182
+ name=f'{self.name}_empty_embedding',
183
+ shape=[self.units],
184
+ initializer=INIT_ZEROS,
185
+ )
186
+ # Embedding
187
+ self.dense = tf.keras.Sequential([
188
+ tf.keras.layers.Dense(self.units, name=f'{self.name}_dense_1', use_bias=False,
189
+ kernel_initializer=INIT_GLOROT_UNIFORM, activation=GELU), # Can change activation
190
+ tf.keras.layers.Dense(self.units, name=f'{self.name}_dense_2', use_bias=False,
191
+ kernel_initializer=INIT_HE_UNIFORM),
192
+ ], name=f'{self.name}_dense')
193
+
194
+ def call(self, x):
195
+ return tf.where(
196
+ # Checks whether landmark is missing in frame
197
+ tf.reduce_sum(x, axis=2, keepdims=True) == 0,
198
+ # If so, the empty embedding is used
199
+ self.empty_embedding,
200
+ # Otherwise the landmark data is embedded
201
+ self.dense(x),
202
+ )
203
+
204
+ def get_config(self):
205
+ config = super().get_config()
206
+ config.update({"units": self.units, "name": self.name})
207
+ return config
208
+
209
+ @classmethod
210
+ def from_config(cls, config):
211
+ return cls(**config)
212
+
213
+
214
+ class EncoderTransformerBlock(tf.keras.layers.Layer):
215
+ def __init__(self, units, num_heads, mlp_ratio, mha_dropout_ratio, mlp_dropout_ratio, **kwargs):
216
+ super(EncoderTransformerBlock, self).__init__(**kwargs)
217
+ self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
218
+ self.mha = MultiHeadAttention(units, num_heads, mha_dropout_ratio)
219
+ self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
220
+ self.mlp = tf.keras.Sequential([
221
+ tf.keras.layers.Dense(units * mlp_ratio, activation=GELU, kernel_initializer=INIT_GLOROT_UNIFORM,
222
+ use_bias=False),
223
+ tf.keras.layers.Dropout(mlp_dropout_ratio),
224
+ tf.keras.layers.Dense(units, kernel_initializer=INIT_HE_UNIFORM, use_bias=False),
225
+ ])
226
+
227
+ def call(self, inputs, attention_mask, training=False):
228
+ x = self.layer_norm_1(inputs + self.mha(inputs, inputs, inputs, attention_mask=attention_mask))
229
+ x = self.layer_norm_2(x + self.mlp(x))
230
+ return x
231
+
232
+ def get_config(self):
233
+ config = super().get_config()
234
+ config.update({"units": self.units, "num_heads": self.num_heads, "mlp_ratio": self.mlp_ratio,
235
+ "mha_dropout_ratio": self.mha_dropout_ratio, "mlp_dropout_ratio": self.mlp_dropout_ratio})
236
+ return config
237
+
238
+ @classmethod
239
+ def from_config(cls, config):
240
+ return cls(**config)
241
+
242
+
243
+ # replaced softmax with softmax layer to support masked softmax
244
+ def scaled_dot_product(q, k, v, softmax, attention_mask):
245
+ # calculates Q . K(transpose)
246
+ qkt = tf.matmul(q, k, transpose_b=True)
247
+ # calculates scaling factor
248
+ dk = tf.math.sqrt(tf.cast(q.shape[-1], dtype=tf.float32))
249
+ scaled_qkt = qkt / dk
250
+ softmax = softmax(scaled_qkt, mask=attention_mask)
251
+ z = tf.matmul(softmax, v)
252
+ # shape: (m,Tx,depth), same shape as q,k,v
253
+ return z
254
+
255
+
256
+ class MultiHeadAttention(tf.keras.layers.Layer):
257
+ def __init__(self, d_model, num_of_heads, dropout, d_out=None):
258
+ super(MultiHeadAttention, self).__init__()
259
+ self.d_model = d_model
260
+ self.num_of_heads = num_of_heads
261
+ self.depth = d_model // num_of_heads # Can change
262
+ self.wq = [tf.keras.layers.Dense(self.depth, use_bias=False) for i in
263
+ range(num_of_heads)] # depth//2 isn't common, we can try different numbers
264
+ self.wk = [tf.keras.layers.Dense(self.depth, use_bias=False) for i in range(num_of_heads)]
265
+ self.wv = [tf.keras.layers.Dense(self.depth, use_bias=False) for i in range(num_of_heads)]
266
+ self.softmax = tf.keras.layers.Softmax()
267
+ self.do = tf.keras.layers.Dropout(dropout)
268
+ self.supports_masking = True
269
+ self.wo = tf.keras.layers.Dense(d_model if d_out is None else d_out, use_bias=False)
270
+
271
+ def call(self, q, k, v, attention_mask=None, training=False):
272
+ multi_attn = []
273
+ for i in range(self.num_of_heads):
274
+ Q = self.wq[i](q)
275
+ K = self.wk[i](k)
276
+ V = self.wv[i](v)
277
+ multi_attn.append(scaled_dot_product(Q, K, V, self.softmax, attention_mask))
278
+
279
+ multi_head = tf.concat(multi_attn, axis=-1)
280
+ multi_head_attention = self.wo(multi_head)
281
+ multi_head_attention = self.do(multi_head_attention, training=training)
282
+
283
+ return multi_head_attention
284
+
285
+ def get_config(self):
286
+ config = super().get_config()
287
+ config.update({"d_model": self.d_model, "num_of_heads": self.num_of_heads, "dropout": self.dropout})
288
+ return config
289
+
290
+ @classmethod
291
+ def from_config(cls, config):
292
+ return cls(**config)
293
+
294
+
295
+ class DecoderTransformerBlock(tf.keras.layers.Layer):
296
+ def __init__(self, units, num_heads, mlp_ratio, mha_dropout_ratio, mlp_dropout_ratio, **kwargs):
297
+ super(DecoderTransformerBlock, self).__init__(**kwargs)
298
+ self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
299
+ self.mha = MultiHeadAttention(units, num_heads, mha_dropout_ratio)
300
+ self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
301
+ self.mlp = tf.keras.Sequential([
302
+ tf.keras.layers.Dense(units * mlp_ratio, activation=GELU, kernel_initializer=INIT_GLOROT_UNIFORM,
303
+ use_bias=False),
304
+ tf.keras.layers.Dropout(mlp_dropout_ratio),
305
+ tf.keras.layers.Dense(units, kernel_initializer=INIT_HE_UNIFORM, use_bias=False),
306
+ ])
307
+
308
+ def call(self, inputs, encoder_outputs, attention_mask, training=False):
309
+ x = self.layer_norm_1(
310
+ inputs + self.mha(inputs, encoder_outputs, encoder_outputs, attention_mask=attention_mask))
311
+ x = self.layer_norm_2(x + self.mlp(x))
312
+ return x
313
+
314
+ def get_config(self):
315
+ config = super().get_config()
316
+ config.update({"units": self.units, "num_heads": self.num_heads, "mlp_ratio": self.mlp_ratio,
317
+ "mha_dropout_ratio": self.mha_dropout_ratio, "mlp_dropout_ratio": self.mlp_dropout_ratio})
318
+ return config
319
+
320
+ @classmethod
321
+ def from_config(cls, config):
322
+ return cls(**config)
323
+
324
+
325
+ custom_objects = {'Embedding': Embedding,
326
+ 'Encoder': Encoder,
327
+ 'Decoder': Decoder,
328
+ 'LandmarkEmbedding': LandmarkEmbedding,
329
+ 'EncoderTransformerBlock': EncoderTransformerBlock,
330
+ 'MultiHeadAttention': MultiHeadAttention,
331
+ 'DecoderTransformerBlock': DecoderTransformerBlock}
332
+
333
+ # load json and create model
334
+ model_archeticture_file_path = os.path.join(current_directory, 'model_architecture.json')
335
+ json_file = open(model_archeticture_file_path, 'r')
336
+ loaded_model_json = json_file.read()
337
+ json_file.close()
338
+
339
+ # load model from JSON file
340
+ loaded_model = model_from_json(loaded_model_json, custom_objects=custom_objects)
341
+
342
+ # load weights into the new model
343
+ model_weights_file_path = os.path.join(current_directory, 'model.h5')
344
+ loaded_model.load_weights(model_weights_file_path)
345
+
346
+ # loaded_model.summary(expand_nested=True, show_trainable=True, )
src/model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d18e0fb837407b4b8f45411e48758a71d7ff149e31fec29b5868bc5b60e032ef
3
+ size 27599904
src/model_architecture.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"class_name": "Functional", "config": {"name": "model", "trainable": true, "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 128, 164], "dtype": "float32", "sparse": false, "ragged": false, "name": "frames"}, "name": "frames", "inbound_nodes": []}, {"class_name": "Masking", "config": {"name": "masking", "trainable": true, "dtype": "float32", "batch_input_shape": [null, 128, 164], "mask_value": 0.0}, "name": "masking", "inbound_nodes": [[["frames", 0, 0, {}]]]}, {"class_name": "Embedding", "config": {}, "name": "embedding", "inbound_nodes": [[["masking", 0, 0, {}]]]}, {"class_name": "Encoder", "config": {"num_blocks": 3}, "name": "encoder", "inbound_nodes": [[["embedding", 0, 0, {"x_inp": ["frames", 0, 0]}]]]}, {"class_name": "InputLayer", "config": {"batch_input_shape": [null, 32], "dtype": "int32", "sparse": false, "ragged": false, "name": "phrase"}, "name": "phrase", "inbound_nodes": []}, {"class_name": "Decoder", "config": {"num_blocks": 2}, "name": "decoder", "inbound_nodes": [[["encoder", 0, 0, {"attention_mask": ["encoder", 0, 1], "phrase": ["phrase", 0, 0]}]]]}, {"class_name": "Sequential", "config": {"name": "classifier", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 32, 384], "dtype": "float32", "sparse": false, "ragged": false, "name": "dropout_11_input"}}, {"class_name": "Dropout", "config": {"name": "dropout_11", "trainable": true, "dtype": "float32", "rate": 0.2, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_88", "trainable": true, "dtype": "float32", "units": 62, "activation": "linear", "use_bias": false, "kernel_initializer": {"class_name": "HeUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "name": "classifier", "inbound_nodes": [[["decoder", 0, 0, {}]]]}], "input_layers": [["frames", 0, 0], ["phrase", 0, 0]], "output_layers": [["classifier", 1, 0]]}, "keras_version": "2.12.0", "backend": "tensorflow"}
src/predict_sequence.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ import os
5
+
6
+ # Convert the variables to the correct data type
7
+ # Load the variables from the JSON file
8
+ # Get the directory of the current file (preprocess_coordinates_data.py)
9
+ current_directory = os.path.dirname(os.path.abspath(__file__))
10
+
11
+ # Construct the path to variables.json
12
+ json_file_path = os.path.join(current_directory, 'variables.json')
13
+ with open(json_file_path, 'r') as json_file:
14
+ variables_dict = json.load(json_file)
15
+
16
+ MAX_PHRASE_LENGTH = variables_dict['MAX_PHRASE_LENGTH']
17
+ PAD_TOKEN = variables_dict['PAD_TOKEN']
18
+ N_UNIQUE_CHARACTERS = variables_dict['N_UNIQUE_CHARACTERS']
19
+
20
+ # Read Character to Ordinal Encoding Mapping.
21
+ character_to_prediction = os.path.join(current_directory, 'character_to_prediction_index.json')
22
+ with open(character_to_prediction) as json_file:
23
+ ORD2CHAR = json.load(json_file)
24
+
25
+
26
+
27
+ # Output Predictions to string
28
+ def outputs2phrase(outputs, ORD2CHAR):
29
+ """
30
+ Convert output sequence to a human-readable phrase by mapping
31
+ each output to a corresponding character.
32
+
33
+ Parameters:
34
+ outputs (np.array): A sequence of model output,
35
+ can be 1D (sequence of character IDs)
36
+ or 2D (sequence of one-hot encodings).
37
+ ORD2CHAR (dict): A mapping from character IDs to characters.
38
+
39
+ Returns:
40
+ str: The converted phrase.
41
+ """
42
+ ORD2CHAR = {int(k): v for k, v in ORD2CHAR.items()} # Convert keys to integers
43
+ if outputs.ndim == 2:
44
+ outputs = np.argmax(outputs, axis=1)
45
+ return ''.join([ORD2CHAR.get(s, '') for s in outputs])
46
+
47
+
48
+ @tf.function()
49
+ def predict_phrase(batch_frames, model):
50
+ """
51
+ Use a pre-trained model to predict a phrase from a batch of frame sequences.
52
+
53
+ Parameters:
54
+ batch_frames (np.array): A batch of frame sequences.
55
+ model (tf.keras.Model): The pre-trained model to use for prediction.
56
+
57
+ Returns:
58
+ tf.Tensor: One-hot encoding of the predicted phrase.
59
+ """
60
+ batch_frames = tf.convert_to_tensor(batch_frames)
61
+ phrase = tf.fill([batch_frames.shape[0], MAX_PHRASE_LENGTH], PAD_TOKEN)
62
+ phrase = tf.cast(phrase, tf.int32) # Cast phrase to int32 initially
63
+ for idx in tf.range(MAX_PHRASE_LENGTH):
64
+ # Predict Next Token
65
+ outputs = model({
66
+ 'frames': batch_frames,
67
+ 'phrase': phrase,
68
+ })
69
+
70
+ phrase = tf.where(
71
+ tf.range(MAX_PHRASE_LENGTH)[None, :] < idx + 1,
72
+ tf.argmax(outputs, axis=-1, output_type=tf.int32),
73
+ phrase,
74
+ )
75
+ # one-hot encode the outputs
76
+ outputs_one_hot = tf.one_hot(phrase, depth=N_UNIQUE_CHARACTERS)
77
+ return outputs_one_hot
78
+
79
+ # # Assuming sequence is your array of shape (128, 164)
80
+ # sequence = processed_sequence._shape(1, *processed_sequence.shape) # reshapes sequence to (1, 128, 164)
81
+ #
82
+ # # Now you can feed sequence to your prediction function
83
+ # pred_phrase_one_hot = predict_phrase(sequence)
84
+ #
85
+ # # Convert the one-hot encoded prediction to a string
86
+ # # Remember the output is one-hot encoded so we need to convert it to integers first
87
+ # pred_phrase = outputs2phrase(tf.argmax(pred_phrase_one_hot, axis=-1).numpy())
88
+ #
89
+ # print(pred_phrase)
src/preprocess_coordinates_data.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ import json
5
+ import os
6
+
7
+ # Convert the variables to the correct data type
8
+ # Load the variables from the JSON file
9
+ current_directory = os.path.dirname(os.path.abspath(__file__))
10
+
11
+ # Construct the path to variables.json
12
+ json_file_path = os.path.join(current_directory, 'variables.json')
13
+ with open(json_file_path, 'r') as json_file:
14
+ variables_dict = json.load(json_file)
15
+
16
+ # Lips Landmark Face Ids
17
+ LIPS_LANDMARK_IDXS = variables_dict['LIPS_LANDMARK_IDXS']
18
+ N_TARGET_FRAMES = variables_dict['N_TARGET_FRAMES']
19
+ N_DIMS0 = variables_dict['N_TARGET_FRAMES']
20
+
21
+ # Read data from a CSV file
22
+ csv_file_path = os.path.join(current_directory, 'landmarks.csv')
23
+
24
+ df = pd.read_csv(csv_file_path)
25
+
26
+ def get_idxs(df, words_pos, words_neg=[], ret_names=True, idxs_pos=None):
27
+ """
28
+ Given a DataFrame and a list of words, this function will find all the column names
29
+ that contain all the words in 'words_pos' and none of the words in 'words_neg'.
30
+
31
+ Parameters:
32
+ df (pandas.DataFrame): Dataframe to search for column names
33
+ words_pos (list of str): List of words that column names should contain
34
+ words_neg (list of str, optional): List of words that column names should not contain. Default is empty list.
35
+ ret_names (bool, optional): Whether to return column names. Default is True.
36
+ idxs_pos (list of int, optional): Column indices to search within. Default is None, which means search all columns.
37
+
38
+ Returns:
39
+ idxs (np.array): Column indices where column names meet the criteria
40
+ names (np.array): Column names that meet the criteria. Only returned if 'ret_names' is True.
41
+ """
42
+ idxs = []
43
+ names = []
44
+ for w in words_pos:
45
+ for col_idx, col in enumerate(df.columns):
46
+ # Exclude Non Landmark Columns
47
+ if col in ['frame']:
48
+ continue
49
+
50
+ col_idx = int(col.split('_')[-1])
51
+ # Check if column name contains all words
52
+ if (w in col) and (idxs_pos is None or col_idx in idxs_pos) and all([w not in col for w in words_neg]):
53
+ idxs.append(col_idx)
54
+ names.append(col)
55
+ # Convert to Numpy arrays
56
+ idxs = np.array(idxs)
57
+ names = np.array(names)
58
+ # Returns either both column indices and names
59
+ if ret_names:
60
+ return idxs, names
61
+ # Or only columns indices
62
+ else:
63
+ return idxs
64
+
65
+
66
+ # Get the indices of columns of interest
67
+ LEFT_HAND_IDXS0, LEFT_HAND_NAMES0 = get_idxs(df, ['left_hand'], ['z'])
68
+ RIGHT_HAND_IDXS0, RIGHT_HAND_NAMES0 = get_idxs(df, ['right_hand'], ['z'])
69
+ LIPS_IDXS0, LIPS_NAMES0 = get_idxs(df, ['face'], ['z'], idxs_pos=LIPS_LANDMARK_IDXS)
70
+ COLUMNS0 = np.concatenate((LEFT_HAND_NAMES0, RIGHT_HAND_NAMES0, LIPS_NAMES0))
71
+ N_COLS0 = len(COLUMNS0)
72
+ N_COLS = N_COLS0
73
+
74
+
75
+ class PreprocessLayerNonNaN(tf.keras.layers.Layer):
76
+ """
77
+ This is a custom layer in Keras that replaces NaN values in the input tensor with 0.
78
+ """
79
+
80
+ def __init__(self):
81
+ super(PreprocessLayerNonNaN, self).__init__()
82
+
83
+ @tf.function(
84
+ input_signature=(tf.TensorSpec(shape=[None, N_COLS0], dtype=tf.float32),),
85
+ )
86
+ def call(self, data0):
87
+ """
88
+ This method is called when the layer instance is called with some inputs.
89
+
90
+ Parameters:
91
+ data0 (Tensor): Input tensor
92
+
93
+ Returns:
94
+ data (Tensor): Output tensor with the same shape as the input, but with NaN values replaced with 0
95
+ """
96
+ # Fill NaN Values With 0
97
+ data = tf.where(tf.math.is_nan(data0), 0.0, data0)
98
+
99
+ # Hacky
100
+ data = data[None]
101
+
102
+ # Empty Hand Frame Filtering
103
+ hands = tf.slice(data, [0, 0, 0], [-1, -1, 84])
104
+ hands = tf.abs(hands)
105
+ mask = tf.reduce_sum(hands, axis=2)
106
+ mask = tf.not_equal(mask, 0)
107
+ data = data[mask][None]
108
+ data = tf.squeeze(data, axis=[0])
109
+
110
+ return data
111
+
112
+
113
+ class PreprocessLayer(tf.keras.layers.Layer):
114
+ """
115
+ This is a custom layer in Keras that pre-processes the input data in a specific way,
116
+ which includes filling NaN values with 0, filtering empty frames and resizing frames.
117
+ """
118
+
119
+ def __init__(self):
120
+ super(PreprocessLayer, self).__init__()
121
+
122
+ @tf.function(
123
+ input_signature=(tf.TensorSpec(shape=[None, None, N_COLS0], dtype=tf.float32),),
124
+ )
125
+ def call(self, data0, resize=True):
126
+ """
127
+ This method is called when the layer instance is called with some inputs.
128
+
129
+ Parameters:
130
+ data0 (Tensor): Input tensor
131
+ resize (bool, optional): Whether to resize the frames. Default is True.
132
+
133
+ Returns:
134
+ data (Tensor): Output tensor after pre-processing
135
+ """
136
+ # Fill NaN Values With 0
137
+ data = tf.where(tf.math.is_nan(data0), 0.0, data0)
138
+
139
+ # Empty Hand Frame Filtering
140
+ hands = tf.slice(data, [0, 0, 0], [-1, -1, 84])
141
+ hands = tf.abs(hands)
142
+ mask = tf.reduce_sum(hands, axis=2)
143
+ mask = tf.not_equal(mask, 0)
144
+ data = data[mask][None]
145
+
146
+ # Pad Zeros
147
+ N_FRAMES = len(data[0])
148
+ if N_FRAMES < N_TARGET_FRAMES:
149
+ data = tf.concat((
150
+ data,
151
+ tf.zeros([1, N_TARGET_FRAMES - N_FRAMES, N_COLS], dtype=tf.float32)
152
+ ), axis=1)
153
+ # Downsample
154
+ data = tf.image.resize(
155
+ data,
156
+ [1, N_TARGET_FRAMES],
157
+ method=tf.image.ResizeMethod.BILINEAR,
158
+ )
159
+
160
+ # Squeeze Batch Dimension
161
+ data = tf.squeeze(data, axis=[0])
162
+
163
+ return data
164
+
165
+
166
+ df = df[COLUMNS0] # select only columns of interest equal to N_COLS0
167
+ hand_tracking_sequence = df.values.reshape(1, -1, N_COLS0) # reshape after converting DataFrame to numpy array
168
+
169
+ preprocess_layer_instance = PreprocessLayer() # instantiate PreprocessLayer class
170
+ processed_sequence = preprocess_layer_instance(hand_tracking_sequence) # call instance with data
171
+
172
+ # print(f'input sequence shape: {hand_tracking_sequence.shape}')
173
+ # print(f'processed sequence shape: {processed_sequence.shape}')
src/variables.json ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "LIPS_LANDMARK_IDXS": [
3
+ 61,
4
+ 185,
5
+ 40,
6
+ 39,
7
+ 37,
8
+ 0,
9
+ 267,
10
+ 269,
11
+ 270,
12
+ 409,
13
+ 291,
14
+ 146,
15
+ 91,
16
+ 181,
17
+ 84,
18
+ 17,
19
+ 314,
20
+ 405,
21
+ 321,
22
+ 375,
23
+ 78,
24
+ 191,
25
+ 80,
26
+ 81,
27
+ 82,
28
+ 13,
29
+ 312,
30
+ 311,
31
+ 310,
32
+ 415,
33
+ 95,
34
+ 88,
35
+ 178,
36
+ 87,
37
+ 14,
38
+ 317,
39
+ 402,
40
+ 318,
41
+ 324,
42
+ 308
43
+ ],
44
+ "LAYER_NORM_EPS": 1e-05,
45
+ "UNITS_ENCODER": 384,
46
+ "UNITS_DECODER": 384,
47
+ "NUM_BLOCKS_ENCODER": 3,
48
+ "NUM_BLOCKS_DECODER": 2,
49
+ "NUM_HEADS": 4,
50
+ "MLP_RATIO": 2,
51
+ "EMBEDDING_DROPOUT": 0.2,
52
+ "MLP_DROPOUT_RATIO": 0.2,
53
+ "MHA_DROPOUT_RATIO": 0.2,
54
+ "CLASSIFIER_DROPOUT_RATIO": 0.2,
55
+ "N_TARGET_FRAMES": 128,
56
+ "N_DIMS0": 2,
57
+ "N_UNIQUE_CHARACTERS": 62,
58
+ "N_UNIQUE_CHARACTERS0": 59,
59
+ "PAD_TOKEN": 59,
60
+ "SOS_TOKEN": 60,
61
+ "MAX_PHRASE_LENGTH": 32,
62
+ "MEANS": [
63
+ 0.69352776,
64
+ 0.60659605,
65
+ 0.53412515,
66
+ 0.4970676,
67
+ 0.48584947,
68
+ 0.5761701,
69
+ 0.5300588,
70
+ 0.49778917,
71
+ 0.47764367,
72
+ 0.6305243,
73
+ 0.5822572,
74
+ 0.55222154,
75
+ 0.53908557,
76
+ 0.68544865,
77
+ 0.63951194,
78
+ 0.6104323,
79
+ 0.5991277,
80
+ 0.7378051,
81
+ 0.7018211,
82
+ 0.6776119,
83
+ 0.6673842,
84
+ 0.76445776,
85
+ 0.7457853,
86
+ 0.7062872,
87
+ 0.67484325,
88
+ 0.6514734,
89
+ 0.6304427,
90
+ 0.5906848,
91
+ 0.5854317,
92
+ 0.5849309,
93
+ 0.6276549,
94
+ 0.5890438,
95
+ 0.59771925,
96
+ 0.6047316,
97
+ 0.6383216,
98
+ 0.60959125,
99
+ 0.6295764,
100
+ 0.6437836,
101
+ 0.6588292,
102
+ 0.6397078,
103
+ 0.65018004,
104
+ 0.65816236,
105
+ 0.26357186,
106
+ 0.35093567,
107
+ 0.4236605,
108
+ 0.45704976,
109
+ 0.4634739,
110
+ 0.37947592,
111
+ 0.4234214,
112
+ 0.45306972,
113
+ 0.4717593,
114
+ 0.3199842,
115
+ 0.36261505,
116
+ 0.38926786,
117
+ 0.40241373,
118
+ 0.26189587,
119
+ 0.30273047,
120
+ 0.3301876,
121
+ 0.34255308,
122
+ 0.20624675,
123
+ 0.23920882,
124
+ 0.263005,
125
+ 0.27461466,
126
+ 0.75472385,
127
+ 0.73504084,
128
+ 0.6943852,
129
+ 0.6608657,
130
+ 0.63613355,
131
+ 0.6144105,
132
+ 0.5700216,
133
+ 0.56217206,
134
+ 0.5597008,
135
+ 0.611077,
136
+ 0.56800383,
137
+ 0.575002,
138
+ 0.5811821,
139
+ 0.62163454,
140
+ 0.59134597,
141
+ 0.61230445,
142
+ 0.6277079,
143
+ 0.64273566,
144
+ 0.6216118,
145
+ 0.6318555,
146
+ 0.63973725,
147
+ 0.56342137,
148
+ 0.5647059,
149
+ 0.5649758,
150
+ 0.5657689,
151
+ 0.54460865,
152
+ 0.52689284,
153
+ 0.51569146,
154
+ 0.5043293,
155
+ 0.51033896,
156
+ 0.52668756,
157
+ 0.53708506,
158
+ 0.54991424,
159
+ 0.5468167,
160
+ 0.55006754,
161
+ 0.5267238,
162
+ 0.5178957,
163
+ 0.51888436,
164
+ 0.5099791,
165
+ 0.53717476,
166
+ 0.5305108,
167
+ 0.5081805,
168
+ 0.51886874,
169
+ 0.58258605,
170
+ 0.6024338,
171
+ 0.6155048,
172
+ 0.6306914,
173
+ 0.6245343,
174
+ 0.6058631,
175
+ 0.59408224,
176
+ 0.58018464,
177
+ 0.5852319,
178
+ 0.5804903,
179
+ 0.60605526,
180
+ 0.61589545,
181
+ 0.61500907,
182
+ 0.6246284,
183
+ 0.59435004,
184
+ 0.6024958,
185
+ 0.6250273,
186
+ 0.61513,
187
+ 0.508501,
188
+ 0.5193109,
189
+ 0.52219623,
190
+ 0.53701967,
191
+ 0.5069547,
192
+ 0.51169485,
193
+ 0.51677644,
194
+ 0.5253185,
195
+ 0.5245756,
196
+ 0.521367,
197
+ 0.5199756,
198
+ 0.51932734,
199
+ 0.5365361,
200
+ 0.5221106,
201
+ 0.5230684,
202
+ 0.53079647,
203
+ 0.5238175,
204
+ 0.52800494,
205
+ 0.5223436,
206
+ 0.5342269,
207
+ 0.5212379,
208
+ 0.52289945,
209
+ 0.506347,
210
+ 0.5106173,
211
+ 0.51533395,
212
+ 0.5235456,
213
+ 0.5230225,
214
+ 0.52027595,
215
+ 0.51917976,
216
+ 0.5189014,
217
+ 0.5361387,
218
+ 0.5216965,
219
+ 0.5220167,
220
+ 0.52960336,
221
+ 0.5225625,
222
+ 0.5264617,
223
+ 0.5215638,
224
+ 0.53341466,
225
+ 0.51952803,
226
+ 0.5216051
227
+ ],
228
+ "STDS": [
229
+ 0.10834738,
230
+ 0.10391748,
231
+ 0.10296664,
232
+ 0.10752504,
233
+ 0.12336373,
234
+ 0.10313869,
235
+ 0.10744168,
236
+ 0.11199072,
237
+ 0.1193621,
238
+ 0.10597368,
239
+ 0.11260378,
240
+ 0.1170811,
241
+ 0.12447591,
242
+ 0.11238337,
243
+ 0.12130429,
244
+ 0.12248141,
245
+ 0.1267081,
246
+ 0.1224081,
247
+ 0.13301295,
248
+ 0.13806877,
249
+ 0.1437398,
250
+ 0.08867608,
251
+ 0.08839962,
252
+ 0.08913112,
253
+ 0.09358086,
254
+ 0.09968524,
255
+ 0.08439907,
256
+ 0.09381164,
257
+ 0.10565417,
258
+ 0.11996002,
259
+ 0.08592986,
260
+ 0.1002507,
261
+ 0.11805841,
262
+ 0.13548768,
263
+ 0.08893858,
264
+ 0.1042807,
265
+ 0.11806193,
266
+ 0.13066797,
267
+ 0.09283979,
268
+ 0.1044982,
269
+ 0.11446757,
270
+ 0.12410894,
271
+ 0.08575833,
272
+ 0.08688664,
273
+ 0.08871841,
274
+ 0.09452496,
275
+ 0.11280894,
276
+ 0.08605019,
277
+ 0.09069607,
278
+ 0.09625262,
279
+ 0.10480069,
280
+ 0.08209087,
281
+ 0.08907479,
282
+ 0.09521613,
283
+ 0.10375828,
284
+ 0.0827678,
285
+ 0.09389319,
286
+ 0.09721766,
287
+ 0.10260603,
288
+ 0.0892784,
289
+ 0.10309231,
290
+ 0.11121955,
291
+ 0.11911318,
292
+ 0.08014706,
293
+ 0.07939664,
294
+ 0.07666104,
295
+ 0.07640523,
296
+ 0.07845239,
297
+ 0.06779566,
298
+ 0.06928173,
299
+ 0.07995176,
300
+ 0.09609538,
301
+ 0.06776656,
302
+ 0.07411631,
303
+ 0.09502285,
304
+ 0.11704809,
305
+ 0.06976698,
306
+ 0.07840788,
307
+ 0.09568293,
308
+ 0.11219386,
309
+ 0.07334771,
310
+ 0.07997227,
311
+ 0.09204492,
312
+ 0.10471888,
313
+ 0.1324311,
314
+ 0.13287905,
315
+ 0.13296498,
316
+ 0.13300247,
317
+ 0.13251117,
318
+ 0.13296743,
319
+ 0.13352127,
320
+ 0.13476767,
321
+ 0.13467269,
322
+ 0.13386367,
323
+ 0.13339657,
324
+ 0.13304512,
325
+ 0.13318144,
326
+ 0.13313657,
327
+ 0.13394693,
328
+ 0.13404495,
329
+ 0.1343446,
330
+ 0.13446471,
331
+ 0.13349241,
332
+ 0.13355125,
333
+ 0.13414721,
334
+ 0.13430822,
335
+ 0.13283393,
336
+ 0.13377732,
337
+ 0.1346423,
338
+ 0.13602652,
339
+ 0.13584861,
340
+ 0.13470158,
341
+ 0.1339573,
342
+ 0.13331288,
343
+ 0.13342074,
344
+ 0.133372,
345
+ 0.13473015,
346
+ 0.13483934,
347
+ 0.13534908,
348
+ 0.13551436,
349
+ 0.13399816,
350
+ 0.13405652,
351
+ 0.1354323,
352
+ 0.13537434,
353
+ 0.06685787,
354
+ 0.06737807,
355
+ 0.06767439,
356
+ 0.06927998,
357
+ 0.06658512,
358
+ 0.06643137,
359
+ 0.0663855,
360
+ 0.06645988,
361
+ 0.06653237,
362
+ 0.06679216,
363
+ 0.06700299,
364
+ 0.06721594,
365
+ 0.06899743,
366
+ 0.06748881,
367
+ 0.06692849,
368
+ 0.06752784,
369
+ 0.06670087,
370
+ 0.06690367,
371
+ 0.06722134,
372
+ 0.06834918,
373
+ 0.06637124,
374
+ 0.06663854,
375
+ 0.06680202,
376
+ 0.06691353,
377
+ 0.06701645,
378
+ 0.06724831,
379
+ 0.06726662,
380
+ 0.06730385,
381
+ 0.06735906,
382
+ 0.06739713,
383
+ 0.06924284,
384
+ 0.06767783,
385
+ 0.06744281,
386
+ 0.06815296,
387
+ 0.06732813,
388
+ 0.0676265,
389
+ 0.06758311,
390
+ 0.06880609,
391
+ 0.06710069,
392
+ 0.0672657
393
+ ]
394
+ }
src/video_to_landmark_coordinates.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import mediapipe as mp
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+
7
+ def generate_column_names():
8
+ """
9
+ Generate column names for a DataFrame that will store coordinates of landmarks.
10
+
11
+ Column names are formatted as '{coordinate}_{landmark_type}_{landmark_index}'.
12
+
13
+ Returns:
14
+ list: A list of strings representing the column names.
15
+ """
16
+ columns = ['frame']
17
+
18
+ # face columns
19
+ for coordinate in ['x', 'y']:
20
+ for i in range(468): # Mediapipe face mesh contains 468 landmarks
21
+ columns.append(f'{coordinate}_face_{i}')
22
+
23
+ # hands columns
24
+ for hand in ['left_hand', 'right_hand']:
25
+ for coordinate in ['x', 'y']:
26
+ for i in range(21): # Mediapipe hand model contains 21 landmarks
27
+ columns.append(f'{coordinate}_{hand}_{i}')
28
+
29
+ return columns
30
+
31
+
32
+ def video_to_landmarks(video_path, columns):
33
+ """
34
+ Extract face and hand landmarks from a video and store them in a DataFrame.
35
+
36
+ The video is processed frame by frame. For each frame, face and hand landmarks
37
+ are detected using MediaPipe's face mesh and hand models, respectively.
38
+ The coordinates of the landmarks are stored in a DataFrame.
39
+
40
+ Parameters:
41
+ video_path (str): Path to the video file.
42
+ columns (list): List of column names for the DataFrame.
43
+
44
+ Returns:
45
+ pd.DataFrame: A DataFrame where each row corresponds to a frame and each column corresponds to a landmark.
46
+ """
47
+ mp_drawing = mp.solutions.drawing_utils
48
+ mp_face_mesh = mp.solutions.face_mesh
49
+ mp_hands = mp.solutions.hands
50
+
51
+ cap = cv2.VideoCapture(video_path)
52
+ df = pd.DataFrame(columns=columns)
53
+
54
+ with mp_face_mesh.FaceMesh() as face_mesh, mp_hands.Hands(max_num_hands=2) as hands:
55
+ frame_count = 0
56
+ while cap.isOpened():
57
+ success, frame = cap.read()
58
+ if not success:
59
+ break
60
+
61
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
62
+ results_face = face_mesh.process(rgb_frame)
63
+ results_hands = hands.process(rgb_frame)
64
+
65
+ # Initialize frame dictionary with NaNs
66
+ frame_data = {column: np.NaN for column in columns}
67
+ frame_data['frame'] = frame_count
68
+
69
+ # Process face landmarks
70
+ if results_face.multi_face_landmarks:
71
+ for face_landmarks in results_face.multi_face_landmarks:
72
+ for i, landmark in enumerate(face_landmarks.landmark):
73
+ frame_data[f'x_face_{i}'] = landmark.x
74
+ frame_data[f'y_face_{i}'] = landmark.y
75
+
76
+ # Process hand landmarks
77
+ if results_hands.multi_hand_landmarks:
78
+ for hand_landmarks in results_hands.multi_hand_landmarks:
79
+ if hand_landmarks.landmark[mp_hands.HandLandmark.WRIST].x < hand_landmarks.landmark[
80
+ mp_hands.HandLandmark.THUMB_TIP].x:
81
+ hand_type = 'left_hand'
82
+ else:
83
+ hand_type = 'right_hand'
84
+
85
+ for i, landmark in enumerate(hand_landmarks.landmark):
86
+ frame_data[f'x_{hand_type}_{i}'] = landmark.x
87
+ frame_data[f'y_{hand_type}_{i}'] = landmark.y
88
+
89
+ df = df._append(frame_data, ignore_index=True)
90
+ frame_count += 1
91
+
92
+ cap.release()
93
+
94
+ return df
95
+
96
+ # video_path = "videoplayback_with_landmarks.mp4"
97
+ # df = video_to_landmarks(video_path, generate_column_names())
98
+ #
99
+ # # Save the DataFrame to a CSV file
100
+ # df.to_csv('landmarks.csv', index=False)
src/video_with_landmarks.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import mediapipe as mp
3
+
4
+ def process_video_with_landmarks(video_path, output_path, scale_percent=100):
5
+ """
6
+ Process a video to identify and draw landmarks on faces and hands.
7
+
8
+ Parameters:
9
+ video_path (str): The path to the input video file.
10
+ output_path (str): The path to the output video file.
11
+ scale_percent (int, optional): The percentage of the original size. Default is 100.
12
+ """
13
+ # MediaPipe solutions
14
+ mp_drawing = mp.solutions.drawing_utils
15
+ mp_face_mesh = mp.solutions.face_mesh
16
+ mp_hands = mp.solutions.hands
17
+
18
+ # Open the video file
19
+ cap = cv2.VideoCapture(video_path)
20
+
21
+ # Get the video properties
22
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
23
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
24
+ fps = cap.get(cv2.CAP_PROP_FPS)
25
+
26
+ # Calculate the scale dimensions
27
+ width = int(width * scale_percent / 100)
28
+ height = int(height * scale_percent / 100)
29
+
30
+ # Define the output video file
31
+ # fourcc = cv2.VideoWriter_fourcc(*'h264')
32
+
33
+ fourcc = cv2.VideoWriter_fourcc(*'HEVC')
34
+ out_fps = fps / 0.6 # Set the output fps to half of the original fps
35
+ out = cv2.VideoWriter(output_path, fourcc, out_fps, (width, height))
36
+
37
+ # Process each frame
38
+ with mp_face_mesh.FaceMesh() as face_mesh, mp_hands.Hands() as hands:
39
+ while cap.isOpened():
40
+ success, frame = cap.read()
41
+ if not success:
42
+ break
43
+
44
+ # Resize the frame
45
+ frame = cv2.resize(frame, (width, height), interpolation = cv2.INTER_AREA)
46
+
47
+ # Convert the frame to RGB
48
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
49
+
50
+ # Process face landmarks
51
+ results_face = face_mesh.process(rgb_frame)
52
+ if results_face.multi_face_landmarks:
53
+ for face_landmarks in results_face.multi_face_landmarks:
54
+ mp_drawing.draw_landmarks(
55
+ frame,
56
+ face_landmarks,
57
+ mp_face_mesh.FACEMESH_TESSELATION,
58
+ landmark_drawing_spec=mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=1, circle_radius=1),
59
+ connection_drawing_spec=mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=1)
60
+ )
61
+
62
+ # Process hand landmarks
63
+ results_hands = hands.process(rgb_frame)
64
+ if results_hands.multi_hand_landmarks:
65
+ for hand_landmarks in results_hands.multi_hand_landmarks:
66
+ if hand_landmarks.landmark[mp_hands.HandLandmark.WRIST].x < hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].x:
67
+ landmark_color = (255, 0, 0) # Left hand (Blue)
68
+ else:
69
+ landmark_color = (0, 0, 255) # Right hand (Red)
70
+
71
+ mp_drawing.draw_landmarks(
72
+ frame,
73
+ hand_landmarks,
74
+ mp_hands.HAND_CONNECTIONS,
75
+ landmark_drawing_spec=mp_drawing.DrawingSpec(color=landmark_color, thickness=1, circle_radius=1),
76
+ connection_drawing_spec=mp_drawing.DrawingSpec(color=landmark_color, thickness=1)
77
+ )
78
+
79
+ # Write the annotated frame to the output video
80
+ out.write(frame)
81
+
82
+ # If 'q' is pressed on the keyboard, exit this loop
83
+ if cv2.waitKey(1) & 0xFF == ord('q'):
84
+ break
85
+
86
+ # Close the video file
87
+ cap.release()
88
+ out.release()
89
+ cv2.destroyAllWindows()
src/videoplayback.mp4 ADDED
Binary file (768 kB). View file