Spaces:
Sleeping
Sleeping
Ariel
commited on
Commit
Β·
c6cec04
1
Parent(s):
ed3ae15
Update files
Browse files- README.md +5 -5
- app.py +158 -0
- requirements.txt +6 -0
- src/character_to_prediction_index.json +61 -0
- src/landmarks.csv +0 -0
- src/load_model.py +346 -0
- src/model.h5 +3 -0
- src/model_architecture.json +1 -0
- src/predict_sequence.py +89 -0
- src/preprocess_coordinates_data.py +173 -0
- src/variables.json +394 -0
- src/video_to_landmark_coordinates.py +100 -0
- src/video_with_landmarks.py +89 -0
- src/videoplayback.mp4 +0 -0
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
1 |
---
|
2 |
+
title: SpellNet
|
3 |
+
emoji: π
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.10.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
app.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
from tensorflow.math import argmax
|
5 |
+
import gradio as gr
|
6 |
+
from tensorflow.keras.models import model_from_json
|
7 |
+
import src.video_with_landmarks
|
8 |
+
import src.video_to_landmark_coordinates
|
9 |
+
import src.preprocess_coordinates_data
|
10 |
+
from src.load_model import Embedding, Encoder, Decoder, LandmarkEmbedding, EncoderTransformerBlock, MultiHeadAttention, \
|
11 |
+
DecoderTransformerBlock
|
12 |
+
import src.predict_sequence
|
13 |
+
|
14 |
+
# Load the character to prediction index dictionary
|
15 |
+
character_to_prediction = 'src/character_to_prediction_index.json'
|
16 |
+
with open(character_to_prediction) as json_file:
|
17 |
+
ORD2CHAR = json.load(json_file)
|
18 |
+
|
19 |
+
# Load the variables from the JSON file
|
20 |
+
json_file_path = "src/variables.json"
|
21 |
+
with open(json_file_path, 'r') as json_file:
|
22 |
+
variables_dict = json.load(json_file)
|
23 |
+
|
24 |
+
# Load the model architecture from the JSON file
|
25 |
+
json_file = open('src/model_architecture.json', 'r')
|
26 |
+
loaded_model_json = json_file.read()
|
27 |
+
json_file.close()
|
28 |
+
|
29 |
+
# Import lips landmark indices
|
30 |
+
LIPS_LANDMARK_IDXS = np.array(variables_dict['LIPS_LANDMARK_IDXS'])
|
31 |
+
|
32 |
+
custom_objects = {'Embedding': Embedding,
|
33 |
+
'Encoder': Encoder,
|
34 |
+
'Decoder': Decoder,
|
35 |
+
'LandmarkEmbedding': LandmarkEmbedding,
|
36 |
+
'EncoderTransformerBlock': EncoderTransformerBlock,
|
37 |
+
'MultiHeadAttention': MultiHeadAttention,
|
38 |
+
'DecoderTransformerBlock': DecoderTransformerBlock}
|
39 |
+
|
40 |
+
|
41 |
+
def process_and_print_sequence(df):
|
42 |
+
"""
|
43 |
+
Process the input DataFrame using specified data processing steps and print the shapes of the sequences.
|
44 |
+
|
45 |
+
Parameters:
|
46 |
+
df (pd.DataFrame): Input DataFrame containing tracking data.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
processed_sequence (np.ndarray): Processed sequence as a NumPy array.
|
50 |
+
"""
|
51 |
+
LEFT_HAND_IDXS0, LEFT_HAND_NAMES0 = src.preprocess_coordinates_data.get_idxs(df, ['left_hand'], ['z'])
|
52 |
+
RIGHT_HAND_IDXS0, RIGHT_HAND_NAMES0 = src.preprocess_coordinates_data.get_idxs(df, ['right_hand'], ['z'])
|
53 |
+
LIPS_IDXS0, LIPS_NAMES0 = src.preprocess_coordinates_data.get_idxs(df, ['face'], ['z'], idxs_pos=LIPS_LANDMARK_IDXS)
|
54 |
+
COLUMNS0 = np.concatenate((LEFT_HAND_NAMES0, RIGHT_HAND_NAMES0, LIPS_NAMES0))
|
55 |
+
N_COLS0 = len(COLUMNS0)
|
56 |
+
|
57 |
+
df = df[COLUMNS0] # select only columns of interest equal to N_COLS0
|
58 |
+
all_tracking_sequence = df.values.reshape(1, -1, N_COLS0).astype(
|
59 |
+
np.float32) # reshape after converting DataFrame to numpy array
|
60 |
+
preprocess_layer_instance = src.preprocess_coordinates_data.PreprocessLayer() # instantiate PreprocessLayer class
|
61 |
+
processed_sequence = preprocess_layer_instance(all_tracking_sequence) # call instance with data
|
62 |
+
|
63 |
+
print(f'input sequence shape: {all_tracking_sequence.shape}')
|
64 |
+
print(f'processed sequence shape: {processed_sequence.shape}')
|
65 |
+
|
66 |
+
return processed_sequence
|
67 |
+
|
68 |
+
|
69 |
+
def predict_final_sequence(processed_sequence, model):
|
70 |
+
"""
|
71 |
+
This function makes a prediction on a given sequence using a pre-trained model.
|
72 |
+
|
73 |
+
The sequence is expanded along the 0th dimension to account for batch size.
|
74 |
+
The prediction is made using the `predict_phrase` function, which should return a one-hot encoded prediction.
|
75 |
+
This one-hot encoded prediction is then converted into index values using argmax.
|
76 |
+
Finally, these index values are converted into a string representation using the `outputs2phrase` function.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
processed_sequence (numpy array): An array representing the sequence to make a prediction on.
|
80 |
+
This should be of shape (128,164).
|
81 |
+
model (tensorflow.python.keras.engine.training.Model): The pre-trained model to use for making predictions.
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
final_prediction (str): The final prediction made by the model, represented as a string.
|
85 |
+
"""
|
86 |
+
# change shape to (1,128,164)
|
87 |
+
sequence = np.expand_dims(processed_sequence, axis=0) # change shape to (1,128,164)
|
88 |
+
|
89 |
+
# Convert the one-hot encoded prediction to a string
|
90 |
+
predicted_phrase_one_hot = src.predict_sequence.predict_phrase(sequence, model)
|
91 |
+
predicted_phrase_one_hot = predicted_phrase_one_hot[0] # Remove the batch dimension
|
92 |
+
predicted_phrase = argmax(predicted_phrase_one_hot, axis=-1).numpy() # Convert one-hot encoding to index values
|
93 |
+
print(predicted_phrase)
|
94 |
+
final_prediction = src.predict_sequence.outputs2phrase(predicted_phrase, ORD2CHAR)
|
95 |
+
return final_prediction
|
96 |
+
|
97 |
+
|
98 |
+
def video_identity(video):
|
99 |
+
"""
|
100 |
+
Processes a video, extracts landmarks, feeds them to a pre-trained model, and makes a prediction.
|
101 |
+
|
102 |
+
The processing pipeline consists of the following steps:
|
103 |
+
1. Process the video with landmarks.
|
104 |
+
2. Extract landmarks coordinates and save them into a DataFrame.
|
105 |
+
3. Preprocess the landmarks.
|
106 |
+
4. Load a pre-trained model.
|
107 |
+
5. Feed the preprocessed landmarks to the model and get a prediction.
|
108 |
+
|
109 |
+
Parameters:
|
110 |
+
video (str): Path to the video file.
|
111 |
+
|
112 |
+
Returns:
|
113 |
+
tuple: The path to the processed video with landmarks and the predicted outcome.
|
114 |
+
"""
|
115 |
+
# 1. load video and process it with landmarks
|
116 |
+
original_video_path = video
|
117 |
+
output_path = "src/video_landmarks.mp4"
|
118 |
+
src.video_with_landmarks.process_video_with_landmarks(original_video_path, output_path)
|
119 |
+
|
120 |
+
# 2. extract landmarks coordinates
|
121 |
+
df = src.video_to_landmark_coordinates.video_to_landmarks(output_path,
|
122 |
+
src.video_to_landmark_coordinates.generate_column_names())
|
123 |
+
# Save the DataFrame to a CSV file
|
124 |
+
# df.to_csv('landmarks.csv', index=False)
|
125 |
+
|
126 |
+
# 3. preprocess landmarks
|
127 |
+
# Read data from a CSV file
|
128 |
+
# df = pd.read_csv('landmarks2.csv')
|
129 |
+
# df.drop(['sequence_id'],axis = 1, inplace=True)
|
130 |
+
processed_sequence = process_and_print_sequence(df)
|
131 |
+
|
132 |
+
# 4. load model
|
133 |
+
# load model architecture from JSON file
|
134 |
+
model = model_from_json(loaded_model_json, custom_objects=custom_objects)
|
135 |
+
|
136 |
+
# load weights into the new model
|
137 |
+
model.load_weights("src/model.h5")
|
138 |
+
|
139 |
+
# 5. predict
|
140 |
+
prediction = predict_final_sequence(processed_sequence, model)
|
141 |
+
print(prediction)
|
142 |
+
|
143 |
+
return output_path, prediction
|
144 |
+
|
145 |
+
|
146 |
+
iface = gr.Interface(video_identity,
|
147 |
+
inputs=gr.Video(label="Upload your video"), # Adding a label to the input
|
148 |
+
outputs=[gr.Video(label="Processed video"), gr.Textbox(label="Predicted sequence")],
|
149 |
+
# Adding labels to the outputs
|
150 |
+
title="SpellNet", # Adding a title
|
151 |
+
# Adding a description
|
152 |
+
description="This application analyzes your video input to interpret American Sign Language (ASL) gestures corresponding to letters, numbers, and other signs. The output consists of the original video enhanced with overlaid landmarks that represent key points of ASL gestures, along with the predicted decoded ASL sequence expressed in textual form.",
|
153 |
+
theme="gradio/monochrome", # Changing the theme
|
154 |
+
examples=[os.path.join(os.path.dirname(__file__), "src/videoplayback.mp4")],
|
155 |
+
cache_examples=False) # Disabling caching
|
156 |
+
|
157 |
+
if __name__ == "__main__":
|
158 |
+
iface.launch(share=False)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
opencv-python
|
3 |
+
mediapipe
|
4 |
+
tensorflow
|
5 |
+
numpy
|
6 |
+
gradio
|
src/character_to_prediction_index.json
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"0": " ",
|
3 |
+
"1": "!",
|
4 |
+
"2": "#",
|
5 |
+
"3": "$",
|
6 |
+
"4": "%",
|
7 |
+
"5": "&",
|
8 |
+
"6": "'",
|
9 |
+
"7": "(",
|
10 |
+
"8": ")",
|
11 |
+
"9": "*",
|
12 |
+
"10": "+",
|
13 |
+
"11": ",",
|
14 |
+
"12": "-",
|
15 |
+
"13": ".",
|
16 |
+
"14": "/",
|
17 |
+
"15": "0",
|
18 |
+
"16": "1",
|
19 |
+
"17": "2",
|
20 |
+
"18": "3",
|
21 |
+
"19": "4",
|
22 |
+
"20": "5",
|
23 |
+
"21": "6",
|
24 |
+
"22": "7",
|
25 |
+
"23": "8",
|
26 |
+
"24": "9",
|
27 |
+
"25": ":",
|
28 |
+
"26": ";",
|
29 |
+
"27": "=",
|
30 |
+
"28": "?",
|
31 |
+
"29": "@",
|
32 |
+
"30": "[",
|
33 |
+
"31": "_",
|
34 |
+
"32": "a",
|
35 |
+
"33": "b",
|
36 |
+
"34": "c",
|
37 |
+
"35": "d",
|
38 |
+
"36": "e",
|
39 |
+
"37": "f",
|
40 |
+
"38": "g",
|
41 |
+
"39": "h",
|
42 |
+
"40": "i",
|
43 |
+
"41": "j",
|
44 |
+
"42": "k",
|
45 |
+
"43": "l",
|
46 |
+
"44": "m",
|
47 |
+
"45": "n",
|
48 |
+
"46": "o",
|
49 |
+
"47": "p",
|
50 |
+
"48": "q",
|
51 |
+
"49": "r",
|
52 |
+
"50": "s",
|
53 |
+
"51": "t",
|
54 |
+
"52": "u",
|
55 |
+
"53": "v",
|
56 |
+
"54": "w",
|
57 |
+
"55": "x",
|
58 |
+
"56": "y",
|
59 |
+
"57": "z",
|
60 |
+
"58": "~"
|
61 |
+
}
|
src/landmarks.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
src/load_model.py
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
+
from tensorflow.keras.models import model_from_json
|
5 |
+
import os
|
6 |
+
|
7 |
+
# Convert the variables to the correct data type
|
8 |
+
# Load the variables from the JSON file
|
9 |
+
# Get the directory of the current file (preprocess_coordinates_data.py)
|
10 |
+
current_directory = os.path.dirname(os.path.abspath(__file__))
|
11 |
+
|
12 |
+
# Construct the path to variables.json
|
13 |
+
json_file_path = os.path.join(current_directory, 'variables.json')
|
14 |
+
with open(json_file_path, 'r') as json_file:
|
15 |
+
variables_dict = json.load(json_file)
|
16 |
+
|
17 |
+
# Epsilon value for layer normalisation
|
18 |
+
LAYER_NORM_EPS = variables_dict['LAYER_NORM_EPS']
|
19 |
+
|
20 |
+
# final embedding and transformer embedding size
|
21 |
+
UNITS_ENCODER = variables_dict['UNITS_ENCODER']
|
22 |
+
UNITS_DECODER = variables_dict['UNITS_DECODER']
|
23 |
+
|
24 |
+
# Transformer
|
25 |
+
NUM_BLOCKS_ENCODER = variables_dict['NUM_BLOCKS_ENCODER']
|
26 |
+
NUM_BLOCKS_DECODER = variables_dict['NUM_BLOCKS_DECODER']
|
27 |
+
NUM_HEADS = variables_dict['NUM_HEADS']
|
28 |
+
MLP_RATIO = variables_dict['MLP_RATIO']
|
29 |
+
|
30 |
+
# Dropout
|
31 |
+
EMBEDDING_DROPOUT = variables_dict['EMBEDDING_DROPOUT']
|
32 |
+
MLP_DROPOUT_RATIO = variables_dict['MLP_DROPOUT_RATIO']
|
33 |
+
MHA_DROPOUT_RATIO = variables_dict['MHA_DROPOUT_RATIO']
|
34 |
+
CLASSIFIER_DROPOUT_RATIO = variables_dict['CLASSIFIER_DROPOUT_RATIO']
|
35 |
+
|
36 |
+
# Number of Frames to resize recording to
|
37 |
+
N_TARGET_FRAMES = variables_dict['N_TARGET_FRAMES']
|
38 |
+
N_UNIQUE_CHARACTERS = variables_dict['N_UNIQUE_CHARACTERS']
|
39 |
+
N_UNIQUE_CHARACTERS0 = variables_dict['N_UNIQUE_CHARACTERS0']
|
40 |
+
PAD_TOKEN = variables_dict['PAD_TOKEN']
|
41 |
+
SOS_TOKEN = variables_dict['SOS_TOKEN']
|
42 |
+
|
43 |
+
# Length of Phrase + EOS Token
|
44 |
+
MAX_PHRASE_LENGTH = variables_dict['MAX_PHRASE_LENGTH']
|
45 |
+
|
46 |
+
# Mean/Standard Deviations of data used for normalizing
|
47 |
+
MEANS = np.array(variables_dict['MEANS'])
|
48 |
+
STDS = np.array(variables_dict['STDS'])
|
49 |
+
|
50 |
+
# Initiailizers
|
51 |
+
INIT_HE_UNIFORM = tf.keras.initializers.he_uniform
|
52 |
+
INIT_GLOROT_UNIFORM = tf.keras.initializers.glorot_uniform
|
53 |
+
INIT_ZEROS = tf.keras.initializers.constant(0.0)
|
54 |
+
# Activations
|
55 |
+
GELU = tf.keras.activations.gelu
|
56 |
+
|
57 |
+
|
58 |
+
class Embedding(tf.keras.Model):
|
59 |
+
def __init__(self, **kwargs):
|
60 |
+
super(Embedding, self).__init__(**kwargs)
|
61 |
+
self.supports_masking = True
|
62 |
+
|
63 |
+
def build(self, input_shape):
|
64 |
+
self.positional_embedding = tf.Variable(
|
65 |
+
initial_value=tf.zeros([N_TARGET_FRAMES, UNITS_ENCODER], dtype=tf.float32),
|
66 |
+
trainable=True, name='embedding_positional_encoder')
|
67 |
+
self.dominant_hand_embedding = LandmarkEmbedding(UNITS_ENCODER, 'dominant_hand')
|
68 |
+
|
69 |
+
def call(self, x, training=False):
|
70 |
+
x = tf.where(tf.math.equal(x, 0.0), 0.0, (x - MEANS) / STDS)
|
71 |
+
x = self.dominant_hand_embedding(x)
|
72 |
+
x = x + self.positional_embedding
|
73 |
+
return x
|
74 |
+
|
75 |
+
def get_config(self):
|
76 |
+
return super().get_config()
|
77 |
+
|
78 |
+
@classmethod
|
79 |
+
def from_config(cls, config):
|
80 |
+
return cls(**config)
|
81 |
+
|
82 |
+
|
83 |
+
class Encoder(tf.keras.Model):
|
84 |
+
def __init__(self, num_blocks, **kwargs):
|
85 |
+
super(Encoder, self).__init__(**kwargs)
|
86 |
+
self.num_blocks = num_blocks
|
87 |
+
self.support_masking = True
|
88 |
+
self.blocks = [
|
89 |
+
EncoderTransformerBlock(UNITS_ENCODER, NUM_HEADS, MLP_RATIO, MHA_DROPOUT_RATIO, MLP_DROPOUT_RATIO) for _ in
|
90 |
+
range(num_blocks)]
|
91 |
+
|
92 |
+
if UNITS_ENCODER != UNITS_DECODER:
|
93 |
+
self.dense_out = tf.keras.layers.Dense(UNITS_DECODER, kernel_initializer=INIT_GLOROT_UNIFORM,
|
94 |
+
use_bias=False)
|
95 |
+
self.apply_dense_out = True
|
96 |
+
else:
|
97 |
+
self.apply_dense_out = False
|
98 |
+
|
99 |
+
def call(self, x, x_inp, training=False):
|
100 |
+
attention_mask = tf.where(tf.math.reduce_sum(x_inp, axis=[2]) == 0.0, 0.0, 1.0)
|
101 |
+
attention_mask = tf.expand_dims(attention_mask, axis=1)
|
102 |
+
attention_mask = tf.repeat(attention_mask, repeats=N_TARGET_FRAMES, axis=1)
|
103 |
+
|
104 |
+
for block in self.blocks:
|
105 |
+
x = block(x, attention_mask=attention_mask, training=training)
|
106 |
+
|
107 |
+
if self.apply_dense_out:
|
108 |
+
x = self.dense_out(x)
|
109 |
+
|
110 |
+
return x, attention_mask
|
111 |
+
|
112 |
+
def get_config(self):
|
113 |
+
config = super().get_config()
|
114 |
+
config.update({"num_blocks": self.num_blocks})
|
115 |
+
return config
|
116 |
+
|
117 |
+
@classmethod
|
118 |
+
def from_config(cls, config):
|
119 |
+
return cls(**config)
|
120 |
+
|
121 |
+
|
122 |
+
class Decoder(tf.keras.Model):
|
123 |
+
def __init__(self, num_blocks, **kwargs):
|
124 |
+
super(Decoder, self).__init__(**kwargs)
|
125 |
+
self.num_blocks = num_blocks
|
126 |
+
self.supports_masking = True
|
127 |
+
self.positional_embedding = tf.Variable(
|
128 |
+
initial_value=tf.zeros([N_TARGET_FRAMES, UNITS_DECODER], dtype=tf.float32),
|
129 |
+
trainable=True, name='embedding_positional_encoder')
|
130 |
+
self.char_emb = tf.keras.layers.Embedding(N_UNIQUE_CHARACTERS, UNITS_DECODER, embeddings_initializer=INIT_ZEROS)
|
131 |
+
self.pos_emb_mha = MultiHeadAttention(UNITS_DECODER, NUM_HEADS, MHA_DROPOUT_RATIO)
|
132 |
+
self.pos_emb_ln = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
|
133 |
+
self.blocks = [
|
134 |
+
DecoderTransformerBlock(UNITS_DECODER, NUM_HEADS, MLP_RATIO, MHA_DROPOUT_RATIO, MLP_DROPOUT_RATIO) for _ in
|
135 |
+
range(num_blocks)]
|
136 |
+
|
137 |
+
def get_causal_attention_mask(self, B):
|
138 |
+
ones = tf.ones((N_TARGET_FRAMES, N_TARGET_FRAMES))
|
139 |
+
mask = tf.linalg.band_part(ones, 0, -1)
|
140 |
+
mask = tf.transpose(mask)
|
141 |
+
mask = tf.expand_dims(mask, axis=0)
|
142 |
+
mask = tf.tile(mask, [B, 1, 1])
|
143 |
+
mask = tf.cast(mask, tf.float32)
|
144 |
+
return mask
|
145 |
+
|
146 |
+
def call(self, encoder_outputs, attention_mask, phrase, training=False):
|
147 |
+
B = tf.shape(encoder_outputs)[0]
|
148 |
+
phrase = tf.cast(phrase, tf.int32)
|
149 |
+
phrase = tf.pad(phrase, [[0, 0], [1, 0]], constant_values=SOS_TOKEN, name='prepend_sos_token')
|
150 |
+
phrase = tf.pad(phrase, [[0, 0], [0, N_TARGET_FRAMES - MAX_PHRASE_LENGTH - 1]], constant_values=PAD_TOKEN,
|
151 |
+
name='append_pad_token')
|
152 |
+
causal_mask = self.get_causal_attention_mask(B)
|
153 |
+
x = self.positional_embedding + self.char_emb(phrase)
|
154 |
+
x = self.pos_emb_ln(x + self.pos_emb_mha(x, x, x, attention_mask=causal_mask))
|
155 |
+
|
156 |
+
for block in self.blocks:
|
157 |
+
x = block(x, encoder_outputs, attention_mask=attention_mask, training=training)
|
158 |
+
|
159 |
+
x = tf.slice(x, [0, 0, 0], [-1, MAX_PHRASE_LENGTH, -1])
|
160 |
+
return x
|
161 |
+
|
162 |
+
def get_config(self):
|
163 |
+
config = super().get_config()
|
164 |
+
config.update({"num_blocks": self.num_blocks})
|
165 |
+
return config
|
166 |
+
|
167 |
+
@classmethod
|
168 |
+
def from_config(cls, config):
|
169 |
+
return cls(**config)
|
170 |
+
|
171 |
+
|
172 |
+
# Embeds a landmark using fully connected layers
|
173 |
+
class LandmarkEmbedding(tf.keras.Model):
|
174 |
+
def __init__(self, units, name):
|
175 |
+
super(LandmarkEmbedding, self).__init__(name=f'{name}_embedding')
|
176 |
+
self.units = units
|
177 |
+
self.supports_masking = True
|
178 |
+
|
179 |
+
def build(self, input_shape):
|
180 |
+
# Embedding for missing landmark in frame, initizlied with zeros
|
181 |
+
self.empty_embedding = self.add_weight(
|
182 |
+
name=f'{self.name}_empty_embedding',
|
183 |
+
shape=[self.units],
|
184 |
+
initializer=INIT_ZEROS,
|
185 |
+
)
|
186 |
+
# Embedding
|
187 |
+
self.dense = tf.keras.Sequential([
|
188 |
+
tf.keras.layers.Dense(self.units, name=f'{self.name}_dense_1', use_bias=False,
|
189 |
+
kernel_initializer=INIT_GLOROT_UNIFORM, activation=GELU), # Can change activation
|
190 |
+
tf.keras.layers.Dense(self.units, name=f'{self.name}_dense_2', use_bias=False,
|
191 |
+
kernel_initializer=INIT_HE_UNIFORM),
|
192 |
+
], name=f'{self.name}_dense')
|
193 |
+
|
194 |
+
def call(self, x):
|
195 |
+
return tf.where(
|
196 |
+
# Checks whether landmark is missing in frame
|
197 |
+
tf.reduce_sum(x, axis=2, keepdims=True) == 0,
|
198 |
+
# If so, the empty embedding is used
|
199 |
+
self.empty_embedding,
|
200 |
+
# Otherwise the landmark data is embedded
|
201 |
+
self.dense(x),
|
202 |
+
)
|
203 |
+
|
204 |
+
def get_config(self):
|
205 |
+
config = super().get_config()
|
206 |
+
config.update({"units": self.units, "name": self.name})
|
207 |
+
return config
|
208 |
+
|
209 |
+
@classmethod
|
210 |
+
def from_config(cls, config):
|
211 |
+
return cls(**config)
|
212 |
+
|
213 |
+
|
214 |
+
class EncoderTransformerBlock(tf.keras.layers.Layer):
|
215 |
+
def __init__(self, units, num_heads, mlp_ratio, mha_dropout_ratio, mlp_dropout_ratio, **kwargs):
|
216 |
+
super(EncoderTransformerBlock, self).__init__(**kwargs)
|
217 |
+
self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
|
218 |
+
self.mha = MultiHeadAttention(units, num_heads, mha_dropout_ratio)
|
219 |
+
self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
|
220 |
+
self.mlp = tf.keras.Sequential([
|
221 |
+
tf.keras.layers.Dense(units * mlp_ratio, activation=GELU, kernel_initializer=INIT_GLOROT_UNIFORM,
|
222 |
+
use_bias=False),
|
223 |
+
tf.keras.layers.Dropout(mlp_dropout_ratio),
|
224 |
+
tf.keras.layers.Dense(units, kernel_initializer=INIT_HE_UNIFORM, use_bias=False),
|
225 |
+
])
|
226 |
+
|
227 |
+
def call(self, inputs, attention_mask, training=False):
|
228 |
+
x = self.layer_norm_1(inputs + self.mha(inputs, inputs, inputs, attention_mask=attention_mask))
|
229 |
+
x = self.layer_norm_2(x + self.mlp(x))
|
230 |
+
return x
|
231 |
+
|
232 |
+
def get_config(self):
|
233 |
+
config = super().get_config()
|
234 |
+
config.update({"units": self.units, "num_heads": self.num_heads, "mlp_ratio": self.mlp_ratio,
|
235 |
+
"mha_dropout_ratio": self.mha_dropout_ratio, "mlp_dropout_ratio": self.mlp_dropout_ratio})
|
236 |
+
return config
|
237 |
+
|
238 |
+
@classmethod
|
239 |
+
def from_config(cls, config):
|
240 |
+
return cls(**config)
|
241 |
+
|
242 |
+
|
243 |
+
# replaced softmax with softmax layer to support masked softmax
|
244 |
+
def scaled_dot_product(q, k, v, softmax, attention_mask):
|
245 |
+
# calculates Q . K(transpose)
|
246 |
+
qkt = tf.matmul(q, k, transpose_b=True)
|
247 |
+
# calculates scaling factor
|
248 |
+
dk = tf.math.sqrt(tf.cast(q.shape[-1], dtype=tf.float32))
|
249 |
+
scaled_qkt = qkt / dk
|
250 |
+
softmax = softmax(scaled_qkt, mask=attention_mask)
|
251 |
+
z = tf.matmul(softmax, v)
|
252 |
+
# shape: (m,Tx,depth), same shape as q,k,v
|
253 |
+
return z
|
254 |
+
|
255 |
+
|
256 |
+
class MultiHeadAttention(tf.keras.layers.Layer):
|
257 |
+
def __init__(self, d_model, num_of_heads, dropout, d_out=None):
|
258 |
+
super(MultiHeadAttention, self).__init__()
|
259 |
+
self.d_model = d_model
|
260 |
+
self.num_of_heads = num_of_heads
|
261 |
+
self.depth = d_model // num_of_heads # Can change
|
262 |
+
self.wq = [tf.keras.layers.Dense(self.depth, use_bias=False) for i in
|
263 |
+
range(num_of_heads)] # depth//2 isn't common, we can try different numbers
|
264 |
+
self.wk = [tf.keras.layers.Dense(self.depth, use_bias=False) for i in range(num_of_heads)]
|
265 |
+
self.wv = [tf.keras.layers.Dense(self.depth, use_bias=False) for i in range(num_of_heads)]
|
266 |
+
self.softmax = tf.keras.layers.Softmax()
|
267 |
+
self.do = tf.keras.layers.Dropout(dropout)
|
268 |
+
self.supports_masking = True
|
269 |
+
self.wo = tf.keras.layers.Dense(d_model if d_out is None else d_out, use_bias=False)
|
270 |
+
|
271 |
+
def call(self, q, k, v, attention_mask=None, training=False):
|
272 |
+
multi_attn = []
|
273 |
+
for i in range(self.num_of_heads):
|
274 |
+
Q = self.wq[i](q)
|
275 |
+
K = self.wk[i](k)
|
276 |
+
V = self.wv[i](v)
|
277 |
+
multi_attn.append(scaled_dot_product(Q, K, V, self.softmax, attention_mask))
|
278 |
+
|
279 |
+
multi_head = tf.concat(multi_attn, axis=-1)
|
280 |
+
multi_head_attention = self.wo(multi_head)
|
281 |
+
multi_head_attention = self.do(multi_head_attention, training=training)
|
282 |
+
|
283 |
+
return multi_head_attention
|
284 |
+
|
285 |
+
def get_config(self):
|
286 |
+
config = super().get_config()
|
287 |
+
config.update({"d_model": self.d_model, "num_of_heads": self.num_of_heads, "dropout": self.dropout})
|
288 |
+
return config
|
289 |
+
|
290 |
+
@classmethod
|
291 |
+
def from_config(cls, config):
|
292 |
+
return cls(**config)
|
293 |
+
|
294 |
+
|
295 |
+
class DecoderTransformerBlock(tf.keras.layers.Layer):
|
296 |
+
def __init__(self, units, num_heads, mlp_ratio, mha_dropout_ratio, mlp_dropout_ratio, **kwargs):
|
297 |
+
super(DecoderTransformerBlock, self).__init__(**kwargs)
|
298 |
+
self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
|
299 |
+
self.mha = MultiHeadAttention(units, num_heads, mha_dropout_ratio)
|
300 |
+
self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
|
301 |
+
self.mlp = tf.keras.Sequential([
|
302 |
+
tf.keras.layers.Dense(units * mlp_ratio, activation=GELU, kernel_initializer=INIT_GLOROT_UNIFORM,
|
303 |
+
use_bias=False),
|
304 |
+
tf.keras.layers.Dropout(mlp_dropout_ratio),
|
305 |
+
tf.keras.layers.Dense(units, kernel_initializer=INIT_HE_UNIFORM, use_bias=False),
|
306 |
+
])
|
307 |
+
|
308 |
+
def call(self, inputs, encoder_outputs, attention_mask, training=False):
|
309 |
+
x = self.layer_norm_1(
|
310 |
+
inputs + self.mha(inputs, encoder_outputs, encoder_outputs, attention_mask=attention_mask))
|
311 |
+
x = self.layer_norm_2(x + self.mlp(x))
|
312 |
+
return x
|
313 |
+
|
314 |
+
def get_config(self):
|
315 |
+
config = super().get_config()
|
316 |
+
config.update({"units": self.units, "num_heads": self.num_heads, "mlp_ratio": self.mlp_ratio,
|
317 |
+
"mha_dropout_ratio": self.mha_dropout_ratio, "mlp_dropout_ratio": self.mlp_dropout_ratio})
|
318 |
+
return config
|
319 |
+
|
320 |
+
@classmethod
|
321 |
+
def from_config(cls, config):
|
322 |
+
return cls(**config)
|
323 |
+
|
324 |
+
|
325 |
+
custom_objects = {'Embedding': Embedding,
|
326 |
+
'Encoder': Encoder,
|
327 |
+
'Decoder': Decoder,
|
328 |
+
'LandmarkEmbedding': LandmarkEmbedding,
|
329 |
+
'EncoderTransformerBlock': EncoderTransformerBlock,
|
330 |
+
'MultiHeadAttention': MultiHeadAttention,
|
331 |
+
'DecoderTransformerBlock': DecoderTransformerBlock}
|
332 |
+
|
333 |
+
# load json and create model
|
334 |
+
model_archeticture_file_path = os.path.join(current_directory, 'model_architecture.json')
|
335 |
+
json_file = open(model_archeticture_file_path, 'r')
|
336 |
+
loaded_model_json = json_file.read()
|
337 |
+
json_file.close()
|
338 |
+
|
339 |
+
# load model from JSON file
|
340 |
+
loaded_model = model_from_json(loaded_model_json, custom_objects=custom_objects)
|
341 |
+
|
342 |
+
# load weights into the new model
|
343 |
+
model_weights_file_path = os.path.join(current_directory, 'model.h5')
|
344 |
+
loaded_model.load_weights(model_weights_file_path)
|
345 |
+
|
346 |
+
# loaded_model.summary(expand_nested=True, show_trainable=True, )
|
src/model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d18e0fb837407b4b8f45411e48758a71d7ff149e31fec29b5868bc5b60e032ef
|
3 |
+
size 27599904
|
src/model_architecture.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"class_name": "Functional", "config": {"name": "model", "trainable": true, "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 128, 164], "dtype": "float32", "sparse": false, "ragged": false, "name": "frames"}, "name": "frames", "inbound_nodes": []}, {"class_name": "Masking", "config": {"name": "masking", "trainable": true, "dtype": "float32", "batch_input_shape": [null, 128, 164], "mask_value": 0.0}, "name": "masking", "inbound_nodes": [[["frames", 0, 0, {}]]]}, {"class_name": "Embedding", "config": {}, "name": "embedding", "inbound_nodes": [[["masking", 0, 0, {}]]]}, {"class_name": "Encoder", "config": {"num_blocks": 3}, "name": "encoder", "inbound_nodes": [[["embedding", 0, 0, {"x_inp": ["frames", 0, 0]}]]]}, {"class_name": "InputLayer", "config": {"batch_input_shape": [null, 32], "dtype": "int32", "sparse": false, "ragged": false, "name": "phrase"}, "name": "phrase", "inbound_nodes": []}, {"class_name": "Decoder", "config": {"num_blocks": 2}, "name": "decoder", "inbound_nodes": [[["encoder", 0, 0, {"attention_mask": ["encoder", 0, 1], "phrase": ["phrase", 0, 0]}]]]}, {"class_name": "Sequential", "config": {"name": "classifier", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 32, 384], "dtype": "float32", "sparse": false, "ragged": false, "name": "dropout_11_input"}}, {"class_name": "Dropout", "config": {"name": "dropout_11", "trainable": true, "dtype": "float32", "rate": 0.2, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_88", "trainable": true, "dtype": "float32", "units": 62, "activation": "linear", "use_bias": false, "kernel_initializer": {"class_name": "HeUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "name": "classifier", "inbound_nodes": [[["decoder", 0, 0, {}]]]}], "input_layers": [["frames", 0, 0], ["phrase", 0, 0]], "output_layers": [["classifier", 1, 0]]}, "keras_version": "2.12.0", "backend": "tensorflow"}
|
src/predict_sequence.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import numpy as np
|
3 |
+
import tensorflow as tf
|
4 |
+
import os
|
5 |
+
|
6 |
+
# Convert the variables to the correct data type
|
7 |
+
# Load the variables from the JSON file
|
8 |
+
# Get the directory of the current file (preprocess_coordinates_data.py)
|
9 |
+
current_directory = os.path.dirname(os.path.abspath(__file__))
|
10 |
+
|
11 |
+
# Construct the path to variables.json
|
12 |
+
json_file_path = os.path.join(current_directory, 'variables.json')
|
13 |
+
with open(json_file_path, 'r') as json_file:
|
14 |
+
variables_dict = json.load(json_file)
|
15 |
+
|
16 |
+
MAX_PHRASE_LENGTH = variables_dict['MAX_PHRASE_LENGTH']
|
17 |
+
PAD_TOKEN = variables_dict['PAD_TOKEN']
|
18 |
+
N_UNIQUE_CHARACTERS = variables_dict['N_UNIQUE_CHARACTERS']
|
19 |
+
|
20 |
+
# Read Character to Ordinal Encoding Mapping.
|
21 |
+
character_to_prediction = os.path.join(current_directory, 'character_to_prediction_index.json')
|
22 |
+
with open(character_to_prediction) as json_file:
|
23 |
+
ORD2CHAR = json.load(json_file)
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
# Output Predictions to string
|
28 |
+
def outputs2phrase(outputs, ORD2CHAR):
|
29 |
+
"""
|
30 |
+
Convert output sequence to a human-readable phrase by mapping
|
31 |
+
each output to a corresponding character.
|
32 |
+
|
33 |
+
Parameters:
|
34 |
+
outputs (np.array): A sequence of model output,
|
35 |
+
can be 1D (sequence of character IDs)
|
36 |
+
or 2D (sequence of one-hot encodings).
|
37 |
+
ORD2CHAR (dict): A mapping from character IDs to characters.
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
str: The converted phrase.
|
41 |
+
"""
|
42 |
+
ORD2CHAR = {int(k): v for k, v in ORD2CHAR.items()} # Convert keys to integers
|
43 |
+
if outputs.ndim == 2:
|
44 |
+
outputs = np.argmax(outputs, axis=1)
|
45 |
+
return ''.join([ORD2CHAR.get(s, '') for s in outputs])
|
46 |
+
|
47 |
+
|
48 |
+
@tf.function()
|
49 |
+
def predict_phrase(batch_frames, model):
|
50 |
+
"""
|
51 |
+
Use a pre-trained model to predict a phrase from a batch of frame sequences.
|
52 |
+
|
53 |
+
Parameters:
|
54 |
+
batch_frames (np.array): A batch of frame sequences.
|
55 |
+
model (tf.keras.Model): The pre-trained model to use for prediction.
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
tf.Tensor: One-hot encoding of the predicted phrase.
|
59 |
+
"""
|
60 |
+
batch_frames = tf.convert_to_tensor(batch_frames)
|
61 |
+
phrase = tf.fill([batch_frames.shape[0], MAX_PHRASE_LENGTH], PAD_TOKEN)
|
62 |
+
phrase = tf.cast(phrase, tf.int32) # Cast phrase to int32 initially
|
63 |
+
for idx in tf.range(MAX_PHRASE_LENGTH):
|
64 |
+
# Predict Next Token
|
65 |
+
outputs = model({
|
66 |
+
'frames': batch_frames,
|
67 |
+
'phrase': phrase,
|
68 |
+
})
|
69 |
+
|
70 |
+
phrase = tf.where(
|
71 |
+
tf.range(MAX_PHRASE_LENGTH)[None, :] < idx + 1,
|
72 |
+
tf.argmax(outputs, axis=-1, output_type=tf.int32),
|
73 |
+
phrase,
|
74 |
+
)
|
75 |
+
# one-hot encode the outputs
|
76 |
+
outputs_one_hot = tf.one_hot(phrase, depth=N_UNIQUE_CHARACTERS)
|
77 |
+
return outputs_one_hot
|
78 |
+
|
79 |
+
# # Assuming sequence is your array of shape (128, 164)
|
80 |
+
# sequence = processed_sequence._shape(1, *processed_sequence.shape) # reshapes sequence to (1, 128, 164)
|
81 |
+
#
|
82 |
+
# # Now you can feed sequence to your prediction function
|
83 |
+
# pred_phrase_one_hot = predict_phrase(sequence)
|
84 |
+
#
|
85 |
+
# # Convert the one-hot encoded prediction to a string
|
86 |
+
# # Remember the output is one-hot encoded so we need to convert it to integers first
|
87 |
+
# pred_phrase = outputs2phrase(tf.argmax(pred_phrase_one_hot, axis=-1).numpy())
|
88 |
+
#
|
89 |
+
# print(pred_phrase)
|
src/preprocess_coordinates_data.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import tensorflow as tf
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
|
7 |
+
# Convert the variables to the correct data type
|
8 |
+
# Load the variables from the JSON file
|
9 |
+
current_directory = os.path.dirname(os.path.abspath(__file__))
|
10 |
+
|
11 |
+
# Construct the path to variables.json
|
12 |
+
json_file_path = os.path.join(current_directory, 'variables.json')
|
13 |
+
with open(json_file_path, 'r') as json_file:
|
14 |
+
variables_dict = json.load(json_file)
|
15 |
+
|
16 |
+
# Lips Landmark Face Ids
|
17 |
+
LIPS_LANDMARK_IDXS = variables_dict['LIPS_LANDMARK_IDXS']
|
18 |
+
N_TARGET_FRAMES = variables_dict['N_TARGET_FRAMES']
|
19 |
+
N_DIMS0 = variables_dict['N_TARGET_FRAMES']
|
20 |
+
|
21 |
+
# Read data from a CSV file
|
22 |
+
csv_file_path = os.path.join(current_directory, 'landmarks.csv')
|
23 |
+
|
24 |
+
df = pd.read_csv(csv_file_path)
|
25 |
+
|
26 |
+
def get_idxs(df, words_pos, words_neg=[], ret_names=True, idxs_pos=None):
|
27 |
+
"""
|
28 |
+
Given a DataFrame and a list of words, this function will find all the column names
|
29 |
+
that contain all the words in 'words_pos' and none of the words in 'words_neg'.
|
30 |
+
|
31 |
+
Parameters:
|
32 |
+
df (pandas.DataFrame): Dataframe to search for column names
|
33 |
+
words_pos (list of str): List of words that column names should contain
|
34 |
+
words_neg (list of str, optional): List of words that column names should not contain. Default is empty list.
|
35 |
+
ret_names (bool, optional): Whether to return column names. Default is True.
|
36 |
+
idxs_pos (list of int, optional): Column indices to search within. Default is None, which means search all columns.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
idxs (np.array): Column indices where column names meet the criteria
|
40 |
+
names (np.array): Column names that meet the criteria. Only returned if 'ret_names' is True.
|
41 |
+
"""
|
42 |
+
idxs = []
|
43 |
+
names = []
|
44 |
+
for w in words_pos:
|
45 |
+
for col_idx, col in enumerate(df.columns):
|
46 |
+
# Exclude Non Landmark Columns
|
47 |
+
if col in ['frame']:
|
48 |
+
continue
|
49 |
+
|
50 |
+
col_idx = int(col.split('_')[-1])
|
51 |
+
# Check if column name contains all words
|
52 |
+
if (w in col) and (idxs_pos is None or col_idx in idxs_pos) and all([w not in col for w in words_neg]):
|
53 |
+
idxs.append(col_idx)
|
54 |
+
names.append(col)
|
55 |
+
# Convert to Numpy arrays
|
56 |
+
idxs = np.array(idxs)
|
57 |
+
names = np.array(names)
|
58 |
+
# Returns either both column indices and names
|
59 |
+
if ret_names:
|
60 |
+
return idxs, names
|
61 |
+
# Or only columns indices
|
62 |
+
else:
|
63 |
+
return idxs
|
64 |
+
|
65 |
+
|
66 |
+
# Get the indices of columns of interest
|
67 |
+
LEFT_HAND_IDXS0, LEFT_HAND_NAMES0 = get_idxs(df, ['left_hand'], ['z'])
|
68 |
+
RIGHT_HAND_IDXS0, RIGHT_HAND_NAMES0 = get_idxs(df, ['right_hand'], ['z'])
|
69 |
+
LIPS_IDXS0, LIPS_NAMES0 = get_idxs(df, ['face'], ['z'], idxs_pos=LIPS_LANDMARK_IDXS)
|
70 |
+
COLUMNS0 = np.concatenate((LEFT_HAND_NAMES0, RIGHT_HAND_NAMES0, LIPS_NAMES0))
|
71 |
+
N_COLS0 = len(COLUMNS0)
|
72 |
+
N_COLS = N_COLS0
|
73 |
+
|
74 |
+
|
75 |
+
class PreprocessLayerNonNaN(tf.keras.layers.Layer):
|
76 |
+
"""
|
77 |
+
This is a custom layer in Keras that replaces NaN values in the input tensor with 0.
|
78 |
+
"""
|
79 |
+
|
80 |
+
def __init__(self):
|
81 |
+
super(PreprocessLayerNonNaN, self).__init__()
|
82 |
+
|
83 |
+
@tf.function(
|
84 |
+
input_signature=(tf.TensorSpec(shape=[None, N_COLS0], dtype=tf.float32),),
|
85 |
+
)
|
86 |
+
def call(self, data0):
|
87 |
+
"""
|
88 |
+
This method is called when the layer instance is called with some inputs.
|
89 |
+
|
90 |
+
Parameters:
|
91 |
+
data0 (Tensor): Input tensor
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
data (Tensor): Output tensor with the same shape as the input, but with NaN values replaced with 0
|
95 |
+
"""
|
96 |
+
# Fill NaN Values With 0
|
97 |
+
data = tf.where(tf.math.is_nan(data0), 0.0, data0)
|
98 |
+
|
99 |
+
# Hacky
|
100 |
+
data = data[None]
|
101 |
+
|
102 |
+
# Empty Hand Frame Filtering
|
103 |
+
hands = tf.slice(data, [0, 0, 0], [-1, -1, 84])
|
104 |
+
hands = tf.abs(hands)
|
105 |
+
mask = tf.reduce_sum(hands, axis=2)
|
106 |
+
mask = tf.not_equal(mask, 0)
|
107 |
+
data = data[mask][None]
|
108 |
+
data = tf.squeeze(data, axis=[0])
|
109 |
+
|
110 |
+
return data
|
111 |
+
|
112 |
+
|
113 |
+
class PreprocessLayer(tf.keras.layers.Layer):
|
114 |
+
"""
|
115 |
+
This is a custom layer in Keras that pre-processes the input data in a specific way,
|
116 |
+
which includes filling NaN values with 0, filtering empty frames and resizing frames.
|
117 |
+
"""
|
118 |
+
|
119 |
+
def __init__(self):
|
120 |
+
super(PreprocessLayer, self).__init__()
|
121 |
+
|
122 |
+
@tf.function(
|
123 |
+
input_signature=(tf.TensorSpec(shape=[None, None, N_COLS0], dtype=tf.float32),),
|
124 |
+
)
|
125 |
+
def call(self, data0, resize=True):
|
126 |
+
"""
|
127 |
+
This method is called when the layer instance is called with some inputs.
|
128 |
+
|
129 |
+
Parameters:
|
130 |
+
data0 (Tensor): Input tensor
|
131 |
+
resize (bool, optional): Whether to resize the frames. Default is True.
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
data (Tensor): Output tensor after pre-processing
|
135 |
+
"""
|
136 |
+
# Fill NaN Values With 0
|
137 |
+
data = tf.where(tf.math.is_nan(data0), 0.0, data0)
|
138 |
+
|
139 |
+
# Empty Hand Frame Filtering
|
140 |
+
hands = tf.slice(data, [0, 0, 0], [-1, -1, 84])
|
141 |
+
hands = tf.abs(hands)
|
142 |
+
mask = tf.reduce_sum(hands, axis=2)
|
143 |
+
mask = tf.not_equal(mask, 0)
|
144 |
+
data = data[mask][None]
|
145 |
+
|
146 |
+
# Pad Zeros
|
147 |
+
N_FRAMES = len(data[0])
|
148 |
+
if N_FRAMES < N_TARGET_FRAMES:
|
149 |
+
data = tf.concat((
|
150 |
+
data,
|
151 |
+
tf.zeros([1, N_TARGET_FRAMES - N_FRAMES, N_COLS], dtype=tf.float32)
|
152 |
+
), axis=1)
|
153 |
+
# Downsample
|
154 |
+
data = tf.image.resize(
|
155 |
+
data,
|
156 |
+
[1, N_TARGET_FRAMES],
|
157 |
+
method=tf.image.ResizeMethod.BILINEAR,
|
158 |
+
)
|
159 |
+
|
160 |
+
# Squeeze Batch Dimension
|
161 |
+
data = tf.squeeze(data, axis=[0])
|
162 |
+
|
163 |
+
return data
|
164 |
+
|
165 |
+
|
166 |
+
df = df[COLUMNS0] # select only columns of interest equal to N_COLS0
|
167 |
+
hand_tracking_sequence = df.values.reshape(1, -1, N_COLS0) # reshape after converting DataFrame to numpy array
|
168 |
+
|
169 |
+
preprocess_layer_instance = PreprocessLayer() # instantiate PreprocessLayer class
|
170 |
+
processed_sequence = preprocess_layer_instance(hand_tracking_sequence) # call instance with data
|
171 |
+
|
172 |
+
# print(f'input sequence shape: {hand_tracking_sequence.shape}')
|
173 |
+
# print(f'processed sequence shape: {processed_sequence.shape}')
|
src/variables.json
ADDED
@@ -0,0 +1,394 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"LIPS_LANDMARK_IDXS": [
|
3 |
+
61,
|
4 |
+
185,
|
5 |
+
40,
|
6 |
+
39,
|
7 |
+
37,
|
8 |
+
0,
|
9 |
+
267,
|
10 |
+
269,
|
11 |
+
270,
|
12 |
+
409,
|
13 |
+
291,
|
14 |
+
146,
|
15 |
+
91,
|
16 |
+
181,
|
17 |
+
84,
|
18 |
+
17,
|
19 |
+
314,
|
20 |
+
405,
|
21 |
+
321,
|
22 |
+
375,
|
23 |
+
78,
|
24 |
+
191,
|
25 |
+
80,
|
26 |
+
81,
|
27 |
+
82,
|
28 |
+
13,
|
29 |
+
312,
|
30 |
+
311,
|
31 |
+
310,
|
32 |
+
415,
|
33 |
+
95,
|
34 |
+
88,
|
35 |
+
178,
|
36 |
+
87,
|
37 |
+
14,
|
38 |
+
317,
|
39 |
+
402,
|
40 |
+
318,
|
41 |
+
324,
|
42 |
+
308
|
43 |
+
],
|
44 |
+
"LAYER_NORM_EPS": 1e-05,
|
45 |
+
"UNITS_ENCODER": 384,
|
46 |
+
"UNITS_DECODER": 384,
|
47 |
+
"NUM_BLOCKS_ENCODER": 3,
|
48 |
+
"NUM_BLOCKS_DECODER": 2,
|
49 |
+
"NUM_HEADS": 4,
|
50 |
+
"MLP_RATIO": 2,
|
51 |
+
"EMBEDDING_DROPOUT": 0.2,
|
52 |
+
"MLP_DROPOUT_RATIO": 0.2,
|
53 |
+
"MHA_DROPOUT_RATIO": 0.2,
|
54 |
+
"CLASSIFIER_DROPOUT_RATIO": 0.2,
|
55 |
+
"N_TARGET_FRAMES": 128,
|
56 |
+
"N_DIMS0": 2,
|
57 |
+
"N_UNIQUE_CHARACTERS": 62,
|
58 |
+
"N_UNIQUE_CHARACTERS0": 59,
|
59 |
+
"PAD_TOKEN": 59,
|
60 |
+
"SOS_TOKEN": 60,
|
61 |
+
"MAX_PHRASE_LENGTH": 32,
|
62 |
+
"MEANS": [
|
63 |
+
0.69352776,
|
64 |
+
0.60659605,
|
65 |
+
0.53412515,
|
66 |
+
0.4970676,
|
67 |
+
0.48584947,
|
68 |
+
0.5761701,
|
69 |
+
0.5300588,
|
70 |
+
0.49778917,
|
71 |
+
0.47764367,
|
72 |
+
0.6305243,
|
73 |
+
0.5822572,
|
74 |
+
0.55222154,
|
75 |
+
0.53908557,
|
76 |
+
0.68544865,
|
77 |
+
0.63951194,
|
78 |
+
0.6104323,
|
79 |
+
0.5991277,
|
80 |
+
0.7378051,
|
81 |
+
0.7018211,
|
82 |
+
0.6776119,
|
83 |
+
0.6673842,
|
84 |
+
0.76445776,
|
85 |
+
0.7457853,
|
86 |
+
0.7062872,
|
87 |
+
0.67484325,
|
88 |
+
0.6514734,
|
89 |
+
0.6304427,
|
90 |
+
0.5906848,
|
91 |
+
0.5854317,
|
92 |
+
0.5849309,
|
93 |
+
0.6276549,
|
94 |
+
0.5890438,
|
95 |
+
0.59771925,
|
96 |
+
0.6047316,
|
97 |
+
0.6383216,
|
98 |
+
0.60959125,
|
99 |
+
0.6295764,
|
100 |
+
0.6437836,
|
101 |
+
0.6588292,
|
102 |
+
0.6397078,
|
103 |
+
0.65018004,
|
104 |
+
0.65816236,
|
105 |
+
0.26357186,
|
106 |
+
0.35093567,
|
107 |
+
0.4236605,
|
108 |
+
0.45704976,
|
109 |
+
0.4634739,
|
110 |
+
0.37947592,
|
111 |
+
0.4234214,
|
112 |
+
0.45306972,
|
113 |
+
0.4717593,
|
114 |
+
0.3199842,
|
115 |
+
0.36261505,
|
116 |
+
0.38926786,
|
117 |
+
0.40241373,
|
118 |
+
0.26189587,
|
119 |
+
0.30273047,
|
120 |
+
0.3301876,
|
121 |
+
0.34255308,
|
122 |
+
0.20624675,
|
123 |
+
0.23920882,
|
124 |
+
0.263005,
|
125 |
+
0.27461466,
|
126 |
+
0.75472385,
|
127 |
+
0.73504084,
|
128 |
+
0.6943852,
|
129 |
+
0.6608657,
|
130 |
+
0.63613355,
|
131 |
+
0.6144105,
|
132 |
+
0.5700216,
|
133 |
+
0.56217206,
|
134 |
+
0.5597008,
|
135 |
+
0.611077,
|
136 |
+
0.56800383,
|
137 |
+
0.575002,
|
138 |
+
0.5811821,
|
139 |
+
0.62163454,
|
140 |
+
0.59134597,
|
141 |
+
0.61230445,
|
142 |
+
0.6277079,
|
143 |
+
0.64273566,
|
144 |
+
0.6216118,
|
145 |
+
0.6318555,
|
146 |
+
0.63973725,
|
147 |
+
0.56342137,
|
148 |
+
0.5647059,
|
149 |
+
0.5649758,
|
150 |
+
0.5657689,
|
151 |
+
0.54460865,
|
152 |
+
0.52689284,
|
153 |
+
0.51569146,
|
154 |
+
0.5043293,
|
155 |
+
0.51033896,
|
156 |
+
0.52668756,
|
157 |
+
0.53708506,
|
158 |
+
0.54991424,
|
159 |
+
0.5468167,
|
160 |
+
0.55006754,
|
161 |
+
0.5267238,
|
162 |
+
0.5178957,
|
163 |
+
0.51888436,
|
164 |
+
0.5099791,
|
165 |
+
0.53717476,
|
166 |
+
0.5305108,
|
167 |
+
0.5081805,
|
168 |
+
0.51886874,
|
169 |
+
0.58258605,
|
170 |
+
0.6024338,
|
171 |
+
0.6155048,
|
172 |
+
0.6306914,
|
173 |
+
0.6245343,
|
174 |
+
0.6058631,
|
175 |
+
0.59408224,
|
176 |
+
0.58018464,
|
177 |
+
0.5852319,
|
178 |
+
0.5804903,
|
179 |
+
0.60605526,
|
180 |
+
0.61589545,
|
181 |
+
0.61500907,
|
182 |
+
0.6246284,
|
183 |
+
0.59435004,
|
184 |
+
0.6024958,
|
185 |
+
0.6250273,
|
186 |
+
0.61513,
|
187 |
+
0.508501,
|
188 |
+
0.5193109,
|
189 |
+
0.52219623,
|
190 |
+
0.53701967,
|
191 |
+
0.5069547,
|
192 |
+
0.51169485,
|
193 |
+
0.51677644,
|
194 |
+
0.5253185,
|
195 |
+
0.5245756,
|
196 |
+
0.521367,
|
197 |
+
0.5199756,
|
198 |
+
0.51932734,
|
199 |
+
0.5365361,
|
200 |
+
0.5221106,
|
201 |
+
0.5230684,
|
202 |
+
0.53079647,
|
203 |
+
0.5238175,
|
204 |
+
0.52800494,
|
205 |
+
0.5223436,
|
206 |
+
0.5342269,
|
207 |
+
0.5212379,
|
208 |
+
0.52289945,
|
209 |
+
0.506347,
|
210 |
+
0.5106173,
|
211 |
+
0.51533395,
|
212 |
+
0.5235456,
|
213 |
+
0.5230225,
|
214 |
+
0.52027595,
|
215 |
+
0.51917976,
|
216 |
+
0.5189014,
|
217 |
+
0.5361387,
|
218 |
+
0.5216965,
|
219 |
+
0.5220167,
|
220 |
+
0.52960336,
|
221 |
+
0.5225625,
|
222 |
+
0.5264617,
|
223 |
+
0.5215638,
|
224 |
+
0.53341466,
|
225 |
+
0.51952803,
|
226 |
+
0.5216051
|
227 |
+
],
|
228 |
+
"STDS": [
|
229 |
+
0.10834738,
|
230 |
+
0.10391748,
|
231 |
+
0.10296664,
|
232 |
+
0.10752504,
|
233 |
+
0.12336373,
|
234 |
+
0.10313869,
|
235 |
+
0.10744168,
|
236 |
+
0.11199072,
|
237 |
+
0.1193621,
|
238 |
+
0.10597368,
|
239 |
+
0.11260378,
|
240 |
+
0.1170811,
|
241 |
+
0.12447591,
|
242 |
+
0.11238337,
|
243 |
+
0.12130429,
|
244 |
+
0.12248141,
|
245 |
+
0.1267081,
|
246 |
+
0.1224081,
|
247 |
+
0.13301295,
|
248 |
+
0.13806877,
|
249 |
+
0.1437398,
|
250 |
+
0.08867608,
|
251 |
+
0.08839962,
|
252 |
+
0.08913112,
|
253 |
+
0.09358086,
|
254 |
+
0.09968524,
|
255 |
+
0.08439907,
|
256 |
+
0.09381164,
|
257 |
+
0.10565417,
|
258 |
+
0.11996002,
|
259 |
+
0.08592986,
|
260 |
+
0.1002507,
|
261 |
+
0.11805841,
|
262 |
+
0.13548768,
|
263 |
+
0.08893858,
|
264 |
+
0.1042807,
|
265 |
+
0.11806193,
|
266 |
+
0.13066797,
|
267 |
+
0.09283979,
|
268 |
+
0.1044982,
|
269 |
+
0.11446757,
|
270 |
+
0.12410894,
|
271 |
+
0.08575833,
|
272 |
+
0.08688664,
|
273 |
+
0.08871841,
|
274 |
+
0.09452496,
|
275 |
+
0.11280894,
|
276 |
+
0.08605019,
|
277 |
+
0.09069607,
|
278 |
+
0.09625262,
|
279 |
+
0.10480069,
|
280 |
+
0.08209087,
|
281 |
+
0.08907479,
|
282 |
+
0.09521613,
|
283 |
+
0.10375828,
|
284 |
+
0.0827678,
|
285 |
+
0.09389319,
|
286 |
+
0.09721766,
|
287 |
+
0.10260603,
|
288 |
+
0.0892784,
|
289 |
+
0.10309231,
|
290 |
+
0.11121955,
|
291 |
+
0.11911318,
|
292 |
+
0.08014706,
|
293 |
+
0.07939664,
|
294 |
+
0.07666104,
|
295 |
+
0.07640523,
|
296 |
+
0.07845239,
|
297 |
+
0.06779566,
|
298 |
+
0.06928173,
|
299 |
+
0.07995176,
|
300 |
+
0.09609538,
|
301 |
+
0.06776656,
|
302 |
+
0.07411631,
|
303 |
+
0.09502285,
|
304 |
+
0.11704809,
|
305 |
+
0.06976698,
|
306 |
+
0.07840788,
|
307 |
+
0.09568293,
|
308 |
+
0.11219386,
|
309 |
+
0.07334771,
|
310 |
+
0.07997227,
|
311 |
+
0.09204492,
|
312 |
+
0.10471888,
|
313 |
+
0.1324311,
|
314 |
+
0.13287905,
|
315 |
+
0.13296498,
|
316 |
+
0.13300247,
|
317 |
+
0.13251117,
|
318 |
+
0.13296743,
|
319 |
+
0.13352127,
|
320 |
+
0.13476767,
|
321 |
+
0.13467269,
|
322 |
+
0.13386367,
|
323 |
+
0.13339657,
|
324 |
+
0.13304512,
|
325 |
+
0.13318144,
|
326 |
+
0.13313657,
|
327 |
+
0.13394693,
|
328 |
+
0.13404495,
|
329 |
+
0.1343446,
|
330 |
+
0.13446471,
|
331 |
+
0.13349241,
|
332 |
+
0.13355125,
|
333 |
+
0.13414721,
|
334 |
+
0.13430822,
|
335 |
+
0.13283393,
|
336 |
+
0.13377732,
|
337 |
+
0.1346423,
|
338 |
+
0.13602652,
|
339 |
+
0.13584861,
|
340 |
+
0.13470158,
|
341 |
+
0.1339573,
|
342 |
+
0.13331288,
|
343 |
+
0.13342074,
|
344 |
+
0.133372,
|
345 |
+
0.13473015,
|
346 |
+
0.13483934,
|
347 |
+
0.13534908,
|
348 |
+
0.13551436,
|
349 |
+
0.13399816,
|
350 |
+
0.13405652,
|
351 |
+
0.1354323,
|
352 |
+
0.13537434,
|
353 |
+
0.06685787,
|
354 |
+
0.06737807,
|
355 |
+
0.06767439,
|
356 |
+
0.06927998,
|
357 |
+
0.06658512,
|
358 |
+
0.06643137,
|
359 |
+
0.0663855,
|
360 |
+
0.06645988,
|
361 |
+
0.06653237,
|
362 |
+
0.06679216,
|
363 |
+
0.06700299,
|
364 |
+
0.06721594,
|
365 |
+
0.06899743,
|
366 |
+
0.06748881,
|
367 |
+
0.06692849,
|
368 |
+
0.06752784,
|
369 |
+
0.06670087,
|
370 |
+
0.06690367,
|
371 |
+
0.06722134,
|
372 |
+
0.06834918,
|
373 |
+
0.06637124,
|
374 |
+
0.06663854,
|
375 |
+
0.06680202,
|
376 |
+
0.06691353,
|
377 |
+
0.06701645,
|
378 |
+
0.06724831,
|
379 |
+
0.06726662,
|
380 |
+
0.06730385,
|
381 |
+
0.06735906,
|
382 |
+
0.06739713,
|
383 |
+
0.06924284,
|
384 |
+
0.06767783,
|
385 |
+
0.06744281,
|
386 |
+
0.06815296,
|
387 |
+
0.06732813,
|
388 |
+
0.0676265,
|
389 |
+
0.06758311,
|
390 |
+
0.06880609,
|
391 |
+
0.06710069,
|
392 |
+
0.0672657
|
393 |
+
]
|
394 |
+
}
|
src/video_to_landmark_coordinates.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import mediapipe as mp
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
def generate_column_names():
|
8 |
+
"""
|
9 |
+
Generate column names for a DataFrame that will store coordinates of landmarks.
|
10 |
+
|
11 |
+
Column names are formatted as '{coordinate}_{landmark_type}_{landmark_index}'.
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
list: A list of strings representing the column names.
|
15 |
+
"""
|
16 |
+
columns = ['frame']
|
17 |
+
|
18 |
+
# face columns
|
19 |
+
for coordinate in ['x', 'y']:
|
20 |
+
for i in range(468): # Mediapipe face mesh contains 468 landmarks
|
21 |
+
columns.append(f'{coordinate}_face_{i}')
|
22 |
+
|
23 |
+
# hands columns
|
24 |
+
for hand in ['left_hand', 'right_hand']:
|
25 |
+
for coordinate in ['x', 'y']:
|
26 |
+
for i in range(21): # Mediapipe hand model contains 21 landmarks
|
27 |
+
columns.append(f'{coordinate}_{hand}_{i}')
|
28 |
+
|
29 |
+
return columns
|
30 |
+
|
31 |
+
|
32 |
+
def video_to_landmarks(video_path, columns):
|
33 |
+
"""
|
34 |
+
Extract face and hand landmarks from a video and store them in a DataFrame.
|
35 |
+
|
36 |
+
The video is processed frame by frame. For each frame, face and hand landmarks
|
37 |
+
are detected using MediaPipe's face mesh and hand models, respectively.
|
38 |
+
The coordinates of the landmarks are stored in a DataFrame.
|
39 |
+
|
40 |
+
Parameters:
|
41 |
+
video_path (str): Path to the video file.
|
42 |
+
columns (list): List of column names for the DataFrame.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
pd.DataFrame: A DataFrame where each row corresponds to a frame and each column corresponds to a landmark.
|
46 |
+
"""
|
47 |
+
mp_drawing = mp.solutions.drawing_utils
|
48 |
+
mp_face_mesh = mp.solutions.face_mesh
|
49 |
+
mp_hands = mp.solutions.hands
|
50 |
+
|
51 |
+
cap = cv2.VideoCapture(video_path)
|
52 |
+
df = pd.DataFrame(columns=columns)
|
53 |
+
|
54 |
+
with mp_face_mesh.FaceMesh() as face_mesh, mp_hands.Hands(max_num_hands=2) as hands:
|
55 |
+
frame_count = 0
|
56 |
+
while cap.isOpened():
|
57 |
+
success, frame = cap.read()
|
58 |
+
if not success:
|
59 |
+
break
|
60 |
+
|
61 |
+
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
62 |
+
results_face = face_mesh.process(rgb_frame)
|
63 |
+
results_hands = hands.process(rgb_frame)
|
64 |
+
|
65 |
+
# Initialize frame dictionary with NaNs
|
66 |
+
frame_data = {column: np.NaN for column in columns}
|
67 |
+
frame_data['frame'] = frame_count
|
68 |
+
|
69 |
+
# Process face landmarks
|
70 |
+
if results_face.multi_face_landmarks:
|
71 |
+
for face_landmarks in results_face.multi_face_landmarks:
|
72 |
+
for i, landmark in enumerate(face_landmarks.landmark):
|
73 |
+
frame_data[f'x_face_{i}'] = landmark.x
|
74 |
+
frame_data[f'y_face_{i}'] = landmark.y
|
75 |
+
|
76 |
+
# Process hand landmarks
|
77 |
+
if results_hands.multi_hand_landmarks:
|
78 |
+
for hand_landmarks in results_hands.multi_hand_landmarks:
|
79 |
+
if hand_landmarks.landmark[mp_hands.HandLandmark.WRIST].x < hand_landmarks.landmark[
|
80 |
+
mp_hands.HandLandmark.THUMB_TIP].x:
|
81 |
+
hand_type = 'left_hand'
|
82 |
+
else:
|
83 |
+
hand_type = 'right_hand'
|
84 |
+
|
85 |
+
for i, landmark in enumerate(hand_landmarks.landmark):
|
86 |
+
frame_data[f'x_{hand_type}_{i}'] = landmark.x
|
87 |
+
frame_data[f'y_{hand_type}_{i}'] = landmark.y
|
88 |
+
|
89 |
+
df = df._append(frame_data, ignore_index=True)
|
90 |
+
frame_count += 1
|
91 |
+
|
92 |
+
cap.release()
|
93 |
+
|
94 |
+
return df
|
95 |
+
|
96 |
+
# video_path = "videoplayback_with_landmarks.mp4"
|
97 |
+
# df = video_to_landmarks(video_path, generate_column_names())
|
98 |
+
#
|
99 |
+
# # Save the DataFrame to a CSV file
|
100 |
+
# df.to_csv('landmarks.csv', index=False)
|
src/video_with_landmarks.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import mediapipe as mp
|
3 |
+
|
4 |
+
def process_video_with_landmarks(video_path, output_path, scale_percent=100):
|
5 |
+
"""
|
6 |
+
Process a video to identify and draw landmarks on faces and hands.
|
7 |
+
|
8 |
+
Parameters:
|
9 |
+
video_path (str): The path to the input video file.
|
10 |
+
output_path (str): The path to the output video file.
|
11 |
+
scale_percent (int, optional): The percentage of the original size. Default is 100.
|
12 |
+
"""
|
13 |
+
# MediaPipe solutions
|
14 |
+
mp_drawing = mp.solutions.drawing_utils
|
15 |
+
mp_face_mesh = mp.solutions.face_mesh
|
16 |
+
mp_hands = mp.solutions.hands
|
17 |
+
|
18 |
+
# Open the video file
|
19 |
+
cap = cv2.VideoCapture(video_path)
|
20 |
+
|
21 |
+
# Get the video properties
|
22 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
23 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
24 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
25 |
+
|
26 |
+
# Calculate the scale dimensions
|
27 |
+
width = int(width * scale_percent / 100)
|
28 |
+
height = int(height * scale_percent / 100)
|
29 |
+
|
30 |
+
# Define the output video file
|
31 |
+
# fourcc = cv2.VideoWriter_fourcc(*'h264')
|
32 |
+
|
33 |
+
fourcc = cv2.VideoWriter_fourcc(*'HEVC')
|
34 |
+
out_fps = fps / 0.6 # Set the output fps to half of the original fps
|
35 |
+
out = cv2.VideoWriter(output_path, fourcc, out_fps, (width, height))
|
36 |
+
|
37 |
+
# Process each frame
|
38 |
+
with mp_face_mesh.FaceMesh() as face_mesh, mp_hands.Hands() as hands:
|
39 |
+
while cap.isOpened():
|
40 |
+
success, frame = cap.read()
|
41 |
+
if not success:
|
42 |
+
break
|
43 |
+
|
44 |
+
# Resize the frame
|
45 |
+
frame = cv2.resize(frame, (width, height), interpolation = cv2.INTER_AREA)
|
46 |
+
|
47 |
+
# Convert the frame to RGB
|
48 |
+
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
49 |
+
|
50 |
+
# Process face landmarks
|
51 |
+
results_face = face_mesh.process(rgb_frame)
|
52 |
+
if results_face.multi_face_landmarks:
|
53 |
+
for face_landmarks in results_face.multi_face_landmarks:
|
54 |
+
mp_drawing.draw_landmarks(
|
55 |
+
frame,
|
56 |
+
face_landmarks,
|
57 |
+
mp_face_mesh.FACEMESH_TESSELATION,
|
58 |
+
landmark_drawing_spec=mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=1, circle_radius=1),
|
59 |
+
connection_drawing_spec=mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=1)
|
60 |
+
)
|
61 |
+
|
62 |
+
# Process hand landmarks
|
63 |
+
results_hands = hands.process(rgb_frame)
|
64 |
+
if results_hands.multi_hand_landmarks:
|
65 |
+
for hand_landmarks in results_hands.multi_hand_landmarks:
|
66 |
+
if hand_landmarks.landmark[mp_hands.HandLandmark.WRIST].x < hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].x:
|
67 |
+
landmark_color = (255, 0, 0) # Left hand (Blue)
|
68 |
+
else:
|
69 |
+
landmark_color = (0, 0, 255) # Right hand (Red)
|
70 |
+
|
71 |
+
mp_drawing.draw_landmarks(
|
72 |
+
frame,
|
73 |
+
hand_landmarks,
|
74 |
+
mp_hands.HAND_CONNECTIONS,
|
75 |
+
landmark_drawing_spec=mp_drawing.DrawingSpec(color=landmark_color, thickness=1, circle_radius=1),
|
76 |
+
connection_drawing_spec=mp_drawing.DrawingSpec(color=landmark_color, thickness=1)
|
77 |
+
)
|
78 |
+
|
79 |
+
# Write the annotated frame to the output video
|
80 |
+
out.write(frame)
|
81 |
+
|
82 |
+
# If 'q' is pressed on the keyboard, exit this loop
|
83 |
+
if cv2.waitKey(1) & 0xFF == ord('q'):
|
84 |
+
break
|
85 |
+
|
86 |
+
# Close the video file
|
87 |
+
cap.release()
|
88 |
+
out.release()
|
89 |
+
cv2.destroyAllWindows()
|
src/videoplayback.mp4
ADDED
Binary file (768 kB). View file
|
|