Spaces:
Sleeping
Sleeping
File size: 7,386 Bytes
c6cec04 f23756c c6cec04 6511e1e c6cec04 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import numpy as np
import os
import json
from tensorflow.math import argmax
import gradio as gr
from tensorflow.keras.models import model_from_json
import src.video_with_landmarks
import src.video_to_landmark_coordinates
import src.preprocess_coordinates_data
from src.load_model import Embedding, Encoder, Decoder, LandmarkEmbedding, EncoderTransformerBlock, MultiHeadAttention, \
DecoderTransformerBlock
import src.predict_sequence
# Load the character to prediction index dictionary
character_to_prediction = 'src/character_to_prediction_index.json'
with open(character_to_prediction) as json_file:
ORD2CHAR = json.load(json_file)
# Load the variables from the JSON file
json_file_path = "src/variables.json"
with open(json_file_path, 'r') as json_file:
variables_dict = json.load(json_file)
# Load the model architecture from the JSON file
json_file = open('src/model_architecture.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
# Import lips landmark indices
LIPS_LANDMARK_IDXS = np.array(variables_dict['LIPS_LANDMARK_IDXS'])
custom_objects = {'Embedding': Embedding,
'Encoder': Encoder,
'Decoder': Decoder,
'LandmarkEmbedding': LandmarkEmbedding,
'EncoderTransformerBlock': EncoderTransformerBlock,
'MultiHeadAttention': MultiHeadAttention,
'DecoderTransformerBlock': DecoderTransformerBlock}
def process_and_print_sequence(df):
"""
Process the input DataFrame using specified data processing steps and print the shapes of the sequences.
Parameters:
df (pd.DataFrame): Input DataFrame containing tracking data.
Returns:
processed_sequence (np.ndarray): Processed sequence as a NumPy array.
"""
LEFT_HAND_IDXS0, LEFT_HAND_NAMES0 = src.preprocess_coordinates_data.get_idxs(df, ['left_hand'], ['z'])
RIGHT_HAND_IDXS0, RIGHT_HAND_NAMES0 = src.preprocess_coordinates_data.get_idxs(df, ['right_hand'], ['z'])
LIPS_IDXS0, LIPS_NAMES0 = src.preprocess_coordinates_data.get_idxs(df, ['face'], ['z'], idxs_pos=LIPS_LANDMARK_IDXS)
COLUMNS0 = np.concatenate((LEFT_HAND_NAMES0, RIGHT_HAND_NAMES0, LIPS_NAMES0))
N_COLS0 = len(COLUMNS0)
df = df[COLUMNS0] # select only columns of interest equal to N_COLS0
all_tracking_sequence = df.values.reshape(1, -1, N_COLS0).astype(
np.float32) # reshape after converting DataFrame to numpy array
preprocess_layer_instance = src.preprocess_coordinates_data.PreprocessLayer() # instantiate PreprocessLayer class
processed_sequence = preprocess_layer_instance(all_tracking_sequence) # call instance with data
print(f'input sequence shape: {all_tracking_sequence.shape}')
print(f'processed sequence shape: {processed_sequence.shape}')
return processed_sequence
def predict_final_sequence(processed_sequence, model):
"""
This function makes a prediction on a given sequence using a pre-trained model.
The sequence is expanded along the 0th dimension to account for batch size.
The prediction is made using the `predict_phrase` function, which should return a one-hot encoded prediction.
This one-hot encoded prediction is then converted into index values using argmax.
Finally, these index values are converted into a string representation using the `outputs2phrase` function.
Args:
processed_sequence (numpy array): An array representing the sequence to make a prediction on.
This should be of shape (128,164).
model (tensorflow.python.keras.engine.training.Model): The pre-trained model to use for making predictions.
Returns:
final_prediction (str): The final prediction made by the model, represented as a string.
"""
# change shape to (1,128,164)
sequence = np.expand_dims(processed_sequence, axis=0) # change shape to (1,128,164)
# Convert the one-hot encoded prediction to a string
predicted_phrase_one_hot = src.predict_sequence.predict_phrase(sequence, model)
predicted_phrase_one_hot = predicted_phrase_one_hot[0] # Remove the batch dimension
predicted_phrase = argmax(predicted_phrase_one_hot, axis=-1).numpy() # Convert one-hot encoding to index values
print(predicted_phrase)
final_prediction = src.predict_sequence.outputs2phrase(predicted_phrase, ORD2CHAR)
return final_prediction
def video_identity(video):
"""
Processes a video, extracts landmarks, feeds them to a pre-trained model, and makes a prediction.
The processing pipeline consists of the following steps:
1. Process the video with landmarks.
2. Extract landmarks coordinates and save them into a DataFrame.
3. Preprocess the landmarks.
4. Load a pre-trained model.
5. Feed the preprocessed landmarks to the model and get a prediction.
Parameters:
video (str): Path to the video file.
Returns:
tuple: The path to the processed video with landmarks and the predicted outcome.
"""
# 1. load video and process it with landmarks
original_video_path = video
current_directory = os.path.dirname(os.path.abspath(__file__))
output_file = "src/video_landmarks.mp4"
output_path = os.path.join(current_directory, output_file)
src.video_with_landmarks.process_video_with_landmarks(original_video_path, output_path)
# 2. extract landmarks coordinates
df = src.video_to_landmark_coordinates.video_to_landmarks(output_path,
src.video_to_landmark_coordinates.generate_column_names())
# Save the DataFrame to a CSV file
# df.to_csv('landmarks.csv', index=False)
# 3. preprocess landmarks
# Read data from a CSV file
# df = pd.read_csv('landmarks2.csv')
# df.drop(['sequence_id'],axis = 1, inplace=True)
processed_sequence = process_and_print_sequence(df)
# 4. load model
# load model architecture from JSON file
model = model_from_json(loaded_model_json, custom_objects=custom_objects)
# load weights into the new model
model.load_weights("src/model.h5")
# 5. predict
prediction = predict_final_sequence(processed_sequence, model)
print(prediction)
return output_path, prediction
iface = gr.Interface(video_identity,
inputs=gr.Video(label="Upload your video"), # Adding a label to the input
outputs=[gr.Video(label="Processed video"), gr.Textbox(label="Predicted sequence")],
# Adding labels to the outputs
title="SpellNet π€πΌβπΌππΌ", # Adding a title
# Adding a description
description="This application analyzes your video input to interpret American Sign Language (ASL) gestures corresponding to letters, numbers, and other signs. The output consists of the original video enhanced with overlaid landmarks that represent key points of ASL gestures, along with the predicted decoded ASL sequence expressed in textual form.",
theme="gradio/monochrome", # Changing the theme
examples=[os.path.join(os.path.dirname(__file__), "src/videoplayback.mp4")],
cache_examples=False) # Disabling caching
if __name__ == "__main__":
iface.launch(share=False)
|