Spell-Net / src /video_to_landmark_coordinates.py
Update files
import cv2
import mediapipe as mp
import pandas as pd
import numpy as np
def generate_column_names():
Generate column names for a DataFrame that will store coordinates of landmarks.
Column names are formatted as '{coordinate}_{landmark_type}_{landmark_index}'.
list: A list of strings representing the column names.
columns = ['frame']
# face columns
for coordinate in ['x', 'y']:
for i in range(468): # Mediapipe face mesh contains 468 landmarks
# hands columns
for hand in ['left_hand', 'right_hand']:
for coordinate in ['x', 'y']:
for i in range(21): # Mediapipe hand model contains 21 landmarks
return columns
def video_to_landmarks(video_path, columns):
Extract face and hand landmarks from a video and store them in a DataFrame.
The video is processed frame by frame. For each frame, face and hand landmarks
are detected using MediaPipe's face mesh and hand models, respectively.
The coordinates of the landmarks are stored in a DataFrame.
video_path (str): Path to the video file.
columns (list): List of column names for the DataFrame.
pd.DataFrame: A DataFrame where each row corresponds to a frame and each column corresponds to a landmark.
mp_drawing = mp.solutions.drawing_utils
mp_face_mesh = mp.solutions.face_mesh
mp_hands = mp.solutions.hands
cap = cv2.VideoCapture(video_path)
df = pd.DataFrame(columns=columns)
with mp_face_mesh.FaceMesh() as face_mesh, mp_hands.Hands(max_num_hands=2) as hands:
frame_count = 0
while cap.isOpened():
success, frame = cap.read()
if not success:
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results_face = face_mesh.process(rgb_frame)
results_hands = hands.process(rgb_frame)
# Initialize frame dictionary with NaNs
frame_data = {column: np.NaN for column in columns}
frame_data['frame'] = frame_count
# Process face landmarks
if results_face.multi_face_landmarks:
for face_landmarks in results_face.multi_face_landmarks:
for i, landmark in enumerate(face_landmarks.landmark):
frame_data[f'x_face_{i}'] = landmark.x
frame_data[f'y_face_{i}'] = landmark.y
# Process hand landmarks
if results_hands.multi_hand_landmarks:
for hand_landmarks in results_hands.multi_hand_landmarks:
if hand_landmarks.landmark[mp_hands.HandLandmark.WRIST].x < hand_landmarks.landmark[
hand_type = 'left_hand'
hand_type = 'right_hand'
for i, landmark in enumerate(hand_landmarks.landmark):
frame_data[f'x_{hand_type}_{i}'] = landmark.x
frame_data[f'y_{hand_type}_{i}'] = landmark.y
df = df._append(frame_data, ignore_index=True)
frame_count += 1
return df
# video_path = "videoplayback_with_landmarks.mp4"
# df = video_to_landmarks(video_path, generate_column_names())
# # Save the DataFrame to a CSV file
# df.to_csv('landmarks.csv', index=False)