innat
commited on
Commit
·
24cfc1b
1
Parent(s):
434dc3e
update
Browse files- .gitattributes +1 -0
- app.py +128 -0
- examples/k400.mp4 +0 -0
- examples/ucf.avi +0 -0
- labels.py +682 -0
- requirements.txt +3 -0
- utils.py +105 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*__pycache__
|
app.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import imageio
|
4 |
+
|
5 |
+
import tensorflow as tf
|
6 |
+
from tensorflow import keras
|
7 |
+
|
8 |
+
from utils import TubeMaskingGenerator
|
9 |
+
from utils import read_video, frame_sampling, denormalize, reconstrunction
|
10 |
+
from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size
|
11 |
+
from labels import K400_label_map, SSv2_label_map, UCF_label_map
|
12 |
+
|
13 |
+
|
14 |
+
MODEL_CACHE = {}
|
15 |
+
MODELS = {
|
16 |
+
'K400': [
|
17 |
+
'/mnt/c/Users/innat/Desktop/videomae/K400/small/TFVideoMAE_S_K400_16x224_FT',
|
18 |
+
'/mnt/c/Users/innat/Desktop/videomae/K400/small/TFVideoMAE_S_K400_16x224_PT'
|
19 |
+
],
|
20 |
+
'SSv2': [],
|
21 |
+
'UCF' : []
|
22 |
+
}
|
23 |
+
|
24 |
+
|
25 |
+
def tube_mask_generator():
|
26 |
+
window_size = (
|
27 |
+
num_frames // 2,
|
28 |
+
input_size // patch_size[0],
|
29 |
+
input_size // patch_size[1]
|
30 |
+
)
|
31 |
+
tube_mask = TubeMaskingGenerator(
|
32 |
+
input_size=window_size,
|
33 |
+
mask_ratio=0.70
|
34 |
+
)
|
35 |
+
make_bool = tube_mask()
|
36 |
+
bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32)
|
37 |
+
bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0)
|
38 |
+
bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool)
|
39 |
+
return bool_masked_pos_tf
|
40 |
+
|
41 |
+
|
42 |
+
def video_to_gif(video_array, gif_filename):
|
43 |
+
imageio.mimsave(
|
44 |
+
gif_filename, video_array, duration=100
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
+
def get_model(data_type):
|
49 |
+
|
50 |
+
if data_type in MODEL_CACHE:
|
51 |
+
return MODEL_CACHE[data_type]
|
52 |
+
|
53 |
+
if data_type=='K400':
|
54 |
+
ft_model = keras.models.load_model(MODELS[data_type][0])
|
55 |
+
pt_model = keras.models.load_model(MODELS[data_type][1])
|
56 |
+
label_map = {v: k for k, v in K400_label_map.items()}
|
57 |
+
|
58 |
+
elif data_type=='SSv2':
|
59 |
+
ft_model = keras.models.load_model(MODELS[data_type][0])
|
60 |
+
pt_model = keras.models.load_model(MODELS[data_type][1])
|
61 |
+
label_map = {v: k for k, v in SSv2_label_map.items()}
|
62 |
+
|
63 |
+
elif data_type=='UCF':
|
64 |
+
ft_model = keras.models.load_model(MODELS[data_type][0])
|
65 |
+
pt_model = keras.models.load_model(MODELS[data_type][1])
|
66 |
+
label_map = {v: k for k, v in UCF_label_map.items()}
|
67 |
+
|
68 |
+
ft_model.trainable = False
|
69 |
+
pt_model.trainable = False
|
70 |
+
MODEL_CACHE[data_type] = (ft_model, pt_model, label_map)
|
71 |
+
|
72 |
+
return ft_model, pt_model, label_map
|
73 |
+
|
74 |
+
|
75 |
+
def inference(video_file, dataset_type):
|
76 |
+
container = read_video(video_file)
|
77 |
+
frames = frame_sampling(container, num_frames=num_frames)
|
78 |
+
bool_masked_pos_tf = tube_mask_generator()
|
79 |
+
ft_model, pt_model, label_map = get_model(dataset_type)
|
80 |
+
|
81 |
+
# inference on fine-tune model
|
82 |
+
outputs_ft = ft_model(frames[None, ...], training=False)
|
83 |
+
probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0)
|
84 |
+
confidences = {
|
85 |
+
label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1]
|
86 |
+
}
|
87 |
+
|
88 |
+
# inference on pre-trained model
|
89 |
+
outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False)
|
90 |
+
reconstruct_output, mask = reconstrunction(
|
91 |
+
frames[None, ...], bool_masked_pos_tf, outputs_pt
|
92 |
+
)
|
93 |
+
|
94 |
+
input_frame = denormalize(frames)
|
95 |
+
input_mask = denormalize(mask[0] * frames)
|
96 |
+
output_frame = denormalize(reconstruct_output)
|
97 |
+
|
98 |
+
frames = []
|
99 |
+
for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame):
|
100 |
+
combined_frame = np.hstack([frame_a, frame_b, frame_c])
|
101 |
+
frames.append(combined_frame)
|
102 |
+
|
103 |
+
combined_gif = 'combined.gif'
|
104 |
+
imageio.mimsave(combined_gif, frames, duration=300, loop=0)
|
105 |
+
return confidences, combined_gif
|
106 |
+
|
107 |
+
|
108 |
+
gr.Interface(
|
109 |
+
fn=inference,
|
110 |
+
inputs=[
|
111 |
+
gr.Video(type="file"),
|
112 |
+
gr.Radio(
|
113 |
+
['K400', 'SSv2', 'UCF'],
|
114 |
+
label='Dataset'
|
115 |
+
),
|
116 |
+
],
|
117 |
+
|
118 |
+
outputs=[
|
119 |
+
gr.Label(num_top_classes=3, label='confidence scores'),
|
120 |
+
gr.Image(type="filepath", label='reconstructed masked autoencoder')
|
121 |
+
],
|
122 |
+
examples=[
|
123 |
+
["examples/k400.mp4"],
|
124 |
+
["examples/ssv2.mp4"],
|
125 |
+
["examples/ucf.avi"],
|
126 |
+
],
|
127 |
+
title="VideoMAE",
|
128 |
+
).launch()
|
examples/k400.mp4
ADDED
Binary file (653 kB). View file
|
|
examples/ucf.avi
ADDED
Binary file (210 kB). View file
|
|
labels.py
ADDED
@@ -0,0 +1,682 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
K400_label_map = {
|
2 |
+
"abseiling": 0,
|
3 |
+
"air_drumming": 1,
|
4 |
+
"answering_questions": 2,
|
5 |
+
"applauding": 3,
|
6 |
+
"applying_cream": 4,
|
7 |
+
"archery": 5,
|
8 |
+
"arm_wrestling": 6,
|
9 |
+
"arranging_flowers": 7,
|
10 |
+
"assembling_computer": 8,
|
11 |
+
"auctioning": 9,
|
12 |
+
"baby_waking_up": 10,
|
13 |
+
"baking_cookies": 11,
|
14 |
+
"balloon_blowing": 12,
|
15 |
+
"bandaging": 13,
|
16 |
+
"barbequing": 14,
|
17 |
+
"bartending": 15,
|
18 |
+
"beatboxing": 16,
|
19 |
+
"bee_keeping": 17,
|
20 |
+
"belly_dancing": 18,
|
21 |
+
"bench_pressing": 19,
|
22 |
+
"bending_back": 20,
|
23 |
+
"bending_metal": 21,
|
24 |
+
"biking_through_snow": 22,
|
25 |
+
"blasting_sand": 23,
|
26 |
+
"blowing_glass": 24,
|
27 |
+
"blowing_leaves": 25,
|
28 |
+
"blowing_nose": 26,
|
29 |
+
"blowing_out_candles": 27,
|
30 |
+
"bobsledding": 28,
|
31 |
+
"bookbinding": 29,
|
32 |
+
"bouncing_on_trampoline": 30,
|
33 |
+
"bowling": 31,
|
34 |
+
"braiding_hair": 32,
|
35 |
+
"breading_or_breadcrumbing": 33,
|
36 |
+
"breakdancing": 34,
|
37 |
+
"brush_painting": 35,
|
38 |
+
"brushing_hair": 36,
|
39 |
+
"brushing_teeth": 37,
|
40 |
+
"building_cabinet": 38,
|
41 |
+
"building_shed": 39,
|
42 |
+
"bungee_jumping": 40,
|
43 |
+
"busking": 41,
|
44 |
+
"canoeing_or_kayaking": 42,
|
45 |
+
"capoeira": 43,
|
46 |
+
"carrying_baby": 44,
|
47 |
+
"cartwheeling": 45,
|
48 |
+
"carving_pumpkin": 46,
|
49 |
+
"catching_fish": 47,
|
50 |
+
"catching_or_throwing_baseball": 48,
|
51 |
+
"catching_or_throwing_frisbee": 49,
|
52 |
+
"catching_or_throwing_softball": 50,
|
53 |
+
"celebrating": 51,
|
54 |
+
"changing_oil": 52,
|
55 |
+
"changing_wheel": 53,
|
56 |
+
"checking_tires": 54,
|
57 |
+
"cheerleading": 55,
|
58 |
+
"chopping_wood": 56,
|
59 |
+
"clapping": 57,
|
60 |
+
"clay_pottery_making": 58,
|
61 |
+
"clean_and_jerk": 59,
|
62 |
+
"cleaning_floor": 60,
|
63 |
+
"cleaning_gutters": 61,
|
64 |
+
"cleaning_pool": 62,
|
65 |
+
"cleaning_shoes": 63,
|
66 |
+
"cleaning_toilet": 64,
|
67 |
+
"cleaning_windows": 65,
|
68 |
+
"climbing_a_rope": 66,
|
69 |
+
"climbing_ladder": 67,
|
70 |
+
"climbing_tree": 68,
|
71 |
+
"contact_juggling": 69,
|
72 |
+
"cooking_chicken": 70,
|
73 |
+
"cooking_egg": 71,
|
74 |
+
"cooking_on_campfire": 72,
|
75 |
+
"cooking_sausages": 73,
|
76 |
+
"counting_money": 74,
|
77 |
+
"country_line_dancing": 75,
|
78 |
+
"cracking_neck": 76,
|
79 |
+
"crawling_baby": 77,
|
80 |
+
"crossing_river": 78,
|
81 |
+
"crying": 79,
|
82 |
+
"curling_hair": 80,
|
83 |
+
"cutting_nails": 81,
|
84 |
+
"cutting_pineapple": 82,
|
85 |
+
"cutting_watermelon": 83,
|
86 |
+
"dancing_ballet": 84,
|
87 |
+
"dancing_charleston": 85,
|
88 |
+
"dancing_gangnam_style": 86,
|
89 |
+
"dancing_macarena": 87,
|
90 |
+
"deadlifting": 88,
|
91 |
+
"decorating_the_christmas_tree": 89,
|
92 |
+
"digging": 90,
|
93 |
+
"dining": 91,
|
94 |
+
"disc_golfing": 92,
|
95 |
+
"diving_cliff": 93,
|
96 |
+
"dodgeball": 94,
|
97 |
+
"doing_aerobics": 95,
|
98 |
+
"doing_laundry": 96,
|
99 |
+
"doing_nails": 97,
|
100 |
+
"drawing": 98,
|
101 |
+
"dribbling_basketball": 99,
|
102 |
+
"drinking": 100,
|
103 |
+
"drinking_beer": 101,
|
104 |
+
"drinking_shots": 102,
|
105 |
+
"driving_car": 103,
|
106 |
+
"driving_tractor": 104,
|
107 |
+
"drop_kicking": 105,
|
108 |
+
"drumming_fingers": 106,
|
109 |
+
"dunking_basketball": 107,
|
110 |
+
"dying_hair": 108,
|
111 |
+
"eating_burger": 109,
|
112 |
+
"eating_cake": 110,
|
113 |
+
"eating_carrots": 111,
|
114 |
+
"eating_chips": 112,
|
115 |
+
"eating_doughnuts": 113,
|
116 |
+
"eating_hotdog": 114,
|
117 |
+
"eating_ice_cream": 115,
|
118 |
+
"eating_spaghetti": 116,
|
119 |
+
"eating_watermelon": 117,
|
120 |
+
"egg_hunting": 118,
|
121 |
+
"exercising_arm": 119,
|
122 |
+
"exercising_with_an_exercise_ball": 120,
|
123 |
+
"extinguishing_fire": 121,
|
124 |
+
"faceplanting": 122,
|
125 |
+
"feeding_birds": 123,
|
126 |
+
"feeding_fish": 124,
|
127 |
+
"feeding_goats": 125,
|
128 |
+
"filling_eyebrows": 126,
|
129 |
+
"finger_snapping": 127,
|
130 |
+
"fixing_hair": 128,
|
131 |
+
"flipping_pancake": 129,
|
132 |
+
"flying_kite": 130,
|
133 |
+
"folding_clothes": 131,
|
134 |
+
"folding_napkins": 132,
|
135 |
+
"folding_paper": 133,
|
136 |
+
"front_raises": 134,
|
137 |
+
"frying_vegetables": 135,
|
138 |
+
"garbage_collecting": 136,
|
139 |
+
"gargling": 137,
|
140 |
+
"getting_a_haircut": 138,
|
141 |
+
"getting_a_tattoo": 139,
|
142 |
+
"giving_or_receiving_award": 140,
|
143 |
+
"golf_chipping": 141,
|
144 |
+
"golf_driving": 142,
|
145 |
+
"golf_putting": 143,
|
146 |
+
"grinding_meat": 144,
|
147 |
+
"grooming_dog": 145,
|
148 |
+
"grooming_horse": 146,
|
149 |
+
"gymnastics_tumbling": 147,
|
150 |
+
"hammer_throw": 148,
|
151 |
+
"headbanging": 149,
|
152 |
+
"headbutting": 150,
|
153 |
+
"high_jump": 151,
|
154 |
+
"high_kick": 152,
|
155 |
+
"hitting_baseball": 153,
|
156 |
+
"hockey_stop": 154,
|
157 |
+
"holding_snake": 155,
|
158 |
+
"hopscotch": 156,
|
159 |
+
"hoverboarding": 157,
|
160 |
+
"hugging": 158,
|
161 |
+
"hula_hooping": 159,
|
162 |
+
"hurdling": 160,
|
163 |
+
"hurling_(sport)": 161,
|
164 |
+
"ice_climbing": 162,
|
165 |
+
"ice_fishing": 163,
|
166 |
+
"ice_skating": 164,
|
167 |
+
"ironing": 165,
|
168 |
+
"javelin_throw": 166,
|
169 |
+
"jetskiing": 167,
|
170 |
+
"jogging": 168,
|
171 |
+
"juggling_balls": 169,
|
172 |
+
"juggling_fire": 170,
|
173 |
+
"juggling_soccer_ball": 171,
|
174 |
+
"jumping_into_pool": 172,
|
175 |
+
"jumpstyle_dancing": 173,
|
176 |
+
"kicking_field_goal": 174,
|
177 |
+
"kicking_soccer_ball": 175,
|
178 |
+
"kissing": 176,
|
179 |
+
"kitesurfing": 177,
|
180 |
+
"knitting": 178,
|
181 |
+
"krumping": 179,
|
182 |
+
"laughing": 180,
|
183 |
+
"laying_bricks": 181,
|
184 |
+
"long_jump": 182,
|
185 |
+
"lunge": 183,
|
186 |
+
"making_a_cake": 184,
|
187 |
+
"making_a_sandwich": 185,
|
188 |
+
"making_bed": 186,
|
189 |
+
"making_jewelry": 187,
|
190 |
+
"making_pizza": 188,
|
191 |
+
"making_snowman": 189,
|
192 |
+
"making_sushi": 190,
|
193 |
+
"making_tea": 191,
|
194 |
+
"marching": 192,
|
195 |
+
"massaging_back": 193,
|
196 |
+
"massaging_feet": 194,
|
197 |
+
"massaging_legs": 195,
|
198 |
+
"massaging_person's_head": 196,
|
199 |
+
"milking_cow": 197,
|
200 |
+
"mopping_floor": 198,
|
201 |
+
"motorcycling": 199,
|
202 |
+
"moving_furniture": 200,
|
203 |
+
"mowing_lawn": 201,
|
204 |
+
"news_anchoring": 202,
|
205 |
+
"opening_bottle": 203,
|
206 |
+
"opening_present": 204,
|
207 |
+
"paragliding": 205,
|
208 |
+
"parasailing": 206,
|
209 |
+
"parkour": 207,
|
210 |
+
"passing_American_football_(in_game)": 208,
|
211 |
+
"passing_American_football_(not_in_game)": 209,
|
212 |
+
"peeling_apples": 210,
|
213 |
+
"peeling_potatoes": 211,
|
214 |
+
"petting_animal_(not_cat)": 212,
|
215 |
+
"petting_cat": 213,
|
216 |
+
"picking_fruit": 214,
|
217 |
+
"planting_trees": 215,
|
218 |
+
"plastering": 216,
|
219 |
+
"playing_accordion": 217,
|
220 |
+
"playing_badminton": 218,
|
221 |
+
"playing_bagpipes": 219,
|
222 |
+
"playing_basketball": 220,
|
223 |
+
"playing_bass_guitar": 221,
|
224 |
+
"playing_cards": 222,
|
225 |
+
"playing_cello": 223,
|
226 |
+
"playing_chess": 224,
|
227 |
+
"playing_clarinet": 225,
|
228 |
+
"playing_controller": 226,
|
229 |
+
"playing_cricket": 227,
|
230 |
+
"playing_cymbals": 228,
|
231 |
+
"playing_didgeridoo": 229,
|
232 |
+
"playing_drums": 230,
|
233 |
+
"playing_flute": 231,
|
234 |
+
"playing_guitar": 232,
|
235 |
+
"playing_harmonica": 233,
|
236 |
+
"playing_harp": 234,
|
237 |
+
"playing_ice_hockey": 235,
|
238 |
+
"playing_keyboard": 236,
|
239 |
+
"playing_kickball": 237,
|
240 |
+
"playing_monopoly": 238,
|
241 |
+
"playing_organ": 239,
|
242 |
+
"playing_paintball": 240,
|
243 |
+
"playing_piano": 241,
|
244 |
+
"playing_poker": 242,
|
245 |
+
"playing_recorder": 243,
|
246 |
+
"playing_saxophone": 244,
|
247 |
+
"playing_squash_or_racquetball": 245,
|
248 |
+
"playing_tennis": 246,
|
249 |
+
"playing_trombone": 247,
|
250 |
+
"playing_trumpet": 248,
|
251 |
+
"playing_ukulele": 249,
|
252 |
+
"playing_violin": 250,
|
253 |
+
"playing_volleyball": 251,
|
254 |
+
"playing_xylophone": 252,
|
255 |
+
"pole_vault": 253,
|
256 |
+
"presenting_weather_forecast": 254,
|
257 |
+
"pull_ups": 255,
|
258 |
+
"pumping_fist": 256,
|
259 |
+
"pumping_gas": 257,
|
260 |
+
"punching_bag": 258,
|
261 |
+
"punching_person_(boxing)": 259,
|
262 |
+
"push_up": 260,
|
263 |
+
"pushing_car": 261,
|
264 |
+
"pushing_cart": 262,
|
265 |
+
"pushing_wheelchair": 263,
|
266 |
+
"reading_book": 264,
|
267 |
+
"reading_newspaper": 265,
|
268 |
+
"recording_music": 266,
|
269 |
+
"riding_a_bike": 267,
|
270 |
+
"riding_camel": 268,
|
271 |
+
"riding_elephant": 269,
|
272 |
+
"riding_mechanical_bull": 270,
|
273 |
+
"riding_mountain_bike": 271,
|
274 |
+
"riding_mule": 272,
|
275 |
+
"riding_or_walking_with_horse": 273,
|
276 |
+
"riding_scooter": 274,
|
277 |
+
"riding_unicycle": 275,
|
278 |
+
"ripping_paper": 276,
|
279 |
+
"robot_dancing": 277,
|
280 |
+
"rock_climbing": 278,
|
281 |
+
"rock_scissors_paper": 279,
|
282 |
+
"roller_skating": 280,
|
283 |
+
"running_on_treadmill": 281,
|
284 |
+
"sailing": 282,
|
285 |
+
"salsa_dancing": 283,
|
286 |
+
"sanding_floor": 284,
|
287 |
+
"scrambling_eggs": 285,
|
288 |
+
"scuba_diving": 286,
|
289 |
+
"setting_table": 287,
|
290 |
+
"shaking_hands": 288,
|
291 |
+
"shaking_head": 289,
|
292 |
+
"sharpening_knives": 290,
|
293 |
+
"sharpening_pencil": 291,
|
294 |
+
"shaving_head": 292,
|
295 |
+
"shaving_legs": 293,
|
296 |
+
"shearing_sheep": 294,
|
297 |
+
"shining_shoes": 295,
|
298 |
+
"shooting_basketball": 296,
|
299 |
+
"shooting_goal_(soccer)": 297,
|
300 |
+
"shot_put": 298,
|
301 |
+
"shoveling_snow": 299,
|
302 |
+
"shredding_paper": 300,
|
303 |
+
"shuffling_cards": 301,
|
304 |
+
"side_kick": 302,
|
305 |
+
"sign_language_interpreting": 303,
|
306 |
+
"singing": 304,
|
307 |
+
"situp": 305,
|
308 |
+
"skateboarding": 306,
|
309 |
+
"ski_jumping": 307,
|
310 |
+
"skiing_(not_slalom_or_crosscountry)": 308,
|
311 |
+
"skiing_crosscountry": 309,
|
312 |
+
"skiing_slalom": 310,
|
313 |
+
"skipping_rope": 311,
|
314 |
+
"skydiving": 312,
|
315 |
+
"slacklining": 313,
|
316 |
+
"slapping": 314,
|
317 |
+
"sled_dog_racing": 315,
|
318 |
+
"smoking": 316,
|
319 |
+
"smoking_hookah": 317,
|
320 |
+
"snatch_weight_lifting": 318,
|
321 |
+
"sneezing": 319,
|
322 |
+
"sniffing": 320,
|
323 |
+
"snorkeling": 321,
|
324 |
+
"snowboarding": 322,
|
325 |
+
"snowkiting": 323,
|
326 |
+
"snowmobiling": 324,
|
327 |
+
"somersaulting": 325,
|
328 |
+
"spinning_poi": 326,
|
329 |
+
"spray_painting": 327,
|
330 |
+
"spraying": 328,
|
331 |
+
"springboard_diving": 329,
|
332 |
+
"squat": 330,
|
333 |
+
"sticking_tongue_out": 331,
|
334 |
+
"stomping_grapes": 332,
|
335 |
+
"stretching_arm": 333,
|
336 |
+
"stretching_leg": 334,
|
337 |
+
"strumming_guitar": 335,
|
338 |
+
"surfing_crowd": 336,
|
339 |
+
"surfing_water": 337,
|
340 |
+
"sweeping_floor": 338,
|
341 |
+
"swimming_backstroke": 339,
|
342 |
+
"swimming_breast_stroke": 340,
|
343 |
+
"swimming_butterfly_stroke": 341,
|
344 |
+
"swing_dancing": 342,
|
345 |
+
"swinging_legs": 343,
|
346 |
+
"swinging_on_something": 344,
|
347 |
+
"sword_fighting": 345,
|
348 |
+
"tai_chi": 346,
|
349 |
+
"taking_a_shower": 347,
|
350 |
+
"tango_dancing": 348,
|
351 |
+
"tap_dancing": 349,
|
352 |
+
"tapping_guitar": 350,
|
353 |
+
"tapping_pen": 351,
|
354 |
+
"tasting_beer": 352,
|
355 |
+
"tasting_food": 353,
|
356 |
+
"testifying": 354,
|
357 |
+
"texting": 355,
|
358 |
+
"throwing_axe": 356,
|
359 |
+
"throwing_ball": 357,
|
360 |
+
"throwing_discus": 358,
|
361 |
+
"tickling": 359,
|
362 |
+
"tobogganing": 360,
|
363 |
+
"tossing_coin": 361,
|
364 |
+
"tossing_salad": 362,
|
365 |
+
"training_dog": 363,
|
366 |
+
"trapezing": 364,
|
367 |
+
"trimming_or_shaving_beard": 365,
|
368 |
+
"trimming_trees": 366,
|
369 |
+
"triple_jump": 367,
|
370 |
+
"tying_bow_tie": 368,
|
371 |
+
"tying_knot_(not_on_a_tie)": 369,
|
372 |
+
"tying_tie": 370,
|
373 |
+
"unboxing": 371,
|
374 |
+
"unloading_truck": 372,
|
375 |
+
"using_computer": 373,
|
376 |
+
"using_remote_controller_(not_gaming)": 374,
|
377 |
+
"using_segway": 375,
|
378 |
+
"vault": 376,
|
379 |
+
"waiting_in_line": 377,
|
380 |
+
"walking_the_dog": 378,
|
381 |
+
"washing_dishes": 379,
|
382 |
+
"washing_feet": 380,
|
383 |
+
"washing_hair": 381,
|
384 |
+
"washing_hands": 382,
|
385 |
+
"water_skiing": 383,
|
386 |
+
"water_sliding": 384,
|
387 |
+
"watering_plants": 385,
|
388 |
+
"waxing_back": 386,
|
389 |
+
"waxing_chest": 387,
|
390 |
+
"waxing_eyebrows": 388,
|
391 |
+
"waxing_legs": 389,
|
392 |
+
"weaving_basket": 390,
|
393 |
+
"welding": 391,
|
394 |
+
"whistling": 392,
|
395 |
+
"windsurfing": 393,
|
396 |
+
"wrapping_present": 394,
|
397 |
+
"wrestling": 395,
|
398 |
+
"writing": 396,
|
399 |
+
"yawning": 397,
|
400 |
+
"yoga": 398,
|
401 |
+
"zumba": 399,
|
402 |
+
}
|
403 |
+
SSv2_label_map = {
|
404 |
+
"Approaching something with your camera": 0,
|
405 |
+
"Attaching something to something": 1,
|
406 |
+
"Bending something so that it deforms": 2,
|
407 |
+
"Bending something until it breaks": 3,
|
408 |
+
"Burying something in something": 4,
|
409 |
+
"Closing something": 5,
|
410 |
+
"Covering something with something": 6,
|
411 |
+
"Digging something out of something": 7,
|
412 |
+
"Dropping something behind something": 8,
|
413 |
+
"Dropping something in front of something": 9,
|
414 |
+
"Dropping something into something": 10,
|
415 |
+
"Dropping something next to something": 11,
|
416 |
+
"Dropping something onto something": 12,
|
417 |
+
"Failing to put something into something because something does not fit": 13,
|
418 |
+
"Folding something": 14,
|
419 |
+
"Hitting something with something": 15,
|
420 |
+
"Holding something": 16,
|
421 |
+
"Holding something behind something": 17,
|
422 |
+
"Holding something in front of something": 18,
|
423 |
+
"Holding something next to something": 19,
|
424 |
+
"Holding something over something": 20,
|
425 |
+
"Laying something on the table on its side, not upright": 21,
|
426 |
+
"Letting something roll along a flat surface": 22,
|
427 |
+
"Letting something roll down a slanted surface": 23,
|
428 |
+
"Letting something roll up a slanted surface, so it rolls back down": 24,
|
429 |
+
"Lifting a surface with something on it but not enough for it to slide down": 25,
|
430 |
+
"Lifting a surface with something on it until it starts sliding down": 26,
|
431 |
+
"Lifting something up completely without letting it drop down": 27,
|
432 |
+
"Lifting something up completely, then letting it drop down": 28,
|
433 |
+
"Lifting something with something on it": 29,
|
434 |
+
"Lifting up one end of something without letting it drop down": 30,
|
435 |
+
"Lifting up one end of something, then letting it drop down": 31,
|
436 |
+
"Moving away from something with your camera": 32,
|
437 |
+
"Moving part of something": 33,
|
438 |
+
"Moving something across a surface until it falls down": 34,
|
439 |
+
"Moving something across a surface without it falling down": 35,
|
440 |
+
"Moving something and something away from each other": 36,
|
441 |
+
"Moving something and something closer to each other": 37,
|
442 |
+
"Moving something and something so they collide with each other": 38,
|
443 |
+
"Moving something and something so they pass each other": 39,
|
444 |
+
"Moving something away from something": 40,
|
445 |
+
"Moving something away from the camera": 41,
|
446 |
+
"Moving something closer to something": 42,
|
447 |
+
"Moving something down": 43,
|
448 |
+
"Moving something towards the camera": 44,
|
449 |
+
"Moving something up": 45,
|
450 |
+
"Opening something": 46,
|
451 |
+
"Picking something up": 47,
|
452 |
+
"Piling something up": 48,
|
453 |
+
"Plugging something into something": 49,
|
454 |
+
"Plugging something into something but pulling it right out as you remove your hand": 50,
|
455 |
+
"Poking a hole into some substance": 51,
|
456 |
+
"Poking a hole into something soft": 52,
|
457 |
+
"Poking a stack of something so the stack collapses": 53,
|
458 |
+
"Poking a stack of something without the stack collapsing": 54,
|
459 |
+
"Poking something so it slightly moves": 55,
|
460 |
+
"Poking something so lightly that it doesn't or almost doesn't move": 56,
|
461 |
+
"Poking something so that it falls over": 57,
|
462 |
+
"Poking something so that it spins around": 58,
|
463 |
+
"Pouring something into something": 59,
|
464 |
+
"Pouring something into something until it overflows": 60,
|
465 |
+
"Pouring something onto something": 61,
|
466 |
+
"Pouring something out of something": 62,
|
467 |
+
"Pretending or failing to wipe something off of something": 63,
|
468 |
+
"Pretending or trying and failing to twist something": 64,
|
469 |
+
"Pretending to be tearing something that is not tearable": 65,
|
470 |
+
"Pretending to close something without actually closing it": 66,
|
471 |
+
"Pretending to open something without actually opening it": 67,
|
472 |
+
"Pretending to pick something up": 68,
|
473 |
+
"Pretending to poke something": 69,
|
474 |
+
"Pretending to pour something out of something, but something is empty": 70,
|
475 |
+
"Pretending to put something behind something": 71,
|
476 |
+
"Pretending to put something into something": 72,
|
477 |
+
"Pretending to put something next to something": 73,
|
478 |
+
"Pretending to put something on a surface": 74,
|
479 |
+
"Pretending to put something onto something": 75,
|
480 |
+
"Pretending to put something underneath something": 76,
|
481 |
+
"Pretending to scoop something up with something": 77,
|
482 |
+
"Pretending to spread air onto something": 78,
|
483 |
+
"Pretending to sprinkle air onto something": 79,
|
484 |
+
"Pretending to squeeze something": 80,
|
485 |
+
"Pretending to take something from somewhere": 81,
|
486 |
+
"Pretending to take something out of something": 82,
|
487 |
+
"Pretending to throw something": 83,
|
488 |
+
"Pretending to turn something upside down": 84,
|
489 |
+
"Pulling something from behind of something": 85,
|
490 |
+
"Pulling something from left to right": 86,
|
491 |
+
"Pulling something from right to left": 87,
|
492 |
+
"Pulling something onto something": 88,
|
493 |
+
"Pulling something out of something": 89,
|
494 |
+
"Pulling two ends of something but nothing happens": 90,
|
495 |
+
"Pulling two ends of something so that it gets stretched": 91,
|
496 |
+
"Pulling two ends of something so that it separates into two pieces": 92,
|
497 |
+
"Pushing something from left to right": 93,
|
498 |
+
"Pushing something from right to left": 94,
|
499 |
+
"Pushing something off of something": 95,
|
500 |
+
"Pushing something onto something": 96,
|
501 |
+
"Pushing something so it spins": 97,
|
502 |
+
"Pushing something so that it almost falls off but doesn't": 98,
|
503 |
+
"Pushing something so that it falls off the table": 99,
|
504 |
+
"Pushing something so that it slightly moves": 100,
|
505 |
+
"Pushing something with something": 101,
|
506 |
+
"Putting number of something onto something": 102,
|
507 |
+
"Putting something and something on the table": 103,
|
508 |
+
"Putting something behind something": 104,
|
509 |
+
"Putting something in front of something": 105,
|
510 |
+
"Putting something into something": 106,
|
511 |
+
"Putting something next to something": 107,
|
512 |
+
"Putting something on a flat surface without letting it roll": 108,
|
513 |
+
"Putting something on a surface": 109,
|
514 |
+
"Putting something on the edge of something so it is not supported and falls down": 110,
|
515 |
+
"Putting something onto a slanted surface but it doesn't glide down": 111,
|
516 |
+
"Putting something onto something": 112,
|
517 |
+
"Putting something onto something else that cannot support it so it falls down": 113,
|
518 |
+
"Putting something similar to other things that are already on the table": 114,
|
519 |
+
"Putting something that can't roll onto a slanted surface, so it slides down": 115,
|
520 |
+
"Putting something that can't roll onto a slanted surface, so it stays where it is": 116,
|
521 |
+
"Putting something that cannot actually stand upright upright on the table, so it falls on its side": 117,
|
522 |
+
"Putting something underneath something": 118,
|
523 |
+
"Putting something upright on the table": 119,
|
524 |
+
"Putting something, something and something on the table": 120,
|
525 |
+
"Removing something, revealing something behind": 121,
|
526 |
+
"Rolling something on a flat surface": 122,
|
527 |
+
"Scooping something up with something": 123,
|
528 |
+
"Showing a photo of something to the camera": 124,
|
529 |
+
"Showing something behind something": 125,
|
530 |
+
"Showing something next to something": 126,
|
531 |
+
"Showing something on top of something": 127,
|
532 |
+
"Showing something to the camera": 128,
|
533 |
+
"Showing that something is empty": 129,
|
534 |
+
"Showing that something is inside something": 130,
|
535 |
+
"Something being deflected from something": 131,
|
536 |
+
"Something colliding with something and both are being deflected": 132,
|
537 |
+
"Something colliding with something and both come to a halt": 133,
|
538 |
+
"Something falling like a feather or paper": 134,
|
539 |
+
"Something falling like a rock": 135,
|
540 |
+
"Spilling something behind something": 136,
|
541 |
+
"Spilling something next to something": 137,
|
542 |
+
"Spilling something onto something": 138,
|
543 |
+
"Spinning something so it continues spinning": 139,
|
544 |
+
"Spinning something that quickly stops spinning": 140,
|
545 |
+
"Spreading something onto something": 141,
|
546 |
+
"Sprinkling something onto something": 142,
|
547 |
+
"Squeezing something": 143,
|
548 |
+
"Stacking number of something": 144,
|
549 |
+
"Stuffing something into something": 145,
|
550 |
+
"Taking one of many similar things on the table": 146,
|
551 |
+
"Taking something from somewhere": 147,
|
552 |
+
"Taking something out of something": 148,
|
553 |
+
"Tearing something into two pieces": 149,
|
554 |
+
"Tearing something just a little bit": 150,
|
555 |
+
"Throwing something": 151,
|
556 |
+
"Throwing something against something": 152,
|
557 |
+
"Throwing something in the air and catching it": 153,
|
558 |
+
"Throwing something in the air and letting it fall": 154,
|
559 |
+
"Throwing something onto a surface": 155,
|
560 |
+
"Tilting something with something on it slightly so it doesn't fall down": 156,
|
561 |
+
"Tilting something with something on it until it falls off": 157,
|
562 |
+
"Tipping something over": 158,
|
563 |
+
"Tipping something with something in it over, so something in it falls out": 159,
|
564 |
+
"Touching (without moving) part of something": 160,
|
565 |
+
"Trying but failing to attach something to something because it doesn't stick": 161,
|
566 |
+
"Trying to bend something unbendable so nothing happens": 162,
|
567 |
+
"Trying to pour something into something, but missing so it spills next to it": 163,
|
568 |
+
"Turning something upside down": 164,
|
569 |
+
"Turning the camera downwards while filming something": 165,
|
570 |
+
"Turning the camera left while filming something": 166,
|
571 |
+
"Turning the camera right while filming something": 167,
|
572 |
+
"Turning the camera upwards while filming something": 168,
|
573 |
+
"Twisting (wringing) something wet until water comes out": 169,
|
574 |
+
"Twisting something": 170,
|
575 |
+
"Uncovering something": 171,
|
576 |
+
"Unfolding something": 172,
|
577 |
+
"Wiping something off of something": 173,
|
578 |
+
"Moving something and something so they overlap each other": 174,
|
579 |
+
}
|
580 |
+
UCF_label_map = {
|
581 |
+
"ApplyEyeMakeup": 0,
|
582 |
+
"ApplyLipstick": 1,
|
583 |
+
"Archery": 2,
|
584 |
+
"BabyCrawling": 3,
|
585 |
+
"BalanceBeam": 4,
|
586 |
+
"BandMarching": 5,
|
587 |
+
"BaseballPitch": 6,
|
588 |
+
"Basketball": 7,
|
589 |
+
"BasketballDunk": 8,
|
590 |
+
"BenchPress": 9,
|
591 |
+
"Biking": 10,
|
592 |
+
"Billiards": 11,
|
593 |
+
"BlowDryHair": 12,
|
594 |
+
"BlowingCandles": 13,
|
595 |
+
"BodyWeightSquats": 14,
|
596 |
+
"Bowling": 15,
|
597 |
+
"BoxingPunchingBag": 16,
|
598 |
+
"BoxingSpeedBag": 17,
|
599 |
+
"BreastStroke": 18,
|
600 |
+
"BrushingTeeth": 19,
|
601 |
+
"CleanAndJerk": 20,
|
602 |
+
"CliffDiving": 21,
|
603 |
+
"CricketBowling": 22,
|
604 |
+
"CricketShot": 23,
|
605 |
+
"CuttingInKitchen": 24,
|
606 |
+
"Diving": 25,
|
607 |
+
"Drumming": 26,
|
608 |
+
"Fencing": 27,
|
609 |
+
"FieldHockeyPenalty": 28,
|
610 |
+
"FloorGymnastics": 29,
|
611 |
+
"FrisbeeCatch": 30,
|
612 |
+
"FrontCrawl": 31,
|
613 |
+
"GolfSwing": 32,
|
614 |
+
"Haircut": 33,
|
615 |
+
"Hammering": 34,
|
616 |
+
"HammerThrow": 35,
|
617 |
+
"HandstandPushups": 36,
|
618 |
+
"HandstandWalking": 37,
|
619 |
+
"HeadMassage": 38,
|
620 |
+
"HighJump": 39,
|
621 |
+
"HorseRace": 40,
|
622 |
+
"HorseRiding": 41,
|
623 |
+
"HulaHoop": 42,
|
624 |
+
"IceDancing": 43,
|
625 |
+
"JavelinThrow": 44,
|
626 |
+
"JugglingBalls": 45,
|
627 |
+
"JumpingJack": 46,
|
628 |
+
"JumpRope": 47,
|
629 |
+
"Kayaking": 48,
|
630 |
+
"Knitting": 49,
|
631 |
+
"LongJump": 50,
|
632 |
+
"Lunges": 51,
|
633 |
+
"MilitaryParade": 52,
|
634 |
+
"Mixing": 53,
|
635 |
+
"MoppingFloor": 54,
|
636 |
+
"Nunchucks": 55,
|
637 |
+
"ParallelBars": 56,
|
638 |
+
"PizzaTossing": 57,
|
639 |
+
"PlayingCello": 58,
|
640 |
+
"PlayingDaf": 59,
|
641 |
+
"PlayingDhol": 60,
|
642 |
+
"PlayingFlute": 61,
|
643 |
+
"PlayingGuitar": 62,
|
644 |
+
"PlayingPiano": 63,
|
645 |
+
"PlayingSitar": 64,
|
646 |
+
"PlayingTabla": 65,
|
647 |
+
"PlayingViolin": 66,
|
648 |
+
"PoleVault": 67,
|
649 |
+
"PommelHorse": 68,
|
650 |
+
"PullUps": 69,
|
651 |
+
"Punch": 70,
|
652 |
+
"PushUps": 71,
|
653 |
+
"Rafting": 72,
|
654 |
+
"RockClimbingIndoor": 73,
|
655 |
+
"RopeClimbing": 74,
|
656 |
+
"Rowing": 75,
|
657 |
+
"SalsaSpin": 76,
|
658 |
+
"ShavingBeard": 77,
|
659 |
+
"Shotput": 78,
|
660 |
+
"SkateBoarding": 79,
|
661 |
+
"Skiing": 80,
|
662 |
+
"Skijet": 81,
|
663 |
+
"SkyDiving": 82,
|
664 |
+
"SoccerJuggling": 83,
|
665 |
+
"SoccerPenalty": 84,
|
666 |
+
"StillRings": 85,
|
667 |
+
"SumoWrestling": 86,
|
668 |
+
"Surfing": 87,
|
669 |
+
"Swing": 88,
|
670 |
+
"TableTennisShot": 89,
|
671 |
+
"TaiChi": 90,
|
672 |
+
"TennisSwing": 91,
|
673 |
+
"ThrowDiscus": 92,
|
674 |
+
"TrampolineJumping": 93,
|
675 |
+
"Typing": 94,
|
676 |
+
"UnevenBars": 95,
|
677 |
+
"VolleyballSpiking": 96,
|
678 |
+
"WalkingWithDog": 97,
|
679 |
+
"WallPushups": 98,
|
680 |
+
"WritingOnBoard": 99,
|
681 |
+
"YoYo": 100,
|
682 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
opencv-python
|
2 |
+
einops
|
3 |
+
decord
|
utils.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import tensorflow as tf
|
3 |
+
import numpy as np
|
4 |
+
from einops import rearrange
|
5 |
+
from decord import VideoReader
|
6 |
+
|
7 |
+
num_frames = 16
|
8 |
+
input_size = 224
|
9 |
+
patch_size = (16, 16)
|
10 |
+
IMAGENET_MEAN = np.array([0.45, 0.45, 0.45])
|
11 |
+
IMAGENET_STD = np.array([0.225, 0.225, 0.225])
|
12 |
+
|
13 |
+
def format_frames(frame, output_size):
|
14 |
+
frame = tf.image.convert_image_dtype(frame, tf.uint8)
|
15 |
+
frame = tf.image.resize(frame, size=output_size)
|
16 |
+
frame = frame / 255.
|
17 |
+
frame = frame - IMAGENET_MEAN
|
18 |
+
frame = frame / IMAGENET_STD
|
19 |
+
return frame
|
20 |
+
|
21 |
+
def read_video(file_path):
|
22 |
+
container = VideoReader(file_path)
|
23 |
+
return container
|
24 |
+
|
25 |
+
def frame_sampling(container, num_frames):
|
26 |
+
interval = len(container) // num_frames
|
27 |
+
bids = np.arange(num_frames) * interval
|
28 |
+
offset = np.random.randint(interval, size=bids.shape)
|
29 |
+
frame_index = bids + offset
|
30 |
+
frames = container.get_batch(frame_index).asnumpy()
|
31 |
+
frames = np.stack(frames)
|
32 |
+
frames = format_frames(frames, [input_size] * 2)
|
33 |
+
return frames
|
34 |
+
|
35 |
+
def denormalize(image):
|
36 |
+
image = image.numpy() if not isinstance(image, np.ndarray) else image
|
37 |
+
image = image * IMAGENET_STD + IMAGENET_MEAN
|
38 |
+
image = (image * 255).clip(0, 255).astype('uint8')
|
39 |
+
return image
|
40 |
+
|
41 |
+
def reconstrunction(input_frame, bool_mask, pretrained_pred):
|
42 |
+
img_squeeze = rearrange(
|
43 |
+
input_frame.numpy(),
|
44 |
+
'b (t p0) (h p1) (w p2) c -> b (t h w) (p0 p1 p2) c',
|
45 |
+
p0=2, p1=patch_size[0], p2=patch_size[0]
|
46 |
+
)
|
47 |
+
img_mean = np.mean(img_squeeze, axis=-2, keepdims=True)
|
48 |
+
img_variance = np.var(img_squeeze, axis=-2, ddof=1, keepdims=True)
|
49 |
+
img_norm = (img_squeeze - img_mean) / (np.sqrt(img_variance) + 1e-6)
|
50 |
+
img_patch = rearrange(img_norm, 'b n p c -> b n (p c)')
|
51 |
+
img_patch[bool_mask] = pretrained_pred
|
52 |
+
|
53 |
+
# make mask
|
54 |
+
mask = np.ones_like(img_patch)
|
55 |
+
mask[bool_mask] = 0
|
56 |
+
mask = rearrange(
|
57 |
+
mask, 'b n (p c) -> b n p c', c=3
|
58 |
+
)
|
59 |
+
mask = rearrange(
|
60 |
+
mask,
|
61 |
+
'b (t h w) (p0 p1 p2) c -> b (t p0) (h p1) (w p2) c',
|
62 |
+
p0=2, p1=patch_size[0], p2=patch_size[1], h=14, w=14
|
63 |
+
)
|
64 |
+
|
65 |
+
#save reconstruction video
|
66 |
+
rec_img = rearrange(img_patch, 'b n (p c) -> b n p c', c=3)
|
67 |
+
|
68 |
+
# Notice: To visualize the reconstruction video, we add the predict and the original mean and var of each patch.
|
69 |
+
img_mean = np.mean(img_squeeze, axis=-2, keepdims=True)
|
70 |
+
img_std = np.sqrt(np.var(img_squeeze, axis=-2, ddof=1, keepdims=True) + 1e-6)
|
71 |
+
rec_img = rec_img * img_std + img_mean
|
72 |
+
rec_img = rearrange(
|
73 |
+
rec_img,
|
74 |
+
'b (t h w) (p0 p1 p2) c -> b (t p0) (h p1) (w p2) c',
|
75 |
+
p0=2, p1=patch_size[0], p2=patch_size[1], h=14, w=14
|
76 |
+
)
|
77 |
+
|
78 |
+
return (
|
79 |
+
rec_img[0],
|
80 |
+
mask[0]
|
81 |
+
)
|
82 |
+
|
83 |
+
|
84 |
+
class TubeMaskingGenerator:
|
85 |
+
def __init__(self, input_size, mask_ratio):
|
86 |
+
self.frames, self.height, self.width = input_size
|
87 |
+
self.num_patches_per_frame = self.height * self.width
|
88 |
+
self.total_patches = self.frames * self.num_patches_per_frame
|
89 |
+
self.num_masks_per_frame = int(mask_ratio * self.num_patches_per_frame)
|
90 |
+
self.total_masks = self.frames * self.num_masks_per_frame
|
91 |
+
|
92 |
+
def __repr__(self):
|
93 |
+
repr_str = "Maks: total patches {}, mask patches {}".format(
|
94 |
+
self.total_patches, self.total_masks
|
95 |
+
)
|
96 |
+
return repr_str
|
97 |
+
|
98 |
+
def __call__(self):
|
99 |
+
mask_per_frame = np.hstack([
|
100 |
+
np.zeros(self.num_patches_per_frame - self.num_masks_per_frame),
|
101 |
+
np.ones(self.num_masks_per_frame),
|
102 |
+
])
|
103 |
+
np.random.shuffle(mask_per_frame)
|
104 |
+
mask = np.tile(mask_per_frame, (self.frames,1)).flatten()
|
105 |
+
return mask
|