Spaces:

innat
/

VideoMAE

Running

App Files Files Community

innat commited on Oct 9, 2023

Commit

3ee2d3e

1 Parent(s): 0b2cebb

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -53

app.py CHANGED Viewed

@@ -11,23 +11,7 @@ from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_siz
 from labels import K400_label_map, SSv2_label_map, UCF_label_map
-MODELS = {
-    'K400': [
-        './TFVideoMAE_S_K400_16x224_FT',
-        './TFVideoMAE_S_K400_16x224_PT'
-        ],
-    'SSv2': [
-        './TFVideoMAE_S_K400_16x224_FT',
-        './TFVideoMAE_S_K400_16x224_PT'
-        ],
-    'UCF' : [
-        'innat/videomae/TFVideoMAE_S_K400_16x224_FT',
-        './TFVideoMAE_S_K400_16x224_PT'
-        ]
-}
-def tube_mask_generator():
     window_size = (
         num_frames // 2,
         input_size // patch_size[0],
@@ -35,7 +19,7 @@ def tube_mask_generator():
     )
     tube_mask = TubeMaskingGenerator(
         input_size=window_size,
-        mask_ratio=0.70
     )
     make_bool = tube_mask()
     bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32)
@@ -44,28 +28,17 @@ def tube_mask_generator():
     return bool_masked_pos_tf
-def video_to_gif(video_array, gif_filename):
-    imageio.mimsave(
-        gif_filename, video_array, duration=100
-    )
 def get_model(data_type):
-    print()
-    print('-------------------- ', data_type)
-    print()
-    data_type ='K400'
     ft_model = keras.models.load_model(MODELS[data_type][0])
     pt_model = keras.models.load_model(MODELS[data_type][1])
     label_map = {v: k for k, v in K400_label_map.items()}
     return ft_model, pt_model, label_map
-def inference(video_file, dataset_type):
     container = read_video(video_file)
     frames = frame_sampling(container, num_frames=num_frames)
-    bool_masked_pos_tf = tube_mask_generator()
     ft_model, pt_model, label_map = get_model(dataset_type)
     ft_model.trainable = False
     pt_model.trainable = False
@@ -97,25 +70,57 @@ def inference(video_file, dataset_type):
     return confidences, combined_gif
-gr.Interface(
-    fn=inference,
-    inputs=[
-        gr.Video(type="file"),
-        gr.Radio(
-            ['K400', 'SSv2', 'UCF'],
-            type='value',
-            default='K400',
-            label='Dataset',
-        ),
-    ],
-    outputs=[
-        gr.Label(num_top_classes=3, label='confidence scores'),
-        gr.Image(type="filepath", label='reconstructed masked autoencoder')
-    ],
-    examples=[
-        ["examples/k400.mp4"],
-        ["examples/k400.mp4"],
-        ["examples/k400.mp4"],
-    ],
-    title="VideoMAE",
-).launch()

 from labels import K400_label_map, SSv2_label_map, UCF_label_map
+def tube_mask_generator(mask_ratio):
     window_size = (
         num_frames // 2,
         input_size // patch_size[0],
     )
     tube_mask = TubeMaskingGenerator(
         input_size=window_size,
+        mask_ratio=mask_ratio
     )
     make_bool = tube_mask()
     bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32)
     return bool_masked_pos_tf
 def get_model(data_type):
     ft_model = keras.models.load_model(MODELS[data_type][0])
     pt_model = keras.models.load_model(MODELS[data_type][1])
     label_map = {v: k for k, v in K400_label_map.items()}
     return ft_model, pt_model, label_map
+def inference(video_file, dataset_type, mask_ratio):
     container = read_video(video_file)
     frames = frame_sampling(container, num_frames=num_frames)
+    bool_masked_pos_tf = tube_mask_generator(mask_ratio)
     ft_model, pt_model, label_map = get_model(dataset_type)
     ft_model.trainable = False
     pt_model.trainable = False
     return confidences, combined_gif
+def main():
+    MODELS = {
+        'K400': [
+            './TFVideoMAE_S_K400_16x224_FT',
+            './TFVideoMAE_S_K400_16x224_PT'
+            ],
+        'SSv2': [
+            './TFVideoMAE_S_K400_16x224_FT',
+            './TFVideoMAE_S_K400_16x224_PT'
+            ],
+        'UCF' : [
+            'innat/videomae/TFVideoMAE_S_K400_16x224_FT',
+            './TFVideoMAE_S_K400_16x224_PT'
+            ]
+    }
+    BENCHMARK_DATASETS = ['K400', 'SSv2', 'UCF']
+    SAMPLE_EXAMPLES = [
+        ["examples/k400.mp4", 'Kintetics-400'],
+        ["examples/k400.mp4", 'SSv2'],
+        ["examples/k400.mp4", 'UCF']
+    ]
+    iface = gr.Interface(
+        fn=inference,
+        inputs=[
+            gr.Video(type="file", label="Input Video"),
+            gr.Radio(
+                BENCHMARK_DATASETS,
+                type='value',
+                default=BENCHMARK_DATASETS[0],
+                label='Dataset',
+            ),
+            gr.inputs.Slider(
+                0.5,
+                1.0,
+                step=0.1,
+                default=0.7,
+                label='Mask Ratio'
+            )
+        ],
+        outputs=[
+            gr.Label(num_top_classes=3, label='scores'),
+            gr.Image(type="filepath", label='reconstructed')
+        ],
+        examples=SAMPLE_EXAMPLES,
+        title="VideoMAE",
+        description="Keras reimplementation of <a href='https://github.com/innat/VideoMAE'>VideoMAE</a> is presented here."
+    )
+    iface.launch()
+if __name__ == '__main__':
+    main()