Spaces:
Runtime error
Runtime error
# Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Video classification configuration definition.""" | |
import dataclasses | |
from typing import Optional, Tuple, Union | |
from official.core import config_definitions as cfg | |
from official.core import exp_factory | |
from official.modeling import hyperparams | |
from official.modeling import optimization | |
from official.vision.configs import backbones_3d | |
from official.vision.configs import common | |
class DataConfig(cfg.DataConfig): | |
"""The base configuration for building datasets.""" | |
name: Optional[str] = None | |
file_type: Optional[str] = 'tfrecord' | |
compressed_input: bool = False | |
split: str = 'train' | |
variant_name: Optional[str] = None | |
feature_shape: Tuple[int, ...] = (64, 224, 224, 3) | |
temporal_stride: int = 1 | |
random_stride_range: int = 0 | |
num_test_clips: int = 1 | |
num_test_crops: int = 1 | |
num_classes: int = -1 | |
num_examples: int = -1 | |
global_batch_size: int = 128 | |
data_format: str = 'channels_last' | |
dtype: str = 'float32' | |
label_dtype: str = 'int32' | |
one_hot: bool = True | |
shuffle_buffer_size: int = 64 | |
cache: bool = False | |
input_path: Union[str, cfg.base_config.Config] = '' | |
is_training: bool = True | |
cycle_length: int = 10 | |
drop_remainder: bool = True | |
min_image_size: int = 256 | |
zero_centering_image: bool = False | |
is_multilabel: bool = False | |
output_audio: bool = False | |
audio_feature: str = '' | |
audio_feature_shape: Tuple[int, ...] = (-1,) | |
aug_min_aspect_ratio: float = 0.5 | |
aug_max_aspect_ratio: float = 2.0 | |
aug_min_area_ratio: float = 0.49 | |
aug_max_area_ratio: float = 1.0 | |
aug_type: Optional[ | |
common.Augmentation] = None # AutoAugment and RandAugment. | |
mixup_and_cutmix: Optional[common.MixupAndCutmix] = None | |
image_field_key: str = 'image/encoded' | |
label_field_key: str = 'clip/label/index' | |
input_image_format: str = 'jpeg' | |
def kinetics400(is_training): | |
"""Generated Kinetics 400 dataset configs.""" | |
return DataConfig( | |
name='kinetics400', | |
num_classes=400, | |
is_training=is_training, | |
split='train' if is_training else 'valid', | |
drop_remainder=is_training, | |
num_examples=215570 if is_training else 17706, | |
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3)) | |
def kinetics600(is_training): | |
"""Generated Kinetics 600 dataset configs.""" | |
return DataConfig( | |
name='kinetics600', | |
num_classes=600, | |
is_training=is_training, | |
split='train' if is_training else 'valid', | |
drop_remainder=is_training, | |
num_examples=366016 if is_training else 27780, | |
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3)) | |
def kinetics700(is_training): | |
"""Generated Kinetics 600 dataset configs.""" | |
return DataConfig( | |
name='kinetics700', | |
num_classes=700, | |
is_training=is_training, | |
split='train' if is_training else 'valid', | |
drop_remainder=is_training, | |
num_examples=522883 if is_training else 33441, | |
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3)) | |
def kinetics700_2020(is_training): | |
"""Generated Kinetics 600 dataset configs.""" | |
return DataConfig( | |
name='kinetics700', | |
num_classes=700, | |
is_training=is_training, | |
split='train' if is_training else 'valid', | |
drop_remainder=is_training, | |
num_examples=535982 if is_training else 33640, | |
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3)) | |
class VideoClassificationModel(hyperparams.Config): | |
"""The model config.""" | |
model_type: str = 'video_classification' | |
backbone: backbones_3d.Backbone3D = dataclasses.field( | |
default_factory=lambda: backbones_3d.Backbone3D( # pylint: disable=g-long-lambda | |
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50() | |
) | |
) | |
norm_activation: common.NormActivation = dataclasses.field( | |
default_factory=lambda: common.NormActivation(use_sync_bn=False) | |
) | |
dropout_rate: float = 0.2 | |
aggregate_endpoints: bool = False | |
require_endpoints: Optional[Tuple[str, ...]] = None | |
class Losses(hyperparams.Config): | |
one_hot: bool = True | |
label_smoothing: float = 0.0 | |
l2_weight_decay: float = 0.0 | |
class Metrics(hyperparams.Config): | |
use_per_class_recall: bool = False | |
class VideoClassificationTask(cfg.TaskConfig): | |
"""The task config.""" | |
model: VideoClassificationModel = dataclasses.field( | |
default_factory=VideoClassificationModel | |
) | |
train_data: DataConfig = dataclasses.field( | |
default_factory=lambda: DataConfig(is_training=True, drop_remainder=True) | |
) | |
validation_data: DataConfig = dataclasses.field( | |
default_factory=lambda: DataConfig( # pylint: disable=g-long-lambda | |
is_training=False, drop_remainder=False | |
) | |
) | |
losses: Losses = dataclasses.field(default_factory=Losses) | |
metrics: Metrics = dataclasses.field(default_factory=Metrics) | |
init_checkpoint: Optional[str] = None | |
init_checkpoint_modules: str = 'all' # all or backbone | |
freeze_backbone: bool = False | |
# Spatial Partitioning fields. | |
train_input_partition_dims: Optional[Tuple[int, ...]] = None | |
eval_input_partition_dims: Optional[Tuple[int, ...]] = None | |
def add_trainer(experiment: cfg.ExperimentConfig, | |
train_batch_size: int, | |
eval_batch_size: int, | |
learning_rate: float = 1.6, | |
train_epochs: int = 44, | |
warmup_epochs: int = 5): | |
"""Add and config a trainer to the experiment config.""" | |
if experiment.task.train_data.num_examples <= 0: | |
raise ValueError('Wrong train dataset size {!r}'.format( | |
experiment.task.train_data)) | |
if experiment.task.validation_data.num_examples <= 0: | |
raise ValueError('Wrong validation dataset size {!r}'.format( | |
experiment.task.validation_data)) | |
experiment.task.train_data.global_batch_size = train_batch_size | |
experiment.task.validation_data.global_batch_size = eval_batch_size | |
steps_per_epoch = experiment.task.train_data.num_examples // train_batch_size | |
experiment.trainer = cfg.TrainerConfig( | |
steps_per_loop=steps_per_epoch, | |
summary_interval=steps_per_epoch, | |
checkpoint_interval=steps_per_epoch, | |
train_steps=train_epochs * steps_per_epoch, | |
validation_steps=experiment.task.validation_data.num_examples // | |
eval_batch_size, | |
validation_interval=steps_per_epoch, | |
optimizer_config=optimization.OptimizationConfig({ | |
'optimizer': { | |
'type': 'sgd', | |
'sgd': { | |
'momentum': 0.9, | |
'nesterov': True, | |
} | |
}, | |
'learning_rate': { | |
'type': 'cosine', | |
'cosine': { | |
'initial_learning_rate': learning_rate, | |
'decay_steps': train_epochs * steps_per_epoch, | |
} | |
}, | |
'warmup': { | |
'type': 'linear', | |
'linear': { | |
'warmup_steps': warmup_epochs * steps_per_epoch, | |
'warmup_learning_rate': 0 | |
} | |
} | |
})) | |
return experiment | |
def video_classification() -> cfg.ExperimentConfig: | |
"""Video classification general.""" | |
return cfg.ExperimentConfig( | |
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), | |
task=VideoClassificationTask(), | |
trainer=cfg.TrainerConfig(), | |
restrictions=[ | |
'task.train_data.is_training != None', | |
'task.validation_data.is_training != None', | |
'task.train_data.num_classes == task.validation_data.num_classes', | |
]) | |
def video_classification_ucf101() -> cfg.ExperimentConfig: | |
"""Video classification on UCF-101 with resnet.""" | |
train_dataset = DataConfig( | |
name='ucf101', | |
num_classes=101, | |
is_training=True, | |
split='train', | |
drop_remainder=True, | |
num_examples=9537, | |
temporal_stride=2, | |
feature_shape=(32, 224, 224, 3)) | |
train_dataset.tfds_name = 'ucf101' | |
train_dataset.tfds_split = 'train' | |
validation_dataset = DataConfig( | |
name='ucf101', | |
num_classes=101, | |
is_training=True, | |
split='test', | |
drop_remainder=False, | |
num_examples=3783, | |
temporal_stride=2, | |
feature_shape=(32, 224, 224, 3)) | |
validation_dataset.tfds_name = 'ucf101' | |
validation_dataset.tfds_split = 'test' | |
task = VideoClassificationTask( | |
model=VideoClassificationModel( | |
backbone=backbones_3d.Backbone3D( | |
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()), | |
norm_activation=common.NormActivation( | |
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), | |
losses=Losses(l2_weight_decay=1e-4), | |
train_data=train_dataset, | |
validation_data=validation_dataset) | |
config = cfg.ExperimentConfig( | |
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), | |
task=task, | |
restrictions=[ | |
'task.train_data.is_training != None', | |
'task.validation_data.is_training != None', | |
'task.train_data.num_classes == task.validation_data.num_classes', | |
]) | |
add_trainer( | |
config, | |
train_batch_size=64, | |
eval_batch_size=16, | |
learning_rate=0.8, | |
train_epochs=100) | |
return config | |
def video_classification_kinetics400() -> cfg.ExperimentConfig: | |
"""Video classification on Kinetics 400 with resnet.""" | |
train_dataset = kinetics400(is_training=True) | |
validation_dataset = kinetics400(is_training=False) | |
task = VideoClassificationTask( | |
model=VideoClassificationModel( | |
backbone=backbones_3d.Backbone3D( | |
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()), | |
norm_activation=common.NormActivation( | |
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), | |
losses=Losses(l2_weight_decay=1e-4), | |
train_data=train_dataset, | |
validation_data=validation_dataset) | |
config = cfg.ExperimentConfig( | |
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), | |
task=task, | |
restrictions=[ | |
'task.train_data.is_training != None', | |
'task.validation_data.is_training != None', | |
'task.train_data.num_classes == task.validation_data.num_classes', | |
]) | |
add_trainer(config, train_batch_size=1024, eval_batch_size=64) | |
return config | |
def video_classification_kinetics600() -> cfg.ExperimentConfig: | |
"""Video classification on Kinetics 600 with resnet.""" | |
train_dataset = kinetics600(is_training=True) | |
validation_dataset = kinetics600(is_training=False) | |
task = VideoClassificationTask( | |
model=VideoClassificationModel( | |
backbone=backbones_3d.Backbone3D( | |
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()), | |
norm_activation=common.NormActivation( | |
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), | |
losses=Losses(l2_weight_decay=1e-4), | |
train_data=train_dataset, | |
validation_data=validation_dataset) | |
config = cfg.ExperimentConfig( | |
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), | |
task=task, | |
restrictions=[ | |
'task.train_data.is_training != None', | |
'task.validation_data.is_training != None', | |
'task.train_data.num_classes == task.validation_data.num_classes', | |
]) | |
add_trainer(config, train_batch_size=1024, eval_batch_size=64) | |
return config | |
def video_classification_kinetics700() -> cfg.ExperimentConfig: | |
"""Video classification on Kinetics 700 with resnet.""" | |
train_dataset = kinetics700(is_training=True) | |
validation_dataset = kinetics700(is_training=False) | |
task = VideoClassificationTask( | |
model=VideoClassificationModel( | |
backbone=backbones_3d.Backbone3D( | |
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()), | |
norm_activation=common.NormActivation( | |
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), | |
losses=Losses(l2_weight_decay=1e-4), | |
train_data=train_dataset, | |
validation_data=validation_dataset) | |
config = cfg.ExperimentConfig( | |
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), | |
task=task, | |
restrictions=[ | |
'task.train_data.is_training != None', | |
'task.validation_data.is_training != None', | |
'task.train_data.num_classes == task.validation_data.num_classes', | |
]) | |
add_trainer(config, train_batch_size=1024, eval_batch_size=64) | |
return config | |
def video_classification_kinetics700_2020() -> cfg.ExperimentConfig: | |
"""Video classification on Kinetics 700 2020 with resnet.""" | |
train_dataset = kinetics700_2020(is_training=True) | |
validation_dataset = kinetics700_2020(is_training=False) | |
task = VideoClassificationTask( | |
model=VideoClassificationModel( | |
backbone=backbones_3d.Backbone3D( | |
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()), | |
norm_activation=common.NormActivation( | |
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), | |
losses=Losses(l2_weight_decay=1e-4), | |
train_data=train_dataset, | |
validation_data=validation_dataset) | |
config = cfg.ExperimentConfig( | |
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), | |
task=task, | |
restrictions=[ | |
'task.train_data.is_training != None', | |
'task.validation_data.is_training != None', | |
'task.train_data.num_classes == task.validation_data.num_classes', | |
]) | |
add_trainer(config, train_batch_size=1024, eval_batch_size=64) | |
return config | |