deanna-emery's picture
updates
93528c6
raw
history blame
14.5 kB
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Video classification configuration definition."""
import dataclasses
from typing import Optional, Tuple, Union
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import hyperparams
from official.modeling import optimization
from official.vision.configs import backbones_3d
from official.vision.configs import common
@dataclasses.dataclass
class DataConfig(cfg.DataConfig):
"""The base configuration for building datasets."""
name: Optional[str] = None
file_type: Optional[str] = 'tfrecord'
compressed_input: bool = False
split: str = 'train'
variant_name: Optional[str] = None
feature_shape: Tuple[int, ...] = (64, 224, 224, 3)
temporal_stride: int = 1
random_stride_range: int = 0
num_test_clips: int = 1
num_test_crops: int = 1
num_classes: int = -1
num_examples: int = -1
global_batch_size: int = 128
data_format: str = 'channels_last'
dtype: str = 'float32'
label_dtype: str = 'int32'
one_hot: bool = True
shuffle_buffer_size: int = 64
cache: bool = False
input_path: Union[str, cfg.base_config.Config] = ''
is_training: bool = True
cycle_length: int = 10
drop_remainder: bool = True
min_image_size: int = 256
zero_centering_image: bool = False
is_multilabel: bool = False
output_audio: bool = False
audio_feature: str = ''
audio_feature_shape: Tuple[int, ...] = (-1,)
aug_min_aspect_ratio: float = 0.5
aug_max_aspect_ratio: float = 2.0
aug_min_area_ratio: float = 0.49
aug_max_area_ratio: float = 1.0
aug_type: Optional[
common.Augmentation] = None # AutoAugment and RandAugment.
mixup_and_cutmix: Optional[common.MixupAndCutmix] = None
image_field_key: str = 'image/encoded'
label_field_key: str = 'clip/label/index'
input_image_format: str = 'jpeg'
def kinetics400(is_training):
"""Generated Kinetics 400 dataset configs."""
return DataConfig(
name='kinetics400',
num_classes=400,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=215570 if is_training else 17706,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
def kinetics600(is_training):
"""Generated Kinetics 600 dataset configs."""
return DataConfig(
name='kinetics600',
num_classes=600,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=366016 if is_training else 27780,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
def kinetics700(is_training):
"""Generated Kinetics 600 dataset configs."""
return DataConfig(
name='kinetics700',
num_classes=700,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=522883 if is_training else 33441,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
def kinetics700_2020(is_training):
"""Generated Kinetics 600 dataset configs."""
return DataConfig(
name='kinetics700',
num_classes=700,
is_training=is_training,
split='train' if is_training else 'valid',
drop_remainder=is_training,
num_examples=535982 if is_training else 33640,
feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3))
@dataclasses.dataclass
class VideoClassificationModel(hyperparams.Config):
"""The model config."""
model_type: str = 'video_classification'
backbone: backbones_3d.Backbone3D = dataclasses.field(
default_factory=lambda: backbones_3d.Backbone3D( # pylint: disable=g-long-lambda
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()
)
)
norm_activation: common.NormActivation = dataclasses.field(
default_factory=lambda: common.NormActivation(use_sync_bn=False)
)
dropout_rate: float = 0.2
aggregate_endpoints: bool = False
require_endpoints: Optional[Tuple[str, ...]] = None
@dataclasses.dataclass
class Losses(hyperparams.Config):
one_hot: bool = True
label_smoothing: float = 0.0
l2_weight_decay: float = 0.0
@dataclasses.dataclass
class Metrics(hyperparams.Config):
use_per_class_recall: bool = False
@dataclasses.dataclass
class VideoClassificationTask(cfg.TaskConfig):
"""The task config."""
model: VideoClassificationModel = dataclasses.field(
default_factory=VideoClassificationModel
)
train_data: DataConfig = dataclasses.field(
default_factory=lambda: DataConfig(is_training=True, drop_remainder=True)
)
validation_data: DataConfig = dataclasses.field(
default_factory=lambda: DataConfig( # pylint: disable=g-long-lambda
is_training=False, drop_remainder=False
)
)
losses: Losses = dataclasses.field(default_factory=Losses)
metrics: Metrics = dataclasses.field(default_factory=Metrics)
init_checkpoint: Optional[str] = None
init_checkpoint_modules: str = 'all' # all or backbone
freeze_backbone: bool = False
# Spatial Partitioning fields.
train_input_partition_dims: Optional[Tuple[int, ...]] = None
eval_input_partition_dims: Optional[Tuple[int, ...]] = None
def add_trainer(experiment: cfg.ExperimentConfig,
train_batch_size: int,
eval_batch_size: int,
learning_rate: float = 1.6,
train_epochs: int = 44,
warmup_epochs: int = 5):
"""Add and config a trainer to the experiment config."""
if experiment.task.train_data.num_examples <= 0:
raise ValueError('Wrong train dataset size {!r}'.format(
experiment.task.train_data))
if experiment.task.validation_data.num_examples <= 0:
raise ValueError('Wrong validation dataset size {!r}'.format(
experiment.task.validation_data))
experiment.task.train_data.global_batch_size = train_batch_size
experiment.task.validation_data.global_batch_size = eval_batch_size
steps_per_epoch = experiment.task.train_data.num_examples // train_batch_size
experiment.trainer = cfg.TrainerConfig(
steps_per_loop=steps_per_epoch,
summary_interval=steps_per_epoch,
checkpoint_interval=steps_per_epoch,
train_steps=train_epochs * steps_per_epoch,
validation_steps=experiment.task.validation_data.num_examples //
eval_batch_size,
validation_interval=steps_per_epoch,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {
'momentum': 0.9,
'nesterov': True,
}
},
'learning_rate': {
'type': 'cosine',
'cosine': {
'initial_learning_rate': learning_rate,
'decay_steps': train_epochs * steps_per_epoch,
}
},
'warmup': {
'type': 'linear',
'linear': {
'warmup_steps': warmup_epochs * steps_per_epoch,
'warmup_learning_rate': 0
}
}
}))
return experiment
@exp_factory.register_config_factory('video_classification')
def video_classification() -> cfg.ExperimentConfig:
"""Video classification general."""
return cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=VideoClassificationTask(),
trainer=cfg.TrainerConfig(),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
@exp_factory.register_config_factory('video_classification_ucf101')
def video_classification_ucf101() -> cfg.ExperimentConfig:
"""Video classification on UCF-101 with resnet."""
train_dataset = DataConfig(
name='ucf101',
num_classes=101,
is_training=True,
split='train',
drop_remainder=True,
num_examples=9537,
temporal_stride=2,
feature_shape=(32, 224, 224, 3))
train_dataset.tfds_name = 'ucf101'
train_dataset.tfds_split = 'train'
validation_dataset = DataConfig(
name='ucf101',
num_classes=101,
is_training=True,
split='test',
drop_remainder=False,
num_examples=3783,
temporal_stride=2,
feature_shape=(32, 224, 224, 3))
validation_dataset.tfds_name = 'ucf101'
validation_dataset.tfds_split = 'test'
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(
config,
train_batch_size=64,
eval_batch_size=16,
learning_rate=0.8,
train_epochs=100)
return config
@exp_factory.register_config_factory('video_classification_kinetics400')
def video_classification_kinetics400() -> cfg.ExperimentConfig:
"""Video classification on Kinetics 400 with resnet."""
train_dataset = kinetics400(is_training=True)
validation_dataset = kinetics400(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
@exp_factory.register_config_factory('video_classification_kinetics600')
def video_classification_kinetics600() -> cfg.ExperimentConfig:
"""Video classification on Kinetics 600 with resnet."""
train_dataset = kinetics600(is_training=True)
validation_dataset = kinetics600(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
@exp_factory.register_config_factory('video_classification_kinetics700')
def video_classification_kinetics700() -> cfg.ExperimentConfig:
"""Video classification on Kinetics 700 with resnet."""
train_dataset = kinetics700(is_training=True)
validation_dataset = kinetics700(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config
@exp_factory.register_config_factory('video_classification_kinetics700_2020')
def video_classification_kinetics700_2020() -> cfg.ExperimentConfig:
"""Video classification on Kinetics 700 2020 with resnet."""
train_dataset = kinetics700_2020(is_training=True)
validation_dataset = kinetics700_2020(is_training=False)
task = VideoClassificationTask(
model=VideoClassificationModel(
backbone=backbones_3d.Backbone3D(
type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()),
norm_activation=common.NormActivation(
norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
losses=Losses(l2_weight_decay=1e-4),
train_data=train_dataset,
validation_data=validation_dataset)
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
task=task,
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.train_data.num_classes == task.validation_data.num_classes',
])
add_trainer(config, train_batch_size=1024, eval_batch_size=64)
return config