includes: projects/task/ft.yaml dataset: meta_processor: CrossTaskMetaProcessor train_path: data/crosstask/crosstask_release/videos.csv # dummy train_csv_path: data/crosstask/crosstask_release/videos.csv val_path: data/crosstask/crosstask_release/videos_val.csv # dummy val_csv_path: data/crosstask/crosstask_release/videos_val.csv primary_path: data/crosstask/crosstask_release/tasks_primary.txt related_path: data/crosstask/crosstask_release/tasks_related.txt vfeat_dir: data/feat/feat_crosstask_s3d annotation_path: data/crosstask/crosstask_release/annotations n_train: 30 video_processor: CrossTaskVideoProcessor text_processor: CrossTaskTextProcessor aligner: CrossTaskAligner num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 model: model_cls: MMFusionActionLocalization mm_encoder_cls: MMBertForJoint loss: loss_cls: BCE fairseq: dataset: batch_size: 1 optimization: max_epoch: 5 checkpoint: save_dir: runs/task/crosstask restore_file: runs/task/checkpoint11.pt # for VLM