VideoModelStudio / training /prepare_dataset.sh
jbilcke-hf's picture
jbilcke-hf HF staff
initial commit log 🪵🦫
91fb4ef
raw
history blame
1.49 kB
#!/bin/bash
MODEL_ID="THUDM/CogVideoX-2b"
NUM_GPUS=8
# For more details on the expected data format, please refer to the README.
DATA_ROOT="/path/to/my/datasets/video-dataset" # This needs to be the path to the base directory where your videos are located.
CAPTION_COLUMN="prompt.txt"
VIDEO_COLUMN="videos.txt"
OUTPUT_DIR="/path/to/my/datasets/preprocessed-dataset"
HEIGHT_BUCKETS="480 720"
WIDTH_BUCKETS="720 960"
FRAME_BUCKETS="49"
MAX_NUM_FRAMES="49"
MAX_SEQUENCE_LENGTH=226
TARGET_FPS=8
BATCH_SIZE=1
DTYPE=fp32
# To create a folder-style dataset structure without pre-encoding videos and captions
# For Image-to-Video finetuning, make sure to pass `--save_image_latents`
CMD_WITHOUT_PRE_ENCODING="\
torchrun --nproc_per_node=$NUM_GPUS \
training/prepare_dataset.py \
--model_id $MODEL_ID \
--data_root $DATA_ROOT \
--caption_column $CAPTION_COLUMN \
--video_column $VIDEO_COLUMN \
--output_dir $OUTPUT_DIR \
--height_buckets $HEIGHT_BUCKETS \
--width_buckets $WIDTH_BUCKETS \
--frame_buckets $FRAME_BUCKETS \
--max_num_frames $MAX_NUM_FRAMES \
--max_sequence_length $MAX_SEQUENCE_LENGTH \
--target_fps $TARGET_FPS \
--batch_size $BATCH_SIZE \
--dtype $DTYPE
"
CMD_WITH_PRE_ENCODING="$CMD_WITHOUT_PRE_ENCODING --save_latents_and_embeddings"
# Select which you'd like to run
CMD=$CMD_WITH_PRE_ENCODING
echo "===== Running \`$CMD\` ====="
eval $CMD
echo -ne "===== Finished running script =====\n"