File size: 4,404 Bytes
17ff0d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
PYTHON_CMD="
accelerate launch
    --mixed_precision bf16 -m sdlm.run_pretrain \
    --per_device_train_batch_size 1  \
    --per_device_eval_batch_size 1 \
    --do_train \
    --do_eval \
    --log_level info \
    --evaluation_strategy steps \
    --report_to tensorboard \
    --max_seq_length 512 \
    --simplex_value 5 \
    --num_diffusion_steps 5000  \
    --lr_scheduler_type constant_with_warmup \
    --learning_rate 1e-5 \
    --pad_to_max_length \
    --beta_schedule squaredcos_improved_ddpm \
    --top_p 0.99 \
    --max_steps 35000 \
    --warmup_steps 5000 \
    --logging_steps 50 \
    --save_total_limit 1 \
    --conditional_generation ul2 \
    --self_condition logits_mean \
    --self_condition_mix_before_weights \
    --streaming \
    --bf16 \
    --gradient_checkpointing \
    --use_flash_attention2 \
    --is_causal false \
    --mask_padding_in_loss false \
    --without_compute_metrics true \
    --dataloader_num_workers 8 \
    --remove_unused_columns false \
    --dispatch_batches false \
    --shuffle true \
    --preprocessing_num_workers 16 \
    --line_by_line false \
    --model_revision 26bca36bde8333b5d7f72e9ed20ccda6a618af24 \
    --fsdp auto_wrap \
    --fsdp_transformer_layer_cls_to_wrap MistralDecoderLayer \
"
    # --min_sample_seq_length 650 \  # set to filter by length
    #   # cdcd
    # --fsdp auto_wrap \
    # --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \


GANTRY_CMD="
gantry run -y -n random_35k_pretrain -t random_35k_pretrain --allow-dirty \
    --workspace ai2/tess2 \
    --gpus 7 \
    --priority normal \
    --budget ai2/allennlp \
    --preemptible \
    --env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python \
    --env-secret HF_TOKEN=HF_TOKEN \
    --beaker-image ai2/pytorch2.0.0-cuda11.8-python3.10 \
    --venv base \
    --pip requirements.txt \
    --no-nfs \
    --cluster ai2/jupiter-cirrascale-2 \
    --weka oe-data-default:/data/input \
    -- ${PYTHON_CMD} \
    --from_scratch True \
    --model_name_or_path mistralai/Mistral-7B-v0.1 \
    --eval_steps 2000 \
    --save_steps 2000 \
    --ignore_data_skip \
    --seed 101 \
    --max_eval_samples 200 \
    --gradient_accumulation_steps 16 \
    --num_inference_diffusion_steps 100 \
    --overwrite_output_dir false \
    --beaker \
    --output_dir /results \
"


if [ ! -z "${BEAKER}" ]; then
    if [ ! -z "${WEKA}" ]; then
        ${GANTRY_CMD}
            
    else
        ${GANTRY_CMD} 
    fi
else
    ${PYTHON_CMD} \
        --model_name_or_path meta-llama/Llama-3.1-8B \
        --eval_steps 10 \
        --save_steps 50 \
        --max_eval_samples 16 \
        --gradient_accumulation_steps 1 \
        --num_inference_diffusion_steps 10 \
        --output_dir outputs/test \
        --overwrite_output_dir true
fi


# accelerate launch \
#     --mixed_precision bf16 -m sdlm.run_pretrain \
#     --per_device_train_batch_size 1  \
#     --per_device_eval_batch_size 1 \
#     --do_train \
#     --do_eval \
#     --log_level info \
#     --evaluation_strategy steps \
#     --report_to tensorboard \
#     --max_seq_length 2048 \
#     --simplex_value 5 \
#     --num_diffusion_steps 5000  \
#     --lr_scheduler_type constant_with_warmup \
#     --learning_rate 1e-5 \
#     --pad_to_max_length \
#     --beta_schedule squaredcos_improved_ddpm \
#     --top_p 0.99 \
#     --max_steps 10000000 \
#     --warmup_steps 5000 \
#     --logging_steps 50 \
#     --save_total_limit 1 \
#     --conditional_generation ul2 \
#     --self_condition logits_mean \
#     --self_condition_mix_before_weights \
#     --streaming \
#     --bf16 \
#     --optim adamw_torch_fused \
#     --gradient_checkpointing \
#     --is_causal false \
#     --mask_padding_in_loss false \
#     --ddp_find_unused_parameters false \
#     --without_compute_metrics true \
#     --dataloader_num_workers 0 \
#     --remove_unused_columns false \
#     --dispatch_batches false \
#     --shuffle true \
#     --preprocessing_num_workers 16 \
#     --model_revision 26bca36bde8333b5d7f72e9ed20ccda6a618af24 \
#     --line_by_line false \
#     --model_name_or_path mistralai/Mistral-7B-v0.1 \
#     --eval_steps 2000 \
#     --save_steps 2000 \
#     --max_eval_samples 200 \
#     --gradient_accumulation_steps 16 \
#     --num_inference_diffusion_steps 100 \
#     --overwrite_output_dir false \
#     --output_dir testing_output \
#     --use_fast_tokenizer false