Update weights and scripts

Browse files

Files changed (10) hide show

flax_model.msgpack +1 -1
flax_to_pt.py +3 -1
opt_state.msgpack +1 -1
pytorch_model.bin +1 -1
run_t5.sh +2 -2
run_t5_mlm_flax_custom_dataset.py +11 -6
runs/Jul16_09-14-47_t1v-n-0e7426e8-w-0/events.out.tfevents.1626426893.t1v-n-0e7426e8-w-0.21179.3.v2 +3 -0
runs/Jul16_11-53-22_t1v-n-0e7426e8-w-0/events.out.tfevents.1626436407.t1v-n-0e7426e8-w-0.23523.3.v2 +3 -0
tf_model.h5 +1 -1
training_state.json +1 -1

flax_model.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:036fa7f4aada6641bbb9a798fec098ac3350e03e6d985c9f007ee7d3ddc85438
 size 891548548

 version https://git-lfs.github.com/spec/v1
+oid sha256:4fc61ad7414d32c991fb5e568ee6d64eb92653a6f68e7386a7a1c1fd43973a45
 size 891548548

flax_to_pt.py CHANGED Viewed

@@ -1,4 +1,6 @@
-from transformers import T5ForConditionalGeneration
 pt_model = T5ForConditionalGeneration.from_pretrained(".", from_flax=True)
 pt_model.save_pretrained(".")

+from transformers import T5ForConditionalGeneration, TFT5ForConditionalGeneration
 pt_model = T5ForConditionalGeneration.from_pretrained(".", from_flax=True)
 pt_model.save_pretrained(".")
+tf_model = TFT5ForConditionalGeneration.from_pretrained(".", from_pt=True)
+tf_model.save_pretrained(".")

opt_state.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f23f514d5431d8d92660e8f7e4db2596d70dab81682f269dc5b49a8e0861d1f1
 size 1985609

 version https://git-lfs.github.com/spec/v1
+oid sha256:3457cc629735027c0d39cfcc9c4978f8180617df4023786ca1c542c79c466335
 size 1985609

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ff6ef4311cd2c471c693622864f85d1b37bae57d9ec06b7773aa27edd4f966d4
 size 891650495

 version https://git-lfs.github.com/spec/v1
+oid sha256:d1b04c56abcc3a5bd4d7e871c7d017f44ab5b75af1c4adcc30c205da5fc5ede1
 size 891650495

run_t5.sh CHANGED Viewed

@@ -50,9 +50,9 @@ while true; do
       --per_device_train_batch_size="16" \
       --per_device_eval_batch_size="16" \
       --dtype="bfloat16" \
-      --learning_rate="1e-2" \
       --overwrite_output_dir \
-      --num_train_epochs="3" \
       --logging_steps="50" \
       --save_steps="500" \
       --eval_steps="5000" \

       --per_device_train_batch_size="16" \
       --per_device_eval_batch_size="16" \
       --dtype="bfloat16" \
+      --learning_rate="1e-3" \
       --overwrite_output_dir \
+      --num_train_epochs="1" \
       --logging_steps="50" \
       --save_steps="500" \
       --eval_steps="5000" \

run_t5_mlm_flax_custom_dataset.py CHANGED Viewed

@@ -583,7 +583,7 @@ if __name__ == "__main__":
             return train, val
-        train, val = train_val_files()
         load_grouped = True
@@ -649,7 +649,7 @@ if __name__ == "__main__":
         logger.info("Loading tokenized and grouped dataset")
         tokenized_datasets = DatasetDict.load_from_disk("/home/yeb/grouped_datasets")
         logger.info("Setting max validation examples to 500")
-        tokenized_datasets['validation'] = tokenized_datasets['validation'].select(range(500))
     else:
         if training_args.do_train:
             column_names = datasets["train"].column_names
@@ -906,11 +906,16 @@ if __name__ == "__main__":
         for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
             cur_step = epoch * (num_train_samples // train_batch_size) + step
             # skip to the step from which we are resuming
-            # if cur_step < resume_step:
-            #     continue
             samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
-            model_inputs = data_collator(samples)
             # Model forward
             model_inputs = shard(model_inputs.data)
@@ -926,7 +931,7 @@ if __name__ == "__main__":
                     write_train_metric(summary_writer, train_metrics, train_time, cur_step)
                 epochs.write(
-                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
                 )
                 train_metrics = []

             return train, val
+        # train, val = train_val_files()
         load_grouped = True
         logger.info("Loading tokenized and grouped dataset")
         tokenized_datasets = DatasetDict.load_from_disk("/home/yeb/grouped_datasets")
         logger.info("Setting max validation examples to 500")
+        tokenized_datasets['validation'] = tokenized_datasets['validation'].select(range(1000))
     else:
         if training_args.do_train:
             column_names = datasets["train"].column_names
         for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
             cur_step = epoch * (num_train_samples // train_batch_size) + step
             # skip to the step from which we are resuming
+            if cur_step < resume_step:
+                continue
             samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
+            try:
+                model_inputs = data_collator(samples)
+            except ValueError as e:
+                logger.warning(str(e))
+                logger.info(f"Continuing with the next batch")
+                continue
             # Model forward
             model_inputs = shard(model_inputs.data)
                     write_train_metric(summary_writer, train_metrics, train_time, cur_step)
                 epochs.write(
+                    f"Step... ({cur_step} ({cur_step+resume_step}| Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
                 )
                 train_metrics = []

runs/Jul16_09-14-47_t1v-n-0e7426e8-w-0/events.out.tfevents.1626426893.t1v-n-0e7426e8-w-0.21179.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6bd1363991ec767bd11c50387d5c37ce29cd84fba0daa9bd9fbbe1bc246a5d6
+size 865193

runs/Jul16_11-53-22_t1v-n-0e7426e8-w-0/events.out.tfevents.1626436407.t1v-n-0e7426e8-w-0.23523.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6246ff2cb2ae46428e8c6faadbbbedefd5e718271f8db555ce4bc45d1f5a8d0e
+size 40

tf_model.h5 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6cd74fd58e4f93094e537b8353035a56384c99fd1c576957092e41ece202e471
 size 892067416

 version https://git-lfs.github.com/spec/v1
+oid sha256:7ca091f719f88d0c460cb709fead1521082e46ac9b1d9873a06e65bb0ca2d94c
 size 892067416

training_state.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"step": ~~48500~~}


1	+ {"step": 54001}