diff --git a/.gitattributes b/.gitattributes index 5bed51a6cab5da5308c399769937de5e4a4b0c64..b13a8aa90164f3bc55aa2a40cec60b6d5d626d1b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -37,3 +37,13 @@ checkpoint-10000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text checkpoint-10000/training_args.bin filter=lfs diff=lfs merge=lfs -text pytorch_model.bin filter=lfs diff=lfs merge=lfs -text **/**/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +*/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +function-base/checkpoint-10000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +function-base/checkpoint-15000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +function-base/checkpoint-20000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +function-base/checkpoint-5000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +**/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +function-base-flatten/checkpoint-10000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +function-base-flatten/checkpoint-15000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +function-base-flatten/checkpoint-20000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +function-base-flatten/checkpoint-5000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text diff --git a/archived/checkpoint-10000/optimizer.pt b/archived/checkpoint-10000/optimizer.pt deleted file mode 100644 index 1be1f8ab5294ea96e4553d4c7dfaf91ee2a6339c..0000000000000000000000000000000000000000 --- a/archived/checkpoint-10000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:337bb55603ebee89cb75dec21a9c866c67a9172a10eaa13590c6e617c99537c4 -size 2371333 diff --git a/archived/checkpoint-10000/pytorch_model.bin b/archived/checkpoint-10000/pytorch_model.bin deleted file mode 100644 index 759138ebf83561d9d1685bdedbf37bb3bb226911..0000000000000000000000000000000000000000 --- a/archived/checkpoint-10000/pytorch_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0b745769676f63fdcd414ff882cdcee4e322aba6ef69bea206e1d7fb4e5f7e6c -size 990408885 diff --git a/archived/checkpoint-10000/rng_state.pth b/archived/checkpoint-10000/rng_state.pth deleted file mode 100644 index 68d25f39d7441e3f61ec3938f1c07eb9632f46ef..0000000000000000000000000000000000000000 --- a/archived/checkpoint-10000/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4dc462a4cd692df29deb3db06d4e4ea5357a3f9583326b1e4246b17e9779a0bd -size 14575 diff --git a/archived/checkpoint-10000/scheduler.pt b/archived/checkpoint-10000/scheduler.pt deleted file mode 100644 index 48e973a1e874eaee28ee630bc9e69a2120cf3e62..0000000000000000000000000000000000000000 --- a/archived/checkpoint-10000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f5e24b8bf255cbefe9d307944a9741807d095b40cc5429a7befe9515b366b0f -size 627 diff --git a/archived/checkpoint-10000/trainer_state.json b/archived/checkpoint-10000/trainer_state.json deleted file mode 100644 index 6ab8ce3ae9458bb9ef9a43127bfb8f14b5fa751f..0000000000000000000000000000000000000000 --- a/archived/checkpoint-10000/trainer_state.json +++ /dev/null @@ -1,296 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.2616704516780217, - "global_step": 10000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.06, - "learning_rate": 6.25e-05, - "loss": 5.1725, - "step": 500 - }, - { - "epoch": 0.06, - "eval_loss": 0.6939732432365417, - "eval_runtime": 5.8238, - "eval_samples_per_second": 17.171, - "eval_steps_per_second": 8.585, - "step": 500 - }, - { - "epoch": 0.13, - "learning_rate": 9.782608695652174e-05, - "loss": 0.5104, - "step": 1000 - }, - { - "epoch": 0.13, - "eval_loss": 0.641968846321106, - "eval_runtime": 6.0125, - "eval_samples_per_second": 16.632, - "eval_steps_per_second": 8.316, - "step": 1000 - }, - { - "epoch": 0.19, - "learning_rate": 9.239130434782609e-05, - "loss": 0.485, - "step": 1500 - }, - { - "epoch": 0.19, - "eval_loss": 0.6206984519958496, - "eval_runtime": 5.6411, - "eval_samples_per_second": 17.727, - "eval_steps_per_second": 8.864, - "step": 1500 - }, - { - "epoch": 0.25, - "learning_rate": 8.695652173913044e-05, - "loss": 0.4644, - "step": 2000 - }, - { - "epoch": 0.25, - "eval_loss": 0.6094934344291687, - "eval_runtime": 4.7677, - "eval_samples_per_second": 20.974, - "eval_steps_per_second": 10.487, - "step": 2000 - }, - { - "epoch": 0.32, - "learning_rate": 8.152173913043478e-05, - "loss": 0.4563, - "step": 2500 - }, - { - "epoch": 0.32, - "eval_loss": 0.6136000156402588, - "eval_runtime": 5.1919, - "eval_samples_per_second": 19.261, - "eval_steps_per_second": 9.63, - "step": 2500 - }, - { - "epoch": 0.38, - "learning_rate": 7.608695652173914e-05, - "loss": 0.4426, - "step": 3000 - }, - { - "epoch": 0.38, - "eval_loss": 0.6097093224525452, - "eval_runtime": 5.562, - "eval_samples_per_second": 17.979, - "eval_steps_per_second": 8.99, - "step": 3000 - }, - { - "epoch": 0.44, - "learning_rate": 7.065217391304349e-05, - "loss": 0.4401, - "step": 3500 - }, - { - "epoch": 0.44, - "eval_loss": 0.5967560410499573, - "eval_runtime": 5.8426, - "eval_samples_per_second": 17.116, - "eval_steps_per_second": 8.558, - "step": 3500 - }, - { - "epoch": 0.5, - "learning_rate": 6.521739130434783e-05, - "loss": 0.4258, - "step": 4000 - }, - { - "epoch": 0.5, - "eval_loss": 0.6082923412322998, - "eval_runtime": 5.3584, - "eval_samples_per_second": 18.662, - "eval_steps_per_second": 9.331, - "step": 4000 - }, - { - "epoch": 0.57, - "learning_rate": 5.9782608695652175e-05, - "loss": 0.424, - "step": 4500 - }, - { - "epoch": 0.57, - "eval_loss": 0.5975988507270813, - "eval_runtime": 4.7572, - "eval_samples_per_second": 21.021, - "eval_steps_per_second": 10.51, - "step": 4500 - }, - { - "epoch": 0.63, - "learning_rate": 5.4347826086956524e-05, - "loss": 0.4375, - "step": 5000 - }, - { - "epoch": 0.63, - "eval_loss": 0.5958980321884155, - "eval_runtime": 5.8907, - "eval_samples_per_second": 16.976, - "eval_steps_per_second": 8.488, - "step": 5000 - }, - { - "epoch": 0.69, - "learning_rate": 4.891304347826087e-05, - "loss": 0.4341, - "step": 5500 - }, - { - "epoch": 0.69, - "eval_loss": 0.5830276012420654, - "eval_runtime": 5.3687, - "eval_samples_per_second": 18.626, - "eval_steps_per_second": 9.313, - "step": 5500 - }, - { - "epoch": 0.76, - "learning_rate": 4.347826086956522e-05, - "loss": 0.4337, - "step": 6000 - }, - { - "epoch": 0.76, - "eval_loss": 0.5838184356689453, - "eval_runtime": 5.9136, - "eval_samples_per_second": 16.91, - "eval_steps_per_second": 8.455, - "step": 6000 - }, - { - "epoch": 0.82, - "learning_rate": 3.804347826086957e-05, - "loss": 0.4363, - "step": 6500 - }, - { - "epoch": 0.82, - "eval_loss": 0.5774537920951843, - "eval_runtime": 5.28, - "eval_samples_per_second": 18.939, - "eval_steps_per_second": 9.47, - "step": 6500 - }, - { - "epoch": 0.88, - "learning_rate": 3.260869565217392e-05, - "loss": 0.4122, - "step": 7000 - }, - { - "epoch": 0.88, - "eval_loss": 0.5706260800361633, - "eval_runtime": 5.6473, - "eval_samples_per_second": 17.707, - "eval_steps_per_second": 8.854, - "step": 7000 - }, - { - "epoch": 0.95, - "learning_rate": 2.7173913043478262e-05, - "loss": 0.4074, - "step": 7500 - }, - { - "epoch": 0.95, - "eval_loss": 0.5714925527572632, - "eval_runtime": 3.7863, - "eval_samples_per_second": 26.411, - "eval_steps_per_second": 13.206, - "step": 7500 - }, - { - "epoch": 1.01, - "learning_rate": 2.173913043478261e-05, - "loss": 0.4137, - "step": 8000 - }, - { - "epoch": 1.01, - "eval_loss": 0.5754862427711487, - "eval_runtime": 6.0221, - "eval_samples_per_second": 16.605, - "eval_steps_per_second": 8.303, - "step": 8000 - }, - { - "epoch": 1.07, - "learning_rate": 1.630434782608696e-05, - "loss": 0.362, - "step": 8500 - }, - { - "epoch": 1.07, - "eval_loss": 0.5741321444511414, - "eval_runtime": 5.8113, - "eval_samples_per_second": 17.208, - "eval_steps_per_second": 8.604, - "step": 8500 - }, - { - "epoch": 1.14, - "learning_rate": 1.0869565217391305e-05, - "loss": 0.3791, - "step": 9000 - }, - { - "epoch": 1.14, - "eval_loss": 0.570868968963623, - "eval_runtime": 5.5735, - "eval_samples_per_second": 17.942, - "eval_steps_per_second": 8.971, - "step": 9000 - }, - { - "epoch": 1.2, - "learning_rate": 5.4347826086956525e-06, - "loss": 0.3628, - "step": 9500 - }, - { - "epoch": 1.2, - "eval_loss": 0.5734958052635193, - "eval_runtime": 5.3626, - "eval_samples_per_second": 18.648, - "eval_steps_per_second": 9.324, - "step": 9500 - }, - { - "epoch": 1.26, - "learning_rate": 0.0, - "loss": 0.3694, - "step": 10000 - }, - { - "epoch": 1.26, - "eval_loss": 0.573215663433075, - "eval_runtime": 4.6899, - "eval_samples_per_second": 21.322, - "eval_steps_per_second": 10.661, - "step": 10000 - } - ], - "max_steps": 10000, - "num_train_epochs": 2, - "total_flos": 3.8513087293824e+16, - "trial_name": null, - "trial_params": null -} diff --git a/archived/checkpoint-10000/training_args.bin b/archived/checkpoint-10000/training_args.bin deleted file mode 100644 index e7ba3bf101337a3c450c5ad0c1872c59201dbb7d..0000000000000000000000000000000000000000 --- a/archived/checkpoint-10000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92133b734236cd14623f5f32d7dac13646e59db8ee0cae12878e084143f07cb4 -size 4219 diff --git a/archived/checkpoint-10000/config.json b/function-base-flatten/checkpoint-10000/config.json similarity index 95% rename from archived/checkpoint-10000/config.json rename to function-base-flatten/checkpoint-10000/config.json index 80d83d0dd8315482ed1b8b635798a64e41042913..68e6643e8bf7f6ccfa0fc70582266a5d1a440ec7 100644 --- a/archived/checkpoint-10000/config.json +++ b/function-base-flatten/checkpoint-10000/config.json @@ -3,6 +3,7 @@ "architectures": [ "FiDT5" ], + "classifier_dropout": 0.0, "d_ff": 2048, "d_kv": 64, "d_model": 768, @@ -55,7 +56,7 @@ }, "tie_word_embeddings": false, "torch_dtype": "float32", - "transformers_version": "4.31.0", + "transformers_version": "4.33.1", "use_cache": true, "vocab_size": 32128 } diff --git a/generation_config.json b/function-base-flatten/checkpoint-10000/generation_config.json similarity index 75% rename from generation_config.json rename to function-base-flatten/checkpoint-10000/generation_config.json index 7528dbb1b6ce860d242aff71294a5fef12a41572..d1022bfc1f42eb4ca98e3fb8efb6fb5982748b51 100644 --- a/generation_config.json +++ b/function-base-flatten/checkpoint-10000/generation_config.json @@ -3,5 +3,5 @@ "decoder_start_token_id": 0, "eos_token_id": 1, "pad_token_id": 0, - "transformers_version": "4.31.0" + "transformers_version": "4.33.1" } diff --git a/function-base-flatten/checkpoint-10000/optimizer.pt b/function-base-flatten/checkpoint-10000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1eb1a7d636af01bb041531eca32f54f819cf0b1 --- /dev/null +++ b/function-base-flatten/checkpoint-10000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02862daee9f5279b5eb8cd89e34c2669c673cdf1d59a98c134566a9b2e31be64 +size 2372293 diff --git a/function-base-flatten/checkpoint-10000/pytorch_model.bin b/function-base-flatten/checkpoint-10000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..43cbec7acb57820561f6780ef88a9bde8ad1a57a --- /dev/null +++ b/function-base-flatten/checkpoint-10000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:298486418d9e1499d63285dd7ef1ea2dc836d8fcb2372dba48dac0d9bd8acf4e +size 990410745 diff --git a/function-base-flatten/checkpoint-10000/rng_state.pth b/function-base-flatten/checkpoint-10000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5aa27370ca2732aa046f77401844f870d25636ea --- /dev/null +++ b/function-base-flatten/checkpoint-10000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ffe83d5f6e5972b42f5e84134778b78cd2b53ac4084db95835b17046131594e +size 14575 diff --git a/function-base-flatten/checkpoint-10000/scheduler.pt b/function-base-flatten/checkpoint-10000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b737addbba331bad572a394dfad27ee2b14c3887 --- /dev/null +++ b/function-base-flatten/checkpoint-10000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ca84735770f2a357642bb07a692869b77f0409ed01b19bf3c890697e28d947e +size 627 diff --git a/function-base-flatten/checkpoint-10000/trainer_state.json b/function-base-flatten/checkpoint-10000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ea27520e287b62cbdbf1e25713c8d5cbfe98104e --- /dev/null +++ b/function-base-flatten/checkpoint-10000/trainer_state.json @@ -0,0 +1,299 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2597631645250693, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 0.000975, + "loss": 1.0322, + "step": 500 + }, + { + "epoch": 0.06, + "eval_loss": 0.8672707676887512, + "eval_runtime": 15.3736, + "eval_samples_per_second": 65.047, + "eval_steps_per_second": 32.523, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00095, + "loss": 0.9884, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.8287830948829651, + "eval_runtime": 15.4828, + "eval_samples_per_second": 64.588, + "eval_steps_per_second": 32.294, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 0.000925, + "loss": 0.9418, + "step": 1500 + }, + { + "epoch": 0.19, + "eval_loss": 0.8468040823936462, + "eval_runtime": 15.4288, + "eval_samples_per_second": 64.814, + "eval_steps_per_second": 32.407, + "step": 1500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009000000000000001, + "loss": 0.9232, + "step": 2000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8540903329849243, + "eval_runtime": 15.6504, + "eval_samples_per_second": 63.896, + "eval_steps_per_second": 31.948, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.000875, + "loss": 0.9107, + "step": 2500 + }, + { + "epoch": 0.31, + "eval_loss": 0.809073269367218, + "eval_runtime": 15.3867, + "eval_samples_per_second": 64.991, + "eval_steps_per_second": 32.496, + "step": 2500 + }, + { + "epoch": 0.38, + "learning_rate": 0.00085, + "loss": 0.9233, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.8151862621307373, + "eval_runtime": 15.0851, + "eval_samples_per_second": 66.291, + "eval_steps_per_second": 33.145, + "step": 3000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000825, + "loss": 0.9038, + "step": 3500 + }, + { + "epoch": 0.44, + "eval_loss": 0.8232718706130981, + "eval_runtime": 15.5073, + "eval_samples_per_second": 64.486, + "eval_steps_per_second": 32.243, + "step": 3500 + }, + { + "epoch": 0.5, + "learning_rate": 0.0008, + "loss": 0.8747, + "step": 4000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7864852547645569, + "eval_runtime": 15.3637, + "eval_samples_per_second": 65.089, + "eval_steps_per_second": 32.544, + "step": 4000 + }, + { + "epoch": 0.57, + "learning_rate": 0.0007750000000000001, + "loss": 0.8981, + "step": 4500 + }, + { + "epoch": 0.57, + "eval_loss": 0.770944356918335, + "eval_runtime": 15.3602, + "eval_samples_per_second": 65.103, + "eval_steps_per_second": 32.552, + "step": 4500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00075, + "loss": 0.8538, + "step": 5000 + }, + { + "epoch": 0.63, + "eval_loss": 0.7905715107917786, + "eval_runtime": 16.0451, + "eval_samples_per_second": 62.324, + "eval_steps_per_second": 31.162, + "step": 5000 + }, + { + "epoch": 0.69, + "learning_rate": 0.000725, + "loss": 0.8295, + "step": 5500 + }, + { + "epoch": 0.69, + "eval_loss": 0.7717331051826477, + "eval_runtime": 16.2335, + "eval_samples_per_second": 61.601, + "eval_steps_per_second": 30.8, + "step": 5500 + }, + { + "epoch": 0.76, + "learning_rate": 0.0007, + "loss": 0.8346, + "step": 6000 + }, + { + "epoch": 0.76, + "eval_loss": 0.7587910890579224, + "eval_runtime": 15.3749, + "eval_samples_per_second": 65.041, + "eval_steps_per_second": 32.521, + "step": 6000 + }, + { + "epoch": 0.82, + "learning_rate": 0.000675, + "loss": 0.8366, + "step": 6500 + }, + { + "epoch": 0.82, + "eval_loss": 0.7654258012771606, + "eval_runtime": 15.1755, + "eval_samples_per_second": 65.895, + "eval_steps_per_second": 32.948, + "step": 6500 + }, + { + "epoch": 0.88, + "learning_rate": 0.0006500000000000001, + "loss": 0.8017, + "step": 7000 + }, + { + "epoch": 0.88, + "eval_loss": 0.759145975112915, + "eval_runtime": 15.3597, + "eval_samples_per_second": 65.106, + "eval_steps_per_second": 32.553, + "step": 7000 + }, + { + "epoch": 0.94, + "learning_rate": 0.000625, + "loss": 0.7788, + "step": 7500 + }, + { + "epoch": 0.94, + "eval_loss": 0.7703807950019836, + "eval_runtime": 15.9867, + "eval_samples_per_second": 62.552, + "eval_steps_per_second": 31.276, + "step": 7500 + }, + { + "epoch": 1.01, + "learning_rate": 0.0006, + "loss": 0.7764, + "step": 8000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7488923668861389, + "eval_runtime": 16.1496, + "eval_samples_per_second": 61.921, + "eval_steps_per_second": 30.961, + "step": 8000 + }, + { + "epoch": 1.07, + "learning_rate": 0.000575, + "loss": 0.5766, + "step": 8500 + }, + { + "epoch": 1.07, + "eval_loss": 0.7688985466957092, + "eval_runtime": 15.5729, + "eval_samples_per_second": 64.214, + "eval_steps_per_second": 32.107, + "step": 8500 + }, + { + "epoch": 1.13, + "learning_rate": 0.00055, + "loss": 0.6024, + "step": 9000 + }, + { + "epoch": 1.13, + "eval_loss": 0.7472162842750549, + "eval_runtime": 15.4638, + "eval_samples_per_second": 64.667, + "eval_steps_per_second": 32.334, + "step": 9000 + }, + { + "epoch": 1.2, + "learning_rate": 0.0005250000000000001, + "loss": 0.5822, + "step": 9500 + }, + { + "epoch": 1.2, + "eval_loss": 0.7330933809280396, + "eval_runtime": 15.5588, + "eval_samples_per_second": 64.272, + "eval_steps_per_second": 32.136, + "step": 9500 + }, + { + "epoch": 1.26, + "learning_rate": 0.0005, + "loss": 0.5812, + "step": 10000 + }, + { + "epoch": 1.26, + "eval_loss": 0.7249069809913635, + "eval_runtime": 15.1664, + "eval_samples_per_second": 65.935, + "eval_steps_per_second": 32.968, + "step": 10000 + } + ], + "logging_steps": 500, + "max_steps": 20000, + "num_train_epochs": 3, + "save_steps": 5000, + "total_flos": 5.445321533503488e+16, + "trial_name": null, + "trial_params": null +} diff --git a/function-base-flatten/checkpoint-10000/training_args.bin b/function-base-flatten/checkpoint-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d611223c11c75c5c99b38fe7fbcabc326ee71ca --- /dev/null +++ b/function-base-flatten/checkpoint-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857168942562a5574ce0eadda06d15062fa7c087954ae9df9c3df06ec62f42c4 +size 4283 diff --git a/config.json b/function-base-flatten/checkpoint-15000/config.json similarity index 95% rename from config.json rename to function-base-flatten/checkpoint-15000/config.json index 80d83d0dd8315482ed1b8b635798a64e41042913..68e6643e8bf7f6ccfa0fc70582266a5d1a440ec7 100644 --- a/config.json +++ b/function-base-flatten/checkpoint-15000/config.json @@ -3,6 +3,7 @@ "architectures": [ "FiDT5" ], + "classifier_dropout": 0.0, "d_ff": 2048, "d_kv": 64, "d_model": 768, @@ -55,7 +56,7 @@ }, "tie_word_embeddings": false, "torch_dtype": "float32", - "transformers_version": "4.31.0", + "transformers_version": "4.33.1", "use_cache": true, "vocab_size": 32128 } diff --git a/archived/checkpoint-10000/generation_config.json b/function-base-flatten/checkpoint-15000/generation_config.json similarity index 75% rename from archived/checkpoint-10000/generation_config.json rename to function-base-flatten/checkpoint-15000/generation_config.json index 7528dbb1b6ce860d242aff71294a5fef12a41572..d1022bfc1f42eb4ca98e3fb8efb6fb5982748b51 100644 --- a/archived/checkpoint-10000/generation_config.json +++ b/function-base-flatten/checkpoint-15000/generation_config.json @@ -3,5 +3,5 @@ "decoder_start_token_id": 0, "eos_token_id": 1, "pad_token_id": 0, - "transformers_version": "4.31.0" + "transformers_version": "4.33.1" } diff --git a/function-base-flatten/checkpoint-15000/optimizer.pt b/function-base-flatten/checkpoint-15000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b0b304cf832f10d1f152854957c58ccef52c164 --- /dev/null +++ b/function-base-flatten/checkpoint-15000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3fc6096a4bea0a15a0ef50591ecf5d74f46d983ae8dcc8255a582e94c0d786e +size 2372293 diff --git a/function-base-flatten/checkpoint-15000/pytorch_model.bin b/function-base-flatten/checkpoint-15000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b41cf0d99f637b41249c7143ca2fa0b454caaad0 --- /dev/null +++ b/function-base-flatten/checkpoint-15000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc4d7035318a7d9356a8c80980d5e8e98a09127dace85985989b9a5114f139b +size 990410745 diff --git a/function-base-flatten/checkpoint-15000/rng_state.pth b/function-base-flatten/checkpoint-15000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e03ab6dcd93d38d8768b106c71aaf80d27729a80 --- /dev/null +++ b/function-base-flatten/checkpoint-15000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84148658af0ffc7e402e61e276c1e47ca4c371c6495ebac7d6c4bc7bb1680e2c +size 14575 diff --git a/function-base-flatten/checkpoint-15000/scheduler.pt b/function-base-flatten/checkpoint-15000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce7cf3e27bb1aa9687a91c4fe4262f73b459f8b3 --- /dev/null +++ b/function-base-flatten/checkpoint-15000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62b793f9ea6eba39185d82063b1e7434411e2aeca1bab5a010024f955d1696b3 +size 627 diff --git a/function-base-flatten/checkpoint-15000/trainer_state.json b/function-base-flatten/checkpoint-15000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f5eb492e705e3a055ab8c0ac9e6da7015fd5f74e --- /dev/null +++ b/function-base-flatten/checkpoint-15000/trainer_state.json @@ -0,0 +1,439 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8896447467876039, + "eval_steps": 500, + "global_step": 15000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 0.000975, + "loss": 1.0322, + "step": 500 + }, + { + "epoch": 0.06, + "eval_loss": 0.8672707676887512, + "eval_runtime": 15.3736, + "eval_samples_per_second": 65.047, + "eval_steps_per_second": 32.523, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00095, + "loss": 0.9884, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.8287830948829651, + "eval_runtime": 15.4828, + "eval_samples_per_second": 64.588, + "eval_steps_per_second": 32.294, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 0.000925, + "loss": 0.9418, + "step": 1500 + }, + { + "epoch": 0.19, + "eval_loss": 0.8468040823936462, + "eval_runtime": 15.4288, + "eval_samples_per_second": 64.814, + "eval_steps_per_second": 32.407, + "step": 1500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009000000000000001, + "loss": 0.9232, + "step": 2000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8540903329849243, + "eval_runtime": 15.6504, + "eval_samples_per_second": 63.896, + "eval_steps_per_second": 31.948, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.000875, + "loss": 0.9107, + "step": 2500 + }, + { + "epoch": 0.31, + "eval_loss": 0.809073269367218, + "eval_runtime": 15.3867, + "eval_samples_per_second": 64.991, + "eval_steps_per_second": 32.496, + "step": 2500 + }, + { + "epoch": 0.38, + "learning_rate": 0.00085, + "loss": 0.9233, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.8151862621307373, + "eval_runtime": 15.0851, + "eval_samples_per_second": 66.291, + "eval_steps_per_second": 33.145, + "step": 3000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000825, + "loss": 0.9038, + "step": 3500 + }, + { + "epoch": 0.44, + "eval_loss": 0.8232718706130981, + "eval_runtime": 15.5073, + "eval_samples_per_second": 64.486, + "eval_steps_per_second": 32.243, + "step": 3500 + }, + { + "epoch": 0.5, + "learning_rate": 0.0008, + "loss": 0.8747, + "step": 4000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7864852547645569, + "eval_runtime": 15.3637, + "eval_samples_per_second": 65.089, + "eval_steps_per_second": 32.544, + "step": 4000 + }, + { + "epoch": 0.57, + "learning_rate": 0.0007750000000000001, + "loss": 0.8981, + "step": 4500 + }, + { + "epoch": 0.57, + "eval_loss": 0.770944356918335, + "eval_runtime": 15.3602, + "eval_samples_per_second": 65.103, + "eval_steps_per_second": 32.552, + "step": 4500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00075, + "loss": 0.8538, + "step": 5000 + }, + { + "epoch": 0.63, + "eval_loss": 0.7905715107917786, + "eval_runtime": 16.0451, + "eval_samples_per_second": 62.324, + "eval_steps_per_second": 31.162, + "step": 5000 + }, + { + "epoch": 0.69, + "learning_rate": 0.000725, + "loss": 0.8295, + "step": 5500 + }, + { + "epoch": 0.69, + "eval_loss": 0.7717331051826477, + "eval_runtime": 16.2335, + "eval_samples_per_second": 61.601, + "eval_steps_per_second": 30.8, + "step": 5500 + }, + { + "epoch": 0.76, + "learning_rate": 0.0007, + "loss": 0.8346, + "step": 6000 + }, + { + "epoch": 0.76, + "eval_loss": 0.7587910890579224, + "eval_runtime": 15.3749, + "eval_samples_per_second": 65.041, + "eval_steps_per_second": 32.521, + "step": 6000 + }, + { + "epoch": 0.82, + "learning_rate": 0.000675, + "loss": 0.8366, + "step": 6500 + }, + { + "epoch": 0.82, + "eval_loss": 0.7654258012771606, + "eval_runtime": 15.1755, + "eval_samples_per_second": 65.895, + "eval_steps_per_second": 32.948, + "step": 6500 + }, + { + "epoch": 0.88, + "learning_rate": 0.0006500000000000001, + "loss": 0.8017, + "step": 7000 + }, + { + "epoch": 0.88, + "eval_loss": 0.759145975112915, + "eval_runtime": 15.3597, + "eval_samples_per_second": 65.106, + "eval_steps_per_second": 32.553, + "step": 7000 + }, + { + "epoch": 0.94, + "learning_rate": 0.000625, + "loss": 0.7788, + "step": 7500 + }, + { + "epoch": 0.94, + "eval_loss": 0.7703807950019836, + "eval_runtime": 15.9867, + "eval_samples_per_second": 62.552, + "eval_steps_per_second": 31.276, + "step": 7500 + }, + { + "epoch": 1.01, + "learning_rate": 0.0006, + "loss": 0.7764, + "step": 8000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7488923668861389, + "eval_runtime": 16.1496, + "eval_samples_per_second": 61.921, + "eval_steps_per_second": 30.961, + "step": 8000 + }, + { + "epoch": 1.07, + "learning_rate": 0.000575, + "loss": 0.5766, + "step": 8500 + }, + { + "epoch": 1.07, + "eval_loss": 0.7688985466957092, + "eval_runtime": 15.5729, + "eval_samples_per_second": 64.214, + "eval_steps_per_second": 32.107, + "step": 8500 + }, + { + "epoch": 1.13, + "learning_rate": 0.00055, + "loss": 0.6024, + "step": 9000 + }, + { + "epoch": 1.13, + "eval_loss": 0.7472162842750549, + "eval_runtime": 15.4638, + "eval_samples_per_second": 64.667, + "eval_steps_per_second": 32.334, + "step": 9000 + }, + { + "epoch": 1.2, + "learning_rate": 0.0005250000000000001, + "loss": 0.5822, + "step": 9500 + }, + { + "epoch": 1.2, + "eval_loss": 0.7330933809280396, + "eval_runtime": 15.5588, + "eval_samples_per_second": 64.272, + "eval_steps_per_second": 32.136, + "step": 9500 + }, + { + "epoch": 1.26, + "learning_rate": 0.0005, + "loss": 0.5812, + "step": 10000 + }, + { + "epoch": 1.26, + "eval_loss": 0.7249069809913635, + "eval_runtime": 15.1664, + "eval_samples_per_second": 65.935, + "eval_steps_per_second": 32.968, + "step": 10000 + }, + { + "epoch": 1.32, + "learning_rate": 0.000475, + "loss": 0.5758, + "step": 10500 + }, + { + "epoch": 1.32, + "eval_loss": 0.7344868779182434, + "eval_runtime": 15.267, + "eval_samples_per_second": 65.501, + "eval_steps_per_second": 32.75, + "step": 10500 + }, + { + "epoch": 1.39, + "learning_rate": 0.00045000000000000004, + "loss": 0.5714, + "step": 11000 + }, + { + "epoch": 1.39, + "eval_loss": 0.7404966950416565, + "eval_runtime": 15.2995, + "eval_samples_per_second": 65.361, + "eval_steps_per_second": 32.681, + "step": 11000 + }, + { + "epoch": 1.45, + "learning_rate": 0.000425, + "loss": 0.5787, + "step": 11500 + }, + { + "epoch": 1.45, + "eval_loss": 0.7168460488319397, + "eval_runtime": 15.277, + "eval_samples_per_second": 65.458, + "eval_steps_per_second": 32.729, + "step": 11500 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004, + "loss": 0.5694, + "step": 12000 + }, + { + "epoch": 1.51, + "eval_loss": 0.7175167202949524, + "eval_runtime": 15.6988, + "eval_samples_per_second": 63.699, + "eval_steps_per_second": 31.85, + "step": 12000 + }, + { + "epoch": 1.57, + "learning_rate": 0.000375, + "loss": 0.5835, + "step": 12500 + }, + { + "epoch": 1.57, + "eval_loss": 0.7070069313049316, + "eval_runtime": 17.1549, + "eval_samples_per_second": 58.293, + "eval_steps_per_second": 29.146, + "step": 12500 + }, + { + "epoch": 1.64, + "learning_rate": 0.00035, + "loss": 0.5664, + "step": 13000 + }, + { + "epoch": 1.64, + "eval_loss": 0.7037546038627625, + "eval_runtime": 15.2843, + "eval_samples_per_second": 65.427, + "eval_steps_per_second": 32.713, + "step": 13000 + }, + { + "epoch": 1.7, + "learning_rate": 0.00032500000000000004, + "loss": 0.5534, + "step": 13500 + }, + { + "epoch": 1.7, + "eval_loss": 0.706645667552948, + "eval_runtime": 15.983, + "eval_samples_per_second": 62.567, + "eval_steps_per_second": 31.283, + "step": 13500 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003, + "loss": 0.5588, + "step": 14000 + }, + { + "epoch": 1.76, + "eval_loss": 0.7000927329063416, + "eval_runtime": 15.4052, + "eval_samples_per_second": 64.913, + "eval_steps_per_second": 32.457, + "step": 14000 + }, + { + "epoch": 1.83, + "learning_rate": 0.000275, + "loss": 0.5568, + "step": 14500 + }, + { + "epoch": 1.83, + "eval_loss": 0.6883980631828308, + "eval_runtime": 15.226, + "eval_samples_per_second": 65.677, + "eval_steps_per_second": 32.839, + "step": 14500 + }, + { + "epoch": 1.89, + "learning_rate": 0.00025, + "loss": 0.5661, + "step": 15000 + }, + { + "epoch": 1.89, + "eval_loss": 0.6906747817993164, + "eval_runtime": 15.225, + "eval_samples_per_second": 65.681, + "eval_steps_per_second": 32.841, + "step": 15000 + } + ], + "logging_steps": 500, + "max_steps": 20000, + "num_train_epochs": 3, + "save_steps": 5000, + "total_flos": 8.170201074567168e+16, + "trial_name": null, + "trial_params": null +} diff --git a/function-base-flatten/checkpoint-15000/training_args.bin b/function-base-flatten/checkpoint-15000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d611223c11c75c5c99b38fe7fbcabc326ee71ca --- /dev/null +++ b/function-base-flatten/checkpoint-15000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857168942562a5574ce0eadda06d15062fa7c087954ae9df9c3df06ec62f42c4 +size 4283 diff --git a/function-base-flatten/checkpoint-20000/config.json b/function-base-flatten/checkpoint-20000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..68e6643e8bf7f6ccfa0fc70582266a5d1a440ec7 --- /dev/null +++ b/function-base-flatten/checkpoint-20000/config.json @@ -0,0 +1,62 @@ +{ + "_name_or_path": "google/flan-t5-base", + "architectures": [ + "FiDT5" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 768, + "decoder_start_token_id": 0, + "dense_act_fn": "gelu_new", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 12, + "num_heads": 12, + "num_layers": 12, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.33.1", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/function-base-flatten/checkpoint-20000/generation_config.json b/function-base-flatten/checkpoint-20000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d1022bfc1f42eb4ca98e3fb8efb6fb5982748b51 --- /dev/null +++ b/function-base-flatten/checkpoint-20000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.33.1" +} diff --git a/function-base-flatten/checkpoint-20000/optimizer.pt b/function-base-flatten/checkpoint-20000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea132f63afb9d6b733ead12e0243399945ef2b3c --- /dev/null +++ b/function-base-flatten/checkpoint-20000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f48f34683d2ac110d59dd00b1585aaef74de33fcadd1e19a361eb0b516be913c +size 2372293 diff --git a/function-base-flatten/checkpoint-20000/pytorch_model.bin b/function-base-flatten/checkpoint-20000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8970416d6bfff49a4562e2a188cf92bd97eafc0b --- /dev/null +++ b/function-base-flatten/checkpoint-20000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aca407d24447508342f9dfc23627a02773a09e95bd307d9602b44bd70f333450 +size 990410745 diff --git a/function-base-flatten/checkpoint-20000/rng_state.pth b/function-base-flatten/checkpoint-20000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0d223bdd9f0c8ca55a863bb30ea65ede164b2f3 --- /dev/null +++ b/function-base-flatten/checkpoint-20000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d45fe887815d6bbd1e0ca49c6430d2920293486016d8671f425007a097024ef6 +size 14575 diff --git a/function-base-flatten/checkpoint-20000/scheduler.pt b/function-base-flatten/checkpoint-20000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdb61285f3b746a1c11414ad9dca7340e1758206 --- /dev/null +++ b/function-base-flatten/checkpoint-20000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f58a0653df4cbbcf9d6cc03d846193b654e5e1cc8a7d6462c99377d7fbe445ea +size 627 diff --git a/function-base-flatten/checkpoint-20000/trainer_state.json b/function-base-flatten/checkpoint-20000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..293426170c482b9c28fd755c6dfe3939424de352 --- /dev/null +++ b/function-base-flatten/checkpoint-20000/trainer_state.json @@ -0,0 +1,579 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5195263290501386, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 0.000975, + "loss": 1.0322, + "step": 500 + }, + { + "epoch": 0.06, + "eval_loss": 0.8672707676887512, + "eval_runtime": 15.3736, + "eval_samples_per_second": 65.047, + "eval_steps_per_second": 32.523, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00095, + "loss": 0.9884, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.8287830948829651, + "eval_runtime": 15.4828, + "eval_samples_per_second": 64.588, + "eval_steps_per_second": 32.294, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 0.000925, + "loss": 0.9418, + "step": 1500 + }, + { + "epoch": 0.19, + "eval_loss": 0.8468040823936462, + "eval_runtime": 15.4288, + "eval_samples_per_second": 64.814, + "eval_steps_per_second": 32.407, + "step": 1500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009000000000000001, + "loss": 0.9232, + "step": 2000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8540903329849243, + "eval_runtime": 15.6504, + "eval_samples_per_second": 63.896, + "eval_steps_per_second": 31.948, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.000875, + "loss": 0.9107, + "step": 2500 + }, + { + "epoch": 0.31, + "eval_loss": 0.809073269367218, + "eval_runtime": 15.3867, + "eval_samples_per_second": 64.991, + "eval_steps_per_second": 32.496, + "step": 2500 + }, + { + "epoch": 0.38, + "learning_rate": 0.00085, + "loss": 0.9233, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.8151862621307373, + "eval_runtime": 15.0851, + "eval_samples_per_second": 66.291, + "eval_steps_per_second": 33.145, + "step": 3000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000825, + "loss": 0.9038, + "step": 3500 + }, + { + "epoch": 0.44, + "eval_loss": 0.8232718706130981, + "eval_runtime": 15.5073, + "eval_samples_per_second": 64.486, + "eval_steps_per_second": 32.243, + "step": 3500 + }, + { + "epoch": 0.5, + "learning_rate": 0.0008, + "loss": 0.8747, + "step": 4000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7864852547645569, + "eval_runtime": 15.3637, + "eval_samples_per_second": 65.089, + "eval_steps_per_second": 32.544, + "step": 4000 + }, + { + "epoch": 0.57, + "learning_rate": 0.0007750000000000001, + "loss": 0.8981, + "step": 4500 + }, + { + "epoch": 0.57, + "eval_loss": 0.770944356918335, + "eval_runtime": 15.3602, + "eval_samples_per_second": 65.103, + "eval_steps_per_second": 32.552, + "step": 4500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00075, + "loss": 0.8538, + "step": 5000 + }, + { + "epoch": 0.63, + "eval_loss": 0.7905715107917786, + "eval_runtime": 16.0451, + "eval_samples_per_second": 62.324, + "eval_steps_per_second": 31.162, + "step": 5000 + }, + { + "epoch": 0.69, + "learning_rate": 0.000725, + "loss": 0.8295, + "step": 5500 + }, + { + "epoch": 0.69, + "eval_loss": 0.7717331051826477, + "eval_runtime": 16.2335, + "eval_samples_per_second": 61.601, + "eval_steps_per_second": 30.8, + "step": 5500 + }, + { + "epoch": 0.76, + "learning_rate": 0.0007, + "loss": 0.8346, + "step": 6000 + }, + { + "epoch": 0.76, + "eval_loss": 0.7587910890579224, + "eval_runtime": 15.3749, + "eval_samples_per_second": 65.041, + "eval_steps_per_second": 32.521, + "step": 6000 + }, + { + "epoch": 0.82, + "learning_rate": 0.000675, + "loss": 0.8366, + "step": 6500 + }, + { + "epoch": 0.82, + "eval_loss": 0.7654258012771606, + "eval_runtime": 15.1755, + "eval_samples_per_second": 65.895, + "eval_steps_per_second": 32.948, + "step": 6500 + }, + { + "epoch": 0.88, + "learning_rate": 0.0006500000000000001, + "loss": 0.8017, + "step": 7000 + }, + { + "epoch": 0.88, + "eval_loss": 0.759145975112915, + "eval_runtime": 15.3597, + "eval_samples_per_second": 65.106, + "eval_steps_per_second": 32.553, + "step": 7000 + }, + { + "epoch": 0.94, + "learning_rate": 0.000625, + "loss": 0.7788, + "step": 7500 + }, + { + "epoch": 0.94, + "eval_loss": 0.7703807950019836, + "eval_runtime": 15.9867, + "eval_samples_per_second": 62.552, + "eval_steps_per_second": 31.276, + "step": 7500 + }, + { + "epoch": 1.01, + "learning_rate": 0.0006, + "loss": 0.7764, + "step": 8000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7488923668861389, + "eval_runtime": 16.1496, + "eval_samples_per_second": 61.921, + "eval_steps_per_second": 30.961, + "step": 8000 + }, + { + "epoch": 1.07, + "learning_rate": 0.000575, + "loss": 0.5766, + "step": 8500 + }, + { + "epoch": 1.07, + "eval_loss": 0.7688985466957092, + "eval_runtime": 15.5729, + "eval_samples_per_second": 64.214, + "eval_steps_per_second": 32.107, + "step": 8500 + }, + { + "epoch": 1.13, + "learning_rate": 0.00055, + "loss": 0.6024, + "step": 9000 + }, + { + "epoch": 1.13, + "eval_loss": 0.7472162842750549, + "eval_runtime": 15.4638, + "eval_samples_per_second": 64.667, + "eval_steps_per_second": 32.334, + "step": 9000 + }, + { + "epoch": 1.2, + "learning_rate": 0.0005250000000000001, + "loss": 0.5822, + "step": 9500 + }, + { + "epoch": 1.2, + "eval_loss": 0.7330933809280396, + "eval_runtime": 15.5588, + "eval_samples_per_second": 64.272, + "eval_steps_per_second": 32.136, + "step": 9500 + }, + { + "epoch": 1.26, + "learning_rate": 0.0005, + "loss": 0.5812, + "step": 10000 + }, + { + "epoch": 1.26, + "eval_loss": 0.7249069809913635, + "eval_runtime": 15.1664, + "eval_samples_per_second": 65.935, + "eval_steps_per_second": 32.968, + "step": 10000 + }, + { + "epoch": 1.32, + "learning_rate": 0.000475, + "loss": 0.5758, + "step": 10500 + }, + { + "epoch": 1.32, + "eval_loss": 0.7344868779182434, + "eval_runtime": 15.267, + "eval_samples_per_second": 65.501, + "eval_steps_per_second": 32.75, + "step": 10500 + }, + { + "epoch": 1.39, + "learning_rate": 0.00045000000000000004, + "loss": 0.5714, + "step": 11000 + }, + { + "epoch": 1.39, + "eval_loss": 0.7404966950416565, + "eval_runtime": 15.2995, + "eval_samples_per_second": 65.361, + "eval_steps_per_second": 32.681, + "step": 11000 + }, + { + "epoch": 1.45, + "learning_rate": 0.000425, + "loss": 0.5787, + "step": 11500 + }, + { + "epoch": 1.45, + "eval_loss": 0.7168460488319397, + "eval_runtime": 15.277, + "eval_samples_per_second": 65.458, + "eval_steps_per_second": 32.729, + "step": 11500 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004, + "loss": 0.5694, + "step": 12000 + }, + { + "epoch": 1.51, + "eval_loss": 0.7175167202949524, + "eval_runtime": 15.6988, + "eval_samples_per_second": 63.699, + "eval_steps_per_second": 31.85, + "step": 12000 + }, + { + "epoch": 1.57, + "learning_rate": 0.000375, + "loss": 0.5835, + "step": 12500 + }, + { + "epoch": 1.57, + "eval_loss": 0.7070069313049316, + "eval_runtime": 17.1549, + "eval_samples_per_second": 58.293, + "eval_steps_per_second": 29.146, + "step": 12500 + }, + { + "epoch": 1.64, + "learning_rate": 0.00035, + "loss": 0.5664, + "step": 13000 + }, + { + "epoch": 1.64, + "eval_loss": 0.7037546038627625, + "eval_runtime": 15.2843, + "eval_samples_per_second": 65.427, + "eval_steps_per_second": 32.713, + "step": 13000 + }, + { + "epoch": 1.7, + "learning_rate": 0.00032500000000000004, + "loss": 0.5534, + "step": 13500 + }, + { + "epoch": 1.7, + "eval_loss": 0.706645667552948, + "eval_runtime": 15.983, + "eval_samples_per_second": 62.567, + "eval_steps_per_second": 31.283, + "step": 13500 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003, + "loss": 0.5588, + "step": 14000 + }, + { + "epoch": 1.76, + "eval_loss": 0.7000927329063416, + "eval_runtime": 15.4052, + "eval_samples_per_second": 64.913, + "eval_steps_per_second": 32.457, + "step": 14000 + }, + { + "epoch": 1.83, + "learning_rate": 0.000275, + "loss": 0.5568, + "step": 14500 + }, + { + "epoch": 1.83, + "eval_loss": 0.6883980631828308, + "eval_runtime": 15.226, + "eval_samples_per_second": 65.677, + "eval_steps_per_second": 32.839, + "step": 14500 + }, + { + "epoch": 1.89, + "learning_rate": 0.00025, + "loss": 0.5661, + "step": 15000 + }, + { + "epoch": 1.89, + "eval_loss": 0.6906747817993164, + "eval_runtime": 15.225, + "eval_samples_per_second": 65.681, + "eval_steps_per_second": 32.841, + "step": 15000 + }, + { + "epoch": 1.95, + "learning_rate": 0.00022500000000000002, + "loss": 0.536, + "step": 15500 + }, + { + "epoch": 1.95, + "eval_loss": 0.6931287050247192, + "eval_runtime": 15.5471, + "eval_samples_per_second": 64.321, + "eval_steps_per_second": 32.16, + "step": 15500 + }, + { + "epoch": 2.02, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 16000 + }, + { + "epoch": 2.02, + "eval_loss": 0.712044894695282, + "eval_runtime": 15.4441, + "eval_samples_per_second": 64.75, + "eval_steps_per_second": 32.375, + "step": 16000 + }, + { + "epoch": 2.08, + "learning_rate": 0.000175, + "loss": 0.3784, + "step": 16500 + }, + { + "epoch": 2.08, + "eval_loss": 0.7175341844558716, + "eval_runtime": 15.0821, + "eval_samples_per_second": 66.304, + "eval_steps_per_second": 33.152, + "step": 16500 + }, + { + "epoch": 2.14, + "learning_rate": 0.00015, + "loss": 0.3755, + "step": 17000 + }, + { + "epoch": 2.14, + "eval_loss": 0.7178177833557129, + "eval_runtime": 15.1581, + "eval_samples_per_second": 65.972, + "eval_steps_per_second": 32.986, + "step": 17000 + }, + { + "epoch": 2.2, + "learning_rate": 0.000125, + "loss": 0.3632, + "step": 17500 + }, + { + "epoch": 2.2, + "eval_loss": 0.7224026918411255, + "eval_runtime": 15.2639, + "eval_samples_per_second": 65.514, + "eval_steps_per_second": 32.757, + "step": 17500 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 0.3632, + "step": 18000 + }, + { + "epoch": 2.27, + "eval_loss": 0.7143183350563049, + "eval_runtime": 15.506, + "eval_samples_per_second": 64.491, + "eval_steps_per_second": 32.246, + "step": 18000 + }, + { + "epoch": 2.33, + "learning_rate": 7.5e-05, + "loss": 0.3614, + "step": 18500 + }, + { + "epoch": 2.33, + "eval_loss": 0.7125980257987976, + "eval_runtime": 15.2549, + "eval_samples_per_second": 65.552, + "eval_steps_per_second": 32.776, + "step": 18500 + }, + { + "epoch": 2.39, + "learning_rate": 5e-05, + "loss": 0.3838, + "step": 19000 + }, + { + "epoch": 2.39, + "eval_loss": 0.7115333080291748, + "eval_runtime": 15.4705, + "eval_samples_per_second": 64.639, + "eval_steps_per_second": 32.32, + "step": 19000 + }, + { + "epoch": 2.46, + "learning_rate": 2.5e-05, + "loss": 0.3652, + "step": 19500 + }, + { + "epoch": 2.46, + "eval_loss": 0.7107402086257935, + "eval_runtime": 16.4678, + "eval_samples_per_second": 60.725, + "eval_steps_per_second": 30.362, + "step": 19500 + }, + { + "epoch": 2.52, + "learning_rate": 0.0, + "loss": 0.3608, + "step": 20000 + }, + { + "epoch": 2.52, + "eval_loss": 0.7085164189338684, + "eval_runtime": 15.3333, + "eval_samples_per_second": 65.218, + "eval_steps_per_second": 32.609, + "step": 20000 + } + ], + "logging_steps": 500, + "max_steps": 20000, + "num_train_epochs": 3, + "save_steps": 5000, + "total_flos": 1.0899423608893747e+17, + "trial_name": null, + "trial_params": null +} diff --git a/function-base-flatten/checkpoint-20000/training_args.bin b/function-base-flatten/checkpoint-20000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d611223c11c75c5c99b38fe7fbcabc326ee71ca --- /dev/null +++ b/function-base-flatten/checkpoint-20000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857168942562a5574ce0eadda06d15062fa7c087954ae9df9c3df06ec62f42c4 +size 4283 diff --git a/function-base-flatten/checkpoint-5000/config.json b/function-base-flatten/checkpoint-5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..68e6643e8bf7f6ccfa0fc70582266a5d1a440ec7 --- /dev/null +++ b/function-base-flatten/checkpoint-5000/config.json @@ -0,0 +1,62 @@ +{ + "_name_or_path": "google/flan-t5-base", + "architectures": [ + "FiDT5" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 768, + "decoder_start_token_id": 0, + "dense_act_fn": "gelu_new", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 12, + "num_heads": 12, + "num_layers": 12, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.33.1", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/function-base-flatten/checkpoint-5000/generation_config.json b/function-base-flatten/checkpoint-5000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d1022bfc1f42eb4ca98e3fb8efb6fb5982748b51 --- /dev/null +++ b/function-base-flatten/checkpoint-5000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.33.1" +} diff --git a/function-base-flatten/checkpoint-5000/optimizer.pt b/function-base-flatten/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c1436d75a4d93409b880f6c81ad61364eddfbfe --- /dev/null +++ b/function-base-flatten/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0908a23754ae755048af4506000784b7466bc0b906c915f3a3ecf011c299b63 +size 2372293 diff --git a/function-base-flatten/checkpoint-5000/pytorch_model.bin b/function-base-flatten/checkpoint-5000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..bf1ae6b9c30fb8388874be9e837a2593622af041 --- /dev/null +++ b/function-base-flatten/checkpoint-5000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72b274ec351daa7fada5fee63ca9860df026dabfc9939fcc1d6405f0e533689e +size 990410745 diff --git a/function-base-flatten/checkpoint-5000/rng_state.pth b/function-base-flatten/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..05360e3defcf786b8d7999cc10643bb62dc4ff2e --- /dev/null +++ b/function-base-flatten/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c9d06088956111c6e20d5851243172fc06211d12a6d319b3a987552ba61c609 +size 14575 diff --git a/function-base-flatten/checkpoint-5000/scheduler.pt b/function-base-flatten/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed3954ff0e21d53d2cce084035a7d0bbac4698c7 --- /dev/null +++ b/function-base-flatten/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf2e868687b4ae6ba785ff2503f15cde78e0501a53544557c44b05f9fae02479 +size 627 diff --git a/function-base-flatten/checkpoint-5000/trainer_state.json b/function-base-flatten/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8f7c549e6807c1bcc0c432d7fd304ce584cba412 --- /dev/null +++ b/function-base-flatten/checkpoint-5000/trainer_state.json @@ -0,0 +1,159 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6298815822625347, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 0.000975, + "loss": 1.0322, + "step": 500 + }, + { + "epoch": 0.06, + "eval_loss": 0.8672707676887512, + "eval_runtime": 15.3736, + "eval_samples_per_second": 65.047, + "eval_steps_per_second": 32.523, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00095, + "loss": 0.9884, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.8287830948829651, + "eval_runtime": 15.4828, + "eval_samples_per_second": 64.588, + "eval_steps_per_second": 32.294, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 0.000925, + "loss": 0.9418, + "step": 1500 + }, + { + "epoch": 0.19, + "eval_loss": 0.8468040823936462, + "eval_runtime": 15.4288, + "eval_samples_per_second": 64.814, + "eval_steps_per_second": 32.407, + "step": 1500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009000000000000001, + "loss": 0.9232, + "step": 2000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8540903329849243, + "eval_runtime": 15.6504, + "eval_samples_per_second": 63.896, + "eval_steps_per_second": 31.948, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.000875, + "loss": 0.9107, + "step": 2500 + }, + { + "epoch": 0.31, + "eval_loss": 0.809073269367218, + "eval_runtime": 15.3867, + "eval_samples_per_second": 64.991, + "eval_steps_per_second": 32.496, + "step": 2500 + }, + { + "epoch": 0.38, + "learning_rate": 0.00085, + "loss": 0.9233, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.8151862621307373, + "eval_runtime": 15.0851, + "eval_samples_per_second": 66.291, + "eval_steps_per_second": 33.145, + "step": 3000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000825, + "loss": 0.9038, + "step": 3500 + }, + { + "epoch": 0.44, + "eval_loss": 0.8232718706130981, + "eval_runtime": 15.5073, + "eval_samples_per_second": 64.486, + "eval_steps_per_second": 32.243, + "step": 3500 + }, + { + "epoch": 0.5, + "learning_rate": 0.0008, + "loss": 0.8747, + "step": 4000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7864852547645569, + "eval_runtime": 15.3637, + "eval_samples_per_second": 65.089, + "eval_steps_per_second": 32.544, + "step": 4000 + }, + { + "epoch": 0.57, + "learning_rate": 0.0007750000000000001, + "loss": 0.8981, + "step": 4500 + }, + { + "epoch": 0.57, + "eval_loss": 0.770944356918335, + "eval_runtime": 15.3602, + "eval_samples_per_second": 65.103, + "eval_steps_per_second": 32.552, + "step": 4500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00075, + "loss": 0.8538, + "step": 5000 + }, + { + "epoch": 0.63, + "eval_loss": 0.7905715107917786, + "eval_runtime": 16.0451, + "eval_samples_per_second": 62.324, + "eval_steps_per_second": 31.162, + "step": 5000 + } + ], + "logging_steps": 500, + "max_steps": 20000, + "num_train_epochs": 3, + "save_steps": 5000, + "total_flos": 2.716206661221581e+16, + "trial_name": null, + "trial_params": null +} diff --git a/function-base-flatten/checkpoint-5000/training_args.bin b/function-base-flatten/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d611223c11c75c5c99b38fe7fbcabc326ee71ca --- /dev/null +++ b/function-base-flatten/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857168942562a5574ce0eadda06d15062fa7c087954ae9df9c3df06ec62f42c4 +size 4283 diff --git a/function-base/checkpoint-10000/config.json b/function-base/checkpoint-10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..68e6643e8bf7f6ccfa0fc70582266a5d1a440ec7 --- /dev/null +++ b/function-base/checkpoint-10000/config.json @@ -0,0 +1,62 @@ +{ + "_name_or_path": "google/flan-t5-base", + "architectures": [ + "FiDT5" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 768, + "decoder_start_token_id": 0, + "dense_act_fn": "gelu_new", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 12, + "num_heads": 12, + "num_layers": 12, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.33.1", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/function-base/checkpoint-10000/generation_config.json b/function-base/checkpoint-10000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d1022bfc1f42eb4ca98e3fb8efb6fb5982748b51 --- /dev/null +++ b/function-base/checkpoint-10000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.33.1" +} diff --git a/function-base/checkpoint-10000/optimizer.pt b/function-base/checkpoint-10000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd1cb01c6f7d0cd08905f4c5a6604ca27f13d5fc --- /dev/null +++ b/function-base/checkpoint-10000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31a8a4d069a6841f5bc2ea4a8f73c35b1643780ee12aea2bacb523db2f0f8014 +size 2372293 diff --git a/function-base/checkpoint-10000/pytorch_model.bin b/function-base/checkpoint-10000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..56c39f36e09c60a22b9e06ccca1c0a764c2296cb --- /dev/null +++ b/function-base/checkpoint-10000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84906aa788bf8a1e878057086eb3a908712a1f7edc9948f79bf943f0a8211a7c +size 990410745 diff --git a/function-base/checkpoint-10000/rng_state.pth b/function-base/checkpoint-10000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..563b733e0faaa5b38f566144c3408490ab78d7e1 --- /dev/null +++ b/function-base/checkpoint-10000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1e8dfcc14f0e4c7687859bd95ad0362e76e36c0d62c4a58370f1dfda0a5a17 +size 14575 diff --git a/function-base/checkpoint-10000/scheduler.pt b/function-base/checkpoint-10000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b737addbba331bad572a394dfad27ee2b14c3887 --- /dev/null +++ b/function-base/checkpoint-10000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ca84735770f2a357642bb07a692869b77f0409ed01b19bf3c890697e28d947e +size 627 diff --git a/function-base/checkpoint-10000/trainer_state.json b/function-base/checkpoint-10000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4af2071c4473e13f55d14266efccacf94978d730 --- /dev/null +++ b/function-base/checkpoint-10000/trainer_state.json @@ -0,0 +1,299 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2597631645250693, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 0.000975, + "loss": 0.9774, + "step": 500 + }, + { + "epoch": 0.06, + "eval_loss": 0.8144938945770264, + "eval_runtime": 14.6573, + "eval_samples_per_second": 68.225, + "eval_steps_per_second": 34.113, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00095, + "loss": 0.9617, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.8077166080474854, + "eval_runtime": 15.1765, + "eval_samples_per_second": 65.891, + "eval_steps_per_second": 32.946, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 0.000925, + "loss": 0.911, + "step": 1500 + }, + { + "epoch": 0.19, + "eval_loss": 0.8064053058624268, + "eval_runtime": 15.3268, + "eval_samples_per_second": 65.245, + "eval_steps_per_second": 32.623, + "step": 1500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009000000000000001, + "loss": 0.8954, + "step": 2000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8163686990737915, + "eval_runtime": 14.5944, + "eval_samples_per_second": 68.519, + "eval_steps_per_second": 34.26, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.000875, + "loss": 0.883, + "step": 2500 + }, + { + "epoch": 0.31, + "eval_loss": 0.8068735003471375, + "eval_runtime": 14.8614, + "eval_samples_per_second": 67.288, + "eval_steps_per_second": 33.644, + "step": 2500 + }, + { + "epoch": 0.38, + "learning_rate": 0.00085, + "loss": 0.8867, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7819482088088989, + "eval_runtime": 14.6896, + "eval_samples_per_second": 68.076, + "eval_steps_per_second": 34.038, + "step": 3000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000825, + "loss": 0.8688, + "step": 3500 + }, + { + "epoch": 0.44, + "eval_loss": 0.8062307238578796, + "eval_runtime": 14.6856, + "eval_samples_per_second": 68.094, + "eval_steps_per_second": 34.047, + "step": 3500 + }, + { + "epoch": 0.5, + "learning_rate": 0.0008, + "loss": 0.8446, + "step": 4000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7707250714302063, + "eval_runtime": 15.0517, + "eval_samples_per_second": 66.438, + "eval_steps_per_second": 33.219, + "step": 4000 + }, + { + "epoch": 0.57, + "learning_rate": 0.0007750000000000001, + "loss": 0.8617, + "step": 4500 + }, + { + "epoch": 0.57, + "eval_loss": 0.7528353333473206, + "eval_runtime": 14.7368, + "eval_samples_per_second": 67.857, + "eval_steps_per_second": 33.929, + "step": 4500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00075, + "loss": 0.8158, + "step": 5000 + }, + { + "epoch": 0.63, + "eval_loss": 0.7551385760307312, + "eval_runtime": 15.3736, + "eval_samples_per_second": 65.047, + "eval_steps_per_second": 32.523, + "step": 5000 + }, + { + "epoch": 0.69, + "learning_rate": 0.000725, + "loss": 0.7889, + "step": 5500 + }, + { + "epoch": 0.69, + "eval_loss": 0.7405046820640564, + "eval_runtime": 15.4488, + "eval_samples_per_second": 64.73, + "eval_steps_per_second": 32.365, + "step": 5500 + }, + { + "epoch": 0.76, + "learning_rate": 0.0007, + "loss": 0.7992, + "step": 6000 + }, + { + "epoch": 0.76, + "eval_loss": 0.7292428016662598, + "eval_runtime": 15.892, + "eval_samples_per_second": 62.925, + "eval_steps_per_second": 31.462, + "step": 6000 + }, + { + "epoch": 0.82, + "learning_rate": 0.000675, + "loss": 0.8051, + "step": 6500 + }, + { + "epoch": 0.82, + "eval_loss": 0.7345249056816101, + "eval_runtime": 14.8049, + "eval_samples_per_second": 67.545, + "eval_steps_per_second": 33.773, + "step": 6500 + }, + { + "epoch": 0.88, + "learning_rate": 0.0006500000000000001, + "loss": 0.7684, + "step": 7000 + }, + { + "epoch": 0.88, + "eval_loss": 0.7357723712921143, + "eval_runtime": 14.7316, + "eval_samples_per_second": 67.881, + "eval_steps_per_second": 33.941, + "step": 7000 + }, + { + "epoch": 0.94, + "learning_rate": 0.000625, + "loss": 0.753, + "step": 7500 + }, + { + "epoch": 0.94, + "eval_loss": 0.7323009371757507, + "eval_runtime": 14.6239, + "eval_samples_per_second": 68.381, + "eval_steps_per_second": 34.191, + "step": 7500 + }, + { + "epoch": 1.01, + "learning_rate": 0.0006, + "loss": 0.7464, + "step": 8000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7275989651679993, + "eval_runtime": 15.2815, + "eval_samples_per_second": 65.439, + "eval_steps_per_second": 32.719, + "step": 8000 + }, + { + "epoch": 1.07, + "learning_rate": 0.000575, + "loss": 0.5429, + "step": 8500 + }, + { + "epoch": 1.07, + "eval_loss": 0.7231326103210449, + "eval_runtime": 15.5099, + "eval_samples_per_second": 64.475, + "eval_steps_per_second": 32.238, + "step": 8500 + }, + { + "epoch": 1.13, + "learning_rate": 0.00055, + "loss": 0.5704, + "step": 9000 + }, + { + "epoch": 1.13, + "eval_loss": 0.717272162437439, + "eval_runtime": 14.9897, + "eval_samples_per_second": 66.712, + "eval_steps_per_second": 33.356, + "step": 9000 + }, + { + "epoch": 1.2, + "learning_rate": 0.0005250000000000001, + "loss": 0.5459, + "step": 9500 + }, + { + "epoch": 1.2, + "eval_loss": 0.7188604474067688, + "eval_runtime": 14.7366, + "eval_samples_per_second": 67.858, + "eval_steps_per_second": 33.929, + "step": 9500 + }, + { + "epoch": 1.26, + "learning_rate": 0.0005, + "loss": 0.5435, + "step": 10000 + }, + { + "epoch": 1.26, + "eval_loss": 0.7037996053695679, + "eval_runtime": 14.5588, + "eval_samples_per_second": 68.687, + "eval_steps_per_second": 34.343, + "step": 10000 + } + ], + "logging_steps": 500, + "max_steps": 20000, + "num_train_epochs": 3, + "save_steps": 5000, + "total_flos": 6.278100154500096e+16, + "trial_name": null, + "trial_params": null +} diff --git a/function-base/checkpoint-10000/training_args.bin b/function-base/checkpoint-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e800d36fb5b28196c127b2b6130e5a079a9071ee --- /dev/null +++ b/function-base/checkpoint-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ea3910cc2b6cda26544c9471813d33e4dddc44d4ed360de38519e745497679 +size 4219 diff --git a/function-base/checkpoint-15000/config.json b/function-base/checkpoint-15000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..68e6643e8bf7f6ccfa0fc70582266a5d1a440ec7 --- /dev/null +++ b/function-base/checkpoint-15000/config.json @@ -0,0 +1,62 @@ +{ + "_name_or_path": "google/flan-t5-base", + "architectures": [ + "FiDT5" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 768, + "decoder_start_token_id": 0, + "dense_act_fn": "gelu_new", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 12, + "num_heads": 12, + "num_layers": 12, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.33.1", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/function-base/checkpoint-15000/generation_config.json b/function-base/checkpoint-15000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d1022bfc1f42eb4ca98e3fb8efb6fb5982748b51 --- /dev/null +++ b/function-base/checkpoint-15000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.33.1" +} diff --git a/function-base/checkpoint-15000/optimizer.pt b/function-base/checkpoint-15000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4eacfa35560d05443da76d42a5c8af1a5df1b4a8 --- /dev/null +++ b/function-base/checkpoint-15000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3138aeb908f29a6626871da58849ebc4a60358542679230032fbe02f4e3658e4 +size 2372293 diff --git a/function-base/checkpoint-15000/pytorch_model.bin b/function-base/checkpoint-15000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..37f9658ff4d3e39d452f5b05009caa1d0fceab1e --- /dev/null +++ b/function-base/checkpoint-15000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e85c9b3d2de2c3e436770a52af0d80543e90725ecee7e845d35843969e56ce2 +size 990410745 diff --git a/function-base/checkpoint-15000/rng_state.pth b/function-base/checkpoint-15000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1532697274a14615f027b753d1c650692a0304a7 --- /dev/null +++ b/function-base/checkpoint-15000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:941a40eace4874c8bc2176908a7a69bb49bf28a46f7d454a00dba777352366c7 +size 14575 diff --git a/function-base/checkpoint-15000/scheduler.pt b/function-base/checkpoint-15000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce7cf3e27bb1aa9687a91c4fe4262f73b459f8b3 --- /dev/null +++ b/function-base/checkpoint-15000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62b793f9ea6eba39185d82063b1e7434411e2aeca1bab5a010024f955d1696b3 +size 627 diff --git a/function-base/checkpoint-15000/trainer_state.json b/function-base/checkpoint-15000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1ae08dc816fc29918747d772139c2509fe040955 --- /dev/null +++ b/function-base/checkpoint-15000/trainer_state.json @@ -0,0 +1,439 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8896447467876039, + "eval_steps": 500, + "global_step": 15000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 0.000975, + "loss": 0.9774, + "step": 500 + }, + { + "epoch": 0.06, + "eval_loss": 0.8144938945770264, + "eval_runtime": 14.6573, + "eval_samples_per_second": 68.225, + "eval_steps_per_second": 34.113, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00095, + "loss": 0.9617, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.8077166080474854, + "eval_runtime": 15.1765, + "eval_samples_per_second": 65.891, + "eval_steps_per_second": 32.946, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 0.000925, + "loss": 0.911, + "step": 1500 + }, + { + "epoch": 0.19, + "eval_loss": 0.8064053058624268, + "eval_runtime": 15.3268, + "eval_samples_per_second": 65.245, + "eval_steps_per_second": 32.623, + "step": 1500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009000000000000001, + "loss": 0.8954, + "step": 2000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8163686990737915, + "eval_runtime": 14.5944, + "eval_samples_per_second": 68.519, + "eval_steps_per_second": 34.26, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.000875, + "loss": 0.883, + "step": 2500 + }, + { + "epoch": 0.31, + "eval_loss": 0.8068735003471375, + "eval_runtime": 14.8614, + "eval_samples_per_second": 67.288, + "eval_steps_per_second": 33.644, + "step": 2500 + }, + { + "epoch": 0.38, + "learning_rate": 0.00085, + "loss": 0.8867, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7819482088088989, + "eval_runtime": 14.6896, + "eval_samples_per_second": 68.076, + "eval_steps_per_second": 34.038, + "step": 3000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000825, + "loss": 0.8688, + "step": 3500 + }, + { + "epoch": 0.44, + "eval_loss": 0.8062307238578796, + "eval_runtime": 14.6856, + "eval_samples_per_second": 68.094, + "eval_steps_per_second": 34.047, + "step": 3500 + }, + { + "epoch": 0.5, + "learning_rate": 0.0008, + "loss": 0.8446, + "step": 4000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7707250714302063, + "eval_runtime": 15.0517, + "eval_samples_per_second": 66.438, + "eval_steps_per_second": 33.219, + "step": 4000 + }, + { + "epoch": 0.57, + "learning_rate": 0.0007750000000000001, + "loss": 0.8617, + "step": 4500 + }, + { + "epoch": 0.57, + "eval_loss": 0.7528353333473206, + "eval_runtime": 14.7368, + "eval_samples_per_second": 67.857, + "eval_steps_per_second": 33.929, + "step": 4500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00075, + "loss": 0.8158, + "step": 5000 + }, + { + "epoch": 0.63, + "eval_loss": 0.7551385760307312, + "eval_runtime": 15.3736, + "eval_samples_per_second": 65.047, + "eval_steps_per_second": 32.523, + "step": 5000 + }, + { + "epoch": 0.69, + "learning_rate": 0.000725, + "loss": 0.7889, + "step": 5500 + }, + { + "epoch": 0.69, + "eval_loss": 0.7405046820640564, + "eval_runtime": 15.4488, + "eval_samples_per_second": 64.73, + "eval_steps_per_second": 32.365, + "step": 5500 + }, + { + "epoch": 0.76, + "learning_rate": 0.0007, + "loss": 0.7992, + "step": 6000 + }, + { + "epoch": 0.76, + "eval_loss": 0.7292428016662598, + "eval_runtime": 15.892, + "eval_samples_per_second": 62.925, + "eval_steps_per_second": 31.462, + "step": 6000 + }, + { + "epoch": 0.82, + "learning_rate": 0.000675, + "loss": 0.8051, + "step": 6500 + }, + { + "epoch": 0.82, + "eval_loss": 0.7345249056816101, + "eval_runtime": 14.8049, + "eval_samples_per_second": 67.545, + "eval_steps_per_second": 33.773, + "step": 6500 + }, + { + "epoch": 0.88, + "learning_rate": 0.0006500000000000001, + "loss": 0.7684, + "step": 7000 + }, + { + "epoch": 0.88, + "eval_loss": 0.7357723712921143, + "eval_runtime": 14.7316, + "eval_samples_per_second": 67.881, + "eval_steps_per_second": 33.941, + "step": 7000 + }, + { + "epoch": 0.94, + "learning_rate": 0.000625, + "loss": 0.753, + "step": 7500 + }, + { + "epoch": 0.94, + "eval_loss": 0.7323009371757507, + "eval_runtime": 14.6239, + "eval_samples_per_second": 68.381, + "eval_steps_per_second": 34.191, + "step": 7500 + }, + { + "epoch": 1.01, + "learning_rate": 0.0006, + "loss": 0.7464, + "step": 8000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7275989651679993, + "eval_runtime": 15.2815, + "eval_samples_per_second": 65.439, + "eval_steps_per_second": 32.719, + "step": 8000 + }, + { + "epoch": 1.07, + "learning_rate": 0.000575, + "loss": 0.5429, + "step": 8500 + }, + { + "epoch": 1.07, + "eval_loss": 0.7231326103210449, + "eval_runtime": 15.5099, + "eval_samples_per_second": 64.475, + "eval_steps_per_second": 32.238, + "step": 8500 + }, + { + "epoch": 1.13, + "learning_rate": 0.00055, + "loss": 0.5704, + "step": 9000 + }, + { + "epoch": 1.13, + "eval_loss": 0.717272162437439, + "eval_runtime": 14.9897, + "eval_samples_per_second": 66.712, + "eval_steps_per_second": 33.356, + "step": 9000 + }, + { + "epoch": 1.2, + "learning_rate": 0.0005250000000000001, + "loss": 0.5459, + "step": 9500 + }, + { + "epoch": 1.2, + "eval_loss": 0.7188604474067688, + "eval_runtime": 14.7366, + "eval_samples_per_second": 67.858, + "eval_steps_per_second": 33.929, + "step": 9500 + }, + { + "epoch": 1.26, + "learning_rate": 0.0005, + "loss": 0.5435, + "step": 10000 + }, + { + "epoch": 1.26, + "eval_loss": 0.7037996053695679, + "eval_runtime": 14.5588, + "eval_samples_per_second": 68.687, + "eval_steps_per_second": 34.343, + "step": 10000 + }, + { + "epoch": 1.32, + "learning_rate": 0.000475, + "loss": 0.5429, + "step": 10500 + }, + { + "epoch": 1.32, + "eval_loss": 0.7000067234039307, + "eval_runtime": 14.629, + "eval_samples_per_second": 68.357, + "eval_steps_per_second": 34.179, + "step": 10500 + }, + { + "epoch": 1.39, + "learning_rate": 0.00045000000000000004, + "loss": 0.5363, + "step": 11000 + }, + { + "epoch": 1.39, + "eval_loss": 0.7090610861778259, + "eval_runtime": 15.5146, + "eval_samples_per_second": 64.455, + "eval_steps_per_second": 32.228, + "step": 11000 + }, + { + "epoch": 1.45, + "learning_rate": 0.000425, + "loss": 0.551, + "step": 11500 + }, + { + "epoch": 1.45, + "eval_loss": 0.6937999129295349, + "eval_runtime": 15.2752, + "eval_samples_per_second": 65.466, + "eval_steps_per_second": 32.733, + "step": 11500 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004, + "loss": 0.5345, + "step": 12000 + }, + { + "epoch": 1.51, + "eval_loss": 0.6926913261413574, + "eval_runtime": 14.5585, + "eval_samples_per_second": 68.688, + "eval_steps_per_second": 34.344, + "step": 12000 + }, + { + "epoch": 1.57, + "learning_rate": 0.000375, + "loss": 0.5519, + "step": 12500 + }, + { + "epoch": 1.57, + "eval_loss": 0.6763409972190857, + "eval_runtime": 14.6685, + "eval_samples_per_second": 68.173, + "eval_steps_per_second": 34.087, + "step": 12500 + }, + { + "epoch": 1.64, + "learning_rate": 0.00035, + "loss": 0.5324, + "step": 13000 + }, + { + "epoch": 1.64, + "eval_loss": 0.6778369545936584, + "eval_runtime": 17.3898, + "eval_samples_per_second": 57.505, + "eval_steps_per_second": 28.753, + "step": 13000 + }, + { + "epoch": 1.7, + "learning_rate": 0.00032500000000000004, + "loss": 0.5272, + "step": 13500 + }, + { + "epoch": 1.7, + "eval_loss": 0.6725330948829651, + "eval_runtime": 14.6869, + "eval_samples_per_second": 68.088, + "eval_steps_per_second": 34.044, + "step": 13500 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003, + "loss": 0.5258, + "step": 14000 + }, + { + "epoch": 1.76, + "eval_loss": 0.6668800115585327, + "eval_runtime": 15.0231, + "eval_samples_per_second": 66.564, + "eval_steps_per_second": 33.282, + "step": 14000 + }, + { + "epoch": 1.83, + "learning_rate": 0.000275, + "loss": 0.5229, + "step": 14500 + }, + { + "epoch": 1.83, + "eval_loss": 0.6599903106689453, + "eval_runtime": 14.6661, + "eval_samples_per_second": 68.184, + "eval_steps_per_second": 34.092, + "step": 14500 + }, + { + "epoch": 1.89, + "learning_rate": 0.00025, + "loss": 0.5386, + "step": 15000 + }, + { + "epoch": 1.89, + "eval_loss": 0.659939706325531, + "eval_runtime": 14.6708, + "eval_samples_per_second": 68.163, + "eval_steps_per_second": 34.081, + "step": 15000 + } + ], + "logging_steps": 500, + "max_steps": 20000, + "num_train_epochs": 3, + "save_steps": 5000, + "total_flos": 9.418761150170112e+16, + "trial_name": null, + "trial_params": null +} diff --git a/function-base/checkpoint-15000/training_args.bin b/function-base/checkpoint-15000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e800d36fb5b28196c127b2b6130e5a079a9071ee --- /dev/null +++ b/function-base/checkpoint-15000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ea3910cc2b6cda26544c9471813d33e4dddc44d4ed360de38519e745497679 +size 4219 diff --git a/function-base/checkpoint-20000/config.json b/function-base/checkpoint-20000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..68e6643e8bf7f6ccfa0fc70582266a5d1a440ec7 --- /dev/null +++ b/function-base/checkpoint-20000/config.json @@ -0,0 +1,62 @@ +{ + "_name_or_path": "google/flan-t5-base", + "architectures": [ + "FiDT5" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 768, + "decoder_start_token_id": 0, + "dense_act_fn": "gelu_new", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 12, + "num_heads": 12, + "num_layers": 12, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.33.1", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/function-base/checkpoint-20000/generation_config.json b/function-base/checkpoint-20000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d1022bfc1f42eb4ca98e3fb8efb6fb5982748b51 --- /dev/null +++ b/function-base/checkpoint-20000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.33.1" +} diff --git a/function-base/checkpoint-20000/optimizer.pt b/function-base/checkpoint-20000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..225437b35ea5c710e3df068f04f1510b930dda26 --- /dev/null +++ b/function-base/checkpoint-20000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c8e37f284bd4d064c062e6a59da6eeebea137bed75c4e7276db0e544fe1c3e0 +size 2372293 diff --git a/function-base/checkpoint-20000/pytorch_model.bin b/function-base/checkpoint-20000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..872195651329b9125c2e66e92c1f59274dbfdad3 --- /dev/null +++ b/function-base/checkpoint-20000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bccc6459f43f82cbfcfe2b0f277e415257fd46815c7fbc812d981814626d3d7c +size 990410745 diff --git a/function-base/checkpoint-20000/rng_state.pth b/function-base/checkpoint-20000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c17377028b3bd741f2497f0e556aa4f8297d7830 --- /dev/null +++ b/function-base/checkpoint-20000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36be9af23ec2bb156d0e53a3f444777c5beaab1c946153b12f7d7cc2fda29fd8 +size 14575 diff --git a/function-base/checkpoint-20000/scheduler.pt b/function-base/checkpoint-20000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdb61285f3b746a1c11414ad9dca7340e1758206 --- /dev/null +++ b/function-base/checkpoint-20000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f58a0653df4cbbcf9d6cc03d846193b654e5e1cc8a7d6462c99377d7fbe445ea +size 627 diff --git a/function-base/checkpoint-20000/trainer_state.json b/function-base/checkpoint-20000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b5919861012b7b09bbc30fc50db106a795ec4c91 --- /dev/null +++ b/function-base/checkpoint-20000/trainer_state.json @@ -0,0 +1,579 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5195263290501386, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 0.000975, + "loss": 0.9774, + "step": 500 + }, + { + "epoch": 0.06, + "eval_loss": 0.8144938945770264, + "eval_runtime": 14.6573, + "eval_samples_per_second": 68.225, + "eval_steps_per_second": 34.113, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00095, + "loss": 0.9617, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.8077166080474854, + "eval_runtime": 15.1765, + "eval_samples_per_second": 65.891, + "eval_steps_per_second": 32.946, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 0.000925, + "loss": 0.911, + "step": 1500 + }, + { + "epoch": 0.19, + "eval_loss": 0.8064053058624268, + "eval_runtime": 15.3268, + "eval_samples_per_second": 65.245, + "eval_steps_per_second": 32.623, + "step": 1500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009000000000000001, + "loss": 0.8954, + "step": 2000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8163686990737915, + "eval_runtime": 14.5944, + "eval_samples_per_second": 68.519, + "eval_steps_per_second": 34.26, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.000875, + "loss": 0.883, + "step": 2500 + }, + { + "epoch": 0.31, + "eval_loss": 0.8068735003471375, + "eval_runtime": 14.8614, + "eval_samples_per_second": 67.288, + "eval_steps_per_second": 33.644, + "step": 2500 + }, + { + "epoch": 0.38, + "learning_rate": 0.00085, + "loss": 0.8867, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7819482088088989, + "eval_runtime": 14.6896, + "eval_samples_per_second": 68.076, + "eval_steps_per_second": 34.038, + "step": 3000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000825, + "loss": 0.8688, + "step": 3500 + }, + { + "epoch": 0.44, + "eval_loss": 0.8062307238578796, + "eval_runtime": 14.6856, + "eval_samples_per_second": 68.094, + "eval_steps_per_second": 34.047, + "step": 3500 + }, + { + "epoch": 0.5, + "learning_rate": 0.0008, + "loss": 0.8446, + "step": 4000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7707250714302063, + "eval_runtime": 15.0517, + "eval_samples_per_second": 66.438, + "eval_steps_per_second": 33.219, + "step": 4000 + }, + { + "epoch": 0.57, + "learning_rate": 0.0007750000000000001, + "loss": 0.8617, + "step": 4500 + }, + { + "epoch": 0.57, + "eval_loss": 0.7528353333473206, + "eval_runtime": 14.7368, + "eval_samples_per_second": 67.857, + "eval_steps_per_second": 33.929, + "step": 4500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00075, + "loss": 0.8158, + "step": 5000 + }, + { + "epoch": 0.63, + "eval_loss": 0.7551385760307312, + "eval_runtime": 15.3736, + "eval_samples_per_second": 65.047, + "eval_steps_per_second": 32.523, + "step": 5000 + }, + { + "epoch": 0.69, + "learning_rate": 0.000725, + "loss": 0.7889, + "step": 5500 + }, + { + "epoch": 0.69, + "eval_loss": 0.7405046820640564, + "eval_runtime": 15.4488, + "eval_samples_per_second": 64.73, + "eval_steps_per_second": 32.365, + "step": 5500 + }, + { + "epoch": 0.76, + "learning_rate": 0.0007, + "loss": 0.7992, + "step": 6000 + }, + { + "epoch": 0.76, + "eval_loss": 0.7292428016662598, + "eval_runtime": 15.892, + "eval_samples_per_second": 62.925, + "eval_steps_per_second": 31.462, + "step": 6000 + }, + { + "epoch": 0.82, + "learning_rate": 0.000675, + "loss": 0.8051, + "step": 6500 + }, + { + "epoch": 0.82, + "eval_loss": 0.7345249056816101, + "eval_runtime": 14.8049, + "eval_samples_per_second": 67.545, + "eval_steps_per_second": 33.773, + "step": 6500 + }, + { + "epoch": 0.88, + "learning_rate": 0.0006500000000000001, + "loss": 0.7684, + "step": 7000 + }, + { + "epoch": 0.88, + "eval_loss": 0.7357723712921143, + "eval_runtime": 14.7316, + "eval_samples_per_second": 67.881, + "eval_steps_per_second": 33.941, + "step": 7000 + }, + { + "epoch": 0.94, + "learning_rate": 0.000625, + "loss": 0.753, + "step": 7500 + }, + { + "epoch": 0.94, + "eval_loss": 0.7323009371757507, + "eval_runtime": 14.6239, + "eval_samples_per_second": 68.381, + "eval_steps_per_second": 34.191, + "step": 7500 + }, + { + "epoch": 1.01, + "learning_rate": 0.0006, + "loss": 0.7464, + "step": 8000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7275989651679993, + "eval_runtime": 15.2815, + "eval_samples_per_second": 65.439, + "eval_steps_per_second": 32.719, + "step": 8000 + }, + { + "epoch": 1.07, + "learning_rate": 0.000575, + "loss": 0.5429, + "step": 8500 + }, + { + "epoch": 1.07, + "eval_loss": 0.7231326103210449, + "eval_runtime": 15.5099, + "eval_samples_per_second": 64.475, + "eval_steps_per_second": 32.238, + "step": 8500 + }, + { + "epoch": 1.13, + "learning_rate": 0.00055, + "loss": 0.5704, + "step": 9000 + }, + { + "epoch": 1.13, + "eval_loss": 0.717272162437439, + "eval_runtime": 14.9897, + "eval_samples_per_second": 66.712, + "eval_steps_per_second": 33.356, + "step": 9000 + }, + { + "epoch": 1.2, + "learning_rate": 0.0005250000000000001, + "loss": 0.5459, + "step": 9500 + }, + { + "epoch": 1.2, + "eval_loss": 0.7188604474067688, + "eval_runtime": 14.7366, + "eval_samples_per_second": 67.858, + "eval_steps_per_second": 33.929, + "step": 9500 + }, + { + "epoch": 1.26, + "learning_rate": 0.0005, + "loss": 0.5435, + "step": 10000 + }, + { + "epoch": 1.26, + "eval_loss": 0.7037996053695679, + "eval_runtime": 14.5588, + "eval_samples_per_second": 68.687, + "eval_steps_per_second": 34.343, + "step": 10000 + }, + { + "epoch": 1.32, + "learning_rate": 0.000475, + "loss": 0.5429, + "step": 10500 + }, + { + "epoch": 1.32, + "eval_loss": 0.7000067234039307, + "eval_runtime": 14.629, + "eval_samples_per_second": 68.357, + "eval_steps_per_second": 34.179, + "step": 10500 + }, + { + "epoch": 1.39, + "learning_rate": 0.00045000000000000004, + "loss": 0.5363, + "step": 11000 + }, + { + "epoch": 1.39, + "eval_loss": 0.7090610861778259, + "eval_runtime": 15.5146, + "eval_samples_per_second": 64.455, + "eval_steps_per_second": 32.228, + "step": 11000 + }, + { + "epoch": 1.45, + "learning_rate": 0.000425, + "loss": 0.551, + "step": 11500 + }, + { + "epoch": 1.45, + "eval_loss": 0.6937999129295349, + "eval_runtime": 15.2752, + "eval_samples_per_second": 65.466, + "eval_steps_per_second": 32.733, + "step": 11500 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004, + "loss": 0.5345, + "step": 12000 + }, + { + "epoch": 1.51, + "eval_loss": 0.6926913261413574, + "eval_runtime": 14.5585, + "eval_samples_per_second": 68.688, + "eval_steps_per_second": 34.344, + "step": 12000 + }, + { + "epoch": 1.57, + "learning_rate": 0.000375, + "loss": 0.5519, + "step": 12500 + }, + { + "epoch": 1.57, + "eval_loss": 0.6763409972190857, + "eval_runtime": 14.6685, + "eval_samples_per_second": 68.173, + "eval_steps_per_second": 34.087, + "step": 12500 + }, + { + "epoch": 1.64, + "learning_rate": 0.00035, + "loss": 0.5324, + "step": 13000 + }, + { + "epoch": 1.64, + "eval_loss": 0.6778369545936584, + "eval_runtime": 17.3898, + "eval_samples_per_second": 57.505, + "eval_steps_per_second": 28.753, + "step": 13000 + }, + { + "epoch": 1.7, + "learning_rate": 0.00032500000000000004, + "loss": 0.5272, + "step": 13500 + }, + { + "epoch": 1.7, + "eval_loss": 0.6725330948829651, + "eval_runtime": 14.6869, + "eval_samples_per_second": 68.088, + "eval_steps_per_second": 34.044, + "step": 13500 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003, + "loss": 0.5258, + "step": 14000 + }, + { + "epoch": 1.76, + "eval_loss": 0.6668800115585327, + "eval_runtime": 15.0231, + "eval_samples_per_second": 66.564, + "eval_steps_per_second": 33.282, + "step": 14000 + }, + { + "epoch": 1.83, + "learning_rate": 0.000275, + "loss": 0.5229, + "step": 14500 + }, + { + "epoch": 1.83, + "eval_loss": 0.6599903106689453, + "eval_runtime": 14.6661, + "eval_samples_per_second": 68.184, + "eval_steps_per_second": 34.092, + "step": 14500 + }, + { + "epoch": 1.89, + "learning_rate": 0.00025, + "loss": 0.5386, + "step": 15000 + }, + { + "epoch": 1.89, + "eval_loss": 0.659939706325531, + "eval_runtime": 14.6708, + "eval_samples_per_second": 68.163, + "eval_steps_per_second": 34.081, + "step": 15000 + }, + { + "epoch": 1.95, + "learning_rate": 0.00022500000000000002, + "loss": 0.5057, + "step": 15500 + }, + { + "epoch": 1.95, + "eval_loss": 0.6644103527069092, + "eval_runtime": 14.8001, + "eval_samples_per_second": 67.567, + "eval_steps_per_second": 33.784, + "step": 15500 + }, + { + "epoch": 2.02, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 16000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6815780401229858, + "eval_runtime": 14.6458, + "eval_samples_per_second": 68.279, + "eval_steps_per_second": 34.139, + "step": 16000 + }, + { + "epoch": 2.08, + "learning_rate": 0.000175, + "loss": 0.3459, + "step": 16500 + }, + { + "epoch": 2.08, + "eval_loss": 0.6946858763694763, + "eval_runtime": 14.801, + "eval_samples_per_second": 67.563, + "eval_steps_per_second": 33.782, + "step": 16500 + }, + { + "epoch": 2.14, + "learning_rate": 0.00015, + "loss": 0.3451, + "step": 17000 + }, + { + "epoch": 2.14, + "eval_loss": 0.6909430027008057, + "eval_runtime": 14.6191, + "eval_samples_per_second": 68.404, + "eval_steps_per_second": 34.202, + "step": 17000 + }, + { + "epoch": 2.2, + "learning_rate": 0.000125, + "loss": 0.333, + "step": 17500 + }, + { + "epoch": 2.2, + "eval_loss": 0.702035665512085, + "eval_runtime": 14.7343, + "eval_samples_per_second": 67.869, + "eval_steps_per_second": 33.934, + "step": 17500 + }, + { + "epoch": 2.27, + "learning_rate": 0.0001, + "loss": 0.3327, + "step": 18000 + }, + { + "epoch": 2.27, + "eval_loss": 0.6883909702301025, + "eval_runtime": 15.0915, + "eval_samples_per_second": 66.262, + "eval_steps_per_second": 33.131, + "step": 18000 + }, + { + "epoch": 2.33, + "learning_rate": 7.5e-05, + "loss": 0.3324, + "step": 18500 + }, + { + "epoch": 2.33, + "eval_loss": 0.6886125802993774, + "eval_runtime": 14.6855, + "eval_samples_per_second": 68.094, + "eval_steps_per_second": 34.047, + "step": 18500 + }, + { + "epoch": 2.39, + "learning_rate": 5e-05, + "loss": 0.3485, + "step": 19000 + }, + { + "epoch": 2.39, + "eval_loss": 0.6856953501701355, + "eval_runtime": 14.5913, + "eval_samples_per_second": 68.534, + "eval_steps_per_second": 34.267, + "step": 19000 + }, + { + "epoch": 2.46, + "learning_rate": 2.5e-05, + "loss": 0.3322, + "step": 19500 + }, + { + "epoch": 2.46, + "eval_loss": 0.6854106187820435, + "eval_runtime": 14.68, + "eval_samples_per_second": 68.12, + "eval_steps_per_second": 34.06, + "step": 19500 + }, + { + "epoch": 2.52, + "learning_rate": 0.0, + "loss": 0.3254, + "step": 20000 + }, + { + "epoch": 2.52, + "eval_loss": 0.6846584677696228, + "eval_runtime": 15.7573, + "eval_samples_per_second": 63.463, + "eval_steps_per_second": 31.731, + "step": 20000 + } + ], + "logging_steps": 500, + "max_steps": 20000, + "num_train_epochs": 3, + "save_steps": 5000, + "total_flos": 1.2562673673330893e+17, + "trial_name": null, + "trial_params": null +} diff --git a/function-base/checkpoint-20000/training_args.bin b/function-base/checkpoint-20000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e800d36fb5b28196c127b2b6130e5a079a9071ee --- /dev/null +++ b/function-base/checkpoint-20000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ea3910cc2b6cda26544c9471813d33e4dddc44d4ed360de38519e745497679 +size 4219 diff --git a/function-base/checkpoint-5000/config.json b/function-base/checkpoint-5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..68e6643e8bf7f6ccfa0fc70582266a5d1a440ec7 --- /dev/null +++ b/function-base/checkpoint-5000/config.json @@ -0,0 +1,62 @@ +{ + "_name_or_path": "google/flan-t5-base", + "architectures": [ + "FiDT5" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 768, + "decoder_start_token_id": 0, + "dense_act_fn": "gelu_new", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 12, + "num_heads": 12, + "num_layers": 12, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.33.1", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/function-base/checkpoint-5000/generation_config.json b/function-base/checkpoint-5000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d1022bfc1f42eb4ca98e3fb8efb6fb5982748b51 --- /dev/null +++ b/function-base/checkpoint-5000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.33.1" +} diff --git a/function-base/checkpoint-5000/optimizer.pt b/function-base/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..993483530c3215bd94a817e736f4c70055aa852a --- /dev/null +++ b/function-base/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7139499506a25943f2374be21441300dd226ccb5dc4b38d284438862c4a8cd96 +size 2372293 diff --git a/function-base/checkpoint-5000/pytorch_model.bin b/function-base/checkpoint-5000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9bb100666d6a4a354a2761382b15f08c70104c4 --- /dev/null +++ b/function-base/checkpoint-5000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8076443edf359bc2d735d6a25622e2989669459d6507109a5b5c4a3d88835fc1 +size 990410745 diff --git a/function-base/checkpoint-5000/rng_state.pth b/function-base/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d879cfe93e2e4c37d8b66dce0140c94703ad00cf --- /dev/null +++ b/function-base/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e4cf47bfb9b70a8cdae819c162bdd55138f97a85d5f5ccf8857518f5393979a +size 14575 diff --git a/function-base/checkpoint-5000/scheduler.pt b/function-base/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed3954ff0e21d53d2cce084035a7d0bbac4698c7 --- /dev/null +++ b/function-base/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf2e868687b4ae6ba785ff2503f15cde78e0501a53544557c44b05f9fae02479 +size 627 diff --git a/function-base/checkpoint-5000/trainer_state.json b/function-base/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b99ab00f44f13eafacd258b596bd0cc125ebb198 --- /dev/null +++ b/function-base/checkpoint-5000/trainer_state.json @@ -0,0 +1,159 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6298815822625347, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06, + "learning_rate": 0.000975, + "loss": 0.9774, + "step": 500 + }, + { + "epoch": 0.06, + "eval_loss": 0.8144938945770264, + "eval_runtime": 14.6573, + "eval_samples_per_second": 68.225, + "eval_steps_per_second": 34.113, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00095, + "loss": 0.9617, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.8077166080474854, + "eval_runtime": 15.1765, + "eval_samples_per_second": 65.891, + "eval_steps_per_second": 32.946, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 0.000925, + "loss": 0.911, + "step": 1500 + }, + { + "epoch": 0.19, + "eval_loss": 0.8064053058624268, + "eval_runtime": 15.3268, + "eval_samples_per_second": 65.245, + "eval_steps_per_second": 32.623, + "step": 1500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009000000000000001, + "loss": 0.8954, + "step": 2000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8163686990737915, + "eval_runtime": 14.5944, + "eval_samples_per_second": 68.519, + "eval_steps_per_second": 34.26, + "step": 2000 + }, + { + "epoch": 0.31, + "learning_rate": 0.000875, + "loss": 0.883, + "step": 2500 + }, + { + "epoch": 0.31, + "eval_loss": 0.8068735003471375, + "eval_runtime": 14.8614, + "eval_samples_per_second": 67.288, + "eval_steps_per_second": 33.644, + "step": 2500 + }, + { + "epoch": 0.38, + "learning_rate": 0.00085, + "loss": 0.8867, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7819482088088989, + "eval_runtime": 14.6896, + "eval_samples_per_second": 68.076, + "eval_steps_per_second": 34.038, + "step": 3000 + }, + { + "epoch": 0.44, + "learning_rate": 0.000825, + "loss": 0.8688, + "step": 3500 + }, + { + "epoch": 0.44, + "eval_loss": 0.8062307238578796, + "eval_runtime": 14.6856, + "eval_samples_per_second": 68.094, + "eval_steps_per_second": 34.047, + "step": 3500 + }, + { + "epoch": 0.5, + "learning_rate": 0.0008, + "loss": 0.8446, + "step": 4000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7707250714302063, + "eval_runtime": 15.0517, + "eval_samples_per_second": 66.438, + "eval_steps_per_second": 33.219, + "step": 4000 + }, + { + "epoch": 0.57, + "learning_rate": 0.0007750000000000001, + "loss": 0.8617, + "step": 4500 + }, + { + "epoch": 0.57, + "eval_loss": 0.7528353333473206, + "eval_runtime": 14.7368, + "eval_samples_per_second": 67.857, + "eval_steps_per_second": 33.929, + "step": 4500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00075, + "loss": 0.8158, + "step": 5000 + }, + { + "epoch": 0.63, + "eval_loss": 0.7551385760307312, + "eval_runtime": 15.3736, + "eval_samples_per_second": 65.047, + "eval_steps_per_second": 32.523, + "step": 5000 + } + ], + "logging_steps": 500, + "max_steps": 20000, + "num_train_epochs": 3, + "save_steps": 5000, + "total_flos": 3.1313397362098176e+16, + "trial_name": null, + "trial_params": null +} diff --git a/function-base/checkpoint-5000/training_args.bin b/function-base/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e800d36fb5b28196c127b2b6130e5a079a9071ee --- /dev/null +++ b/function-base/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ea3910cc2b6cda26544c9471813d33e4dddc44d4ed360de38519e745497679 +size 4219 diff --git a/optimizer.pt b/optimizer.pt deleted file mode 100644 index ac9d31af021717192e7522476e64765d1dd6fafa..0000000000000000000000000000000000000000 --- a/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:50d4d0984f6d3afa328cdb63469c5a8ff9da0e29f70ebc2b46f803ea273608cd -size 2371333 diff --git a/pytorch_model.bin b/pytorch_model.bin deleted file mode 100644 index f987c06606fd212fbe05ec3abada35d7e228924f..0000000000000000000000000000000000000000 --- a/pytorch_model.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70f77c75f6db835162d64802678671bae5af6ec92907394913c274e72964591a -size 990408885 diff --git a/rng_state.pth b/rng_state.pth deleted file mode 100644 index e542a7167033712b6ac7cc3b0d9eb25823e1ad8f..0000000000000000000000000000000000000000 --- a/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:96805bdb6c1ed875dd7da931970e8b852a1c747ef4c73dba10e101dc01ad5c13 -size 14575 diff --git a/scheduler.pt b/scheduler.pt deleted file mode 100644 index 48e973a1e874eaee28ee630bc9e69a2120cf3e62..0000000000000000000000000000000000000000 --- a/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f5e24b8bf255cbefe9d307944a9741807d095b40cc5429a7befe9515b366b0f -size 627 diff --git a/trainer_state.json b/trainer_state.json deleted file mode 100644 index 6966fbf3cfe27f4e0dc5f5b5cc8c334982bc9714..0000000000000000000000000000000000000000 --- a/trainer_state.json +++ /dev/null @@ -1,296 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.5195263290501386, - "global_step": 10000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.13, - "learning_rate": 9.5e-05, - "loss": 0.8023, - "step": 500 - }, - { - "epoch": 0.13, - "eval_loss": 0.6604204177856445, - "eval_runtime": 5.6621, - "eval_samples_per_second": 52.984, - "eval_steps_per_second": 26.492, - "step": 500 - }, - { - "epoch": 0.25, - "learning_rate": 9e-05, - "loss": 0.7113, - "step": 1000 - }, - { - "epoch": 0.25, - "eval_loss": 0.6501660943031311, - "eval_runtime": 5.6949, - "eval_samples_per_second": 52.679, - "eval_steps_per_second": 26.339, - "step": 1000 - }, - { - "epoch": 0.38, - "learning_rate": 8.5e-05, - "loss": 0.7123, - "step": 1500 - }, - { - "epoch": 0.38, - "eval_loss": 0.6351694464683533, - "eval_runtime": 7.0443, - "eval_samples_per_second": 42.588, - "eval_steps_per_second": 21.294, - "step": 1500 - }, - { - "epoch": 0.5, - "learning_rate": 8e-05, - "loss": 0.7002, - "step": 2000 - }, - { - "epoch": 0.5, - "eval_loss": 0.6252465844154358, - "eval_runtime": 5.6908, - "eval_samples_per_second": 52.716, - "eval_steps_per_second": 26.358, - "step": 2000 - }, - { - "epoch": 0.63, - "learning_rate": 7.500000000000001e-05, - "loss": 0.6974, - "step": 2500 - }, - { - "epoch": 0.63, - "eval_loss": 0.6316436529159546, - "eval_runtime": 5.5384, - "eval_samples_per_second": 54.167, - "eval_steps_per_second": 27.083, - "step": 2500 - }, - { - "epoch": 0.76, - "learning_rate": 7e-05, - "loss": 0.668, - "step": 3000 - }, - { - "epoch": 0.76, - "eval_loss": 0.6197097301483154, - "eval_runtime": 5.9147, - "eval_samples_per_second": 50.721, - "eval_steps_per_second": 25.361, - "step": 3000 - }, - { - "epoch": 0.88, - "learning_rate": 6.500000000000001e-05, - "loss": 0.6721, - "step": 3500 - }, - { - "epoch": 0.88, - "eval_loss": 0.6143491268157959, - "eval_runtime": 6.0601, - "eval_samples_per_second": 49.504, - "eval_steps_per_second": 24.752, - "step": 3500 - }, - { - "epoch": 1.01, - "learning_rate": 6e-05, - "loss": 0.6603, - "step": 4000 - }, - { - "epoch": 1.01, - "eval_loss": 0.6048210859298706, - "eval_runtime": 5.3107, - "eval_samples_per_second": 56.49, - "eval_steps_per_second": 28.245, - "step": 4000 - }, - { - "epoch": 1.13, - "learning_rate": 5.500000000000001e-05, - "loss": 0.598, - "step": 4500 - }, - { - "epoch": 1.13, - "eval_loss": 0.6044912934303284, - "eval_runtime": 5.4715, - "eval_samples_per_second": 54.829, - "eval_steps_per_second": 27.415, - "step": 4500 - }, - { - "epoch": 1.26, - "learning_rate": 5e-05, - "loss": 0.5961, - "step": 5000 - }, - { - "epoch": 1.26, - "eval_loss": 0.5996153950691223, - "eval_runtime": 5.2909, - "eval_samples_per_second": 56.701, - "eval_steps_per_second": 28.35, - "step": 5000 - }, - { - "epoch": 1.39, - "learning_rate": 4.5e-05, - "loss": 0.5712, - "step": 5500 - }, - { - "epoch": 1.39, - "eval_loss": 0.6034131646156311, - "eval_runtime": 5.0673, - "eval_samples_per_second": 59.203, - "eval_steps_per_second": 29.602, - "step": 5500 - }, - { - "epoch": 1.51, - "learning_rate": 4e-05, - "loss": 0.5863, - "step": 6000 - }, - { - "epoch": 1.51, - "eval_loss": 0.5980101227760315, - "eval_runtime": 5.146, - "eval_samples_per_second": 58.297, - "eval_steps_per_second": 29.149, - "step": 6000 - }, - { - "epoch": 1.64, - "learning_rate": 3.5e-05, - "loss": 0.5814, - "step": 6500 - }, - { - "epoch": 1.64, - "eval_loss": 0.5993916988372803, - "eval_runtime": 5.2338, - "eval_samples_per_second": 57.32, - "eval_steps_per_second": 28.66, - "step": 6500 - }, - { - "epoch": 1.76, - "learning_rate": 3e-05, - "loss": 0.5726, - "step": 7000 - }, - { - "epoch": 1.76, - "eval_loss": 0.5940994024276733, - "eval_runtime": 5.1782, - "eval_samples_per_second": 57.935, - "eval_steps_per_second": 28.968, - "step": 7000 - }, - { - "epoch": 1.89, - "learning_rate": 2.5e-05, - "loss": 0.5751, - "step": 7500 - }, - { - "epoch": 1.89, - "eval_loss": 0.5939777493476868, - "eval_runtime": 5.4507, - "eval_samples_per_second": 55.039, - "eval_steps_per_second": 27.52, - "step": 7500 - }, - { - "epoch": 2.02, - "learning_rate": 2e-05, - "loss": 0.5831, - "step": 8000 - }, - { - "epoch": 2.02, - "eval_loss": 0.5925641059875488, - "eval_runtime": 5.3244, - "eval_samples_per_second": 56.344, - "eval_steps_per_second": 28.172, - "step": 8000 - }, - { - "epoch": 2.14, - "learning_rate": 1.5e-05, - "loss": 0.542, - "step": 8500 - }, - { - "epoch": 2.14, - "eval_loss": 0.597409188747406, - "eval_runtime": 5.5596, - "eval_samples_per_second": 53.96, - "eval_steps_per_second": 26.98, - "step": 8500 - }, - { - "epoch": 2.27, - "learning_rate": 1e-05, - "loss": 0.5256, - "step": 9000 - }, - { - "epoch": 2.27, - "eval_loss": 0.594516396522522, - "eval_runtime": 5.486, - "eval_samples_per_second": 54.684, - "eval_steps_per_second": 27.342, - "step": 9000 - }, - { - "epoch": 2.39, - "learning_rate": 5e-06, - "loss": 0.543, - "step": 9500 - }, - { - "epoch": 2.39, - "eval_loss": 0.592924177646637, - "eval_runtime": 5.3977, - "eval_samples_per_second": 55.579, - "eval_steps_per_second": 27.789, - "step": 9500 - }, - { - "epoch": 2.52, - "learning_rate": 0.0, - "loss": 0.5272, - "step": 10000 - }, - { - "epoch": 2.52, - "eval_loss": 0.593059241771698, - "eval_runtime": 5.7212, - "eval_samples_per_second": 52.437, - "eval_steps_per_second": 26.218, - "step": 10000 - } - ], - "max_steps": 10000, - "num_train_epochs": 3, - "total_flos": 1.2121549777083802e+17, - "trial_name": null, - "trial_params": null -} diff --git a/training_args.bin b/training_args.bin deleted file mode 100644 index c7c2c59e486b0ba272164629230e684665398276..0000000000000000000000000000000000000000 --- a/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8289752bca98a3f14e53feb07fafa691076be700549694b90e50635b318dad97 -size 4219