github-actions[bot] commited on
Commit
a76607e
·
0 Parent(s):

🚀 Deploy method comparison app from GH action

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. MetaMathQA/Makefile +90 -0
  2. MetaMathQA/README.md +241 -0
  3. MetaMathQA/cancelled_results/.gitkeep +0 -0
  4. MetaMathQA/data.py +109 -0
  5. MetaMathQA/default_training_params.json +26 -0
  6. MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json +39 -0
  7. MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json +11 -0
  8. MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json +6 -0
  9. MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json +20 -0
  10. MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json +19 -0
  11. MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json +19 -0
  12. MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json +21 -0
  13. MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json +6 -0
  14. MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json +23 -0
  15. MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json +23 -0
  16. MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json +6 -0
  17. MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json +14 -0
  18. MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json +14 -0
  19. MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json +6 -0
  20. MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json +11 -0
  21. MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json +24 -0
  22. MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json +27 -0
  23. MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json +30 -0
  24. MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json +30 -0
  25. MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json +9 -0
  26. MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json +30 -0
  27. MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json +30 -0
  28. MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json +30 -0
  29. MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json +27 -0
  30. MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json +15 -0
  31. MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json +6 -0
  32. MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json +17 -0
  33. MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json +17 -0
  34. MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json +6 -0
  35. MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json +17 -0
  36. MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json +22 -0
  37. MetaMathQA/experiments/vblora/llama-3.2-3B-default/adapter_config.json +26 -0
  38. MetaMathQA/experiments/vera/llama-3.2-3B-default/adapter_config.json +20 -0
  39. MetaMathQA/experiments/vera/llama-3.2-3B-default/training_params.json +6 -0
  40. MetaMathQA/requirements.txt +4 -0
  41. MetaMathQA/results/.gitkeep +0 -0
  42. MetaMathQA/results/adalora--llama-3.2-3B-rank32.json +4071 -0
  43. MetaMathQA/results/adaptionprompt--llama-3.2-3B-lr_0.0005.json +341 -0
  44. MetaMathQA/results/boft--llama-3.2-3B-default.json +354 -0
  45. MetaMathQA/results/bone--llama-3.2-3B-bat.json +350 -0
  46. MetaMathQA/results/bone--llama-3.2-3B-default.json +350 -0
  47. MetaMathQA/results/fourierft--llama-3.2-3B-default.json +354 -0
  48. MetaMathQA/results/fourierft--llama-3.2-3B-n_frequency-5000.json +354 -0
  49. MetaMathQA/results/full-finetuning--llama-3.2-3B-lr_0.00001.json +331 -0
  50. MetaMathQA/results/ia3--llama-3.2-3B-default.json +351 -0
MetaMathQA/Makefile ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Makefile for running MetaMathQA experiments.
2
+
3
+ # --- Configuration ---
4
+ PYTHON := python
5
+ RUN_SCRIPT := run.py
6
+ EXPERIMENTS_DIR := experiments
7
+ RESULTS_DIR := results
8
+
9
+ # --- Automatic Experiment and Result Discovery ---
10
+
11
+ # 1. Find all experiment directories by looking for adapter_config.json files.
12
+ # This gives us a list like: experiments/lora/llama-3.2-3B-rank32 ...
13
+ EXPERIMENT_PATHS := $(shell find $(EXPERIMENTS_DIR) \
14
+ -name "adapter_config.json" -or \
15
+ -name "training_params.json" | xargs dirname | sort -u)
16
+
17
+ # 2. Define a function to replace all occurrences of a character in a string.
18
+ # This is needed to replicate the result naming logic from run.py (e.g., "lora/foo" -> "lora-foo").
19
+ # Usage: $(call replace-all, string, char_to_replace, replacement_char)
20
+ replace-all = $(if $(findstring $(2),$(1)),$(call replace-all,$(subst $(2),$(3),$(1)),$(2),$(3)),$(1))
21
+
22
+ # 3. Define a function to convert an experiment path to its flat result file path.
23
+ # e.g., "experiments/lora/llama-3.2-3B-rank32" -> "results/lora-llama-3.2-3B-rank32.json"
24
+ exp_to_res = $(RESULTS_DIR)/$(call replace-all,$(patsubst $(EXPERIMENTS_DIR)/%,%,$(1)),/,--).json
25
+
26
+ # 4. Generate the list of all target result files we want to build.
27
+ RESULT_FILES := $(foreach exp,$(EXPERIMENT_PATHS),$(call exp_to_res,$(exp)))
28
+
29
+
30
+ # --- Main Rules ---
31
+
32
+ # The default 'all' target depends on all possible result files.
33
+ # Running `make` or `make all` will check and run any outdated or missing experiments.
34
+ all: $(RESULT_FILES)
35
+
36
+
37
+ # --- Dynamic Rule Generation ---
38
+
39
+ # This is the core logic. We dynamically generate a specific Makefile rule for each experiment found.
40
+ # This avoids a complex pattern rule and makes the logic clearer.
41
+ define EXPERIMENT_template
42
+ # Input $1: The full experiment path (e.g., experiments/lora/llama-3.2-3B-rank32)
43
+
44
+ # Define the rule:
45
+ # The target is the result file (e.g., results/lora-llama-3.2-3B-rank32.json).
46
+ # The dependencies are its config files, code changes need to be audited manually since they can
47
+ # vary in degree of importance. Note that we explicitly ignore when the script fails to run
48
+ # so that the other experiments still have a chance to run.
49
+ $(call exp_to_res,$(1)): $(wildcard $(1)/adapter_config.json) $(wildcard $(1)/training_params.json)
50
+ @echo "---"
51
+ @echo "Running experiment: $(1)"
52
+ -$(PYTHON) $(RUN_SCRIPT) -v $(1)
53
+ @echo "Finished: $$@"
54
+ @echo "---"
55
+
56
+ endef
57
+
58
+ # This command iterates through every found experiment path and evaluates the template,
59
+ # effectively stamping out a unique, explicit rule for each one.
60
+ $(foreach exp_path,$(EXPERIMENT_PATHS),$(eval $(call EXPERIMENT_template,$(exp_path))))
61
+
62
+
63
+ # --- Utility Rules ---
64
+
65
+ .PHONY: all clean list dump_rules
66
+
67
+ # The 'clean' rule removes all generated results.
68
+ clean:
69
+ @echo "Cleaning results directory..."
70
+ @([ -n "$(wildcard $(RESULTS_DIR)/*.json)" ] && rm $(RESULTS_DIR)/*.json) || exit 0
71
+
72
+ # The 'list' rule is for debugging. It shows the discovered experiments
73
+ # and the result files the Makefile expects to create for them.
74
+ list:
75
+ @echo "Discovered experiment configurations:"
76
+ @$(foreach exp,$(EXPERIMENT_PATHS),echo " - $(exp)/adapter_config.json";)
77
+ @echo "\nTarget result files:"
78
+ @$(foreach res,$(RESULT_FILES),echo " - $(res)";)
79
+
80
+ # The 'dump_rules' rule is for debugging. It dumps all dynamically defined rules.
81
+ define newline
82
+
83
+
84
+ endef
85
+ define DUMPED_RULES
86
+ $(foreach exp_path,$(EXPERIMENT_PATHS),$(call EXPERIMENT_template,$(exp_path)))
87
+ endef
88
+
89
+ dump_rules:
90
+ @echo -e "$(subst $(newline),\n,${DUMPED_RULES})"
MetaMathQA/README.md ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PEFT method comparison on the MetaMathQA and GSM8K datasets
2
+
3
+ ## Goal
4
+
5
+ This goal is to provide a benchmarking framework for the different PEFT methods that are implemented. It is important that evaluating different PEFT methods is reproducible, idempotent, and version-controlled. Results for more PEFT methods can be added over time.
6
+
7
+ ## Dataset
8
+
9
+ This task trains on the [MetaMathQA]((https://huggingface.co/datasets/meta-math/MetaMathQA)) dataset and validates/tests on the [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset ("main").
10
+
11
+ For the model to attain good accuracy, it needs to learn to adhere to the output format and it must express basic chain of thought reasoning capabilities to get to the correct result in the first place. The task is challenging for models in the sub 7B parameter range.
12
+
13
+ The train set uses the whole of MetaMathQA. The validation set is a random sample from the train set of GSM8K. The test set is the whole of the GSM8K test set.
14
+
15
+ ## Running
16
+
17
+ Create an experiment in the `experiment/<peft-method>` folder of your choice and give it a name (the name itself does not matter but helps identify the experiment). An example would be `experiments/lora/llama-3.2-3B-rank32/`. Inside that directory, create 2 files:
18
+
19
+ - `adapter_config.json`
20
+ - Optional: `training_parameters.json`
21
+
22
+ Once you created these two files, you can either
23
+
24
+ - run the whole suite using by simply calling `make` (takes >24h)
25
+ - run one specific experiment by calling `make results/<experiment_name>-<experiment_variation>.json`,
26
+ for example `results/vblora-llama-3.2-3B-default.json`
27
+
28
+ You can get a list of all runnable experiments by running `make list`, e.g.:
29
+ ```
30
+ % make list (git)-[method-comparison-results] ⛓ peft
31
+ Discovered experiment configurations:
32
+ - experiments/ptuning/llama-3.2-3B-default/adapter_config.json
33
+ [...]
34
+ - experiments/vblora/llama-3.2-3B-default/adapter_config.json
35
+
36
+ Target result files:
37
+ - results/ptuning-llama-3.2-3B-default.json
38
+ [...]
39
+ - results/vblora-llama-3.2-3B-default.json
40
+ ```
41
+
42
+ In case you want to force the execution of an experiment, you can simply `touch` the respective adapter config
43
+ without modifying it. For example:
44
+
45
+ touch experiments/vblora/llama-3.2-3B-default/adapter_config.json
46
+ make
47
+
48
+ to run the VBLoRA default experiment again.
49
+
50
+ ### `adapter_config.json`
51
+
52
+ This must be a valid PEFT configuration. It is easiest to create it programmatically, e.g.:
53
+
54
+ ```python
55
+ from peft import LoraConfig
56
+
57
+ config = LoraConfig(...)
58
+ config.save_pretrained(<path-to-experiment>)
59
+ ```
60
+
61
+ ### `training_parameters.json`
62
+
63
+ There is a default file for the non-PEFT parameters: `default_training_params.json`. This contains all the other parameters that are relevant for training, e.g. the base model id, number of steps, batch size, learning rate, etc. If parameters that differ from the defaults are needed for a specific experiment, place a `training_parameters.json` into the experiment directory and adjust the parameters that need changing. The other parametes are taken from the aforementioned default config.
64
+
65
+ For an overview of all possible arguments, you can also check the `TrainConfig` `dataclass` in `utils.py`.
66
+
67
+ ### Runtime performance
68
+
69
+ Several factors should be considered to achieve a fast runtime performance. Besides the obvious factors like `max_steps` or the base model size, we found the following factors to have a significant impact:
70
+
71
+ #### Eval batch size
72
+
73
+ Regarding the `batch_size_eval` parameter, it is quite critical since evaluation takes up a significant portion of the training time and batching helps with reducing that. It should be possible to choose a value that is multiple times higher than the batch size used for training (`batch_size`). You should also pay attention to the size of the validation set -- e.g. if it's 50, don't choose a `batch_size_eval` of 40, as that results in a large batch of 30 and a small batch of 10. 25 might be a better choice. Also, ensure via a quick train run that the batch size does not lead to out of memory errors -- getting this error at the very end on evaluating the test set would be quite a loss of time.
74
+
75
+ #### Generation length
76
+
77
+ During testing, we discovered that the validation time is greatly inflated by just a few very long generations. Those can inflate the validation time by a factor of 3 or more. At the same time, we discovered that these long generations do not help with accuracy -- in fact, if they exceed the maximum configured length, they're just cut off mid sentence and would thus produce an accuracy of 0 anyway.
78
+
79
+ To remedy this, we now set both `max_length` and `max_new_tokens` for the generation kwargs in the default training parameters. Normally, this is not possible when using transformers, as the latter argument overrides the former. However, we have added special logic inside of `get_generation_config` which takes both and chooses the smaller of the two. This way, we can get rid of these excessively long generations, thus considerably reducing eval times, while still guaranteeing a maximum total generation length to guard against OOM errors. Testing showed that this does not hamper test accuracy. It is therefore recommended not to change these settings.
80
+
81
+ #### Bucketing
82
+
83
+ The length of the sequences in the training data can vary a lot. Therefore, if samples are taken randomly from the training dataset, we will end up with batches containing very short and very long sequences. This is bad because the batch will be padded to the longest sequence, slowing down training. The obvious solution would be to sort the whole dataset by sequence length, but this is also bad because it introduces an order bias (e.g. first training on only short and then on only long answers).
84
+
85
+ The solution is to find a trade off between the two factors. This is achieved by the `BucketIterator`. It first creates buckets that contain multiple batches, e.g. 20x the batch size. The bucket is then sorted by sequence length and then batches are yielded from the bucket. Therefore, we have a small order bias within a bucket but not between buckets, stricking a good balance between training speed and training loss.
86
+
87
+ From practical experiments, for a batch size of 4, a bucket size of 80 provides a good balance with only slightly lower training loss but cutting training time by 25%. For eval, we don't use the iterator since there, the batch size is relatively big and thus there is little upside.
88
+
89
+ ### Start a run
90
+
91
+ Once everything is set up properly, start a run by using the `run.py` script. Pass `-v` for verbose output to the console (recommended if observing the progress is desired). As an example, for `experiments/lora/llama-3.2-3B-rank32/` the invocation would be:
92
+
93
+ ```sh
94
+ python run.py -v experiments/lora/llama-3.2-3B-rank32/
95
+ ```
96
+
97
+ By default, the adapter will be saved in a temporary file for further inspection if needed. The prevent this, add the `--clean` flag to the call.
98
+
99
+ ### Run status
100
+
101
+ The run can be categorized 3 different states:
102
+
103
+ 1. Main run: You are on the `main` branch and the run ended successfully. The results are stored in the `results` folder and are used for further analysis.
104
+ 2. Test run: You are not on the `main` branch and the run ended successfully. The results are stored in the `temporary_results` folder and are not used for further analysis.
105
+ 3. The run was cancelled (`ctrl + c`). The results are stored in the `cancelled_results` folder and are not used for further analysis.
106
+
107
+ ## Outputs
108
+
109
+ Results are stored in one of the result directories. An example output could look like so:
110
+
111
+ ```js
112
+ {
113
+ "run_info": {
114
+ "created_at": "2025-03-05T13:50:05+00:00",
115
+ "total_time": 2711.0915009640157,
116
+ "experiment_name": "ia3/lr_0.001",
117
+ "peft_branch": "ben-method-comparison",
118
+ "train_config": {
119
+ "model_id": "meta-llama/Llama-3.2-3B",
120
+ "dtype": "bfloat16",
121
+ "max_seq_length": 768,
122
+ "batch_size": 4,
123
+ "batch_size_eval": 51,
124
+ "max_steps": 5000,
125
+ "eval_steps": 250,
126
+ "compile": false,
127
+ "query_template": "Question: {query} Think step by step.\nAnswer:",
128
+ "seed": 0,
129
+ "grad_norm_clip": 1.0,
130
+ "optimizer_kwargs": {
131
+ "lr": 0.001
132
+ },
133
+ "lr_scheduler": "cosine",
134
+ "use_amp": false,
135
+ "generation_kwargs": {
136
+ "max_length": 800
137
+ },
138
+ "attn_implementation": null
139
+ },
140
+ "peft_config": {
141
+ "task_type": null,
142
+ "peft_type": "IA3",
143
+ "auto_mapping": null,
144
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
145
+ "revision": null,
146
+ "inference_mode": false,
147
+ "target_modules": [
148
+ "v_proj",
149
+ "k_proj",
150
+ "down_proj"
151
+ ],
152
+ "exclude_modules": null,
153
+ "feedforward_modules": [
154
+ "down_proj"
155
+ ],
156
+ "fan_in_fan_out": false,
157
+ "modules_to_save": null,
158
+ "init_ia3_weights": true
159
+ }
160
+ },
161
+ "train_info": {
162
+ "cuda_memory_reserved_avg": 14229219940,
163
+ "cuda_memory_max": 24847056896,
164
+ "cuda_memory_reserved_99th": 19115624366,
165
+ "train_time": 2238.65277833899,
166
+ "file_size": 1157064,
167
+ "status": "success",
168
+ "metrics": [
169
+ {
170
+ "step": 250,
171
+ "valid accuracy": 0.0784313725490196,
172
+ "train loss": 1.1336498007774354,
173
+ "train samples": 1000
174
+ },
175
+ [...]
176
+ {
177
+ "step": 5000,
178
+ "valid accuracy": 0.21568627450980393,
179
+ "train loss": 0.6345920492410659,
180
+ "train samples": 20000
181
+ },
182
+ {
183
+ "step": 5000,
184
+ "test accuracy": 0.35129740518962077,
185
+ "train loss": 0.6345920492410659,
186
+ "train samples": 20000,
187
+ "train total tokens": 4197579
188
+ }
189
+ ]
190
+ },
191
+ "meta_info": {
192
+ "model_sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
193
+ "model_created_at": "2024-09-18T15:23:48+00:00",
194
+ "dataset_sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
195
+ "dataset_created_at": "2023-09-21T17:22:46+00:00",
196
+ "package_info": {
197
+ "transformers-version": "4.50.0.dev0",
198
+ "transformers-commit-hash": "752ef3fd4e70869626ec70657a770a85c0ad9219",
199
+ "peft-version": "0.14.1.dev0",
200
+ "peft-commit-hash": "a447a4e5ecd87b7d57733f4df9616a328cf130f4",
201
+ "datasets-version": "3.3.2",
202
+ "datasets-commit-hash": null,
203
+ "bitsandbytes-version": "0.45.2",
204
+ "bitsandbytes-commit-hash": null,
205
+ "torch-version": "2.6.0+cu124",
206
+ "torch-commit-hash": null
207
+ },
208
+ "system_info": {
209
+ "system": "Linux",
210
+ "release": "6.11.0-17-generic",
211
+ "version": "#17~24.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jan 20 22:48:29 UTC 2",
212
+ "machine": "x86_64",
213
+ "processor": "x86_64",
214
+ "gpu": "NVIDIA GeForce RTX 4090"
215
+ },
216
+ "pytorch_info": "PyTorch built with: [...]"
217
+ }
218
+ }
219
+ ```
220
+
221
+ ## Dependencies
222
+
223
+ Apart from the normal PEFT dependencies, ensure that the packages in the `requirements.txt` are installed, e.g. via:
224
+
225
+ ```sh
226
+ python -m pip install -r requirements.txt
227
+ ```
228
+
229
+ Python 3.12+ is required.
230
+
231
+ ## Open tasks
232
+
233
+ - consider using `DataLoader`
234
+ - consider adding https://github.com/huggingface/Math-Verify
235
+ - consider adding `weight` argument to cross entropy calculation to downweight the EOS token, but it would require calculating the loss manually instead of relying on transformers (see https://github.com/huggingface/transformers/blob/6a876462c308bd7cd7d3ca8e93abaa7d5b02e90e/src/transformers/loss/loss_utils.py#L24-L48)
236
+ - do a sanity check against/comparison with transformers Trainer
237
+ - consider using vLLM to potentially speed up generations, at least for the test set
238
+ - using `torch.compile` leads to a huge slowdown, investigate (maybe recompiles), although it does save memory
239
+ - AMP does not appear to help, investigate
240
+ - packing of sequences (but this probably requires adjusting the attention matrix)
241
+ - clean up what gets printed and where (stdout, stderr)
MetaMathQA/cancelled_results/.gitkeep ADDED
File without changes
MetaMathQA/data.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025-present the HuggingFace Inc. team.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ All utilities related to data handling.
17
+ """
18
+
19
+ from functools import partial
20
+ from typing import Callable
21
+
22
+ import datasets
23
+ import numpy as np
24
+ from datasets import Dataset, load_dataset
25
+
26
+
27
+ # with a token limit of 768 for query + response, we have to exclude all texts with length > 1304; this leaves 93.8% of
28
+ # the dataset
29
+ CHAR_LIMIT = 1300
30
+ # train/valid/test split -- note that evaluation takes quite long, so don't choose too large sizes for the valid set,
31
+ # since it's run multiple times during training; test is only run once at the end and thus can be larger
32
+ VALID_SIZE = 50
33
+
34
+
35
+ def get_filtered_dataset(*, ds: datasets.Dataset, print_fn: Callable[..., None]) -> Dataset:
36
+ """Return the filtered dataset, with long queries removed.
37
+
38
+ We determined that 99% of queries have 529 or fewer characters. Characters roughly correspond to tokens, so this is
39
+ a good proxy. We cannot use tokens directly, as that depends on the tokenizer, which can be different for each
40
+ model, but we want the same filter for each model.
41
+
42
+ """
43
+ char_lengths = [len(f"{q} {r}") for q, r in zip(ds["query"], ds["response"])]
44
+ idx_filtered = [i for i, length in enumerate(char_lengths) if length <= CHAR_LIMIT]
45
+ print_fn(f"Filtered dataset: {100 * len(idx_filtered) / len(ds):.1f}% of the original dataset")
46
+ return ds.select(idx_filtered)
47
+
48
+
49
+ def get_train_valid_test_datasets(
50
+ *, tokenizer, query_template: str, print_fn: Callable[..., None]
51
+ ) -> tuple[Dataset, Dataset, Dataset]:
52
+ """
53
+ Return the indices of the train, valid, and test splits of the dataset.
54
+
55
+ We cannot use ds.train_test_split(..., stratify_by_column="type") as it gives:
56
+
57
+ > ValueError: Stratifying by column is only supported for ClassLabel column, and column type is Value.
58
+
59
+ even after calling ds_filtered.class_encode_column("type"). Thus, using sklearn's StratifiedKFold instead.
60
+ """
61
+ metamath = load_dataset("meta-math/MetaMathQA")["train"]
62
+ metamath = get_filtered_dataset(ds=metamath, print_fn=print_fn)
63
+
64
+ # gsmk8k does not need to be filtered as query and response are short enough
65
+ gsm8k = load_dataset("openai/gsm8k", "main")
66
+ gsm8k = gsm8k.rename_columns({"question": "query", "answer": "response"})
67
+ gsm8k_train = gsm8k["train"]
68
+ gsm8k_test = gsm8k["test"]
69
+
70
+ np.random.seed(0)
71
+ indices = np.arange(len(gsm8k_train))
72
+ np.random.shuffle(indices)
73
+ idx_valid = indices[:VALID_SIZE]
74
+
75
+ ds_train = metamath
76
+ ds_valid = gsm8k_train.select(idx_valid)
77
+ ds_test = gsm8k_test
78
+
79
+ print_fn(f"Train size: {len(ds_train)}")
80
+ print_fn(f"Valid size: {len(ds_valid)}")
81
+ print_fn(f"Test size: {len(ds_test)}")
82
+
83
+ tokenize_with_answer_ = partial(tokenize_with_answer, tokenizer=tokenizer, template=query_template)
84
+ tokenize_wo_answer_ = partial(tokenize_wo_answer, tokenizer=tokenizer, template=query_template)
85
+ ds_train = ds_train.map(tokenize_with_answer_, batched=True).remove_columns(["type", "query", "original_question"])
86
+ ds_valid = ds_valid.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])
87
+ ds_test = ds_test.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])
88
+
89
+ return ds_train, ds_valid, ds_test
90
+
91
+
92
+ def tokenize_with_answer(samples, tokenizer, template):
93
+ queries = [template.format(query=sample) + answer for sample, answer in zip(samples["query"], samples["response"])]
94
+ tokenized = tokenizer(queries)
95
+ tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
96
+ tokenized["attention_mask"] = [
97
+ input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
98
+ ]
99
+ return tokenized
100
+
101
+
102
+ def tokenize_wo_answer(samples, tokenizer, template):
103
+ queries = [template.format(query=sample) for sample in samples["query"]]
104
+ tokenized = tokenizer(queries)
105
+ tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
106
+ tokenized["attention_mask"] = [
107
+ input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
108
+ ]
109
+ return tokenized
MetaMathQA/default_training_params.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "meta-llama/Llama-3.2-3B",
3
+ "dtype": "bfloat16",
4
+ "max_seq_length": 768,
5
+ "batch_size": 4,
6
+ "batch_size_eval": 50,
7
+ "max_steps": 5000,
8
+ "eval_steps": 250,
9
+ "compile": false,
10
+ "seed": 0,
11
+ "grad_norm_clip": 1.0,
12
+ "optimizer_type": "AdamW",
13
+ "optimizer_kwargs": {
14
+ "lr": 1e-4,
15
+ "weight_decay": 0.1
16
+ },
17
+ "lr_scheduler": "cosine",
18
+ "use_amp": false,
19
+ "autocast_adapter_dtype": true,
20
+ "attn_implementation": null,
21
+ "generation_kwargs": {
22
+ "max_length": 800,
23
+ "max_new_tokens": 300
24
+ },
25
+ "query_template": "Question: {query} Think step by step.\nAnswer:"
26
+ }
MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "beta1": 0.85,
6
+ "beta2": 0.85,
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "deltaT": 1,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "init_lora_weights": true,
15
+ "init_r": 64,
16
+ "layer_replication": null,
17
+ "layers_pattern": null,
18
+ "layers_to_transform": null,
19
+ "loftq_config": {},
20
+ "lora_alpha": 8,
21
+ "lora_bias": false,
22
+ "lora_dropout": 0.0,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "orth_reg_weight": 0.5,
27
+ "peft_type": "ADALORA",
28
+ "r": 8,
29
+ "rank_pattern": null,
30
+ "revision": null,
31
+ "target_modules": null,
32
+ "target_r": 32,
33
+ "task_type": null,
34
+ "tfinal": 500,
35
+ "tinit": 200,
36
+ "total_step": 5000,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "adapter_layers": 28,
3
+ "adapter_len": 100,
4
+ "auto_mapping": null,
5
+ "base_model_name_or_path": null,
6
+ "inference_mode": false,
7
+ "peft_type": "ADAPTION_PROMPT",
8
+ "revision": null,
9
+ "target_modules": null,
10
+ "task_type": "CAUSAL_LM"
11
+ }
MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "optimizer_kwargs": {
3
+ "lr": 5e-4
4
+ }
5
+ }
6
+
MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "bias": "none",
5
+ "boft_block_num": 0,
6
+ "boft_block_size": 4,
7
+ "boft_dropout": 0.0,
8
+ "boft_n_butterfly_factor": 1,
9
+ "exclude_modules": null,
10
+ "fan_in_fan_out": false,
11
+ "inference_mode": false,
12
+ "init_weights": true,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "modules_to_save": null,
16
+ "peft_type": "BOFT",
17
+ "revision": null,
18
+ "target_modules": null,
19
+ "task_type": null
20
+ }
MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "bias": "none",
5
+ "exclude_modules": null,
6
+ "inference_mode": false,
7
+ "init_weights": "bat",
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "modules_to_save": null,
11
+ "peft_type": "BONE",
12
+ "r": 64,
13
+ "revision": null,
14
+ "target_modules": [
15
+ "v_proj",
16
+ "q_proj"
17
+ ],
18
+ "task_type": null
19
+ }
MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "bias": "none",
5
+ "exclude_modules": null,
6
+ "inference_mode": false,
7
+ "init_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "modules_to_save": null,
11
+ "peft_type": "BONE",
12
+ "r": 64,
13
+ "revision": null,
14
+ "target_modules": [
15
+ "v_proj",
16
+ "q_proj"
17
+ ],
18
+ "task_type": null
19
+ }
MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "bias": "none",
5
+ "exclude_modules": null,
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weights": false,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "modules_to_save": null,
12
+ "block_size": 64,
13
+ "block_size_pattern": {},
14
+ "peft_type": "C3A",
15
+ "revision": null,
16
+ "target_modules": [
17
+ "v_proj",
18
+ "q_proj"
19
+ ],
20
+ "task_type": null
21
+ }
MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "optimizer_kwargs": {
3
+ "lr": 3e-1,
4
+ "weight_decay": 1e-5
5
+ }
6
+ }
MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "bias": "none",
5
+ "exclude_modules": null,
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weights": false,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "modules_to_save": null,
12
+ "n_frequency": 1000,
13
+ "n_frequency_pattern": {},
14
+ "peft_type": "FOURIERFT",
15
+ "random_loc_seed": 777,
16
+ "revision": null,
17
+ "scaling": 300,
18
+ "target_modules": [
19
+ "v_proj",
20
+ "q_proj"
21
+ ],
22
+ "task_type": null
23
+ }
MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "bias": "none",
5
+ "exclude_modules": null,
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weights": false,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "modules_to_save": null,
12
+ "n_frequency": 5000,
13
+ "n_frequency_pattern": {},
14
+ "peft_type": "FOURIERFT",
15
+ "random_loc_seed": 777,
16
+ "revision": null,
17
+ "scaling": 300,
18
+ "target_modules": [
19
+ "v_proj",
20
+ "q_proj"
21
+ ],
22
+ "task_type": null
23
+ }
MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "optimizer_kwargs": {
3
+ "lr": 1e-5
4
+ }
5
+ }
6
+
MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "exclude_modules": null,
5
+ "fan_in_fan_out": false,
6
+ "feedforward_modules": null,
7
+ "inference_mode": false,
8
+ "init_ia3_weights": true,
9
+ "modules_to_save": null,
10
+ "peft_type": "IA3",
11
+ "revision": null,
12
+ "target_modules": null,
13
+ "task_type": null
14
+ }
MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "exclude_modules": null,
5
+ "fan_in_fan_out": false,
6
+ "feedforward_modules": null,
7
+ "inference_mode": false,
8
+ "init_ia3_weights": true,
9
+ "modules_to_save": null,
10
+ "peft_type": "IA3",
11
+ "revision": null,
12
+ "target_modules": null,
13
+ "task_type": null
14
+ }
MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "optimizer_kwargs": {
3
+ "lr": 1e-3
4
+ }
5
+ }
6
+
MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "exclude_modules": null,
5
+ "inference_mode": false,
6
+ "modules_to_save": null,
7
+ "peft_type": "LN_TUNING",
8
+ "revision": null,
9
+ "target_modules": null,
10
+ "task_type": null
11
+ }
MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "alpha_pattern": {},
4
+ "auto_mapping": null,
5
+ "base_model_name_or_path": null,
6
+ "exclude_modules": null,
7
+ "inference_mode": false,
8
+ "init_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "module_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LOHA",
14
+ "r": 32,
15
+ "rank_dropout": 0.0,
16
+ "rank_pattern": {},
17
+ "revision": null,
18
+ "target_modules": [
19
+ "q_proj",
20
+ "v_proj"
21
+ ],
22
+ "task_type": null,
23
+ "use_effective_conv2d": false
24
+ }
MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "alpha_pattern": {},
4
+ "auto_mapping": null,
5
+ "base_model_name_or_path": null,
6
+ "decompose_both": false,
7
+ "decompose_factor": -1,
8
+ "exclude_modules": null,
9
+ "inference_mode": false,
10
+ "init_weights": true,
11
+ "layers_pattern": null,
12
+ "layers_to_transform": null,
13
+ "module_dropout": 0.0,
14
+ "modules_to_save": null,
15
+ "peft_type": "LOKR",
16
+ "r": 32,
17
+ "rank_dropout": 0.0,
18
+ "rank_dropout_scale": false,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "q_proj",
23
+ "v_proj"
24
+ ],
25
+ "task_type": null,
26
+ "use_effective_conv2d": false
27
+ }
MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": false,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": null,
27
+ "task_type": "CAUSAL_LM",
28
+ "use_dora": true,
29
+ "use_rslora": false
30
+ }
MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": false,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": null,
27
+ "task_type": "CAUSAL_LM",
28
+ "use_dora": false,
29
+ "use_rslora": false
30
+ }
MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "optimizer_type": "lora-fa",
3
+ "optimizer_kwargs": {
4
+ "r": 32,
5
+ "lora_alpha": 64,
6
+ "lr": 1e-4,
7
+ "weight_decay": 0.1
8
+ }
9
+ }
MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": false,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": null,
27
+ "task_type": "CAUSAL_LM",
28
+ "use_dora": false,
29
+ "use_rslora": false
30
+ }
MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": false,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 64,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": null,
27
+ "task_type": "CAUSAL_LM",
28
+ "use_dora": false,
29
+ "use_rslora": true
30
+ }
MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": false,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 128,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 64,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": null,
27
+ "task_type": "CAUSAL_LM",
28
+ "use_dora": false,
29
+ "use_rslora": false
30
+ }
MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "block_share": false,
7
+ "coft": false,
8
+ "eps": 6e-05,
9
+ "exclude_modules": null,
10
+ "fan_in_fan_out": false,
11
+ "inference_mode": false,
12
+ "init_weights": true,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "module_dropout": 0.0,
16
+ "modules_to_save": null,
17
+ "oft_block_size": 0,
18
+ "peft_type": "OFT",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj"
25
+ ],
26
+ "task_type": null
27
+ }
MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "encoder_hidden_size": 3072,
5
+ "inference_mode": false,
6
+ "num_attention_heads": 24,
7
+ "num_layers": 28,
8
+ "num_transformer_submodules": 1,
9
+ "num_virtual_tokens": 200,
10
+ "peft_type": "PREFIX_TUNING",
11
+ "prefix_projection": false,
12
+ "revision": null,
13
+ "task_type": "CAUSAL_LM",
14
+ "token_dim": 3072
15
+ }
MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "optimizer_kwargs": {
3
+ "lr": 1e-3
4
+ }
5
+ }
6
+
MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "inference_mode": false,
5
+ "num_attention_heads": 24,
6
+ "num_layers": 28,
7
+ "num_transformer_submodules": 1,
8
+ "num_virtual_tokens": 200,
9
+ "peft_type": "PROMPT_TUNING",
10
+ "prompt_tuning_init": "RANDOM",
11
+ "prompt_tuning_init_text": null,
12
+ "revision": null,
13
+ "task_type": "CAUSAL_LM",
14
+ "token_dim": 3072,
15
+ "tokenizer_kwargs": null,
16
+ "tokenizer_name_or_path": null
17
+ }
MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "inference_mode": false,
5
+ "num_attention_heads": 24,
6
+ "num_layers": 28,
7
+ "num_transformer_submodules": 1,
8
+ "num_virtual_tokens": 200,
9
+ "peft_type": "PROMPT_TUNING",
10
+ "prompt_tuning_init": "RANDOM",
11
+ "prompt_tuning_init_text": null,
12
+ "revision": null,
13
+ "task_type": "CAUSAL_LM",
14
+ "token_dim": 3072,
15
+ "tokenizer_kwargs": null,
16
+ "tokenizer_name_or_path": null
17
+ }
MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "optimizer_kwargs": {
3
+ "lr": 1e-3
4
+ }
5
+ }
6
+
MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "encoder_dropout": 0.0,
5
+ "encoder_hidden_size": 3072,
6
+ "encoder_num_layers": 2,
7
+ "encoder_reparameterization_type": "MLP",
8
+ "inference_mode": false,
9
+ "num_attention_heads": 24,
10
+ "num_layers": 28,
11
+ "num_transformer_submodules": 1,
12
+ "num_virtual_tokens": 20,
13
+ "peft_type": "P_TUNING",
14
+ "revision": null,
15
+ "task_type": "CAUSAL_LM",
16
+ "token_dim": 3072
17
+ }
MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": false,
7
+ "init_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "modules_to_save": null,
11
+ "peft_type": "RANDLORA",
12
+ "projection_prng_key": 0,
13
+ "r": 32,
14
+ "randlora_alpha": 640,
15
+ "randlora_dropout": 0.0,
16
+ "revision": null,
17
+ "save_projection": true,
18
+ "sparse": false,
19
+ "target_modules": null,
20
+ "task_type": null,
21
+ "very_sparse": false
22
+ }
MetaMathQA/experiments/vblora/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "bias": "none",
5
+ "exclude_modules": null,
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_logits_std": 0.1,
9
+ "init_vector_bank_bound": 0.02,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "modules_to_save": null,
13
+ "num_vectors": 256,
14
+ "peft_type": "VBLORA",
15
+ "r": 4,
16
+ "revision": null,
17
+ "save_only_topk_weights": false,
18
+ "target_modules": [
19
+ "v_proj",
20
+ "q_proj"
21
+ ],
22
+ "task_type": null,
23
+ "topk": 2,
24
+ "vblora_dropout": 0.0,
25
+ "vector_length": 256
26
+ }
MetaMathQA/experiments/vera/llama-3.2-3B-default/adapter_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": null,
4
+ "bias": "none",
5
+ "d_initial": 0.1,
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "modules_to_save": null,
12
+ "peft_type": "VERA",
13
+ "projection_prng_key": 0,
14
+ "r": 256,
15
+ "revision": null,
16
+ "save_projection": true,
17
+ "target_modules": null,
18
+ "task_type": null,
19
+ "vera_dropout": 0.0
20
+ }
MetaMathQA/experiments/vera/llama-3.2-3B-default/training_params.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "optimizer_kwargs": {
3
+ "lr": 1e-3
4
+ }
5
+ }
6
+
MetaMathQA/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ bitsandbytes
2
+ datasets
3
+ numpy
4
+ tqdm
MetaMathQA/results/.gitkeep ADDED
File without changes
MetaMathQA/results/adalora--llama-3.2-3B-rank32.json ADDED
@@ -0,0 +1,4071 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_info": {
3
+ "created_at": "2025-06-19T23:12:19+00:00",
4
+ "total_time": 2209.243281380004,
5
+ "experiment_name": "adalora/llama-3.2-3B-rank32",
6
+ "peft_branch": "main",
7
+ "train_config": {
8
+ "model_id": "meta-llama/Llama-3.2-3B",
9
+ "dtype": "bfloat16",
10
+ "max_seq_length": 768,
11
+ "batch_size": 4,
12
+ "batch_size_eval": 50,
13
+ "max_steps": 5000,
14
+ "eval_steps": 250,
15
+ "compile": false,
16
+ "query_template": "Question: {query} Think step by step.\nAnswer:",
17
+ "seed": 0,
18
+ "grad_norm_clip": 1.0,
19
+ "optimizer_type": "AdamW",
20
+ "optimizer_kwargs": {
21
+ "lr": 0.0001,
22
+ "weight_decay": 0.1
23
+ },
24
+ "lr_scheduler": "cosine",
25
+ "use_amp": false,
26
+ "autocast_adapter_dtype": true,
27
+ "generation_kwargs": {
28
+ "max_length": 800,
29
+ "max_new_tokens": 300
30
+ },
31
+ "attn_implementation": null
32
+ },
33
+ "peft_config": {
34
+ "task_type": null,
35
+ "peft_type": "ADALORA",
36
+ "auto_mapping": null,
37
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
38
+ "revision": null,
39
+ "inference_mode": false,
40
+ "r": 8,
41
+ "target_modules": [
42
+ "q_proj",
43
+ "v_proj"
44
+ ],
45
+ "exclude_modules": null,
46
+ "lora_alpha": 8,
47
+ "lora_dropout": 0.0,
48
+ "fan_in_fan_out": false,
49
+ "bias": "none",
50
+ "use_rslora": false,
51
+ "modules_to_save": null,
52
+ "init_lora_weights": true,
53
+ "layers_to_transform": null,
54
+ "layers_pattern": null,
55
+ "rank_pattern": {
56
+ "model.layers.0.self_attn.q_proj.lora_E": [
57
+ false,
58
+ false,
59
+ false,
60
+ false,
61
+ false,
62
+ false,
63
+ false,
64
+ false,
65
+ false,
66
+ false,
67
+ false,
68
+ false,
69
+ false,
70
+ false,
71
+ false,
72
+ false,
73
+ false,
74
+ false,
75
+ false,
76
+ false,
77
+ false,
78
+ false,
79
+ false,
80
+ false,
81
+ false,
82
+ false,
83
+ false,
84
+ false,
85
+ false,
86
+ false,
87
+ false,
88
+ false,
89
+ false,
90
+ false,
91
+ false,
92
+ false,
93
+ false,
94
+ false,
95
+ false,
96
+ false,
97
+ false,
98
+ false,
99
+ false,
100
+ false,
101
+ false,
102
+ false,
103
+ false,
104
+ false,
105
+ false,
106
+ false,
107
+ false,
108
+ false,
109
+ false,
110
+ false,
111
+ false,
112
+ false,
113
+ false,
114
+ false,
115
+ false,
116
+ false,
117
+ false,
118
+ false,
119
+ false,
120
+ false
121
+ ],
122
+ "model.layers.0.self_attn.v_proj.lora_E": [
123
+ true,
124
+ true,
125
+ true,
126
+ true,
127
+ true,
128
+ true,
129
+ true,
130
+ true,
131
+ true,
132
+ false,
133
+ true,
134
+ true,
135
+ true,
136
+ true,
137
+ true,
138
+ false,
139
+ true,
140
+ true,
141
+ true,
142
+ false,
143
+ true,
144
+ true,
145
+ true,
146
+ true,
147
+ true,
148
+ true,
149
+ true,
150
+ true,
151
+ true,
152
+ true,
153
+ true,
154
+ true,
155
+ true,
156
+ true,
157
+ true,
158
+ true,
159
+ true,
160
+ true,
161
+ true,
162
+ true,
163
+ true,
164
+ true,
165
+ true,
166
+ true,
167
+ true,
168
+ true,
169
+ true,
170
+ true,
171
+ true,
172
+ true,
173
+ true,
174
+ true,
175
+ false,
176
+ true,
177
+ true,
178
+ true,
179
+ true,
180
+ true,
181
+ true,
182
+ true,
183
+ true,
184
+ true,
185
+ true,
186
+ true
187
+ ],
188
+ "model.layers.1.self_attn.q_proj.lora_E": [
189
+ false,
190
+ false,
191
+ true,
192
+ true,
193
+ false,
194
+ true,
195
+ true,
196
+ false,
197
+ false,
198
+ false,
199
+ false,
200
+ true,
201
+ false,
202
+ false,
203
+ true,
204
+ true,
205
+ true,
206
+ true,
207
+ false,
208
+ false,
209
+ false,
210
+ false,
211
+ false,
212
+ false,
213
+ true,
214
+ false,
215
+ true,
216
+ true,
217
+ false,
218
+ false,
219
+ true,
220
+ true,
221
+ true,
222
+ false,
223
+ true,
224
+ true,
225
+ false,
226
+ false,
227
+ true,
228
+ true,
229
+ true,
230
+ false,
231
+ false,
232
+ false,
233
+ true,
234
+ false,
235
+ true,
236
+ true,
237
+ true,
238
+ true,
239
+ false,
240
+ true,
241
+ true,
242
+ true,
243
+ false,
244
+ false,
245
+ true,
246
+ true,
247
+ false,
248
+ false,
249
+ true,
250
+ true,
251
+ false,
252
+ false
253
+ ],
254
+ "model.layers.1.self_attn.v_proj.lora_E": [
255
+ true,
256
+ true,
257
+ true,
258
+ true,
259
+ true,
260
+ true,
261
+ true,
262
+ true,
263
+ true,
264
+ true,
265
+ true,
266
+ true,
267
+ true,
268
+ true,
269
+ true,
270
+ true,
271
+ true,
272
+ true,
273
+ false,
274
+ true,
275
+ true,
276
+ true,
277
+ true,
278
+ true,
279
+ true,
280
+ true,
281
+ true,
282
+ true,
283
+ true,
284
+ true,
285
+ true,
286
+ true,
287
+ true,
288
+ true,
289
+ true,
290
+ true,
291
+ true,
292
+ true,
293
+ true,
294
+ true,
295
+ true,
296
+ true,
297
+ false,
298
+ true,
299
+ true,
300
+ true,
301
+ true,
302
+ true,
303
+ true,
304
+ true,
305
+ true,
306
+ true,
307
+ true,
308
+ true,
309
+ true,
310
+ true,
311
+ true,
312
+ true,
313
+ true,
314
+ true,
315
+ true,
316
+ true,
317
+ true,
318
+ true
319
+ ],
320
+ "model.layers.2.self_attn.q_proj.lora_E": [
321
+ true,
322
+ false,
323
+ true,
324
+ false,
325
+ false,
326
+ false,
327
+ true,
328
+ true,
329
+ true,
330
+ true,
331
+ false,
332
+ true,
333
+ true,
334
+ true,
335
+ false,
336
+ false,
337
+ true,
338
+ false,
339
+ false,
340
+ true,
341
+ false,
342
+ false,
343
+ false,
344
+ false,
345
+ true,
346
+ true,
347
+ false,
348
+ false,
349
+ false,
350
+ false,
351
+ true,
352
+ false,
353
+ false,
354
+ false,
355
+ false,
356
+ false,
357
+ false,
358
+ true,
359
+ true,
360
+ true,
361
+ false,
362
+ false,
363
+ false,
364
+ true,
365
+ true,
366
+ false,
367
+ false,
368
+ false,
369
+ true,
370
+ false,
371
+ true,
372
+ true,
373
+ false,
374
+ true,
375
+ false,
376
+ false,
377
+ false,
378
+ true,
379
+ true,
380
+ false,
381
+ true,
382
+ true,
383
+ false,
384
+ false
385
+ ],
386
+ "model.layers.2.self_attn.v_proj.lora_E": [
387
+ true,
388
+ false,
389
+ false,
390
+ false,
391
+ true,
392
+ true,
393
+ true,
394
+ true,
395
+ false,
396
+ true,
397
+ true,
398
+ true,
399
+ false,
400
+ true,
401
+ false,
402
+ true,
403
+ false,
404
+ true,
405
+ false,
406
+ true,
407
+ false,
408
+ true,
409
+ true,
410
+ true,
411
+ true,
412
+ true,
413
+ false,
414
+ true,
415
+ false,
416
+ false,
417
+ false,
418
+ false,
419
+ true,
420
+ false,
421
+ false,
422
+ false,
423
+ false,
424
+ false,
425
+ true,
426
+ true,
427
+ false,
428
+ false,
429
+ true,
430
+ true,
431
+ false,
432
+ true,
433
+ true,
434
+ true,
435
+ true,
436
+ false,
437
+ false,
438
+ true,
439
+ false,
440
+ true,
441
+ false,
442
+ false,
443
+ false,
444
+ true,
445
+ true,
446
+ false,
447
+ false,
448
+ false,
449
+ true,
450
+ true
451
+ ],
452
+ "model.layers.3.self_attn.q_proj.lora_E": [
453
+ false,
454
+ false,
455
+ false,
456
+ false,
457
+ false,
458
+ false,
459
+ false,
460
+ false,
461
+ false,
462
+ false,
463
+ false,
464
+ false,
465
+ false,
466
+ false,
467
+ false,
468
+ false,
469
+ false,
470
+ false,
471
+ false,
472
+ false,
473
+ false,
474
+ false,
475
+ false,
476
+ false,
477
+ false,
478
+ false,
479
+ false,
480
+ false,
481
+ false,
482
+ false,
483
+ false,
484
+ false,
485
+ false,
486
+ false,
487
+ false,
488
+ false,
489
+ false,
490
+ false,
491
+ false,
492
+ false,
493
+ false,
494
+ false,
495
+ false,
496
+ false,
497
+ false,
498
+ false,
499
+ false,
500
+ false,
501
+ false,
502
+ false,
503
+ false,
504
+ false,
505
+ false,
506
+ false,
507
+ false,
508
+ false,
509
+ false,
510
+ false,
511
+ false,
512
+ false,
513
+ false,
514
+ false,
515
+ false,
516
+ false
517
+ ],
518
+ "model.layers.3.self_attn.v_proj.lora_E": [
519
+ false,
520
+ false,
521
+ false,
522
+ false,
523
+ false,
524
+ true,
525
+ false,
526
+ true,
527
+ false,
528
+ false,
529
+ false,
530
+ false,
531
+ true,
532
+ false,
533
+ false,
534
+ false,
535
+ false,
536
+ true,
537
+ true,
538
+ true,
539
+ true,
540
+ false,
541
+ true,
542
+ false,
543
+ false,
544
+ false,
545
+ false,
546
+ false,
547
+ false,
548
+ false,
549
+ false,
550
+ false,
551
+ true,
552
+ false,
553
+ false,
554
+ true,
555
+ false,
556
+ false,
557
+ true,
558
+ false,
559
+ true,
560
+ false,
561
+ true,
562
+ true,
563
+ false,
564
+ true,
565
+ false,
566
+ false,
567
+ true,
568
+ false,
569
+ false,
570
+ false,
571
+ false,
572
+ false,
573
+ true,
574
+ false,
575
+ true,
576
+ false,
577
+ false,
578
+ false,
579
+ false,
580
+ true,
581
+ true,
582
+ true
583
+ ],
584
+ "model.layers.4.self_attn.q_proj.lora_E": [
585
+ false,
586
+ false,
587
+ false,
588
+ false,
589
+ false,
590
+ true,
591
+ false,
592
+ false,
593
+ true,
594
+ true,
595
+ false,
596
+ true,
597
+ false,
598
+ false,
599
+ false,
600
+ false,
601
+ false,
602
+ false,
603
+ false,
604
+ true,
605
+ false,
606
+ false,
607
+ true,
608
+ false,
609
+ true,
610
+ false,
611
+ false,
612
+ false,
613
+ false,
614
+ false,
615
+ true,
616
+ false,
617
+ false,
618
+ false,
619
+ false,
620
+ false,
621
+ false,
622
+ false,
623
+ true,
624
+ true,
625
+ false,
626
+ false,
627
+ true,
628
+ false,
629
+ false,
630
+ false,
631
+ true,
632
+ true,
633
+ false,
634
+ false,
635
+ false,
636
+ false,
637
+ false,
638
+ false,
639
+ false,
640
+ false,
641
+ false,
642
+ false,
643
+ false,
644
+ false,
645
+ true,
646
+ true,
647
+ false,
648
+ false
649
+ ],
650
+ "model.layers.4.self_attn.v_proj.lora_E": [
651
+ true,
652
+ false,
653
+ true,
654
+ true,
655
+ false,
656
+ false,
657
+ true,
658
+ false,
659
+ false,
660
+ false,
661
+ true,
662
+ false,
663
+ true,
664
+ true,
665
+ false,
666
+ true,
667
+ false,
668
+ true,
669
+ true,
670
+ false,
671
+ true,
672
+ true,
673
+ false,
674
+ false,
675
+ true,
676
+ true,
677
+ true,
678
+ true,
679
+ false,
680
+ false,
681
+ false,
682
+ false,
683
+ false,
684
+ false,
685
+ true,
686
+ false,
687
+ true,
688
+ false,
689
+ false,
690
+ true,
691
+ true,
692
+ true,
693
+ true,
694
+ true,
695
+ false,
696
+ false,
697
+ false,
698
+ false,
699
+ false,
700
+ true,
701
+ false,
702
+ true,
703
+ true,
704
+ true,
705
+ true,
706
+ true,
707
+ false,
708
+ true,
709
+ true,
710
+ false,
711
+ true,
712
+ true,
713
+ true,
714
+ true
715
+ ],
716
+ "model.layers.5.self_attn.q_proj.lora_E": [
717
+ false,
718
+ true,
719
+ false,
720
+ false,
721
+ false,
722
+ false,
723
+ false,
724
+ false,
725
+ false,
726
+ false,
727
+ false,
728
+ false,
729
+ false,
730
+ false,
731
+ false,
732
+ false,
733
+ false,
734
+ false,
735
+ false,
736
+ false,
737
+ false,
738
+ false,
739
+ false,
740
+ false,
741
+ false,
742
+ false,
743
+ false,
744
+ false,
745
+ false,
746
+ false,
747
+ false,
748
+ false,
749
+ false,
750
+ false,
751
+ false,
752
+ false,
753
+ false,
754
+ false,
755
+ false,
756
+ false,
757
+ false,
758
+ false,
759
+ false,
760
+ false,
761
+ false,
762
+ false,
763
+ false,
764
+ false,
765
+ false,
766
+ false,
767
+ false,
768
+ false,
769
+ false,
770
+ false,
771
+ false,
772
+ false,
773
+ false,
774
+ false,
775
+ false,
776
+ false,
777
+ false,
778
+ false,
779
+ false,
780
+ false
781
+ ],
782
+ "model.layers.5.self_attn.v_proj.lora_E": [
783
+ true,
784
+ true,
785
+ true,
786
+ true,
787
+ true,
788
+ true,
789
+ true,
790
+ true,
791
+ false,
792
+ true,
793
+ false,
794
+ false,
795
+ true,
796
+ false,
797
+ false,
798
+ true,
799
+ false,
800
+ true,
801
+ false,
802
+ false,
803
+ false,
804
+ false,
805
+ true,
806
+ true,
807
+ false,
808
+ false,
809
+ false,
810
+ false,
811
+ true,
812
+ false,
813
+ true,
814
+ false,
815
+ true,
816
+ true,
817
+ false,
818
+ false,
819
+ true,
820
+ true,
821
+ true,
822
+ true,
823
+ false,
824
+ false,
825
+ true,
826
+ false,
827
+ true,
828
+ false,
829
+ false,
830
+ true,
831
+ true,
832
+ true,
833
+ false,
834
+ true,
835
+ false,
836
+ false,
837
+ false,
838
+ true,
839
+ true,
840
+ true,
841
+ true,
842
+ false,
843
+ false,
844
+ false,
845
+ true,
846
+ true
847
+ ],
848
+ "model.layers.6.self_attn.q_proj.lora_E": [
849
+ false,
850
+ false,
851
+ true,
852
+ true,
853
+ false,
854
+ false,
855
+ true,
856
+ true,
857
+ false,
858
+ false,
859
+ false,
860
+ true,
861
+ false,
862
+ true,
863
+ false,
864
+ true,
865
+ false,
866
+ false,
867
+ false,
868
+ false,
869
+ true,
870
+ true,
871
+ true,
872
+ true,
873
+ false,
874
+ true,
875
+ false,
876
+ true,
877
+ false,
878
+ true,
879
+ false,
880
+ false,
881
+ false,
882
+ true,
883
+ true,
884
+ false,
885
+ false,
886
+ false,
887
+ false,
888
+ true,
889
+ true,
890
+ true,
891
+ false,
892
+ false,
893
+ true,
894
+ false,
895
+ false,
896
+ false,
897
+ false,
898
+ true,
899
+ true,
900
+ false,
901
+ false,
902
+ false,
903
+ true,
904
+ false,
905
+ false,
906
+ false,
907
+ false,
908
+ false,
909
+ false,
910
+ false,
911
+ false,
912
+ false
913
+ ],
914
+ "model.layers.6.self_attn.v_proj.lora_E": [
915
+ false,
916
+ true,
917
+ true,
918
+ true,
919
+ true,
920
+ true,
921
+ true,
922
+ true,
923
+ true,
924
+ true,
925
+ false,
926
+ true,
927
+ true,
928
+ true,
929
+ true,
930
+ true,
931
+ false,
932
+ true,
933
+ true,
934
+ true,
935
+ false,
936
+ false,
937
+ false,
938
+ false,
939
+ true,
940
+ true,
941
+ false,
942
+ false,
943
+ false,
944
+ false,
945
+ true,
946
+ true,
947
+ false,
948
+ true,
949
+ true,
950
+ true,
951
+ false,
952
+ true,
953
+ true,
954
+ true,
955
+ false,
956
+ true,
957
+ true,
958
+ true,
959
+ true,
960
+ false,
961
+ false,
962
+ false,
963
+ true,
964
+ true,
965
+ false,
966
+ false,
967
+ true,
968
+ false,
969
+ true,
970
+ false,
971
+ true,
972
+ true,
973
+ false,
974
+ true,
975
+ false,
976
+ true,
977
+ false,
978
+ true
979
+ ],
980
+ "model.layers.7.self_attn.q_proj.lora_E": [
981
+ false,
982
+ false,
983
+ false,
984
+ false,
985
+ false,
986
+ true,
987
+ false,
988
+ false,
989
+ false,
990
+ false,
991
+ false,
992
+ false,
993
+ false,
994
+ false,
995
+ true,
996
+ false,
997
+ false,
998
+ false,
999
+ false,
1000
+ false,
1001
+ false,
1002
+ false,
1003
+ false,
1004
+ false,
1005
+ false,
1006
+ false,
1007
+ false,
1008
+ false,
1009
+ false,
1010
+ false,
1011
+ true,
1012
+ false,
1013
+ false,
1014
+ false,
1015
+ false,
1016
+ false,
1017
+ false,
1018
+ false,
1019
+ false,
1020
+ false,
1021
+ false,
1022
+ false,
1023
+ false,
1024
+ false,
1025
+ false,
1026
+ false,
1027
+ false,
1028
+ false,
1029
+ false,
1030
+ false,
1031
+ true,
1032
+ false,
1033
+ false,
1034
+ false,
1035
+ false,
1036
+ false,
1037
+ false,
1038
+ false,
1039
+ false,
1040
+ false,
1041
+ false,
1042
+ false,
1043
+ false,
1044
+ false
1045
+ ],
1046
+ "model.layers.7.self_attn.v_proj.lora_E": [
1047
+ false,
1048
+ false,
1049
+ true,
1050
+ true,
1051
+ false,
1052
+ true,
1053
+ true,
1054
+ true,
1055
+ true,
1056
+ false,
1057
+ true,
1058
+ true,
1059
+ true,
1060
+ true,
1061
+ true,
1062
+ true,
1063
+ true,
1064
+ true,
1065
+ true,
1066
+ true,
1067
+ false,
1068
+ false,
1069
+ true,
1070
+ true,
1071
+ true,
1072
+ true,
1073
+ true,
1074
+ false,
1075
+ true,
1076
+ false,
1077
+ false,
1078
+ true,
1079
+ true,
1080
+ true,
1081
+ true,
1082
+ false,
1083
+ false,
1084
+ false,
1085
+ true,
1086
+ false,
1087
+ false,
1088
+ true,
1089
+ true,
1090
+ true,
1091
+ false,
1092
+ true,
1093
+ true,
1094
+ true,
1095
+ true,
1096
+ true,
1097
+ true,
1098
+ false,
1099
+ true,
1100
+ true,
1101
+ true,
1102
+ true,
1103
+ true,
1104
+ false,
1105
+ false,
1106
+ false,
1107
+ true,
1108
+ true,
1109
+ true,
1110
+ true
1111
+ ],
1112
+ "model.layers.8.self_attn.q_proj.lora_E": [
1113
+ false,
1114
+ true,
1115
+ false,
1116
+ false,
1117
+ false,
1118
+ false,
1119
+ false,
1120
+ true,
1121
+ false,
1122
+ false,
1123
+ false,
1124
+ false,
1125
+ true,
1126
+ true,
1127
+ false,
1128
+ false,
1129
+ false,
1130
+ false,
1131
+ false,
1132
+ false,
1133
+ false,
1134
+ false,
1135
+ false,
1136
+ false,
1137
+ false,
1138
+ false,
1139
+ false,
1140
+ false,
1141
+ false,
1142
+ false,
1143
+ false,
1144
+ false,
1145
+ false,
1146
+ true,
1147
+ false,
1148
+ false,
1149
+ false,
1150
+ false,
1151
+ true,
1152
+ false,
1153
+ false,
1154
+ true,
1155
+ false,
1156
+ false,
1157
+ false,
1158
+ false,
1159
+ true,
1160
+ false,
1161
+ false,
1162
+ false,
1163
+ false,
1164
+ false,
1165
+ true,
1166
+ false,
1167
+ false,
1168
+ false,
1169
+ false,
1170
+ false,
1171
+ false,
1172
+ false,
1173
+ false,
1174
+ false,
1175
+ false,
1176
+ true
1177
+ ],
1178
+ "model.layers.8.self_attn.v_proj.lora_E": [
1179
+ false,
1180
+ true,
1181
+ false,
1182
+ false,
1183
+ false,
1184
+ true,
1185
+ false,
1186
+ false,
1187
+ false,
1188
+ false,
1189
+ true,
1190
+ true,
1191
+ true,
1192
+ true,
1193
+ true,
1194
+ false,
1195
+ false,
1196
+ true,
1197
+ true,
1198
+ true,
1199
+ false,
1200
+ true,
1201
+ true,
1202
+ true,
1203
+ true,
1204
+ true,
1205
+ false,
1206
+ true,
1207
+ true,
1208
+ false,
1209
+ false,
1210
+ true,
1211
+ true,
1212
+ false,
1213
+ false,
1214
+ true,
1215
+ false,
1216
+ true,
1217
+ false,
1218
+ true,
1219
+ true,
1220
+ false,
1221
+ true,
1222
+ false,
1223
+ true,
1224
+ true,
1225
+ true,
1226
+ false,
1227
+ true,
1228
+ false,
1229
+ false,
1230
+ true,
1231
+ true,
1232
+ true,
1233
+ false,
1234
+ true,
1235
+ true,
1236
+ true,
1237
+ true,
1238
+ false,
1239
+ false,
1240
+ false,
1241
+ false,
1242
+ true
1243
+ ],
1244
+ "model.layers.9.self_attn.q_proj.lora_E": [
1245
+ true,
1246
+ false,
1247
+ true,
1248
+ true,
1249
+ false,
1250
+ false,
1251
+ true,
1252
+ true,
1253
+ false,
1254
+ false,
1255
+ true,
1256
+ false,
1257
+ false,
1258
+ false,
1259
+ false,
1260
+ true,
1261
+ false,
1262
+ true,
1263
+ false,
1264
+ true,
1265
+ false,
1266
+ false,
1267
+ false,
1268
+ true,
1269
+ false,
1270
+ true,
1271
+ false,
1272
+ true,
1273
+ false,
1274
+ true,
1275
+ false,
1276
+ true,
1277
+ true,
1278
+ false,
1279
+ false,
1280
+ true,
1281
+ true,
1282
+ false,
1283
+ false,
1284
+ false,
1285
+ false,
1286
+ true,
1287
+ true,
1288
+ true,
1289
+ false,
1290
+ false,
1291
+ false,
1292
+ false,
1293
+ true,
1294
+ true,
1295
+ true,
1296
+ false,
1297
+ false,
1298
+ false,
1299
+ false,
1300
+ false,
1301
+ true,
1302
+ false,
1303
+ true,
1304
+ false,
1305
+ false,
1306
+ true,
1307
+ false,
1308
+ true
1309
+ ],
1310
+ "model.layers.9.self_attn.v_proj.lora_E": [
1311
+ true,
1312
+ true,
1313
+ false,
1314
+ true,
1315
+ true,
1316
+ true,
1317
+ true,
1318
+ true,
1319
+ true,
1320
+ false,
1321
+ true,
1322
+ true,
1323
+ true,
1324
+ true,
1325
+ true,
1326
+ false,
1327
+ true,
1328
+ true,
1329
+ true,
1330
+ true,
1331
+ true,
1332
+ true,
1333
+ true,
1334
+ true,
1335
+ false,
1336
+ true,
1337
+ true,
1338
+ true,
1339
+ true,
1340
+ true,
1341
+ true,
1342
+ true,
1343
+ false,
1344
+ true,
1345
+ true,
1346
+ false,
1347
+ true,
1348
+ true,
1349
+ true,
1350
+ true,
1351
+ true,
1352
+ true,
1353
+ false,
1354
+ false,
1355
+ true,
1356
+ false,
1357
+ true,
1358
+ false,
1359
+ true,
1360
+ true,
1361
+ true,
1362
+ true,
1363
+ true,
1364
+ true,
1365
+ true,
1366
+ true,
1367
+ true,
1368
+ true,
1369
+ true,
1370
+ true,
1371
+ true,
1372
+ true,
1373
+ false,
1374
+ false
1375
+ ],
1376
+ "model.layers.10.self_attn.q_proj.lora_E": [
1377
+ false,
1378
+ false,
1379
+ false,
1380
+ false,
1381
+ false,
1382
+ false,
1383
+ false,
1384
+ false,
1385
+ false,
1386
+ false,
1387
+ true,
1388
+ false,
1389
+ false,
1390
+ false,
1391
+ false,
1392
+ false,
1393
+ false,
1394
+ false,
1395
+ false,
1396
+ false,
1397
+ false,
1398
+ false,
1399
+ false,
1400
+ false,
1401
+ false,
1402
+ false,
1403
+ false,
1404
+ false,
1405
+ false,
1406
+ false,
1407
+ false,
1408
+ false,
1409
+ false,
1410
+ false,
1411
+ false,
1412
+ false,
1413
+ false,
1414
+ false,
1415
+ false,
1416
+ true,
1417
+ false,
1418
+ false,
1419
+ false,
1420
+ false,
1421
+ false,
1422
+ false,
1423
+ false,
1424
+ false,
1425
+ false,
1426
+ false,
1427
+ true,
1428
+ false,
1429
+ false,
1430
+ false,
1431
+ false,
1432
+ false,
1433
+ false,
1434
+ false,
1435
+ false,
1436
+ false,
1437
+ false,
1438
+ false,
1439
+ false,
1440
+ false
1441
+ ],
1442
+ "model.layers.10.self_attn.v_proj.lora_E": [
1443
+ true,
1444
+ true,
1445
+ false,
1446
+ false,
1447
+ false,
1448
+ true,
1449
+ true,
1450
+ false,
1451
+ false,
1452
+ true,
1453
+ true,
1454
+ true,
1455
+ true,
1456
+ true,
1457
+ true,
1458
+ true,
1459
+ true,
1460
+ true,
1461
+ true,
1462
+ true,
1463
+ true,
1464
+ true,
1465
+ true,
1466
+ true,
1467
+ true,
1468
+ false,
1469
+ false,
1470
+ false,
1471
+ false,
1472
+ true,
1473
+ true,
1474
+ true,
1475
+ true,
1476
+ false,
1477
+ false,
1478
+ false,
1479
+ false,
1480
+ true,
1481
+ false,
1482
+ false,
1483
+ false,
1484
+ true,
1485
+ true,
1486
+ true,
1487
+ false,
1488
+ true,
1489
+ true,
1490
+ false,
1491
+ true,
1492
+ false,
1493
+ false,
1494
+ true,
1495
+ true,
1496
+ false,
1497
+ false,
1498
+ true,
1499
+ true,
1500
+ true,
1501
+ true,
1502
+ true,
1503
+ true,
1504
+ true,
1505
+ false,
1506
+ true
1507
+ ],
1508
+ "model.layers.11.self_attn.q_proj.lora_E": [
1509
+ true,
1510
+ false,
1511
+ false,
1512
+ false,
1513
+ false,
1514
+ true,
1515
+ false,
1516
+ false,
1517
+ false,
1518
+ true,
1519
+ true,
1520
+ false,
1521
+ true,
1522
+ false,
1523
+ false,
1524
+ false,
1525
+ false,
1526
+ false,
1527
+ false,
1528
+ false,
1529
+ false,
1530
+ true,
1531
+ false,
1532
+ false,
1533
+ true,
1534
+ false,
1535
+ false,
1536
+ false,
1537
+ false,
1538
+ false,
1539
+ false,
1540
+ false,
1541
+ false,
1542
+ false,
1543
+ false,
1544
+ false,
1545
+ true,
1546
+ false,
1547
+ false,
1548
+ false,
1549
+ true,
1550
+ true,
1551
+ true,
1552
+ false,
1553
+ true,
1554
+ false,
1555
+ false,
1556
+ false,
1557
+ true,
1558
+ true,
1559
+ false,
1560
+ false,
1561
+ false,
1562
+ false,
1563
+ true,
1564
+ true,
1565
+ false,
1566
+ true,
1567
+ false,
1568
+ true,
1569
+ true,
1570
+ false,
1571
+ false,
1572
+ false
1573
+ ],
1574
+ "model.layers.11.self_attn.v_proj.lora_E": [
1575
+ false,
1576
+ true,
1577
+ true,
1578
+ true,
1579
+ false,
1580
+ true,
1581
+ true,
1582
+ false,
1583
+ true,
1584
+ false,
1585
+ true,
1586
+ true,
1587
+ true,
1588
+ true,
1589
+ true,
1590
+ true,
1591
+ true,
1592
+ false,
1593
+ false,
1594
+ true,
1595
+ false,
1596
+ true,
1597
+ true,
1598
+ true,
1599
+ true,
1600
+ false,
1601
+ false,
1602
+ true,
1603
+ true,
1604
+ true,
1605
+ true,
1606
+ false,
1607
+ true,
1608
+ true,
1609
+ true,
1610
+ true,
1611
+ true,
1612
+ true,
1613
+ true,
1614
+ true,
1615
+ true,
1616
+ false,
1617
+ false,
1618
+ false,
1619
+ false,
1620
+ false,
1621
+ true,
1622
+ false,
1623
+ false,
1624
+ true,
1625
+ true,
1626
+ false,
1627
+ false,
1628
+ true,
1629
+ true,
1630
+ true,
1631
+ true,
1632
+ true,
1633
+ false,
1634
+ true,
1635
+ true,
1636
+ true,
1637
+ false,
1638
+ false
1639
+ ],
1640
+ "model.layers.12.self_attn.q_proj.lora_E": [
1641
+ false,
1642
+ false,
1643
+ false,
1644
+ false,
1645
+ false,
1646
+ false,
1647
+ false,
1648
+ false,
1649
+ false,
1650
+ false,
1651
+ false,
1652
+ false,
1653
+ false,
1654
+ true,
1655
+ false,
1656
+ true,
1657
+ false,
1658
+ false,
1659
+ false,
1660
+ false,
1661
+ false,
1662
+ true,
1663
+ false,
1664
+ false,
1665
+ false,
1666
+ false,
1667
+ false,
1668
+ false,
1669
+ true,
1670
+ false,
1671
+ false,
1672
+ false,
1673
+ false,
1674
+ false,
1675
+ false,
1676
+ false,
1677
+ false,
1678
+ true,
1679
+ true,
1680
+ false,
1681
+ true,
1682
+ false,
1683
+ false,
1684
+ false,
1685
+ false,
1686
+ true,
1687
+ false,
1688
+ true,
1689
+ false,
1690
+ false,
1691
+ true,
1692
+ false,
1693
+ true,
1694
+ false,
1695
+ false,
1696
+ true,
1697
+ false,
1698
+ false,
1699
+ false,
1700
+ false,
1701
+ false,
1702
+ false,
1703
+ false,
1704
+ false
1705
+ ],
1706
+ "model.layers.12.self_attn.v_proj.lora_E": [
1707
+ true,
1708
+ true,
1709
+ true,
1710
+ true,
1711
+ false,
1712
+ true,
1713
+ true,
1714
+ false,
1715
+ true,
1716
+ true,
1717
+ true,
1718
+ true,
1719
+ true,
1720
+ true,
1721
+ true,
1722
+ true,
1723
+ true,
1724
+ false,
1725
+ false,
1726
+ false,
1727
+ false,
1728
+ true,
1729
+ true,
1730
+ false,
1731
+ false,
1732
+ true,
1733
+ true,
1734
+ true,
1735
+ true,
1736
+ true,
1737
+ true,
1738
+ true,
1739
+ true,
1740
+ true,
1741
+ true,
1742
+ false,
1743
+ false,
1744
+ true,
1745
+ false,
1746
+ true,
1747
+ true,
1748
+ true,
1749
+ true,
1750
+ true,
1751
+ false,
1752
+ false,
1753
+ true,
1754
+ false,
1755
+ true,
1756
+ true,
1757
+ true,
1758
+ true,
1759
+ false,
1760
+ true,
1761
+ false,
1762
+ true,
1763
+ false,
1764
+ true,
1765
+ false,
1766
+ true,
1767
+ false,
1768
+ true,
1769
+ true,
1770
+ false
1771
+ ],
1772
+ "model.layers.13.self_attn.q_proj.lora_E": [
1773
+ true,
1774
+ true,
1775
+ false,
1776
+ true,
1777
+ true,
1778
+ true,
1779
+ false,
1780
+ false,
1781
+ true,
1782
+ true,
1783
+ false,
1784
+ true,
1785
+ false,
1786
+ true,
1787
+ false,
1788
+ true,
1789
+ false,
1790
+ false,
1791
+ true,
1792
+ true,
1793
+ false,
1794
+ true,
1795
+ false,
1796
+ true,
1797
+ true,
1798
+ true,
1799
+ true,
1800
+ false,
1801
+ false,
1802
+ true,
1803
+ true,
1804
+ false,
1805
+ false,
1806
+ true,
1807
+ false,
1808
+ true,
1809
+ false,
1810
+ true,
1811
+ true,
1812
+ true,
1813
+ false,
1814
+ false,
1815
+ false,
1816
+ false,
1817
+ true,
1818
+ true,
1819
+ true,
1820
+ true,
1821
+ false,
1822
+ true,
1823
+ false,
1824
+ true,
1825
+ true,
1826
+ true,
1827
+ false,
1828
+ true,
1829
+ false,
1830
+ true,
1831
+ true,
1832
+ false,
1833
+ false,
1834
+ false,
1835
+ true,
1836
+ false
1837
+ ],
1838
+ "model.layers.13.self_attn.v_proj.lora_E": [
1839
+ true,
1840
+ false,
1841
+ true,
1842
+ true,
1843
+ true,
1844
+ false,
1845
+ true,
1846
+ true,
1847
+ true,
1848
+ false,
1849
+ true,
1850
+ true,
1851
+ true,
1852
+ false,
1853
+ true,
1854
+ false,
1855
+ true,
1856
+ false,
1857
+ true,
1858
+ true,
1859
+ true,
1860
+ true,
1861
+ true,
1862
+ true,
1863
+ true,
1864
+ true,
1865
+ true,
1866
+ false,
1867
+ true,
1868
+ true,
1869
+ false,
1870
+ false,
1871
+ true,
1872
+ true,
1873
+ false,
1874
+ false,
1875
+ true,
1876
+ false,
1877
+ false,
1878
+ true,
1879
+ false,
1880
+ false,
1881
+ true,
1882
+ true,
1883
+ true,
1884
+ true,
1885
+ true,
1886
+ true,
1887
+ true,
1888
+ false,
1889
+ true,
1890
+ false,
1891
+ false,
1892
+ true,
1893
+ true,
1894
+ true,
1895
+ false,
1896
+ true,
1897
+ true,
1898
+ false,
1899
+ true,
1900
+ true,
1901
+ true,
1902
+ true
1903
+ ],
1904
+ "model.layers.14.self_attn.q_proj.lora_E": [
1905
+ false,
1906
+ true,
1907
+ false,
1908
+ true,
1909
+ true,
1910
+ false,
1911
+ false,
1912
+ false,
1913
+ true,
1914
+ false,
1915
+ false,
1916
+ true,
1917
+ false,
1918
+ false,
1919
+ true,
1920
+ true,
1921
+ false,
1922
+ true,
1923
+ true,
1924
+ true,
1925
+ false,
1926
+ false,
1927
+ false,
1928
+ true,
1929
+ false,
1930
+ true,
1931
+ false,
1932
+ true,
1933
+ false,
1934
+ false,
1935
+ true,
1936
+ true,
1937
+ true,
1938
+ true,
1939
+ true,
1940
+ false,
1941
+ false,
1942
+ true,
1943
+ true,
1944
+ false,
1945
+ true,
1946
+ true,
1947
+ false,
1948
+ false,
1949
+ true,
1950
+ false,
1951
+ false,
1952
+ false,
1953
+ true,
1954
+ false,
1955
+ true,
1956
+ true,
1957
+ true,
1958
+ false,
1959
+ true,
1960
+ true,
1961
+ true,
1962
+ false,
1963
+ false,
1964
+ true,
1965
+ false,
1966
+ true,
1967
+ true,
1968
+ false
1969
+ ],
1970
+ "model.layers.14.self_attn.v_proj.lora_E": [
1971
+ true,
1972
+ true,
1973
+ true,
1974
+ false,
1975
+ false,
1976
+ false,
1977
+ true,
1978
+ false,
1979
+ false,
1980
+ false,
1981
+ false,
1982
+ true,
1983
+ true,
1984
+ false,
1985
+ false,
1986
+ true,
1987
+ false,
1988
+ true,
1989
+ true,
1990
+ true,
1991
+ false,
1992
+ true,
1993
+ false,
1994
+ false,
1995
+ true,
1996
+ false,
1997
+ true,
1998
+ false,
1999
+ true,
2000
+ true,
2001
+ false,
2002
+ true,
2003
+ false,
2004
+ true,
2005
+ false,
2006
+ false,
2007
+ true,
2008
+ false,
2009
+ false,
2010
+ true,
2011
+ false,
2012
+ true,
2013
+ true,
2014
+ false,
2015
+ true,
2016
+ false,
2017
+ true,
2018
+ false,
2019
+ true,
2020
+ true,
2021
+ true,
2022
+ true,
2023
+ true,
2024
+ true,
2025
+ false,
2026
+ false,
2027
+ true,
2028
+ true,
2029
+ false,
2030
+ true,
2031
+ true,
2032
+ true,
2033
+ true,
2034
+ false
2035
+ ],
2036
+ "model.layers.15.self_attn.q_proj.lora_E": [
2037
+ false,
2038
+ true,
2039
+ true,
2040
+ true,
2041
+ true,
2042
+ true,
2043
+ false,
2044
+ true,
2045
+ false,
2046
+ true,
2047
+ false,
2048
+ true,
2049
+ false,
2050
+ true,
2051
+ true,
2052
+ true,
2053
+ true,
2054
+ true,
2055
+ true,
2056
+ false,
2057
+ true,
2058
+ true,
2059
+ false,
2060
+ true,
2061
+ false,
2062
+ true,
2063
+ false,
2064
+ true,
2065
+ true,
2066
+ true,
2067
+ false,
2068
+ true,
2069
+ false,
2070
+ false,
2071
+ false,
2072
+ true,
2073
+ true,
2074
+ true,
2075
+ true,
2076
+ false,
2077
+ true,
2078
+ true,
2079
+ false,
2080
+ true,
2081
+ false,
2082
+ true,
2083
+ false,
2084
+ false,
2085
+ true,
2086
+ true,
2087
+ false,
2088
+ true,
2089
+ false,
2090
+ true,
2091
+ false,
2092
+ true,
2093
+ true,
2094
+ true,
2095
+ true,
2096
+ true,
2097
+ false,
2098
+ true,
2099
+ true,
2100
+ true
2101
+ ],
2102
+ "model.layers.15.self_attn.v_proj.lora_E": [
2103
+ true,
2104
+ true,
2105
+ true,
2106
+ true,
2107
+ false,
2108
+ true,
2109
+ true,
2110
+ true,
2111
+ true,
2112
+ false,
2113
+ true,
2114
+ true,
2115
+ false,
2116
+ true,
2117
+ true,
2118
+ false,
2119
+ false,
2120
+ true,
2121
+ false,
2122
+ true,
2123
+ true,
2124
+ true,
2125
+ true,
2126
+ true,
2127
+ false,
2128
+ true,
2129
+ true,
2130
+ true,
2131
+ true,
2132
+ true,
2133
+ false,
2134
+ false,
2135
+ false,
2136
+ true,
2137
+ true,
2138
+ true,
2139
+ true,
2140
+ true,
2141
+ true,
2142
+ true,
2143
+ true,
2144
+ true,
2145
+ true,
2146
+ false,
2147
+ true,
2148
+ true,
2149
+ true,
2150
+ true,
2151
+ true,
2152
+ false,
2153
+ true,
2154
+ true,
2155
+ true,
2156
+ true,
2157
+ false,
2158
+ false,
2159
+ false,
2160
+ true,
2161
+ true,
2162
+ true,
2163
+ true,
2164
+ true,
2165
+ true,
2166
+ true
2167
+ ],
2168
+ "model.layers.16.self_attn.q_proj.lora_E": [
2169
+ false,
2170
+ false,
2171
+ false,
2172
+ false,
2173
+ false,
2174
+ false,
2175
+ false,
2176
+ true,
2177
+ true,
2178
+ true,
2179
+ true,
2180
+ false,
2181
+ true,
2182
+ false,
2183
+ true,
2184
+ true,
2185
+ true,
2186
+ false,
2187
+ true,
2188
+ false,
2189
+ true,
2190
+ true,
2191
+ true,
2192
+ false,
2193
+ false,
2194
+ false,
2195
+ true,
2196
+ false,
2197
+ false,
2198
+ false,
2199
+ true,
2200
+ false,
2201
+ true,
2202
+ true,
2203
+ true,
2204
+ true,
2205
+ false,
2206
+ true,
2207
+ true,
2208
+ false,
2209
+ true,
2210
+ true,
2211
+ false,
2212
+ true,
2213
+ true,
2214
+ true,
2215
+ true,
2216
+ false,
2217
+ true,
2218
+ true,
2219
+ false,
2220
+ true,
2221
+ true,
2222
+ false,
2223
+ true,
2224
+ false,
2225
+ false,
2226
+ true,
2227
+ true,
2228
+ true,
2229
+ false,
2230
+ false,
2231
+ true,
2232
+ true
2233
+ ],
2234
+ "model.layers.16.self_attn.v_proj.lora_E": [
2235
+ true,
2236
+ false,
2237
+ true,
2238
+ false,
2239
+ true,
2240
+ false,
2241
+ true,
2242
+ false,
2243
+ false,
2244
+ true,
2245
+ false,
2246
+ true,
2247
+ true,
2248
+ true,
2249
+ true,
2250
+ true,
2251
+ true,
2252
+ false,
2253
+ true,
2254
+ false,
2255
+ true,
2256
+ true,
2257
+ true,
2258
+ false,
2259
+ true,
2260
+ true,
2261
+ false,
2262
+ false,
2263
+ false,
2264
+ true,
2265
+ true,
2266
+ true,
2267
+ false,
2268
+ true,
2269
+ false,
2270
+ false,
2271
+ true,
2272
+ false,
2273
+ false,
2274
+ false,
2275
+ true,
2276
+ false,
2277
+ true,
2278
+ false,
2279
+ true,
2280
+ true,
2281
+ true,
2282
+ false,
2283
+ true,
2284
+ true,
2285
+ false,
2286
+ false,
2287
+ true,
2288
+ true,
2289
+ true,
2290
+ false,
2291
+ true,
2292
+ true,
2293
+ false,
2294
+ true,
2295
+ false,
2296
+ true,
2297
+ false,
2298
+ false
2299
+ ],
2300
+ "model.layers.17.self_attn.q_proj.lora_E": [
2301
+ true,
2302
+ true,
2303
+ true,
2304
+ true,
2305
+ false,
2306
+ true,
2307
+ false,
2308
+ true,
2309
+ false,
2310
+ false,
2311
+ true,
2312
+ true,
2313
+ true,
2314
+ false,
2315
+ true,
2316
+ false,
2317
+ true,
2318
+ true,
2319
+ true,
2320
+ true,
2321
+ true,
2322
+ true,
2323
+ true,
2324
+ true,
2325
+ true,
2326
+ true,
2327
+ true,
2328
+ true,
2329
+ true,
2330
+ true,
2331
+ true,
2332
+ true,
2333
+ false,
2334
+ false,
2335
+ true,
2336
+ true,
2337
+ false,
2338
+ true,
2339
+ true,
2340
+ true,
2341
+ true,
2342
+ false,
2343
+ true,
2344
+ true,
2345
+ false,
2346
+ true,
2347
+ true,
2348
+ true,
2349
+ false,
2350
+ true,
2351
+ false,
2352
+ true,
2353
+ true,
2354
+ true,
2355
+ true,
2356
+ false,
2357
+ true,
2358
+ true,
2359
+ true,
2360
+ true,
2361
+ true,
2362
+ true,
2363
+ true,
2364
+ true
2365
+ ],
2366
+ "model.layers.17.self_attn.v_proj.lora_E": [
2367
+ false,
2368
+ true,
2369
+ true,
2370
+ true,
2371
+ true,
2372
+ true,
2373
+ true,
2374
+ true,
2375
+ true,
2376
+ true,
2377
+ false,
2378
+ true,
2379
+ true,
2380
+ false,
2381
+ false,
2382
+ false,
2383
+ true,
2384
+ true,
2385
+ false,
2386
+ true,
2387
+ true,
2388
+ false,
2389
+ false,
2390
+ true,
2391
+ true,
2392
+ false,
2393
+ false,
2394
+ false,
2395
+ false,
2396
+ true,
2397
+ true,
2398
+ true,
2399
+ false,
2400
+ true,
2401
+ false,
2402
+ true,
2403
+ false,
2404
+ true,
2405
+ true,
2406
+ true,
2407
+ true,
2408
+ false,
2409
+ true,
2410
+ true,
2411
+ true,
2412
+ true,
2413
+ true,
2414
+ true,
2415
+ false,
2416
+ true,
2417
+ true,
2418
+ true,
2419
+ true,
2420
+ true,
2421
+ true,
2422
+ true,
2423
+ true,
2424
+ true,
2425
+ false,
2426
+ false,
2427
+ true,
2428
+ true,
2429
+ true,
2430
+ true
2431
+ ],
2432
+ "model.layers.18.self_attn.q_proj.lora_E": [
2433
+ false,
2434
+ true,
2435
+ false,
2436
+ true,
2437
+ false,
2438
+ true,
2439
+ false,
2440
+ true,
2441
+ true,
2442
+ true,
2443
+ false,
2444
+ true,
2445
+ true,
2446
+ true,
2447
+ false,
2448
+ true,
2449
+ true,
2450
+ false,
2451
+ true,
2452
+ false,
2453
+ false,
2454
+ false,
2455
+ true,
2456
+ true,
2457
+ false,
2458
+ true,
2459
+ true,
2460
+ true,
2461
+ false,
2462
+ true,
2463
+ true,
2464
+ true,
2465
+ true,
2466
+ true,
2467
+ true,
2468
+ false,
2469
+ true,
2470
+ true,
2471
+ true,
2472
+ true,
2473
+ true,
2474
+ true,
2475
+ false,
2476
+ false,
2477
+ true,
2478
+ true,
2479
+ true,
2480
+ true,
2481
+ true,
2482
+ false,
2483
+ true,
2484
+ false,
2485
+ false,
2486
+ false,
2487
+ false,
2488
+ true,
2489
+ false,
2490
+ false,
2491
+ true,
2492
+ false,
2493
+ true,
2494
+ false,
2495
+ true,
2496
+ true
2497
+ ],
2498
+ "model.layers.18.self_attn.v_proj.lora_E": [
2499
+ true,
2500
+ true,
2501
+ true,
2502
+ true,
2503
+ true,
2504
+ true,
2505
+ true,
2506
+ true,
2507
+ false,
2508
+ true,
2509
+ true,
2510
+ false,
2511
+ false,
2512
+ false,
2513
+ true,
2514
+ false,
2515
+ true,
2516
+ true,
2517
+ true,
2518
+ true,
2519
+ true,
2520
+ true,
2521
+ true,
2522
+ true,
2523
+ true,
2524
+ true,
2525
+ true,
2526
+ true,
2527
+ true,
2528
+ false,
2529
+ true,
2530
+ true,
2531
+ true,
2532
+ false,
2533
+ true,
2534
+ true,
2535
+ true,
2536
+ true,
2537
+ true,
2538
+ true,
2539
+ true,
2540
+ true,
2541
+ false,
2542
+ true,
2543
+ true,
2544
+ false,
2545
+ true,
2546
+ true,
2547
+ true,
2548
+ true,
2549
+ true,
2550
+ true,
2551
+ true,
2552
+ true,
2553
+ false,
2554
+ true,
2555
+ true,
2556
+ true,
2557
+ true,
2558
+ true,
2559
+ true,
2560
+ false,
2561
+ false,
2562
+ true
2563
+ ],
2564
+ "model.layers.19.self_attn.q_proj.lora_E": [
2565
+ false,
2566
+ true,
2567
+ false,
2568
+ true,
2569
+ false,
2570
+ true,
2571
+ false,
2572
+ true,
2573
+ true,
2574
+ true,
2575
+ true,
2576
+ true,
2577
+ true,
2578
+ true,
2579
+ true,
2580
+ true,
2581
+ true,
2582
+ true,
2583
+ true,
2584
+ true,
2585
+ false,
2586
+ true,
2587
+ true,
2588
+ true,
2589
+ true,
2590
+ true,
2591
+ true,
2592
+ false,
2593
+ false,
2594
+ true,
2595
+ true,
2596
+ true,
2597
+ true,
2598
+ false,
2599
+ true,
2600
+ false,
2601
+ true,
2602
+ false,
2603
+ true,
2604
+ false,
2605
+ false,
2606
+ false,
2607
+ true,
2608
+ true,
2609
+ true,
2610
+ true,
2611
+ true,
2612
+ false,
2613
+ false,
2614
+ false,
2615
+ false,
2616
+ true,
2617
+ true,
2618
+ true,
2619
+ true,
2620
+ true,
2621
+ true,
2622
+ false,
2623
+ false,
2624
+ true,
2625
+ true,
2626
+ false,
2627
+ true,
2628
+ true
2629
+ ],
2630
+ "model.layers.19.self_attn.v_proj.lora_E": [
2631
+ false,
2632
+ true,
2633
+ false,
2634
+ true,
2635
+ true,
2636
+ true,
2637
+ false,
2638
+ true,
2639
+ false,
2640
+ false,
2641
+ false,
2642
+ false,
2643
+ true,
2644
+ true,
2645
+ true,
2646
+ false,
2647
+ true,
2648
+ false,
2649
+ false,
2650
+ false,
2651
+ false,
2652
+ true,
2653
+ false,
2654
+ true,
2655
+ true,
2656
+ false,
2657
+ true,
2658
+ true,
2659
+ false,
2660
+ false,
2661
+ true,
2662
+ true,
2663
+ true,
2664
+ true,
2665
+ false,
2666
+ false,
2667
+ false,
2668
+ false,
2669
+ false,
2670
+ true,
2671
+ false,
2672
+ false,
2673
+ true,
2674
+ false,
2675
+ false,
2676
+ false,
2677
+ true,
2678
+ true,
2679
+ false,
2680
+ false,
2681
+ false,
2682
+ true,
2683
+ true,
2684
+ true,
2685
+ true,
2686
+ false,
2687
+ true,
2688
+ true,
2689
+ false,
2690
+ true,
2691
+ false,
2692
+ true,
2693
+ true,
2694
+ true
2695
+ ],
2696
+ "model.layers.20.self_attn.q_proj.lora_E": [
2697
+ false,
2698
+ true,
2699
+ false,
2700
+ false,
2701
+ false,
2702
+ false,
2703
+ true,
2704
+ false,
2705
+ false,
2706
+ false,
2707
+ false,
2708
+ false,
2709
+ false,
2710
+ false,
2711
+ false,
2712
+ true,
2713
+ false,
2714
+ false,
2715
+ false,
2716
+ false,
2717
+ false,
2718
+ false,
2719
+ false,
2720
+ false,
2721
+ false,
2722
+ false,
2723
+ false,
2724
+ true,
2725
+ false,
2726
+ false,
2727
+ false,
2728
+ false,
2729
+ false,
2730
+ true,
2731
+ false,
2732
+ true,
2733
+ true,
2734
+ false,
2735
+ false,
2736
+ false,
2737
+ false,
2738
+ true,
2739
+ false,
2740
+ true,
2741
+ false,
2742
+ true,
2743
+ false,
2744
+ false,
2745
+ false,
2746
+ false,
2747
+ true,
2748
+ true,
2749
+ false,
2750
+ false,
2751
+ true,
2752
+ true,
2753
+ false,
2754
+ false,
2755
+ false,
2756
+ false,
2757
+ false,
2758
+ true,
2759
+ false,
2760
+ false
2761
+ ],
2762
+ "model.layers.20.self_attn.v_proj.lora_E": [
2763
+ true,
2764
+ false,
2765
+ true,
2766
+ true,
2767
+ false,
2768
+ false,
2769
+ false,
2770
+ true,
2771
+ true,
2772
+ false,
2773
+ false,
2774
+ true,
2775
+ true,
2776
+ true,
2777
+ false,
2778
+ true,
2779
+ false,
2780
+ true,
2781
+ false,
2782
+ false,
2783
+ false,
2784
+ false,
2785
+ true,
2786
+ false,
2787
+ false,
2788
+ false,
2789
+ true,
2790
+ false,
2791
+ true,
2792
+ false,
2793
+ true,
2794
+ true,
2795
+ true,
2796
+ false,
2797
+ true,
2798
+ false,
2799
+ true,
2800
+ false,
2801
+ true,
2802
+ false,
2803
+ true,
2804
+ true,
2805
+ true,
2806
+ true,
2807
+ false,
2808
+ false,
2809
+ false,
2810
+ false,
2811
+ false,
2812
+ false,
2813
+ false,
2814
+ false,
2815
+ true,
2816
+ false,
2817
+ false,
2818
+ false,
2819
+ true,
2820
+ false,
2821
+ false,
2822
+ true,
2823
+ false,
2824
+ false,
2825
+ true,
2826
+ true
2827
+ ],
2828
+ "model.layers.21.self_attn.q_proj.lora_E": [
2829
+ false,
2830
+ false,
2831
+ true,
2832
+ false,
2833
+ true,
2834
+ true,
2835
+ true,
2836
+ true,
2837
+ true,
2838
+ false,
2839
+ true,
2840
+ true,
2841
+ true,
2842
+ true,
2843
+ true,
2844
+ true,
2845
+ false,
2846
+ false,
2847
+ false,
2848
+ false,
2849
+ true,
2850
+ true,
2851
+ false,
2852
+ true,
2853
+ true,
2854
+ true,
2855
+ true,
2856
+ false,
2857
+ false,
2858
+ false,
2859
+ false,
2860
+ false,
2861
+ false,
2862
+ false,
2863
+ true,
2864
+ true,
2865
+ true,
2866
+ false,
2867
+ true,
2868
+ false,
2869
+ true,
2870
+ false,
2871
+ true,
2872
+ false,
2873
+ false,
2874
+ false,
2875
+ true,
2876
+ false,
2877
+ true,
2878
+ true,
2879
+ true,
2880
+ true,
2881
+ true,
2882
+ false,
2883
+ false,
2884
+ true,
2885
+ true,
2886
+ false,
2887
+ true,
2888
+ true,
2889
+ false,
2890
+ false,
2891
+ true,
2892
+ true
2893
+ ],
2894
+ "model.layers.21.self_attn.v_proj.lora_E": [
2895
+ true,
2896
+ true,
2897
+ true,
2898
+ true,
2899
+ true,
2900
+ false,
2901
+ true,
2902
+ true,
2903
+ false,
2904
+ true,
2905
+ true,
2906
+ false,
2907
+ false,
2908
+ true,
2909
+ true,
2910
+ false,
2911
+ true,
2912
+ true,
2913
+ false,
2914
+ true,
2915
+ true,
2916
+ true,
2917
+ true,
2918
+ false,
2919
+ false,
2920
+ false,
2921
+ false,
2922
+ false,
2923
+ true,
2924
+ true,
2925
+ true,
2926
+ false,
2927
+ true,
2928
+ true,
2929
+ false,
2930
+ true,
2931
+ true,
2932
+ false,
2933
+ true,
2934
+ true,
2935
+ false,
2936
+ false,
2937
+ false,
2938
+ true,
2939
+ true,
2940
+ false,
2941
+ false,
2942
+ false,
2943
+ true,
2944
+ true,
2945
+ false,
2946
+ true,
2947
+ true,
2948
+ false,
2949
+ false,
2950
+ false,
2951
+ false,
2952
+ true,
2953
+ false,
2954
+ false,
2955
+ false,
2956
+ false,
2957
+ false,
2958
+ false
2959
+ ],
2960
+ "model.layers.22.self_attn.q_proj.lora_E": [
2961
+ false,
2962
+ true,
2963
+ false,
2964
+ true,
2965
+ true,
2966
+ false,
2967
+ true,
2968
+ false,
2969
+ false,
2970
+ true,
2971
+ false,
2972
+ false,
2973
+ false,
2974
+ false,
2975
+ false,
2976
+ false,
2977
+ true,
2978
+ false,
2979
+ true,
2980
+ true,
2981
+ false,
2982
+ false,
2983
+ false,
2984
+ false,
2985
+ true,
2986
+ true,
2987
+ true,
2988
+ false,
2989
+ false,
2990
+ true,
2991
+ false,
2992
+ false,
2993
+ false,
2994
+ false,
2995
+ false,
2996
+ true,
2997
+ false,
2998
+ false,
2999
+ false,
3000
+ false,
3001
+ true,
3002
+ false,
3003
+ false,
3004
+ false,
3005
+ true,
3006
+ false,
3007
+ true,
3008
+ false,
3009
+ false,
3010
+ false,
3011
+ true,
3012
+ false,
3013
+ true,
3014
+ true,
3015
+ true,
3016
+ false,
3017
+ false,
3018
+ true,
3019
+ false,
3020
+ false,
3021
+ true,
3022
+ true,
3023
+ false,
3024
+ true
3025
+ ],
3026
+ "model.layers.22.self_attn.v_proj.lora_E": [
3027
+ false,
3028
+ true,
3029
+ true,
3030
+ true,
3031
+ true,
3032
+ false,
3033
+ false,
3034
+ true,
3035
+ true,
3036
+ true,
3037
+ true,
3038
+ false,
3039
+ true,
3040
+ true,
3041
+ false,
3042
+ true,
3043
+ true,
3044
+ true,
3045
+ true,
3046
+ true,
3047
+ true,
3048
+ true,
3049
+ true,
3050
+ true,
3051
+ false,
3052
+ false,
3053
+ true,
3054
+ true,
3055
+ false,
3056
+ true,
3057
+ false,
3058
+ false,
3059
+ true,
3060
+ false,
3061
+ false,
3062
+ false,
3063
+ false,
3064
+ true,
3065
+ false,
3066
+ false,
3067
+ false,
3068
+ true,
3069
+ true,
3070
+ false,
3071
+ false,
3072
+ false,
3073
+ true,
3074
+ true,
3075
+ false,
3076
+ false,
3077
+ true,
3078
+ true,
3079
+ true,
3080
+ true,
3081
+ true,
3082
+ true,
3083
+ true,
3084
+ true,
3085
+ false,
3086
+ false,
3087
+ true,
3088
+ false,
3089
+ true,
3090
+ true
3091
+ ],
3092
+ "model.layers.23.self_attn.q_proj.lora_E": [
3093
+ true,
3094
+ false,
3095
+ true,
3096
+ false,
3097
+ true,
3098
+ true,
3099
+ true,
3100
+ true,
3101
+ true,
3102
+ false,
3103
+ true,
3104
+ true,
3105
+ true,
3106
+ true,
3107
+ true,
3108
+ true,
3109
+ true,
3110
+ true,
3111
+ false,
3112
+ false,
3113
+ true,
3114
+ true,
3115
+ true,
3116
+ true,
3117
+ true,
3118
+ false,
3119
+ true,
3120
+ true,
3121
+ true,
3122
+ true,
3123
+ true,
3124
+ true,
3125
+ true,
3126
+ true,
3127
+ true,
3128
+ true,
3129
+ true,
3130
+ true,
3131
+ true,
3132
+ true,
3133
+ false,
3134
+ true,
3135
+ true,
3136
+ false,
3137
+ true,
3138
+ false,
3139
+ true,
3140
+ false,
3141
+ true,
3142
+ true,
3143
+ true,
3144
+ true,
3145
+ true,
3146
+ true,
3147
+ true,
3148
+ false,
3149
+ true,
3150
+ false,
3151
+ true,
3152
+ true,
3153
+ true,
3154
+ false,
3155
+ true,
3156
+ true
3157
+ ],
3158
+ "model.layers.23.self_attn.v_proj.lora_E": [
3159
+ false,
3160
+ true,
3161
+ false,
3162
+ true,
3163
+ false,
3164
+ false,
3165
+ true,
3166
+ false,
3167
+ true,
3168
+ false,
3169
+ true,
3170
+ true,
3171
+ true,
3172
+ true,
3173
+ true,
3174
+ true,
3175
+ false,
3176
+ true,
3177
+ true,
3178
+ true,
3179
+ true,
3180
+ true,
3181
+ true,
3182
+ true,
3183
+ true,
3184
+ true,
3185
+ true,
3186
+ true,
3187
+ true,
3188
+ false,
3189
+ false,
3190
+ true,
3191
+ false,
3192
+ true,
3193
+ true,
3194
+ false,
3195
+ true,
3196
+ false,
3197
+ true,
3198
+ true,
3199
+ true,
3200
+ true,
3201
+ true,
3202
+ true,
3203
+ true,
3204
+ true,
3205
+ true,
3206
+ true,
3207
+ true,
3208
+ true,
3209
+ false,
3210
+ true,
3211
+ true,
3212
+ true,
3213
+ false,
3214
+ false,
3215
+ false,
3216
+ true,
3217
+ true,
3218
+ false,
3219
+ true,
3220
+ false,
3221
+ true,
3222
+ true
3223
+ ],
3224
+ "model.layers.24.self_attn.q_proj.lora_E": [
3225
+ true,
3226
+ true,
3227
+ true,
3228
+ true,
3229
+ true,
3230
+ true,
3231
+ false,
3232
+ true,
3233
+ true,
3234
+ true,
3235
+ true,
3236
+ false,
3237
+ true,
3238
+ true,
3239
+ true,
3240
+ false,
3241
+ true,
3242
+ true,
3243
+ true,
3244
+ true,
3245
+ true,
3246
+ true,
3247
+ false,
3248
+ false,
3249
+ true,
3250
+ true,
3251
+ true,
3252
+ true,
3253
+ false,
3254
+ true,
3255
+ true,
3256
+ true,
3257
+ true,
3258
+ true,
3259
+ true,
3260
+ true,
3261
+ true,
3262
+ true,
3263
+ false,
3264
+ true,
3265
+ true,
3266
+ true,
3267
+ false,
3268
+ true,
3269
+ true,
3270
+ true,
3271
+ true,
3272
+ false,
3273
+ true,
3274
+ false,
3275
+ true,
3276
+ true,
3277
+ true,
3278
+ true,
3279
+ false,
3280
+ false,
3281
+ false,
3282
+ true,
3283
+ true,
3284
+ true,
3285
+ true,
3286
+ false,
3287
+ false,
3288
+ true
3289
+ ],
3290
+ "model.layers.24.self_attn.v_proj.lora_E": [
3291
+ true,
3292
+ true,
3293
+ true,
3294
+ false,
3295
+ true,
3296
+ false,
3297
+ false,
3298
+ true,
3299
+ true,
3300
+ true,
3301
+ false,
3302
+ true,
3303
+ true,
3304
+ false,
3305
+ false,
3306
+ true,
3307
+ false,
3308
+ false,
3309
+ false,
3310
+ false,
3311
+ true,
3312
+ true,
3313
+ true,
3314
+ false,
3315
+ true,
3316
+ false,
3317
+ false,
3318
+ true,
3319
+ false,
3320
+ true,
3321
+ false,
3322
+ true,
3323
+ true,
3324
+ false,
3325
+ true,
3326
+ true,
3327
+ false,
3328
+ false,
3329
+ false,
3330
+ true,
3331
+ false,
3332
+ false,
3333
+ true,
3334
+ true,
3335
+ false,
3336
+ true,
3337
+ true,
3338
+ false,
3339
+ false,
3340
+ true,
3341
+ true,
3342
+ true,
3343
+ true,
3344
+ false,
3345
+ false,
3346
+ true,
3347
+ true,
3348
+ true,
3349
+ false,
3350
+ true,
3351
+ false,
3352
+ true,
3353
+ true,
3354
+ true
3355
+ ],
3356
+ "model.layers.25.self_attn.q_proj.lora_E": [
3357
+ false,
3358
+ false,
3359
+ false,
3360
+ false,
3361
+ true,
3362
+ true,
3363
+ false,
3364
+ true,
3365
+ true,
3366
+ false,
3367
+ false,
3368
+ false,
3369
+ false,
3370
+ false,
3371
+ false,
3372
+ false,
3373
+ true,
3374
+ false,
3375
+ false,
3376
+ true,
3377
+ false,
3378
+ false,
3379
+ true,
3380
+ false,
3381
+ false,
3382
+ false,
3383
+ true,
3384
+ false,
3385
+ false,
3386
+ false,
3387
+ false,
3388
+ false,
3389
+ false,
3390
+ false,
3391
+ false,
3392
+ false,
3393
+ false,
3394
+ false,
3395
+ false,
3396
+ false,
3397
+ false,
3398
+ false,
3399
+ false,
3400
+ false,
3401
+ true,
3402
+ false,
3403
+ false,
3404
+ false,
3405
+ false,
3406
+ false,
3407
+ true,
3408
+ false,
3409
+ false,
3410
+ false,
3411
+ true,
3412
+ true,
3413
+ true,
3414
+ false,
3415
+ false,
3416
+ false,
3417
+ false,
3418
+ false,
3419
+ false,
3420
+ false
3421
+ ],
3422
+ "model.layers.25.self_attn.v_proj.lora_E": [
3423
+ false,
3424
+ false,
3425
+ false,
3426
+ true,
3427
+ false,
3428
+ false,
3429
+ false,
3430
+ true,
3431
+ true,
3432
+ false,
3433
+ false,
3434
+ true,
3435
+ false,
3436
+ true,
3437
+ true,
3438
+ true,
3439
+ false,
3440
+ false,
3441
+ false,
3442
+ false,
3443
+ true,
3444
+ false,
3445
+ false,
3446
+ false,
3447
+ true,
3448
+ true,
3449
+ true,
3450
+ true,
3451
+ false,
3452
+ false,
3453
+ false,
3454
+ false,
3455
+ true,
3456
+ false,
3457
+ false,
3458
+ false,
3459
+ false,
3460
+ true,
3461
+ false,
3462
+ true,
3463
+ false,
3464
+ false,
3465
+ false,
3466
+ false,
3467
+ false,
3468
+ false,
3469
+ false,
3470
+ false,
3471
+ false,
3472
+ false,
3473
+ false,
3474
+ false,
3475
+ false,
3476
+ false,
3477
+ true,
3478
+ false,
3479
+ false,
3480
+ false,
3481
+ false,
3482
+ false,
3483
+ false,
3484
+ false,
3485
+ false,
3486
+ false
3487
+ ],
3488
+ "model.layers.26.self_attn.q_proj.lora_E": [
3489
+ true,
3490
+ false,
3491
+ false,
3492
+ true,
3493
+ false,
3494
+ false,
3495
+ false,
3496
+ false,
3497
+ false,
3498
+ false,
3499
+ true,
3500
+ false,
3501
+ true,
3502
+ false,
3503
+ true,
3504
+ true,
3505
+ true,
3506
+ false,
3507
+ false,
3508
+ true,
3509
+ true,
3510
+ true,
3511
+ false,
3512
+ false,
3513
+ true,
3514
+ true,
3515
+ false,
3516
+ false,
3517
+ true,
3518
+ false,
3519
+ true,
3520
+ true,
3521
+ false,
3522
+ false,
3523
+ false,
3524
+ true,
3525
+ true,
3526
+ false,
3527
+ false,
3528
+ false,
3529
+ true,
3530
+ false,
3531
+ false,
3532
+ false,
3533
+ true,
3534
+ true,
3535
+ false,
3536
+ false,
3537
+ true,
3538
+ false,
3539
+ true,
3540
+ true,
3541
+ false,
3542
+ true,
3543
+ false,
3544
+ false,
3545
+ true,
3546
+ true,
3547
+ true,
3548
+ false,
3549
+ true,
3550
+ true,
3551
+ true,
3552
+ true
3553
+ ],
3554
+ "model.layers.26.self_attn.v_proj.lora_E": [
3555
+ false,
3556
+ false,
3557
+ true,
3558
+ false,
3559
+ true,
3560
+ false,
3561
+ false,
3562
+ false,
3563
+ true,
3564
+ false,
3565
+ false,
3566
+ false,
3567
+ false,
3568
+ false,
3569
+ true,
3570
+ false,
3571
+ false,
3572
+ false,
3573
+ false,
3574
+ false,
3575
+ false,
3576
+ false,
3577
+ true,
3578
+ false,
3579
+ false,
3580
+ true,
3581
+ false,
3582
+ true,
3583
+ false,
3584
+ true,
3585
+ true,
3586
+ false,
3587
+ false,
3588
+ false,
3589
+ false,
3590
+ false,
3591
+ false,
3592
+ false,
3593
+ false,
3594
+ false,
3595
+ true,
3596
+ false,
3597
+ false,
3598
+ false,
3599
+ false,
3600
+ false,
3601
+ false,
3602
+ false,
3603
+ true,
3604
+ true,
3605
+ false,
3606
+ false,
3607
+ false,
3608
+ true,
3609
+ false,
3610
+ true,
3611
+ false,
3612
+ true,
3613
+ false,
3614
+ false,
3615
+ false,
3616
+ true,
3617
+ false,
3618
+ false
3619
+ ],
3620
+ "model.layers.27.self_attn.q_proj.lora_E": [
3621
+ true,
3622
+ false,
3623
+ false,
3624
+ true,
3625
+ true,
3626
+ false,
3627
+ false,
3628
+ true,
3629
+ true,
3630
+ false,
3631
+ false,
3632
+ false,
3633
+ true,
3634
+ true,
3635
+ false,
3636
+ true,
3637
+ false,
3638
+ false,
3639
+ true,
3640
+ false,
3641
+ false,
3642
+ true,
3643
+ true,
3644
+ true,
3645
+ true,
3646
+ false,
3647
+ false,
3648
+ true,
3649
+ true,
3650
+ false,
3651
+ false,
3652
+ false,
3653
+ false,
3654
+ true,
3655
+ true,
3656
+ true,
3657
+ false,
3658
+ true,
3659
+ false,
3660
+ false,
3661
+ false,
3662
+ true,
3663
+ false,
3664
+ true,
3665
+ true,
3666
+ true,
3667
+ false,
3668
+ false,
3669
+ false,
3670
+ true,
3671
+ true,
3672
+ true,
3673
+ true,
3674
+ true,
3675
+ false,
3676
+ false,
3677
+ false,
3678
+ false,
3679
+ true,
3680
+ false,
3681
+ false,
3682
+ false,
3683
+ true,
3684
+ false
3685
+ ],
3686
+ "model.layers.27.self_attn.v_proj.lora_E": [
3687
+ false,
3688
+ false,
3689
+ true,
3690
+ true,
3691
+ true,
3692
+ true,
3693
+ true,
3694
+ true,
3695
+ true,
3696
+ false,
3697
+ false,
3698
+ false,
3699
+ true,
3700
+ false,
3701
+ false,
3702
+ false,
3703
+ true,
3704
+ true,
3705
+ false,
3706
+ false,
3707
+ false,
3708
+ true,
3709
+ false,
3710
+ true,
3711
+ true,
3712
+ true,
3713
+ true,
3714
+ true,
3715
+ false,
3716
+ true,
3717
+ true,
3718
+ false,
3719
+ true,
3720
+ false,
3721
+ true,
3722
+ true,
3723
+ false,
3724
+ true,
3725
+ true,
3726
+ false,
3727
+ false,
3728
+ true,
3729
+ false,
3730
+ true,
3731
+ true,
3732
+ false,
3733
+ false,
3734
+ true,
3735
+ false,
3736
+ true,
3737
+ true,
3738
+ true,
3739
+ false,
3740
+ false,
3741
+ true,
3742
+ false,
3743
+ false,
3744
+ true,
3745
+ true,
3746
+ true,
3747
+ true,
3748
+ true,
3749
+ false,
3750
+ true
3751
+ ]
3752
+ },
3753
+ "alpha_pattern": {},
3754
+ "megatron_config": null,
3755
+ "megatron_core": "megatron.core",
3756
+ "trainable_token_indices": null,
3757
+ "loftq_config": {},
3758
+ "eva_config": null,
3759
+ "corda_config": null,
3760
+ "use_dora": false,
3761
+ "layer_replication": null,
3762
+ "lora_bias": false,
3763
+ "target_r": 32,
3764
+ "init_r": 64,
3765
+ "tinit": 200,
3766
+ "tfinal": 500,
3767
+ "deltaT": 1,
3768
+ "beta1": 0.85,
3769
+ "beta2": 0.85,
3770
+ "orth_reg_weight": 0.5,
3771
+ "total_step": 5000
3772
+ },
3773
+ "error_msg": ""
3774
+ },
3775
+ "train_info": {
3776
+ "cuda_memory_reserved_avg": 12361399900,
3777
+ "cuda_memory_max": 22793945088,
3778
+ "cuda_memory_reserved_99th": 18203426160,
3779
+ "train_time": 1986.3603882369862,
3780
+ "file_size": 35147440,
3781
+ "num_trainable_params": 18353664,
3782
+ "num_total_params": 3231103544,
3783
+ "status": "success",
3784
+ "metrics": [
3785
+ {
3786
+ "step": 250,
3787
+ "valid accuracy": 0.0,
3788
+ "train loss": 1.3241184422969818,
3789
+ "train samples": 1000,
3790
+ "train time": 35.95594502204767,
3791
+ "eval time": 11.413120707002236,
3792
+ "tokens / sec": 5888.289123542072,
3793
+ "mem allocated avg": 7292959393.792,
3794
+ "mem reserved avg": 12441731727.36,
3795
+ "elapsed time": 100.98083375500573
3796
+ },
3797
+ {
3798
+ "step": 500,
3799
+ "valid accuracy": 0.38,
3800
+ "train loss": 1.0195633232593537,
3801
+ "train samples": 2000,
3802
+ "train time": 37.64258231502754,
3803
+ "eval time": 11.37802824100072,
3804
+ "tokens / sec": 5525.524212428035,
3805
+ "mem allocated avg": 7285510731.776,
3806
+ "mem reserved avg": 12328493907.968,
3807
+ "elapsed time": 197.93603045200143
3808
+ },
3809
+ {
3810
+ "step": 750,
3811
+ "valid accuracy": 0.28,
3812
+ "train loss": 0.7883218789100647,
3813
+ "train samples": 3000,
3814
+ "train time": 37.909325722001086,
3815
+ "eval time": 11.385932488003164,
3816
+ "tokens / sec": 5655.626838954038,
3817
+ "mem allocated avg": 7296095842.304,
3818
+ "mem reserved avg": 12484438130.688,
3819
+ "elapsed time": 295.9188707240028
3820
+ },
3821
+ {
3822
+ "step": 1000,
3823
+ "valid accuracy": 0.3,
3824
+ "train loss": 0.7408825470209122,
3825
+ "train samples": 4000,
3826
+ "train time": 37.79932949803333,
3827
+ "eval time": 11.34964040399791,
3828
+ "tokens / sec": 5511.6321576772825,
3829
+ "mem allocated avg": 7286506670.08,
3830
+ "mem reserved avg": 12351948455.936,
3831
+ "elapsed time": 393.33776786700037
3832
+ },
3833
+ {
3834
+ "step": 1250,
3835
+ "valid accuracy": 0.36,
3836
+ "train loss": 0.7282904219627381,
3837
+ "train samples": 5000,
3838
+ "train time": 37.475317073069164,
3839
+ "eval time": 11.342822429993248,
3840
+ "tokens / sec": 5564.676066473135,
3841
+ "mem allocated avg": 7287005519.872,
3842
+ "mem reserved avg": 12349910024.192,
3843
+ "elapsed time": 490.5430299360014
3844
+ },
3845
+ {
3846
+ "step": 1500,
3847
+ "valid accuracy": 0.38,
3848
+ "train loss": 0.7161256531476975,
3849
+ "train samples": 6000,
3850
+ "train time": 37.660518338059774,
3851
+ "eval time": 11.34013032400253,
3852
+ "tokens / sec": 5558.367469107556,
3853
+ "mem allocated avg": 7287642494.976,
3854
+ "mem reserved avg": 12380570386.432,
3855
+ "elapsed time": 588.017992052999
3856
+ },
3857
+ {
3858
+ "step": 1750,
3859
+ "valid accuracy": 0.34,
3860
+ "train loss": 0.7056601424217224,
3861
+ "train samples": 7000,
3862
+ "train time": 37.636171496975294,
3863
+ "eval time": 11.3171367870018,
3864
+ "tokens / sec": 5562.600861695649,
3865
+ "mem allocated avg": 7289782888.448,
3866
+ "mem reserved avg": 12389051269.12,
3867
+ "elapsed time": 685.2421731229988
3868
+ },
3869
+ {
3870
+ "step": 2000,
3871
+ "valid accuracy": 0.34,
3872
+ "train loss": 0.7058932571411133,
3873
+ "train samples": 8000,
3874
+ "train time": 37.505602380944765,
3875
+ "eval time": 11.37751964799827,
3876
+ "tokens / sec": 5537.732680318789,
3877
+ "mem allocated avg": 7287054886.912,
3878
+ "mem reserved avg": 12336119152.64,
3879
+ "elapsed time": 782.1823508529997
3880
+ },
3881
+ {
3882
+ "step": 2250,
3883
+ "valid accuracy": 0.3,
3884
+ "train loss": 0.700018577337265,
3885
+ "train samples": 9000,
3886
+ "train time": 38.06487834800646,
3887
+ "eval time": 11.33160761000181,
3888
+ "tokens / sec": 5646.885247730137,
3889
+ "mem allocated avg": 7297638139.904,
3890
+ "mem reserved avg": 12521129902.08,
3891
+ "elapsed time": 880.444039299
3892
+ },
3893
+ {
3894
+ "step": 2500,
3895
+ "valid accuracy": 0.34,
3896
+ "train loss": 0.6984639673233032,
3897
+ "train samples": 10000,
3898
+ "train time": 37.400825600088865,
3899
+ "eval time": 7.680036880999978,
3900
+ "tokens / sec": 5507.017470745635,
3901
+ "mem allocated avg": 7283608303.616,
3902
+ "mem reserved avg": 12278598467.584,
3903
+ "elapsed time": 973.4031999860017
3904
+ },
3905
+ {
3906
+ "step": 2750,
3907
+ "valid accuracy": 0.32,
3908
+ "train loss": 0.691307947397232,
3909
+ "train samples": 11000,
3910
+ "train time": 37.97861938195274,
3911
+ "eval time": 11.376824188999308,
3912
+ "tokens / sec": 5578.954776346737,
3913
+ "mem allocated avg": 7293332232.192,
3914
+ "mem reserved avg": 12452821467.136,
3915
+ "elapsed time": 1071.2981272770048
3916
+ },
3917
+ {
3918
+ "step": 3000,
3919
+ "valid accuracy": 0.3,
3920
+ "train loss": 0.6851879090070725,
3921
+ "train samples": 12000,
3922
+ "train time": 37.862704559986014,
3923
+ "eval time": 11.377599911000289,
3924
+ "tokens / sec": 5512.839149387935,
3925
+ "mem allocated avg": 7288929478.656,
3926
+ "mem reserved avg": 12371468746.752,
3927
+ "elapsed time": 1168.7257358770003
3928
+ },
3929
+ {
3930
+ "step": 3250,
3931
+ "valid accuracy": 0.34,
3932
+ "train loss": 0.6939580011367797,
3933
+ "train samples": 13000,
3934
+ "train time": 37.79518606400961,
3935
+ "eval time": 7.2029460159974406,
3936
+ "tokens / sec": 5580.102176050141,
3937
+ "mem allocated avg": 7290687285.248,
3938
+ "mem reserved avg": 12403068633.088,
3939
+ "elapsed time": 1261.9857917680056
3940
+ },
3941
+ {
3942
+ "step": 3500,
3943
+ "valid accuracy": 0.4,
3944
+ "train loss": 0.6825792235136032,
3945
+ "train samples": 14000,
3946
+ "train time": 37.73422463506722,
3947
+ "eval time": 11.28984081800445,
3948
+ "tokens / sec": 5558.614282617983,
3949
+ "mem allocated avg": 7289277476.864,
3950
+ "mem reserved avg": 12381820289.024,
3951
+ "elapsed time": 1359.695578400002
3952
+ },
3953
+ {
3954
+ "step": 3750,
3955
+ "valid accuracy": 0.34,
3956
+ "train loss": 0.6795008780956269,
3957
+ "train samples": 15000,
3958
+ "train time": 38.156728624038806,
3959
+ "eval time": 11.362600938999094,
3960
+ "tokens / sec": 5679.286663570962,
3961
+ "mem allocated avg": 7299185600.512,
3962
+ "mem reserved avg": 12562561236.992,
3963
+ "elapsed time": 1458.6053942910003
3964
+ },
3965
+ {
3966
+ "step": 4000,
3967
+ "valid accuracy": 0.32,
3968
+ "train loss": 0.6967895623445511,
3969
+ "train samples": 16000,
3970
+ "train time": 37.352128309052205,
3971
+ "eval time": 11.363241717001074,
3972
+ "tokens / sec": 5471.522219805362,
3973
+ "mem allocated avg": 7281535514.624,
3974
+ "mem reserved avg": 12256066666.496,
3975
+ "elapsed time": 1555.2909630150025
3976
+ },
3977
+ {
3978
+ "step": 4250,
3979
+ "valid accuracy": 0.34,
3980
+ "train loss": 0.6776066061258316,
3981
+ "train samples": 17000,
3982
+ "train time": 37.65609644694632,
3983
+ "eval time": 11.334564828997827,
3984
+ "tokens / sec": 5613.672683726684,
3985
+ "mem allocated avg": 7291894349.824,
3986
+ "mem reserved avg": 12418562392.064,
3987
+ "elapsed time": 1652.928281804001
3988
+ },
3989
+ {
3990
+ "step": 4500,
3991
+ "valid accuracy": 0.34,
3992
+ "train loss": 0.6868188911676407,
3993
+ "train samples": 18000,
3994
+ "train time": 37.48494880297949,
3995
+ "eval time": 11.33762150000257,
3996
+ "tokens / sec": 5544.038517760537,
3997
+ "mem allocated avg": 7285549684.736,
3998
+ "mem reserved avg": 12333837451.264,
3999
+ "elapsed time": 1749.9311109990012
4000
+ },
4001
+ {
4002
+ "step": 4750,
4003
+ "valid accuracy": 0.34,
4004
+ "train loss": 0.6806062284708023,
4005
+ "train samples": 19000,
4006
+ "train time": 33.62080936400889,
4007
+ "eval time": 11.34113016500487,
4008
+ "tokens / sec": 6244.31725384755,
4009
+ "mem allocated avg": 7068488509.44,
4010
+ "mem reserved avg": 12120833916.928,
4011
+ "elapsed time": 1843.633759463999
4012
+ },
4013
+ {
4014
+ "step": 5000,
4015
+ "valid accuracy": 0.28,
4016
+ "train loss": 0.6862971596717834,
4017
+ "train samples": 20000,
4018
+ "train time": 33.47089828590106,
4019
+ "eval time": 11.363945298006001,
4020
+ "tokens / sec": 6222.7191580255185,
4021
+ "mem allocated avg": 7065409925.12,
4022
+ "mem reserved avg": 12064965787.648,
4023
+ "elapsed time": 1937.0431615920024
4024
+ },
4025
+ {
4026
+ "step": 5000,
4027
+ "test accuracy": 0.3904473085670963,
4028
+ "train loss": 0.6862971596717834,
4029
+ "train samples": 20000,
4030
+ "train total tokens": 4198051
4031
+ }
4032
+ ]
4033
+ },
4034
+ "meta_info": {
4035
+ "model_info": {
4036
+ "sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
4037
+ "created_at": "2024-09-18T15:23:48+00:00"
4038
+ },
4039
+ "dataset_info": {
4040
+ "metamath": {
4041
+ "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
4042
+ "created_at": "2023-09-21T17:22:46+00:00"
4043
+ },
4044
+ "gsm8k": {
4045
+ "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
4046
+ "created_at": "2022-04-12T10:22:10+00:00"
4047
+ }
4048
+ },
4049
+ "package_info": {
4050
+ "transformers-version": "4.52.4",
4051
+ "transformers-commit-hash": null,
4052
+ "peft-version": "0.15.2.dev0",
4053
+ "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
4054
+ "datasets-version": "3.6.0",
4055
+ "datasets-commit-hash": null,
4056
+ "bitsandbytes-version": "0.46.0",
4057
+ "bitsandbytes-commit-hash": null,
4058
+ "torch-version": "2.7.1+cu126",
4059
+ "torch-commit-hash": null
4060
+ },
4061
+ "system_info": {
4062
+ "system": "Linux",
4063
+ "release": "6.8.0-1029-aws",
4064
+ "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
4065
+ "machine": "x86_64",
4066
+ "processor": "x86_64",
4067
+ "gpu": "NVIDIA L40S"
4068
+ },
4069
+ "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
4070
+ }
4071
+ }
MetaMathQA/results/adaptionprompt--llama-3.2-3B-lr_0.0005.json ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_info": {
3
+ "created_at": "2025-06-20T04:48:22+00:00",
4
+ "total_time": 2260.6744696069945,
5
+ "experiment_name": "adaptionprompt/llama-3.2-3B-lr_0.0005",
6
+ "peft_branch": "main",
7
+ "train_config": {
8
+ "model_id": "meta-llama/Llama-3.2-3B",
9
+ "dtype": "bfloat16",
10
+ "max_seq_length": 768,
11
+ "batch_size": 4,
12
+ "batch_size_eval": 50,
13
+ "max_steps": 5000,
14
+ "eval_steps": 250,
15
+ "compile": false,
16
+ "query_template": "Question: {query} Think step by step.\nAnswer:",
17
+ "seed": 0,
18
+ "grad_norm_clip": 1.0,
19
+ "optimizer_type": "AdamW",
20
+ "optimizer_kwargs": {
21
+ "lr": 0.0005
22
+ },
23
+ "lr_scheduler": "cosine",
24
+ "use_amp": false,
25
+ "autocast_adapter_dtype": true,
26
+ "generation_kwargs": {
27
+ "max_length": 800,
28
+ "max_new_tokens": 300
29
+ },
30
+ "attn_implementation": null
31
+ },
32
+ "peft_config": {
33
+ "task_type": "CAUSAL_LM",
34
+ "peft_type": "ADAPTION_PROMPT",
35
+ "auto_mapping": null,
36
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
37
+ "revision": null,
38
+ "inference_mode": false,
39
+ "target_modules": "self_attn",
40
+ "adapter_len": 100,
41
+ "adapter_layers": 28
42
+ },
43
+ "error_msg": ""
44
+ },
45
+ "train_info": {
46
+ "cuda_memory_reserved_avg": 11893757234,
47
+ "cuda_memory_max": 22410166272,
48
+ "cuda_memory_reserved_99th": 17907664814,
49
+ "train_time": 1989.2834085189897,
50
+ "file_size": 17210384,
51
+ "num_trainable_params": 8601628,
52
+ "num_total_params": 3221351452,
53
+ "status": "success",
54
+ "metrics": [
55
+ {
56
+ "step": 250,
57
+ "valid accuracy": 0.0,
58
+ "train loss": 1.3201356165409088,
59
+ "train samples": 1000,
60
+ "train time": 36.18721537806414,
61
+ "eval time": 13.46754032199533,
62
+ "tokens / sec": 5850.657415556191,
63
+ "mem allocated avg": 6848060076.032,
64
+ "mem reserved avg": 11943163199.488,
65
+ "elapsed time": 99.94861951399798
66
+ },
67
+ {
68
+ "step": 500,
69
+ "valid accuracy": 0.1,
70
+ "train loss": 1.153662922859192,
71
+ "train samples": 2000,
72
+ "train time": 35.6493088029747,
73
+ "eval time": 13.314302301005227,
74
+ "tokens / sec": 5834.474972559473,
75
+ "mem allocated avg": 6840933136.384,
76
+ "mem reserved avg": 11833045942.272,
77
+ "elapsed time": 193.4177081749949
78
+ },
79
+ {
80
+ "step": 750,
81
+ "valid accuracy": 0.22,
82
+ "train loss": 0.9016587936878204,
83
+ "train samples": 3000,
84
+ "train time": 36.424757257977035,
85
+ "eval time": 13.392894379001518,
86
+ "tokens / sec": 5886.133941305707,
87
+ "mem allocated avg": 6851972698.112,
88
+ "mem reserved avg": 11989870968.832,
89
+ "elapsed time": 288.2962625699947
90
+ },
91
+ {
92
+ "step": 1000,
93
+ "valid accuracy": 0.2,
94
+ "train loss": 0.8571369113922119,
95
+ "train samples": 4000,
96
+ "train time": 35.59983186099271,
97
+ "eval time": 13.363479856001504,
98
+ "tokens / sec": 5852.1624712581015,
99
+ "mem allocated avg": 6842572642.304,
100
+ "mem reserved avg": 11863001661.44,
101
+ "elapsed time": 381.66334240599826
102
+ },
103
+ {
104
+ "step": 1250,
105
+ "valid accuracy": 0.18,
106
+ "train loss": 0.84929132604599,
107
+ "train samples": 5000,
108
+ "train time": 35.52914607799903,
109
+ "eval time": 13.408120855005109,
110
+ "tokens / sec": 5869.490911551474,
111
+ "mem allocated avg": 6843078866.944,
112
+ "mem reserved avg": 11855409971.2,
113
+ "elapsed time": 475.2031378399988
114
+ },
115
+ {
116
+ "step": 1500,
117
+ "valid accuracy": 0.18,
118
+ "train loss": 0.8379741818904877,
119
+ "train samples": 6000,
120
+ "train time": 35.84657208897261,
121
+ "eval time": 13.451748254003178,
122
+ "tokens / sec": 5839.637873335062,
123
+ "mem allocated avg": 6844234328.064,
124
+ "mem reserved avg": 11880013758.464,
125
+ "elapsed time": 568.970056428996
126
+ },
127
+ {
128
+ "step": 1750,
129
+ "valid accuracy": 0.2,
130
+ "train loss": 0.8320568509101868,
131
+ "train samples": 7000,
132
+ "train time": 36.04748217701126,
133
+ "eval time": 13.354637482996623,
134
+ "tokens / sec": 5807.756529900249,
135
+ "mem allocated avg": 6845049858.048,
136
+ "mem reserved avg": 11894333112.32,
137
+ "elapsed time": 663.2131869919976
138
+ },
139
+ {
140
+ "step": 2000,
141
+ "valid accuracy": 0.2,
142
+ "train loss": 0.83651398563385,
143
+ "train samples": 8000,
144
+ "train time": 35.70882848704787,
145
+ "eval time": 13.407459709997056,
146
+ "tokens / sec": 5816.376756110452,
147
+ "mem allocated avg": 6842067818.496,
148
+ "mem reserved avg": 11843724640.256,
149
+ "elapsed time": 756.9679808469955
150
+ },
151
+ {
152
+ "step": 2250,
153
+ "valid accuracy": 0.18,
154
+ "train loss": 0.8321560187339783,
155
+ "train samples": 9000,
156
+ "train time": 36.077689886013104,
157
+ "eval time": 13.313609958000598,
158
+ "tokens / sec": 5957.92027369615,
159
+ "mem allocated avg": 6853360060.416,
160
+ "mem reserved avg": 12025841319.936,
161
+ "elapsed time": 851.5264306229947
162
+ },
163
+ {
164
+ "step": 2500,
165
+ "valid accuracy": 0.22,
166
+ "train loss": 0.830465945482254,
167
+ "train samples": 10000,
168
+ "train time": 35.51607862501987,
169
+ "eval time": 13.570960901000944,
170
+ "tokens / sec": 5799.260728488849,
171
+ "mem allocated avg": 6838232895.488,
172
+ "mem reserved avg": 11785499312.128,
173
+ "elapsed time": 945.1205676109967
174
+ },
175
+ {
176
+ "step": 2750,
177
+ "valid accuracy": 0.2,
178
+ "train loss": 0.8323929319381714,
179
+ "train samples": 11000,
180
+ "train time": 36.33290277811466,
181
+ "eval time": 13.340032396001334,
182
+ "tokens / sec": 5831.6562619276265,
183
+ "mem allocated avg": 6849506107.392,
184
+ "mem reserved avg": 11957667102.72,
185
+ "elapsed time": 1039.698461469001
186
+ },
187
+ {
188
+ "step": 3000,
189
+ "valid accuracy": 0.22,
190
+ "train loss": 0.8273163681030273,
191
+ "train samples": 12000,
192
+ "train time": 36.133581758025684,
193
+ "eval time": 13.486512909999874,
194
+ "tokens / sec": 5776.648476140576,
195
+ "mem allocated avg": 6844330549.248,
196
+ "mem reserved avg": 11874754101.248,
197
+ "elapsed time": 1134.0729920019949
198
+ },
199
+ {
200
+ "step": 3250,
201
+ "valid accuracy": 0.18,
202
+ "train loss": 0.8321007430553437,
203
+ "train samples": 13000,
204
+ "train time": 35.81564853595046,
205
+ "eval time": 13.383609317002993,
206
+ "tokens / sec": 5888.515456820645,
207
+ "mem allocated avg": 6845503963.136,
208
+ "mem reserved avg": 11903065653.248,
209
+ "elapsed time": 1228.1345331240009
210
+ },
211
+ {
212
+ "step": 3500,
213
+ "valid accuracy": 0.18,
214
+ "train loss": 0.8267617487907409,
215
+ "train samples": 14000,
216
+ "train time": 35.759473790014454,
217
+ "eval time": 13.568141147006827,
218
+ "tokens / sec": 5865.578482269809,
219
+ "mem allocated avg": 6844375582.72,
220
+ "mem reserved avg": 11893385199.616,
221
+ "elapsed time": 1322.3741278140005
222
+ },
223
+ {
224
+ "step": 3750,
225
+ "valid accuracy": 0.18,
226
+ "train loss": 0.822540352344513,
227
+ "train samples": 15000,
228
+ "train time": 36.6447854490616,
229
+ "eval time": 13.383382205000089,
230
+ "tokens / sec": 5913.610827418539,
231
+ "mem allocated avg": 6855454945.28,
232
+ "mem reserved avg": 12064244367.36,
233
+ "elapsed time": 1417.8726171529997
234
+ },
235
+ {
236
+ "step": 4000,
237
+ "valid accuracy": 0.22,
238
+ "train loss": 0.842738341331482,
239
+ "train samples": 16000,
240
+ "train time": 35.83419257100468,
241
+ "eval time": 13.484180120998644,
242
+ "tokens / sec": 5703.295800373884,
243
+ "mem allocated avg": 6837201041.408,
244
+ "mem reserved avg": 11769015697.408,
245
+ "elapsed time": 1511.8286734409994
246
+ },
247
+ {
248
+ "step": 4250,
249
+ "valid accuracy": 0.24,
250
+ "train loss": 0.8195172207355499,
251
+ "train samples": 17000,
252
+ "train time": 36.032976000991766,
253
+ "eval time": 13.43221827600064,
254
+ "tokens / sec": 5866.542913196561,
255
+ "mem allocated avg": 6847173238.784,
256
+ "mem reserved avg": 11924070727.68,
257
+ "elapsed time": 1606.2413196950001
258
+ },
259
+ {
260
+ "step": 4500,
261
+ "valid accuracy": 0.22,
262
+ "train loss": 0.8333091423511505,
263
+ "train samples": 18000,
264
+ "train time": 35.92476197002543,
265
+ "eval time": 13.364069708994066,
266
+ "tokens / sec": 5784.812163081199,
267
+ "mem allocated avg": 6842308513.792,
268
+ "mem reserved avg": 11840637632.512,
269
+ "elapsed time": 1700.1633438569988
270
+ },
271
+ {
272
+ "step": 4750,
273
+ "valid accuracy": 0.24,
274
+ "train loss": 0.8247289218902588,
275
+ "train samples": 19000,
276
+ "train time": 36.319470202004595,
277
+ "eval time": 13.367499373998726,
278
+ "tokens / sec": 5780.343128144329,
279
+ "mem allocated avg": 6845010323.456,
280
+ "mem reserved avg": 11893443919.872,
281
+ "elapsed time": 1795.0117048679967
282
+ },
283
+ {
284
+ "step": 5000,
285
+ "valid accuracy": 0.24,
286
+ "train loss": 0.8317011270523071,
287
+ "train samples": 20000,
288
+ "train time": 35.778475134953624,
289
+ "eval time": 13.382634160996531,
290
+ "tokens / sec": 5821.377216731123,
291
+ "mem allocated avg": 6841479706.624,
292
+ "mem reserved avg": 11840956399.616,
293
+ "elapsed time": 1888.9356832179983
294
+ },
295
+ {
296
+ "step": 5000,
297
+ "test accuracy": 0.22062168309325247,
298
+ "train loss": 0.8317011270523071,
299
+ "train samples": 20000,
300
+ "train total tokens": 4198051
301
+ }
302
+ ]
303
+ },
304
+ "meta_info": {
305
+ "model_info": {
306
+ "sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
307
+ "created_at": "2024-09-18T15:23:48+00:00"
308
+ },
309
+ "dataset_info": {
310
+ "metamath": {
311
+ "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
312
+ "created_at": "2023-09-21T17:22:46+00:00"
313
+ },
314
+ "gsm8k": {
315
+ "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
316
+ "created_at": "2022-04-12T10:22:10+00:00"
317
+ }
318
+ },
319
+ "package_info": {
320
+ "transformers-version": "4.52.4",
321
+ "transformers-commit-hash": null,
322
+ "peft-version": "0.15.2.dev0",
323
+ "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
324
+ "datasets-version": "3.6.0",
325
+ "datasets-commit-hash": null,
326
+ "bitsandbytes-version": "0.46.0",
327
+ "bitsandbytes-commit-hash": null,
328
+ "torch-version": "2.7.1+cu126",
329
+ "torch-commit-hash": null
330
+ },
331
+ "system_info": {
332
+ "system": "Linux",
333
+ "release": "6.8.0-1029-aws",
334
+ "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
335
+ "machine": "x86_64",
336
+ "processor": "x86_64",
337
+ "gpu": "NVIDIA L40S"
338
+ },
339
+ "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
340
+ }
341
+ }
MetaMathQA/results/boft--llama-3.2-3B-default.json ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_info": {
3
+ "created_at": "2025-06-20T00:26:06+00:00",
4
+ "total_time": 11113.556226242006,
5
+ "experiment_name": "boft/llama-3.2-3B-default",
6
+ "peft_branch": "main",
7
+ "train_config": {
8
+ "model_id": "meta-llama/Llama-3.2-3B",
9
+ "dtype": "bfloat16",
10
+ "max_seq_length": 768,
11
+ "batch_size": 4,
12
+ "batch_size_eval": 50,
13
+ "max_steps": 5000,
14
+ "eval_steps": 250,
15
+ "compile": false,
16
+ "query_template": "Question: {query} Think step by step.\nAnswer:",
17
+ "seed": 0,
18
+ "grad_norm_clip": 1.0,
19
+ "optimizer_type": "AdamW",
20
+ "optimizer_kwargs": {
21
+ "lr": 0.0001,
22
+ "weight_decay": 0.1
23
+ },
24
+ "lr_scheduler": "cosine",
25
+ "use_amp": false,
26
+ "autocast_adapter_dtype": true,
27
+ "generation_kwargs": {
28
+ "max_length": 800,
29
+ "max_new_tokens": 300
30
+ },
31
+ "attn_implementation": null
32
+ },
33
+ "peft_config": {
34
+ "task_type": null,
35
+ "peft_type": "BOFT",
36
+ "auto_mapping": null,
37
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
38
+ "revision": null,
39
+ "inference_mode": false,
40
+ "boft_block_size": 4,
41
+ "boft_block_num": 0,
42
+ "boft_n_butterfly_factor": 1,
43
+ "target_modules": [
44
+ "q_proj",
45
+ "v_proj"
46
+ ],
47
+ "exclude_modules": null,
48
+ "boft_dropout": 0.0,
49
+ "fan_in_fan_out": false,
50
+ "bias": "none",
51
+ "modules_to_save": null,
52
+ "init_weights": true,
53
+ "layers_to_transform": null,
54
+ "layers_pattern": null
55
+ },
56
+ "error_msg": ""
57
+ },
58
+ "train_info": {
59
+ "cuda_memory_reserved_avg": 14814855089,
60
+ "cuda_memory_max": 24427626496,
61
+ "cuda_memory_reserved_99th": 20103445872,
62
+ "train_time": 8291.859631775995,
63
+ "file_size": 3225360,
64
+ "num_trainable_params": 802816,
65
+ "num_total_params": 3213552640,
66
+ "status": "success",
67
+ "metrics": [
68
+ {
69
+ "step": 250,
70
+ "valid accuracy": 0.0,
71
+ "train loss": 1.291453486919403,
72
+ "train samples": 1000,
73
+ "train time": 168.6401632970519,
74
+ "eval time": 140.71104099299555,
75
+ "tokens / sec": 1255.4482625059293,
76
+ "mem allocated avg": 6794374191.104,
77
+ "mem reserved avg": 14862272954.368,
78
+ "elapsed time": 378.35506656600046
79
+ },
80
+ {
81
+ "step": 500,
82
+ "valid accuracy": 0.12,
83
+ "train loss": 1.0658165102005004,
84
+ "train samples": 2000,
85
+ "train time": 168.0782826189752,
86
+ "eval time": 140.55351014900225,
87
+ "tokens / sec": 1237.4888460248842,
88
+ "mem allocated avg": 6786098696.192,
89
+ "mem reserved avg": 14759126630.4,
90
+ "elapsed time": 750.4153373740046
91
+ },
92
+ {
93
+ "step": 750,
94
+ "valid accuracy": 0.38,
95
+ "train loss": 0.8760707340240479,
96
+ "train samples": 3000,
97
+ "train time": 168.35559053501493,
98
+ "eval time": 140.5371915020005,
99
+ "tokens / sec": 1273.5009233649919,
100
+ "mem allocated avg": 6796379451.392,
101
+ "mem reserved avg": 14898109087.744,
102
+ "elapsed time": 1123.1088362480004
103
+ },
104
+ {
105
+ "step": 1000,
106
+ "valid accuracy": 0.42,
107
+ "train loss": 0.8187176239490509,
108
+ "train samples": 4000,
109
+ "train time": 168.23626853094902,
110
+ "eval time": 140.51234973900137,
111
+ "tokens / sec": 1238.3536666570453,
112
+ "mem allocated avg": 6788017170.432,
113
+ "mem reserved avg": 14785978564.608,
114
+ "elapsed time": 1495.2035204040003
115
+ },
116
+ {
117
+ "step": 1250,
118
+ "valid accuracy": 0.44,
119
+ "train loss": 0.7968595073223114,
120
+ "train samples": 5000,
121
+ "train time": 168.06973706404096,
122
+ "eval time": 140.56398986800195,
123
+ "tokens / sec": 1240.7825682534333,
124
+ "mem allocated avg": 6786994073.6,
125
+ "mem reserved avg": 14784728662.016,
126
+ "elapsed time": 1867.293767313
127
+ },
128
+ {
129
+ "step": 1500,
130
+ "valid accuracy": 0.3,
131
+ "train loss": 0.7768308148384094,
132
+ "train samples": 6000,
133
+ "train time": 168.12391281103191,
134
+ "eval time": 140.47015122300218,
135
+ "tokens / sec": 1245.0995013141533,
136
+ "mem allocated avg": 6790023022.592,
137
+ "mem reserved avg": 14800616685.568,
138
+ "elapsed time": 2239.2391544300044
139
+ },
140
+ {
141
+ "step": 1750,
142
+ "valid accuracy": 0.34,
143
+ "train loss": 0.7639130955934524,
144
+ "train samples": 7000,
145
+ "train time": 168.4569528100401,
146
+ "eval time": 140.76006173399946,
147
+ "tokens / sec": 1242.780404772479,
148
+ "mem allocated avg": 6790166409.216,
149
+ "mem reserved avg": 14820103421.952,
150
+ "elapsed time": 2611.854956449002
151
+ },
152
+ {
153
+ "step": 2000,
154
+ "valid accuracy": 0.28,
155
+ "train loss": 0.7575103138685226,
156
+ "train samples": 8000,
157
+ "train time": 168.38565446306166,
158
+ "eval time": 140.82750502999988,
159
+ "tokens / sec": 1233.4542432506432,
160
+ "mem allocated avg": 6787659706.368,
161
+ "mem reserved avg": 14766038843.392,
162
+ "elapsed time": 2984.338527646003
163
+ },
164
+ {
165
+ "step": 2250,
166
+ "valid accuracy": 0.36,
167
+ "train loss": 0.7480558000802994,
168
+ "train samples": 9000,
169
+ "train time": 168.98983921804756,
170
+ "eval time": 140.92262020800263,
171
+ "tokens / sec": 1271.9581307054364,
172
+ "mem allocated avg": 6798715979.776,
173
+ "mem reserved avg": 14937929809.92,
174
+ "elapsed time": 3357.8442202950027
175
+ },
176
+ {
177
+ "step": 2500,
178
+ "valid accuracy": 0.36,
179
+ "train loss": 0.7452825582027436,
180
+ "train samples": 10000,
181
+ "train time": 168.30827127001976,
182
+ "eval time": 140.89225408899802,
183
+ "tokens / sec": 1223.7485326527044,
184
+ "mem allocated avg": 6783722676.224,
185
+ "mem reserved avg": 14710111993.856,
186
+ "elapsed time": 3730.0927005050034
187
+ },
188
+ {
189
+ "step": 2750,
190
+ "valid accuracy": 0.4,
191
+ "train loss": 0.7368131847381592,
192
+ "train samples": 11000,
193
+ "train time": 168.8352410539519,
194
+ "eval time": 140.97951381299936,
195
+ "tokens / sec": 1254.9571918595636,
196
+ "mem allocated avg": 6794155292.672,
197
+ "mem reserved avg": 14876869132.288,
198
+ "elapsed time": 4103.762088249001
199
+ },
200
+ {
201
+ "step": 3000,
202
+ "valid accuracy": 0.38,
203
+ "train loss": 0.7284122853279114,
204
+ "train samples": 12000,
205
+ "train time": 168.7332625999261,
206
+ "eval time": 140.92822863799665,
207
+ "tokens / sec": 1237.0471404616308,
208
+ "mem allocated avg": 6789107718.144,
209
+ "mem reserved avg": 14802571231.232,
210
+ "elapsed time": 4477.013831755001
211
+ },
212
+ {
213
+ "step": 3250,
214
+ "valid accuracy": 0.34,
215
+ "train loss": 0.7360657904148101,
216
+ "train samples": 13000,
217
+ "train time": 168.6564349730761,
218
+ "eval time": 140.91345744199498,
219
+ "tokens / sec": 1250.4770424779092,
220
+ "mem allocated avg": 6791307786.24,
221
+ "mem reserved avg": 14825665069.056,
222
+ "elapsed time": 4850.336532419002
223
+ },
224
+ {
225
+ "step": 3500,
226
+ "valid accuracy": 0.34,
227
+ "train loss": 0.7245372575521469,
228
+ "train samples": 14000,
229
+ "train time": 168.69712368501496,
230
+ "eval time": 141.10813598799723,
231
+ "tokens / sec": 1243.3525564528145,
232
+ "mem allocated avg": 6789542191.104,
233
+ "mem reserved avg": 14803175211.008,
234
+ "elapsed time": 5223.900597244006
235
+ },
236
+ {
237
+ "step": 3750,
238
+ "valid accuracy": 0.36,
239
+ "train loss": 0.7196882257461548,
240
+ "train samples": 15000,
241
+ "train time": 169.02741387199057,
242
+ "eval time": 140.85168583100312,
243
+ "tokens / sec": 1282.0583066135978,
244
+ "mem allocated avg": 6800711397.376,
245
+ "mem reserved avg": 14974772576.256,
246
+ "elapsed time": 5597.923287113001
247
+ },
248
+ {
249
+ "step": 4000,
250
+ "valid accuracy": 0.4,
251
+ "train loss": 0.7386573747396469,
252
+ "train samples": 16000,
253
+ "train time": 168.47688378201565,
254
+ "eval time": 141.17620621900278,
255
+ "tokens / sec": 1213.062560347618,
256
+ "mem allocated avg": 6781920968.704,
257
+ "mem reserved avg": 14703241723.904,
258
+ "elapsed time": 5970.573302798002
259
+ },
260
+ {
261
+ "step": 4250,
262
+ "valid accuracy": 0.36,
263
+ "train loss": 0.7167660998106002,
264
+ "train samples": 17000,
265
+ "train time": 168.66243355697225,
266
+ "eval time": 141.03309625500697,
267
+ "tokens / sec": 1253.3259217358275,
268
+ "mem allocated avg": 6792739334.144,
269
+ "mem reserved avg": 14838457696.256,
270
+ "elapsed time": 6343.574297415005
271
+ },
272
+ {
273
+ "step": 4500,
274
+ "valid accuracy": 0.36,
275
+ "train loss": 0.7278824989795685,
276
+ "train samples": 18000,
277
+ "train time": 168.825120675996,
278
+ "eval time": 141.10180295899772,
279
+ "tokens / sec": 1230.966097745832,
280
+ "mem allocated avg": 6787403542.528,
281
+ "mem reserved avg": 14768026943.488,
282
+ "elapsed time": 6716.868663600006
283
+ },
284
+ {
285
+ "step": 4750,
286
+ "valid accuracy": 0.34,
287
+ "train loss": 0.7206774606704712,
288
+ "train samples": 19000,
289
+ "train time": 168.64492384497134,
290
+ "eval time": 140.88104952100548,
291
+ "tokens / sec": 1244.8581031290848,
292
+ "mem allocated avg": 6790186668.032,
293
+ "mem reserved avg": 14817972715.52,
294
+ "elapsed time": 7090.485984892002
295
+ },
296
+ {
297
+ "step": 5000,
298
+ "valid accuracy": 0.34,
299
+ "train loss": 0.7268091850280761,
300
+ "train samples": 20000,
301
+ "train time": 168.56219975605927,
302
+ "eval time": 140.98389447200316,
303
+ "tokens / sec": 1235.6269691628356,
304
+ "mem allocated avg": 6787183779.84,
305
+ "mem reserved avg": 14761332834.304,
306
+ "elapsed time": 7463.428281595006
307
+ },
308
+ {
309
+ "step": 5000,
310
+ "test accuracy": 0.3646702047005307,
311
+ "train loss": 0.7268091850280761,
312
+ "train samples": 20000,
313
+ "train total tokens": 4198051
314
+ }
315
+ ]
316
+ },
317
+ "meta_info": {
318
+ "model_info": {
319
+ "sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
320
+ "created_at": "2024-09-18T15:23:48+00:00"
321
+ },
322
+ "dataset_info": {
323
+ "metamath": {
324
+ "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
325
+ "created_at": "2023-09-21T17:22:46+00:00"
326
+ },
327
+ "gsm8k": {
328
+ "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
329
+ "created_at": "2022-04-12T10:22:10+00:00"
330
+ }
331
+ },
332
+ "package_info": {
333
+ "transformers-version": "4.52.4",
334
+ "transformers-commit-hash": null,
335
+ "peft-version": "0.15.2.dev0",
336
+ "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
337
+ "datasets-version": "3.6.0",
338
+ "datasets-commit-hash": null,
339
+ "bitsandbytes-version": "0.46.0",
340
+ "bitsandbytes-commit-hash": null,
341
+ "torch-version": "2.7.1+cu126",
342
+ "torch-commit-hash": null
343
+ },
344
+ "system_info": {
345
+ "system": "Linux",
346
+ "release": "6.8.0-1029-aws",
347
+ "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
348
+ "machine": "x86_64",
349
+ "processor": "x86_64",
350
+ "gpu": "NVIDIA L40S"
351
+ },
352
+ "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
353
+ }
354
+ }
MetaMathQA/results/bone--llama-3.2-3B-bat.json ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_info": {
3
+ "created_at": "2025-06-20T03:31:24+00:00",
4
+ "total_time": 2742.3845372959986,
5
+ "experiment_name": "bone/llama-3.2-3B-bat",
6
+ "peft_branch": "main",
7
+ "train_config": {
8
+ "model_id": "meta-llama/Llama-3.2-3B",
9
+ "dtype": "bfloat16",
10
+ "max_seq_length": 768,
11
+ "batch_size": 4,
12
+ "batch_size_eval": 50,
13
+ "max_steps": 5000,
14
+ "eval_steps": 250,
15
+ "compile": false,
16
+ "query_template": "Question: {query} Think step by step.\nAnswer:",
17
+ "seed": 0,
18
+ "grad_norm_clip": 1.0,
19
+ "optimizer_type": "AdamW",
20
+ "optimizer_kwargs": {
21
+ "lr": 0.0001,
22
+ "weight_decay": 0.1
23
+ },
24
+ "lr_scheduler": "cosine",
25
+ "use_amp": false,
26
+ "autocast_adapter_dtype": true,
27
+ "generation_kwargs": {
28
+ "max_length": 800,
29
+ "max_new_tokens": 300
30
+ },
31
+ "attn_implementation": null
32
+ },
33
+ "peft_config": {
34
+ "task_type": null,
35
+ "peft_type": "BONE",
36
+ "auto_mapping": null,
37
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
38
+ "revision": null,
39
+ "inference_mode": false,
40
+ "r": 64,
41
+ "target_modules": [
42
+ "v_proj",
43
+ "q_proj"
44
+ ],
45
+ "exclude_modules": null,
46
+ "init_weights": "bat",
47
+ "layers_to_transform": null,
48
+ "layers_pattern": null,
49
+ "bias": "none",
50
+ "modules_to_save": null
51
+ },
52
+ "error_msg": ""
53
+ },
54
+ "train_info": {
55
+ "cuda_memory_reserved_avg": 14713983755,
56
+ "cuda_memory_max": 25251807232,
57
+ "cuda_memory_reserved_99th": 20472733368,
58
+ "train_time": 2430.7548372539895,
59
+ "file_size": 29367552,
60
+ "num_trainable_params": 7340032,
61
+ "num_total_params": 3220089856,
62
+ "status": "success",
63
+ "metrics": [
64
+ {
65
+ "step": 250,
66
+ "valid accuracy": 0.34,
67
+ "train loss": 0.8741071329116822,
68
+ "train samples": 1000,
69
+ "train time": 44.769113782072964,
70
+ "eval time": 16.53786130100343,
71
+ "tokens / sec": 4729.130914464948,
72
+ "mem allocated avg": 6898425409.536,
73
+ "mem reserved avg": 14773294989.312,
74
+ "elapsed time": 124.73039968500234
75
+ },
76
+ {
77
+ "step": 500,
78
+ "valid accuracy": 0.42,
79
+ "train loss": 0.6946564470529556,
80
+ "train samples": 2000,
81
+ "train time": 43.747789238033874,
82
+ "eval time": 16.4541177170031,
83
+ "tokens / sec": 4754.4116770858745,
84
+ "mem allocated avg": 6890118709.248,
85
+ "mem reserved avg": 14662749913.088,
86
+ "elapsed time": 242.48505929599924
87
+ },
88
+ {
89
+ "step": 750,
90
+ "valid accuracy": 0.42,
91
+ "train loss": 0.6668610339164733,
92
+ "train samples": 3000,
93
+ "train time": 44.788394879076805,
94
+ "eval time": 8.99262467600056,
95
+ "tokens / sec": 4786.9766393472355,
96
+ "mem allocated avg": 6900886024.192,
97
+ "mem reserved avg": 14820195696.64,
98
+ "elapsed time": 354.3122298879971
99
+ },
100
+ {
101
+ "step": 1000,
102
+ "valid accuracy": 0.42,
103
+ "train loss": 0.6476555281877517,
104
+ "train samples": 4000,
105
+ "train time": 43.08444309095648,
106
+ "eval time": 14.581032188005338,
107
+ "tokens / sec": 4835.527282090601,
108
+ "mem allocated avg": 6892210176.0,
109
+ "mem reserved avg": 14677799075.84,
110
+ "elapsed time": 469.41999823199876
111
+ },
112
+ {
113
+ "step": 1250,
114
+ "valid accuracy": 0.38,
115
+ "train loss": 0.6442477897405624,
116
+ "train samples": 5000,
117
+ "train time": 43.81069704208494,
118
+ "eval time": 16.504536090003967,
119
+ "tokens / sec": 4759.979048031958,
120
+ "mem allocated avg": 6892437598.208,
121
+ "mem reserved avg": 14675995525.12,
122
+ "elapsed time": 587.4669312400001
123
+ },
124
+ {
125
+ "step": 1500,
126
+ "valid accuracy": 0.48,
127
+ "train loss": 0.6370412122011184,
128
+ "train samples": 6000,
129
+ "train time": 44.041188616007275,
130
+ "eval time": 11.50742915799492,
131
+ "tokens / sec": 4753.07335197389,
132
+ "mem allocated avg": 6893869041.664,
133
+ "mem reserved avg": 14704349020.16,
134
+ "elapsed time": 700.887209352004
135
+ },
136
+ {
137
+ "step": 1750,
138
+ "valid accuracy": 0.44,
139
+ "train loss": 0.6277673766613007,
140
+ "train samples": 7000,
141
+ "train time": 44.32280573899334,
142
+ "eval time": 16.494074002999696,
143
+ "tokens / sec": 4723.414876595195,
144
+ "mem allocated avg": 6895170344.96,
145
+ "mem reserved avg": 14718215389.184,
146
+ "elapsed time": 819.4313268580008
147
+ },
148
+ {
149
+ "step": 2000,
150
+ "valid accuracy": 0.48,
151
+ "train loss": 0.6278820457458496,
152
+ "train samples": 8000,
153
+ "train time": 43.325528461049544,
154
+ "eval time": 16.452074027998606,
155
+ "tokens / sec": 4793.848047040501,
156
+ "mem allocated avg": 6891568050.176,
157
+ "mem reserved avg": 14656710115.328,
158
+ "elapsed time": 936.9070930559974
159
+ },
160
+ {
161
+ "step": 2250,
162
+ "valid accuracy": 0.44,
163
+ "train loss": 0.6160005252361298,
164
+ "train samples": 9000,
165
+ "train time": 45.04456213898811,
166
+ "eval time": 16.52133422600309,
167
+ "tokens / sec": 4771.896757188206,
168
+ "mem allocated avg": 6903412344.832,
169
+ "mem reserved avg": 14851812360.192,
170
+ "elapsed time": 1056.8185863660037
171
+ },
172
+ {
173
+ "step": 2500,
174
+ "valid accuracy": 0.5,
175
+ "train loss": 0.6121727240085602,
176
+ "train samples": 10000,
177
+ "train time": 43.16439942702709,
178
+ "eval time": 16.356938169003115,
179
+ "tokens / sec": 4771.686916395162,
180
+ "mem allocated avg": 6888002562.048,
181
+ "mem reserved avg": 14598350569.472,
182
+ "elapsed time": 1173.7929829869972
183
+ },
184
+ {
185
+ "step": 2750,
186
+ "valid accuracy": 0.52,
187
+ "train loss": 0.6007345867156982,
188
+ "train samples": 11000,
189
+ "train time": 44.3066304440581,
190
+ "eval time": 16.514935120998416,
191
+ "tokens / sec": 4782.151065798665,
192
+ "mem allocated avg": 6899352545.28,
193
+ "mem reserved avg": 14785458470.912,
194
+ "elapsed time": 1292.7444534430033
195
+ },
196
+ {
197
+ "step": 3000,
198
+ "valid accuracy": 0.52,
199
+ "train loss": 0.5899704934358597,
200
+ "train samples": 12000,
201
+ "train time": 44.07467572299356,
202
+ "eval time": 16.412788394998643,
203
+ "tokens / sec": 4735.848796979486,
204
+ "mem allocated avg": 6894036676.608,
205
+ "mem reserved avg": 14687865405.44,
206
+ "elapsed time": 1411.115336062001
207
+ },
208
+ {
209
+ "step": 3250,
210
+ "valid accuracy": 0.48,
211
+ "train loss": 0.5988378477096558,
212
+ "train samples": 13000,
213
+ "train time": 44.070030323957326,
214
+ "eval time": 10.250203846997465,
215
+ "tokens / sec": 4785.587812163363,
216
+ "mem allocated avg": 6895260303.36,
217
+ "mem reserved avg": 14725043716.096,
218
+ "elapsed time": 1523.332073521
219
+ },
220
+ {
221
+ "step": 3500,
222
+ "valid accuracy": 0.5,
223
+ "train loss": 0.5801258901357651,
224
+ "train samples": 14000,
225
+ "train time": 43.991991777089424,
226
+ "eval time": 16.38271237299341,
227
+ "tokens / sec": 4767.913238909897,
228
+ "mem allocated avg": 6893688922.112,
229
+ "mem reserved avg": 14703484993.536,
230
+ "elapsed time": 1641.7187374700006
231
+ },
232
+ {
233
+ "step": 3750,
234
+ "valid accuracy": 0.5,
235
+ "train loss": 0.5768071869611741,
236
+ "train samples": 15000,
237
+ "train time": 45.04501243098639,
238
+ "eval time": 16.454509290000715,
239
+ "tokens / sec": 4810.810083180938,
240
+ "mem allocated avg": 6905122422.784,
241
+ "mem reserved avg": 14891314315.264,
242
+ "elapsed time": 1761.645320085001
243
+ },
244
+ {
245
+ "step": 4000,
246
+ "valid accuracy": 0.52,
247
+ "train loss": 0.5858320169448853,
248
+ "train samples": 16000,
249
+ "train time": 42.547905418032315,
250
+ "eval time": 16.350580427999375,
251
+ "tokens / sec": 4803.36218650576,
252
+ "mem allocated avg": 6886491265.024,
253
+ "mem reserved avg": 14582730981.376,
254
+ "elapsed time": 1878.0724109930015
255
+ },
256
+ {
257
+ "step": 4250,
258
+ "valid accuracy": 0.54,
259
+ "train loss": 0.5723247408866883,
260
+ "train samples": 17000,
261
+ "train time": 44.19116178697732,
262
+ "eval time": 16.508775556001638,
263
+ "tokens / sec": 4783.513070305705,
264
+ "mem allocated avg": 6897152284.672,
265
+ "mem reserved avg": 14738381602.816,
266
+ "elapsed time": 1996.8971549050038
267
+ },
268
+ {
269
+ "step": 4500,
270
+ "valid accuracy": 0.48,
271
+ "train loss": 0.5789256048202515,
272
+ "train samples": 18000,
273
+ "train time": 43.87211918797402,
274
+ "eval time": 16.414912490006827,
275
+ "tokens / sec": 4736.903615473535,
276
+ "mem allocated avg": 6893093124.096,
277
+ "mem reserved avg": 14658832433.152,
278
+ "elapsed time": 2114.9650602839974
279
+ },
280
+ {
281
+ "step": 4750,
282
+ "valid accuracy": 0.48,
283
+ "train loss": 0.568240401506424,
284
+ "train samples": 19000,
285
+ "train time": 43.939464293958736,
286
+ "eval time": 16.460097985000175,
287
+ "tokens / sec": 4777.914418698651,
288
+ "mem allocated avg": 6894218592.256,
289
+ "mem reserved avg": 14710372040.704,
290
+ "elapsed time": 2233.517725938
291
+ },
292
+ {
293
+ "step": 5000,
294
+ "valid accuracy": 0.5,
295
+ "train loss": 0.57634852206707,
296
+ "train samples": 20000,
297
+ "train time": 42.787552905057964,
298
+ "eval time": 16.445046182001533,
299
+ "tokens / sec": 4867.770785166333,
300
+ "mem allocated avg": 6890906441.728,
301
+ "mem reserved avg": 14656718503.936,
302
+ "elapsed time": 2350.279711092
303
+ },
304
+ {
305
+ "step": 5000,
306
+ "test accuracy": 0.5170583775587566,
307
+ "train loss": 0.57634852206707,
308
+ "train samples": 20000,
309
+ "train total tokens": 4198051
310
+ }
311
+ ]
312
+ },
313
+ "meta_info": {
314
+ "model_info": {
315
+ "sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
316
+ "created_at": "2024-09-18T15:23:48+00:00"
317
+ },
318
+ "dataset_info": {
319
+ "metamath": {
320
+ "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
321
+ "created_at": "2023-09-21T17:22:46+00:00"
322
+ },
323
+ "gsm8k": {
324
+ "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
325
+ "created_at": "2022-04-12T10:22:10+00:00"
326
+ }
327
+ },
328
+ "package_info": {
329
+ "transformers-version": "4.52.4",
330
+ "transformers-commit-hash": null,
331
+ "peft-version": "0.15.2.dev0",
332
+ "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
333
+ "datasets-version": "3.6.0",
334
+ "datasets-commit-hash": null,
335
+ "bitsandbytes-version": "0.46.0",
336
+ "bitsandbytes-commit-hash": null,
337
+ "torch-version": "2.7.1+cu126",
338
+ "torch-commit-hash": null
339
+ },
340
+ "system_info": {
341
+ "system": "Linux",
342
+ "release": "6.8.0-1029-aws",
343
+ "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
344
+ "machine": "x86_64",
345
+ "processor": "x86_64",
346
+ "gpu": "NVIDIA L40S"
347
+ },
348
+ "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
349
+ }
350
+ }
MetaMathQA/results/bone--llama-3.2-3B-default.json ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_info": {
3
+ "created_at": "2025-06-20T04:17:11+00:00",
4
+ "total_time": 1867.121674144997,
5
+ "experiment_name": "bone/llama-3.2-3B-default",
6
+ "peft_branch": "main",
7
+ "train_config": {
8
+ "model_id": "meta-llama/Llama-3.2-3B",
9
+ "dtype": "bfloat16",
10
+ "max_seq_length": 768,
11
+ "batch_size": 4,
12
+ "batch_size_eval": 50,
13
+ "max_steps": 5000,
14
+ "eval_steps": 250,
15
+ "compile": false,
16
+ "query_template": "Question: {query} Think step by step.\nAnswer:",
17
+ "seed": 0,
18
+ "grad_norm_clip": 1.0,
19
+ "optimizer_type": "AdamW",
20
+ "optimizer_kwargs": {
21
+ "lr": 0.0001,
22
+ "weight_decay": 0.1
23
+ },
24
+ "lr_scheduler": "cosine",
25
+ "use_amp": false,
26
+ "autocast_adapter_dtype": true,
27
+ "generation_kwargs": {
28
+ "max_length": 800,
29
+ "max_new_tokens": 300
30
+ },
31
+ "attn_implementation": null
32
+ },
33
+ "peft_config": {
34
+ "task_type": null,
35
+ "peft_type": "BONE",
36
+ "auto_mapping": null,
37
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
38
+ "revision": null,
39
+ "inference_mode": false,
40
+ "r": 64,
41
+ "target_modules": [
42
+ "v_proj",
43
+ "q_proj"
44
+ ],
45
+ "exclude_modules": null,
46
+ "init_weights": true,
47
+ "layers_to_transform": null,
48
+ "layers_pattern": null,
49
+ "bias": "none",
50
+ "modules_to_save": null
51
+ },
52
+ "error_msg": ""
53
+ },
54
+ "train_info": {
55
+ "cuda_memory_reserved_avg": 11170837063,
56
+ "cuda_memory_max": 20248002560,
57
+ "cuda_memory_reserved_99th": 16303469363,
58
+ "train_time": 1664.0814183089897,
59
+ "file_size": 29367496,
60
+ "num_trainable_params": 7340032,
61
+ "num_total_params": 3220089856,
62
+ "status": "success",
63
+ "metrics": [
64
+ {
65
+ "step": 250,
66
+ "valid accuracy": 0.34,
67
+ "train loss": 0.8771067566871643,
68
+ "train samples": 1000,
69
+ "train time": 29.468342912026856,
70
+ "eval time": 11.086663477995899,
71
+ "tokens / sec": 7184.625230948821,
72
+ "mem allocated avg": 6894354876.416,
73
+ "mem reserved avg": 11212691603.456,
74
+ "elapsed time": 88.56553585999791
75
+ },
76
+ {
77
+ "step": 500,
78
+ "valid accuracy": 0.38,
79
+ "train loss": 0.6947847135066986,
80
+ "train samples": 2000,
81
+ "train time": 29.13603712292388,
82
+ "eval time": 11.12908834600239,
83
+ "tokens / sec": 7138.753946615206,
84
+ "mem allocated avg": 6887297284.096,
85
+ "mem reserved avg": 11116172279.808,
86
+ "elapsed time": 169.94219922799675
87
+ },
88
+ {
89
+ "step": 750,
90
+ "valid accuracy": 0.42,
91
+ "train loss": 0.6673308206796646,
92
+ "train samples": 3000,
93
+ "train time": 29.74789179801155,
94
+ "eval time": 6.2111000180011615,
95
+ "tokens / sec": 7207.267037805055,
96
+ "mem allocated avg": 6897885888.512,
97
+ "mem reserved avg": 11257109282.816,
98
+ "elapsed time": 247.40845895299572
99
+ },
100
+ {
101
+ "step": 1000,
102
+ "valid accuracy": 0.44,
103
+ "train loss": 0.6480507221221924,
104
+ "train samples": 4000,
105
+ "train time": 29.01437903306214,
106
+ "eval time": 11.063560270995367,
107
+ "tokens / sec": 7180.439731713689,
108
+ "mem allocated avg": 6888501639.168,
109
+ "mem reserved avg": 11141564596.224,
110
+ "elapsed time": 328.43337820599845
111
+ },
112
+ {
113
+ "step": 1250,
114
+ "valid accuracy": 0.42,
115
+ "train loss": 0.6442041766643524,
116
+ "train samples": 5000,
117
+ "train time": 28.86099356606428,
118
+ "eval time": 11.061821620001865,
119
+ "tokens / sec": 7225.600169399779,
120
+ "mem allocated avg": 6888334700.544,
121
+ "mem reserved avg": 11139123511.296,
122
+ "elapsed time": 409.5306018880001
123
+ },
124
+ {
125
+ "step": 1500,
126
+ "valid accuracy": 0.52,
127
+ "train loss": 0.6375475705862045,
128
+ "train samples": 6000,
129
+ "train time": 29.36598393299937,
130
+ "eval time": 6.896059851998871,
131
+ "tokens / sec": 7128.349606047729,
132
+ "mem allocated avg": 6890338080.768,
133
+ "mem reserved avg": 11164893315.072,
134
+ "elapsed time": 487.1438905899995
135
+ },
136
+ {
137
+ "step": 1750,
138
+ "valid accuracy": 0.42,
139
+ "train loss": 0.6282199568748474,
140
+ "train samples": 7000,
141
+ "train time": 29.2208460940019,
142
+ "eval time": 11.139122824002698,
143
+ "tokens / sec": 7164.576936838726,
144
+ "mem allocated avg": 6891485964.288,
145
+ "mem reserved avg": 11174582157.312,
146
+ "elapsed time": 568.6407176649955
147
+ },
148
+ {
149
+ "step": 2000,
150
+ "valid accuracy": 0.44,
151
+ "train loss": 0.628275181055069,
152
+ "train samples": 8000,
153
+ "train time": 28.774674860083906,
154
+ "eval time": 11.096917715003656,
155
+ "tokens / sec": 7218.013791986054,
156
+ "mem allocated avg": 6889055956.992,
157
+ "mem reserved avg": 11126481879.04,
158
+ "elapsed time": 649.4662010969987
159
+ },
160
+ {
161
+ "step": 2250,
162
+ "valid accuracy": 0.5,
163
+ "train loss": 0.6164452042579651,
164
+ "train samples": 9000,
165
+ "train time": 29.666104338008154,
166
+ "eval time": 6.740810982002586,
167
+ "tokens / sec": 7245.575541396888,
168
+ "mem allocated avg": 6899385456.64,
169
+ "mem reserved avg": 11287358603.264,
170
+ "elapsed time": 727.5584506419982
171
+ },
172
+ {
173
+ "step": 2500,
174
+ "valid accuracy": 0.52,
175
+ "train loss": 0.6124898854494095,
176
+ "train samples": 10000,
177
+ "train time": 28.952800227045373,
178
+ "eval time": 11.054138113999215,
179
+ "tokens / sec": 7113.888756349109,
180
+ "mem allocated avg": 6884753041.408,
181
+ "mem reserved avg": 11077492408.32,
182
+ "elapsed time": 808.6757636719994
183
+ },
184
+ {
185
+ "step": 2750,
186
+ "valid accuracy": 0.48,
187
+ "train loss": 0.6010023313760757,
188
+ "train samples": 11000,
189
+ "train time": 29.36040201097785,
190
+ "eval time": 5.933361176998005,
191
+ "tokens / sec": 7216.556500853691,
192
+ "mem allocated avg": 6895703631.872,
193
+ "mem reserved avg": 11229007446.016,
194
+ "elapsed time": 885.2688505609985
195
+ },
196
+ {
197
+ "step": 3000,
198
+ "valid accuracy": 0.36,
199
+ "train loss": 0.590470621585846,
200
+ "train samples": 12000,
201
+ "train time": 29.152743853985157,
202
+ "eval time": 11.051910919995862,
203
+ "tokens / sec": 7159.909236861306,
204
+ "mem allocated avg": 6890226739.2,
205
+ "mem reserved avg": 11156563427.328,
206
+ "elapsed time": 966.2876440099935
207
+ },
208
+ {
209
+ "step": 3250,
210
+ "valid accuracy": 0.46,
211
+ "train loss": 0.5996054347753524,
212
+ "train samples": 13000,
213
+ "train time": 29.23224936202314,
214
+ "eval time": 11.06002619300125,
215
+ "tokens / sec": 7214.668888053154,
216
+ "mem allocated avg": 6892138940.416,
217
+ "mem reserved avg": 11182651998.208,
218
+ "elapsed time": 1047.7634995759945
219
+ },
220
+ {
221
+ "step": 3500,
222
+ "valid accuracy": 0.46,
223
+ "train loss": 0.5810788285732269,
224
+ "train samples": 14000,
225
+ "train time": 29.556202010979177,
226
+ "eval time": 7.767598452002858,
227
+ "tokens / sec": 7096.649289448104,
228
+ "mem allocated avg": 6891370110.976,
229
+ "mem reserved avg": 11166763974.656,
230
+ "elapsed time": 1126.3068484049945
231
+ },
232
+ {
233
+ "step": 3750,
234
+ "valid accuracy": 0.5,
235
+ "train loss": 0.5778432558774949,
236
+ "train samples": 15000,
237
+ "train time": 30.077826159038523,
238
+ "eval time": 11.010653469995304,
239
+ "tokens / sec": 7204.742751493022,
240
+ "mem allocated avg": 6901065279.488,
241
+ "mem reserved avg": 11319788961.792,
242
+ "elapsed time": 1209.0550349339974
243
+ },
244
+ {
245
+ "step": 4000,
246
+ "valid accuracy": 0.4,
247
+ "train loss": 0.5869229323863984,
248
+ "train samples": 16000,
249
+ "train time": 29.213863794990175,
250
+ "eval time": 11.144038623999222,
251
+ "tokens / sec": 6995.753845988955,
252
+ "mem allocated avg": 6883645001.728,
253
+ "mem reserved avg": 11058953584.64,
254
+ "elapsed time": 1290.3985370609953
255
+ },
256
+ {
257
+ "step": 4250,
258
+ "valid accuracy": 0.46,
259
+ "train loss": 0.5733816763162612,
260
+ "train samples": 17000,
261
+ "train time": 29.18649683901458,
262
+ "eval time": 11.153094029003114,
263
+ "tokens / sec": 7242.698607029438,
264
+ "mem allocated avg": 6893432758.272,
265
+ "mem reserved avg": 11193884344.32,
266
+ "elapsed time": 1372.1237251569983
267
+ },
268
+ {
269
+ "step": 4500,
270
+ "valid accuracy": 0.48,
271
+ "train loss": 0.5803762240409851,
272
+ "train samples": 18000,
273
+ "train time": 29.077459994943638,
274
+ "eval time": 11.118935573998897,
275
+ "tokens / sec": 7147.047920834147,
276
+ "mem allocated avg": 6888416004.096,
277
+ "mem reserved avg": 11124485390.336,
278
+ "elapsed time": 1453.4214935309938
279
+ },
280
+ {
281
+ "step": 4750,
282
+ "valid accuracy": 0.48,
283
+ "train loss": 0.5692038584947586,
284
+ "train samples": 19000,
285
+ "train time": 29.40723867896304,
286
+ "eval time": 11.099454375005735,
287
+ "tokens / sec": 7139.024588193769,
288
+ "mem allocated avg": 6890813089.792,
289
+ "mem reserved avg": 11168844349.44,
290
+ "elapsed time": 1535.6791463129994
291
+ },
292
+ {
293
+ "step": 5000,
294
+ "valid accuracy": 0.48,
295
+ "train loss": 0.5775641392469406,
296
+ "train samples": 20000,
297
+ "train time": 28.941933833950316,
298
+ "eval time": 11.18307958800142,
299
+ "tokens / sec": 7196.47834159849,
300
+ "mem allocated avg": 6887869800.448,
301
+ "mem reserved avg": 11118328152.064,
302
+ "elapsed time": 1617.277517963994
303
+ },
304
+ {
305
+ "step": 5000,
306
+ "test accuracy": 0.5079605761940864,
307
+ "train loss": 0.5775641392469406,
308
+ "train samples": 20000,
309
+ "train total tokens": 4198051
310
+ }
311
+ ]
312
+ },
313
+ "meta_info": {
314
+ "model_info": {
315
+ "sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
316
+ "created_at": "2024-09-18T15:23:48+00:00"
317
+ },
318
+ "dataset_info": {
319
+ "metamath": {
320
+ "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
321
+ "created_at": "2023-09-21T17:22:46+00:00"
322
+ },
323
+ "gsm8k": {
324
+ "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
325
+ "created_at": "2022-04-12T10:22:10+00:00"
326
+ }
327
+ },
328
+ "package_info": {
329
+ "transformers-version": "4.52.4",
330
+ "transformers-commit-hash": null,
331
+ "peft-version": "0.15.2.dev0",
332
+ "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
333
+ "datasets-version": "3.6.0",
334
+ "datasets-commit-hash": null,
335
+ "bitsandbytes-version": "0.46.0",
336
+ "bitsandbytes-commit-hash": null,
337
+ "torch-version": "2.7.1+cu126",
338
+ "torch-commit-hash": null
339
+ },
340
+ "system_info": {
341
+ "system": "Linux",
342
+ "release": "6.8.0-1029-aws",
343
+ "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
344
+ "machine": "x86_64",
345
+ "processor": "x86_64",
346
+ "gpu": "NVIDIA L40S"
347
+ },
348
+ "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
349
+ }
350
+ }
MetaMathQA/results/fourierft--llama-3.2-3B-default.json ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_info": {
3
+ "created_at": "2025-06-20T10:18:57+00:00",
4
+ "total_time": 2823.832106703994,
5
+ "experiment_name": "fourierft/llama-3.2-3B-default",
6
+ "peft_branch": "main",
7
+ "train_config": {
8
+ "model_id": "meta-llama/Llama-3.2-3B",
9
+ "dtype": "bfloat16",
10
+ "max_seq_length": 768,
11
+ "batch_size": 4,
12
+ "batch_size_eval": 50,
13
+ "max_steps": 5000,
14
+ "eval_steps": 250,
15
+ "compile": false,
16
+ "query_template": "Question: {query} Think step by step.\nAnswer:",
17
+ "seed": 0,
18
+ "grad_norm_clip": 1.0,
19
+ "optimizer_type": "AdamW",
20
+ "optimizer_kwargs": {
21
+ "lr": 0.0001,
22
+ "weight_decay": 0.1
23
+ },
24
+ "lr_scheduler": "cosine",
25
+ "use_amp": false,
26
+ "autocast_adapter_dtype": true,
27
+ "generation_kwargs": {
28
+ "max_length": 800,
29
+ "max_new_tokens": 300
30
+ },
31
+ "attn_implementation": null
32
+ },
33
+ "peft_config": {
34
+ "task_type": null,
35
+ "peft_type": "FOURIERFT",
36
+ "auto_mapping": null,
37
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
38
+ "revision": null,
39
+ "inference_mode": false,
40
+ "n_frequency": 1000,
41
+ "scaling": 300,
42
+ "random_loc_seed": 777,
43
+ "fan_in_fan_out": false,
44
+ "target_modules": [
45
+ "q_proj",
46
+ "v_proj"
47
+ ],
48
+ "exclude_modules": null,
49
+ "bias": "none",
50
+ "modules_to_save": null,
51
+ "layers_to_transform": null,
52
+ "layers_pattern": null,
53
+ "n_frequency_pattern": {},
54
+ "init_weights": false
55
+ },
56
+ "error_msg": ""
57
+ },
58
+ "train_info": {
59
+ "cuda_memory_reserved_avg": 13104129350,
60
+ "cuda_memory_max": 23653777408,
61
+ "cuda_memory_reserved_99th": 19017267937,
62
+ "train_time": 2424.3862988609762,
63
+ "file_size": 231416,
64
+ "num_trainable_params": 56000,
65
+ "num_total_params": 3212805824,
66
+ "status": "success",
67
+ "metrics": [
68
+ {
69
+ "step": 250,
70
+ "valid accuracy": 0.0,
71
+ "train loss": 1.3263031902313231,
72
+ "train samples": 1000,
73
+ "train time": 53.55340486107161,
74
+ "eval time": 19.578013352002017,
75
+ "tokens / sec": 3953.4180982374883,
76
+ "mem allocated avg": 6781303625.728,
77
+ "mem reserved avg": 13152850804.736,
78
+ "elapsed time": 119.84825310099404
79
+ },
80
+ {
81
+ "step": 500,
82
+ "valid accuracy": 0.0,
83
+ "train loss": 1.3399862418174744,
84
+ "train samples": 2000,
85
+ "train time": 52.85717789203045,
86
+ "eval time": 19.544192551999004,
87
+ "tokens / sec": 3935.03793231005,
88
+ "mem allocated avg": 6774035257.344,
89
+ "mem reserved avg": 13043463356.416,
90
+ "elapsed time": 233.5829256769939
91
+ },
92
+ {
93
+ "step": 750,
94
+ "valid accuracy": 0.0,
95
+ "train loss": 1.3045952091217041,
96
+ "train samples": 3000,
97
+ "train time": 53.35706212905643,
98
+ "eval time": 19.607110917990212,
99
+ "tokens / sec": 4018.2309790861696,
100
+ "mem allocated avg": 6783920330.752,
101
+ "mem reserved avg": 13205673869.312,
102
+ "elapsed time": 348.1469791559939
103
+ },
104
+ {
105
+ "step": 1000,
106
+ "valid accuracy": 0.0,
107
+ "train loss": 1.3111453976631164,
108
+ "train samples": 4000,
109
+ "train time": 52.95546973698947,
110
+ "eval time": 19.472347582006478,
111
+ "tokens / sec": 3934.1733919976355,
112
+ "mem allocated avg": 6776025266.176,
113
+ "mem reserved avg": 13077269446.656,
114
+ "elapsed time": 461.81266678999236
115
+ },
116
+ {
117
+ "step": 1250,
118
+ "valid accuracy": 0.0,
119
+ "train loss": 1.299716483592987,
120
+ "train samples": 5000,
121
+ "train time": 52.12036712520057,
122
+ "eval time": 19.626158429004136,
123
+ "tokens / sec": 4001.0846335572023,
124
+ "mem allocated avg": 6775331573.76,
125
+ "mem reserved avg": 13063344357.376,
126
+ "elapsed time": 574.6407375999988
127
+ },
128
+ {
129
+ "step": 1500,
130
+ "valid accuracy": 0.0,
131
+ "train loss": 1.2867344057559966,
132
+ "train samples": 6000,
133
+ "train time": 52.594848359090975,
134
+ "eval time": 19.54386943600548,
135
+ "tokens / sec": 3980.0666135738998,
136
+ "mem allocated avg": 6776458844.16,
137
+ "mem reserved avg": 13093568512.0,
138
+ "elapsed time": 688.0431025519938
139
+ },
140
+ {
141
+ "step": 1750,
142
+ "valid accuracy": 0.0,
143
+ "train loss": 1.2803141210079194,
144
+ "train samples": 7000,
145
+ "train time": 52.98738884186605,
146
+ "eval time": 19.568909612993593,
147
+ "tokens / sec": 3951.0344739725274,
148
+ "mem allocated avg": 6778496358.4,
149
+ "mem reserved avg": 13108768669.696,
150
+ "elapsed time": 801.9154772249894
151
+ },
152
+ {
153
+ "step": 2000,
154
+ "valid accuracy": 0.0,
155
+ "train loss": 1.2766506419181824,
156
+ "train samples": 8000,
157
+ "train time": 52.03297274692159,
158
+ "eval time": 19.525613270001486,
159
+ "tokens / sec": 3991.62279292005,
160
+ "mem allocated avg": 6774647097.344,
161
+ "mem reserved avg": 13051189264.384,
162
+ "elapsed time": 914.5343848449993
163
+ },
164
+ {
165
+ "step": 2250,
166
+ "valid accuracy": 0.0,
167
+ "train loss": 1.2596003375053406,
168
+ "train samples": 9000,
169
+ "train time": 53.934016149127274,
170
+ "eval time": 19.535415460006334,
171
+ "tokens / sec": 3985.388356870549,
172
+ "mem allocated avg": 6785830477.824,
173
+ "mem reserved avg": 13237223424.0,
174
+ "elapsed time": 1029.9007452719961
175
+ },
176
+ {
177
+ "step": 2500,
178
+ "valid accuracy": 0.0,
179
+ "train loss": 1.2684449093341827,
180
+ "train samples": 10000,
181
+ "train time": 52.006629903029534,
182
+ "eval time": 19.470633051998448,
183
+ "tokens / sec": 3960.3989026791724,
184
+ "mem allocated avg": 6771212331.008,
185
+ "mem reserved avg": 12996118052.864,
186
+ "elapsed time": 1142.5889472209965
187
+ },
188
+ {
189
+ "step": 2750,
190
+ "valid accuracy": 0.0,
191
+ "train loss": 1.2548872971534728,
192
+ "train samples": 11000,
193
+ "train time": 53.403087337108445,
194
+ "eval time": 19.463876378998975,
195
+ "tokens / sec": 3967.579601952513,
196
+ "mem allocated avg": 6781916252.16,
197
+ "mem reserved avg": 13168084516.864,
198
+ "elapsed time": 1257.0122518049902
199
+ },
200
+ {
201
+ "step": 3000,
202
+ "valid accuracy": 0.0,
203
+ "train loss": 1.253697858095169,
204
+ "train samples": 12000,
205
+ "train time": 53.20096563108382,
206
+ "eval time": 19.472515105997445,
207
+ "tokens / sec": 3923.443823321214,
208
+ "mem allocated avg": 6777045135.36,
209
+ "mem reserved avg": 13084844359.68,
210
+ "elapsed time": 1370.94780872899
211
+ },
212
+ {
213
+ "step": 3250,
214
+ "valid accuracy": 0.0,
215
+ "train loss": 1.248513156414032,
216
+ "train samples": 13000,
217
+ "train time": 52.962746563891415,
218
+ "eval time": 19.54665829600708,
219
+ "tokens / sec": 3982.06312328573,
220
+ "mem allocated avg": 6779038627.84,
221
+ "mem reserved avg": 13110345728.0,
222
+ "elapsed time": 1484.7621198889974
223
+ },
224
+ {
225
+ "step": 3500,
226
+ "valid accuracy": 0.0,
227
+ "train loss": 1.2477959940433503,
228
+ "train samples": 14000,
229
+ "train time": 52.93443578510778,
230
+ "eval time": 19.444701158994576,
231
+ "tokens / sec": 3962.4489595298505,
232
+ "mem allocated avg": 6776803573.76,
233
+ "mem reserved avg": 13097142059.008,
234
+ "elapsed time": 1598.8772237269877
235
+ },
236
+ {
237
+ "step": 3750,
238
+ "valid accuracy": 0.0,
239
+ "train loss": 1.228544222354889,
240
+ "train samples": 15000,
241
+ "train time": 53.31031796212483,
242
+ "eval time": 19.472959079008433,
243
+ "tokens / sec": 4064.9354249577,
244
+ "mem allocated avg": 6788200585.216,
245
+ "mem reserved avg": 13268999471.104,
246
+ "elapsed time": 1713.6814467679942
247
+ },
248
+ {
249
+ "step": 4000,
250
+ "valid accuracy": 0.0,
251
+ "train loss": 1.2609001460075377,
252
+ "train samples": 16000,
253
+ "train time": 51.9827769130934,
254
+ "eval time": 19.473652824002784,
255
+ "tokens / sec": 3931.552182017475,
256
+ "mem allocated avg": 6770180233.216,
257
+ "mem reserved avg": 12983610638.336,
258
+ "elapsed time": 1826.5604049959948
259
+ },
260
+ {
261
+ "step": 4250,
262
+ "valid accuracy": 0.0,
263
+ "train loss": 1.227214762210846,
264
+ "train samples": 17000,
265
+ "train time": 53.09942602888623,
266
+ "eval time": 19.547112297004787,
267
+ "tokens / sec": 3981.0034836347163,
268
+ "mem allocated avg": 6779591426.048,
269
+ "mem reserved avg": 13132760088.576,
270
+ "elapsed time": 1940.5098487799987
271
+ },
272
+ {
273
+ "step": 4500,
274
+ "valid accuracy": 0.0,
275
+ "train loss": 1.2504195840358734,
276
+ "train samples": 18000,
277
+ "train time": 52.23909889203787,
278
+ "eval time": 19.522137050997117,
279
+ "tokens / sec": 3978.207978462565,
280
+ "mem allocated avg": 6775933241.344,
281
+ "mem reserved avg": 13056079822.848,
282
+ "elapsed time": 2053.2267840139975
283
+ },
284
+ {
285
+ "step": 4750,
286
+ "valid accuracy": 0.0,
287
+ "train loss": 1.2349513354301453,
288
+ "train samples": 19000,
289
+ "train time": 53.36620609794045,
290
+ "eval time": 19.541859832999762,
291
+ "tokens / sec": 3933.931514912433,
292
+ "mem allocated avg": 6777532579.84,
293
+ "mem reserved avg": 13101604798.464,
294
+ "elapsed time": 2167.8329333979927
295
+ },
296
+ {
297
+ "step": 5000,
298
+ "valid accuracy": 0.0,
299
+ "train loss": 1.2480293517112733,
300
+ "train samples": 20000,
301
+ "train time": 52.46977503092785,
302
+ "eval time": 19.44991449599911,
303
+ "tokens / sec": 3969.5234042309344,
304
+ "mem allocated avg": 6773533165.568,
305
+ "mem reserved avg": 13049645760.512,
306
+ "elapsed time": 2281.220151823989
307
+ },
308
+ {
309
+ "step": 5000,
310
+ "test accuracy": 0.000758150113722517,
311
+ "train loss": 1.2480293517112733,
312
+ "train samples": 20000,
313
+ "train total tokens": 4198051
314
+ }
315
+ ]
316
+ },
317
+ "meta_info": {
318
+ "model_info": {
319
+ "sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
320
+ "created_at": "2024-09-18T15:23:48+00:00"
321
+ },
322
+ "dataset_info": {
323
+ "metamath": {
324
+ "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
325
+ "created_at": "2023-09-21T17:22:46+00:00"
326
+ },
327
+ "gsm8k": {
328
+ "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
329
+ "created_at": "2022-04-12T10:22:10+00:00"
330
+ }
331
+ },
332
+ "package_info": {
333
+ "transformers-version": "4.52.4",
334
+ "transformers-commit-hash": null,
335
+ "peft-version": "0.15.2.dev0",
336
+ "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
337
+ "datasets-version": "3.6.0",
338
+ "datasets-commit-hash": null,
339
+ "bitsandbytes-version": "0.46.0",
340
+ "bitsandbytes-commit-hash": null,
341
+ "torch-version": "2.7.1+cu126",
342
+ "torch-commit-hash": null
343
+ },
344
+ "system_info": {
345
+ "system": "Linux",
346
+ "release": "6.8.0-1029-aws",
347
+ "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
348
+ "machine": "x86_64",
349
+ "processor": "x86_64",
350
+ "gpu": "NVIDIA L40S"
351
+ },
352
+ "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
353
+ }
354
+ }
MetaMathQA/results/fourierft--llama-3.2-3B-n_frequency-5000.json ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_info": {
3
+ "created_at": "2025-06-20T09:31:48+00:00",
4
+ "total_time": 2824.376998209991,
5
+ "experiment_name": "fourierft/llama-3.2-3B-n_frequency-5000",
6
+ "peft_branch": "main",
7
+ "train_config": {
8
+ "model_id": "meta-llama/Llama-3.2-3B",
9
+ "dtype": "bfloat16",
10
+ "max_seq_length": 768,
11
+ "batch_size": 4,
12
+ "batch_size_eval": 50,
13
+ "max_steps": 5000,
14
+ "eval_steps": 250,
15
+ "compile": false,
16
+ "query_template": "Question: {query} Think step by step.\nAnswer:",
17
+ "seed": 0,
18
+ "grad_norm_clip": 1.0,
19
+ "optimizer_type": "AdamW",
20
+ "optimizer_kwargs": {
21
+ "lr": 0.0001,
22
+ "weight_decay": 0.1
23
+ },
24
+ "lr_scheduler": "cosine",
25
+ "use_amp": false,
26
+ "autocast_adapter_dtype": true,
27
+ "generation_kwargs": {
28
+ "max_length": 800,
29
+ "max_new_tokens": 300
30
+ },
31
+ "attn_implementation": null
32
+ },
33
+ "peft_config": {
34
+ "task_type": null,
35
+ "peft_type": "FOURIERFT",
36
+ "auto_mapping": null,
37
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
38
+ "revision": null,
39
+ "inference_mode": false,
40
+ "n_frequency": 5000,
41
+ "scaling": 300,
42
+ "random_loc_seed": 777,
43
+ "fan_in_fan_out": false,
44
+ "target_modules": [
45
+ "v_proj",
46
+ "q_proj"
47
+ ],
48
+ "exclude_modules": null,
49
+ "bias": "none",
50
+ "modules_to_save": null,
51
+ "layers_to_transform": null,
52
+ "layers_pattern": null,
53
+ "n_frequency_pattern": {},
54
+ "init_weights": false
55
+ },
56
+ "error_msg": ""
57
+ },
58
+ "train_info": {
59
+ "cuda_memory_reserved_avg": 13111221498,
60
+ "cuda_memory_max": 23681040384,
61
+ "cuda_memory_reserved_99th": 19054869872,
62
+ "train_time": 2421.913372163006,
63
+ "file_size": 1127472,
64
+ "num_trainable_params": 280000,
65
+ "num_total_params": 3213029824,
66
+ "status": "success",
67
+ "metrics": [
68
+ {
69
+ "step": 250,
70
+ "valid accuracy": 0.0,
71
+ "train loss": 1.3800132541656494,
72
+ "train samples": 1000,
73
+ "train time": 53.57064967796032,
74
+ "eval time": 19.631924207002157,
75
+ "tokens / sec": 3952.1454616053315,
76
+ "mem allocated avg": 6784830552.064,
77
+ "mem reserved avg": 13158731218.944,
78
+ "elapsed time": 119.20255395398999
79
+ },
80
+ {
81
+ "step": 500,
82
+ "valid accuracy": 0.0,
83
+ "train loss": 1.3702282276153563,
84
+ "train samples": 2000,
85
+ "train time": 53.00863014489005,
86
+ "eval time": 19.629790833001607,
87
+ "tokens / sec": 3923.7950392508,
88
+ "mem allocated avg": 6777176354.816,
89
+ "mem reserved avg": 13048941117.44,
90
+ "elapsed time": 232.4386439989903
91
+ },
92
+ {
93
+ "step": 750,
94
+ "valid accuracy": 0.0,
95
+ "train loss": 1.3024170677661895,
96
+ "train samples": 3000,
97
+ "train time": 53.97298614999454,
98
+ "eval time": 19.64192995200574,
99
+ "tokens / sec": 3972.3760957780855,
100
+ "mem allocated avg": 6787548153.856,
101
+ "mem reserved avg": 13211654946.816,
102
+ "elapsed time": 346.9217278779979
103
+ },
104
+ {
105
+ "step": 1000,
106
+ "valid accuracy": 0.0,
107
+ "train loss": 1.2704877371788026,
108
+ "train samples": 4000,
109
+ "train time": 52.95541349705309,
110
+ "eval time": 19.62998814698949,
111
+ "tokens / sec": 3934.1775701854103,
112
+ "mem allocated avg": 6779591346.176,
113
+ "mem reserved avg": 13082126450.688,
114
+ "elapsed time": 460.14450727400254
115
+ },
116
+ {
117
+ "step": 1250,
118
+ "valid accuracy": 0.0,
119
+ "train loss": 1.2236453666687013,
120
+ "train samples": 5000,
121
+ "train time": 53.36593960013124,
122
+ "eval time": 19.652927816001466,
123
+ "tokens / sec": 3907.698460152047,
124
+ "mem allocated avg": 6779029788.672,
125
+ "mem reserved avg": 13073486184.448,
126
+ "elapsed time": 573.5348878969962
127
+ },
128
+ {
129
+ "step": 1500,
130
+ "valid accuracy": 0.0,
131
+ "train loss": 1.1792121708393097,
132
+ "train samples": 6000,
133
+ "train time": 53.3776921518147,
134
+ "eval time": 19.616937039012555,
135
+ "tokens / sec": 3921.69446750581,
136
+ "mem allocated avg": 6779851802.624,
137
+ "mem reserved avg": 13098995941.376,
138
+ "elapsed time": 686.9838123609952
139
+ },
140
+ {
141
+ "step": 1750,
142
+ "valid accuracy": 0.02,
143
+ "train loss": 1.1485692322254182,
144
+ "train samples": 7000,
145
+ "train time": 53.188338823019876,
146
+ "eval time": 19.653264298991417,
147
+ "tokens / sec": 3936.1071361264494,
148
+ "mem allocated avg": 6782223466.496,
149
+ "mem reserved avg": 13116058370.048,
150
+ "elapsed time": 800.3354816049978
151
+ },
152
+ {
153
+ "step": 2000,
154
+ "valid accuracy": 0.06,
155
+ "train loss": 1.1230667443275453,
156
+ "train samples": 8000,
157
+ "train time": 53.074023688037414,
158
+ "eval time": 19.656479785000556,
159
+ "tokens / sec": 3913.3268135239105,
160
+ "mem allocated avg": 6778141935.616,
161
+ "mem reserved avg": 13055400345.6,
162
+ "elapsed time": 913.367253695993
163
+ },
164
+ {
165
+ "step": 2250,
166
+ "valid accuracy": 0.1,
167
+ "train loss": 1.094045166015625,
168
+ "train samples": 9000,
169
+ "train time": 54.34830153394432,
170
+ "eval time": 19.628162662993418,
171
+ "tokens / sec": 3955.008600696563,
172
+ "mem allocated avg": 6789509545.984,
173
+ "mem reserved avg": 13248556433.408,
174
+ "elapsed time": 1028.463336018991
175
+ },
176
+ {
177
+ "step": 2500,
178
+ "valid accuracy": 0.12,
179
+ "train loss": 1.077717797279358,
180
+ "train samples": 10000,
181
+ "train time": 52.1458756570355,
182
+ "eval time": 19.611369335994823,
183
+ "tokens / sec": 3949.823402231256,
184
+ "mem allocated avg": 6775024920.576,
185
+ "mem reserved avg": 13002233348.096,
186
+ "elapsed time": 1140.4990660109906
187
+ },
188
+ {
189
+ "step": 2750,
190
+ "valid accuracy": 0.12,
191
+ "train loss": 1.0569540388584138,
192
+ "train samples": 11000,
193
+ "train time": 53.227410834049806,
194
+ "eval time": 19.625236430001678,
195
+ "tokens / sec": 3980.6745562092756,
196
+ "mem allocated avg": 6785537161.216,
197
+ "mem reserved avg": 13177051938.816,
198
+ "elapsed time": 1254.066401210992
199
+ },
200
+ {
201
+ "step": 3000,
202
+ "valid accuracy": 0.12,
203
+ "train loss": 1.0361379137039184,
204
+ "train samples": 12000,
205
+ "train time": 53.65395914198598,
206
+ "eval time": 19.719437510997523,
207
+ "tokens / sec": 3890.3186892066865,
208
+ "mem allocated avg": 6780720910.336,
209
+ "mem reserved avg": 13092201168.896,
210
+ "elapsed time": 1367.8724600419955
211
+ },
212
+ {
213
+ "step": 3250,
214
+ "valid accuracy": 0.16,
215
+ "train loss": 1.0240549674034118,
216
+ "train samples": 13000,
217
+ "train time": 52.97706237102102,
218
+ "eval time": 19.7029277440015,
219
+ "tokens / sec": 3980.9870642311216,
220
+ "mem allocated avg": 6782688188.416,
221
+ "mem reserved avg": 13119816466.432,
222
+ "elapsed time": 1481.1549517469975
223
+ },
224
+ {
225
+ "step": 3500,
226
+ "valid accuracy": 0.18,
227
+ "train loss": 1.0098259932994842,
228
+ "train samples": 14000,
229
+ "train time": 52.869576787008555,
230
+ "eval time": 19.597270865997416,
231
+ "tokens / sec": 3967.3099870839346,
232
+ "mem allocated avg": 6780575592.448,
233
+ "mem reserved avg": 13102678540.288,
234
+ "elapsed time": 1594.3849144269916
235
+ },
236
+ {
237
+ "step": 3750,
238
+ "valid accuracy": 0.22,
239
+ "train loss": 0.9942408270835876,
240
+ "train samples": 15000,
241
+ "train time": 54.702630093932385,
242
+ "eval time": 19.623511597994366,
243
+ "tokens / sec": 3961.4731435744384,
244
+ "mem allocated avg": 6792074147.84,
245
+ "mem reserved avg": 13278612815.872,
246
+ "elapsed time": 1709.9712875620025
247
+ },
248
+ {
249
+ "step": 4000,
250
+ "valid accuracy": 0.16,
251
+ "train loss": 1.0123027296066285,
252
+ "train samples": 16000,
253
+ "train time": 52.456372838059906,
254
+ "eval time": 19.68401901901234,
255
+ "tokens / sec": 3896.056645603915,
256
+ "mem allocated avg": 6773958766.592,
257
+ "mem reserved avg": 12989172285.44,
258
+ "elapsed time": 1822.6668115109933
259
+ },
260
+ {
261
+ "step": 4250,
262
+ "valid accuracy": 0.24,
263
+ "train loss": 0.9849327182769776,
264
+ "train samples": 17000,
265
+ "train time": 53.25562528491719,
266
+ "eval time": 19.648335694990237,
267
+ "tokens / sec": 3969.3271625123257,
268
+ "mem allocated avg": 6783509901.312,
269
+ "mem reserved avg": 13139588415.488,
270
+ "elapsed time": 1936.0694442329986
271
+ },
272
+ {
273
+ "step": 4500,
274
+ "valid accuracy": 0.18,
275
+ "train loss": 0.9994378657341003,
276
+ "train samples": 18000,
277
+ "train time": 53.01732904899109,
278
+ "eval time": 19.688141086997348,
279
+ "tokens / sec": 3919.8127051621955,
280
+ "mem allocated avg": 6779470948.352,
281
+ "mem reserved avg": 13063528906.752,
282
+ "elapsed time": 2048.985867203999
283
+ },
284
+ {
285
+ "step": 4750,
286
+ "valid accuracy": 0.16,
287
+ "train loss": 0.9892346875667573,
288
+ "train samples": 19000,
289
+ "train time": 53.11992502908106,
290
+ "eval time": 19.68838914000662,
291
+ "tokens / sec": 3952.1704875348883,
292
+ "mem allocated avg": 6781060145.152,
293
+ "mem reserved avg": 13109733359.616,
294
+ "elapsed time": 2162.7099456459982
295
+ },
296
+ {
297
+ "step": 5000,
298
+ "valid accuracy": 0.2,
299
+ "train loss": 0.9978675174713135,
300
+ "train samples": 20000,
301
+ "train time": 52.76285280592856,
302
+ "eval time": 19.634052573994268,
303
+ "tokens / sec": 3947.4741967818154,
304
+ "mem allocated avg": 6777472888.832,
305
+ "mem reserved avg": 13055861719.04,
306
+ "elapsed time": 2275.669019541994
307
+ },
308
+ {
309
+ "step": 5000,
310
+ "test accuracy": 0.1197877179681577,
311
+ "train loss": 0.9978675174713135,
312
+ "train samples": 20000,
313
+ "train total tokens": 4198051
314
+ }
315
+ ]
316
+ },
317
+ "meta_info": {
318
+ "model_info": {
319
+ "sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
320
+ "created_at": "2024-09-18T15:23:48+00:00"
321
+ },
322
+ "dataset_info": {
323
+ "metamath": {
324
+ "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
325
+ "created_at": "2023-09-21T17:22:46+00:00"
326
+ },
327
+ "gsm8k": {
328
+ "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
329
+ "created_at": "2022-04-12T10:22:10+00:00"
330
+ }
331
+ },
332
+ "package_info": {
333
+ "transformers-version": "4.52.4",
334
+ "transformers-commit-hash": null,
335
+ "peft-version": "0.15.2.dev0",
336
+ "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
337
+ "datasets-version": "3.6.0",
338
+ "datasets-commit-hash": null,
339
+ "bitsandbytes-version": "0.46.0",
340
+ "bitsandbytes-commit-hash": null,
341
+ "torch-version": "2.7.1+cu126",
342
+ "torch-commit-hash": null
343
+ },
344
+ "system_info": {
345
+ "system": "Linux",
346
+ "release": "6.8.0-1029-aws",
347
+ "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
348
+ "machine": "x86_64",
349
+ "processor": "x86_64",
350
+ "gpu": "NVIDIA L40S"
351
+ },
352
+ "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
353
+ }
354
+ }
MetaMathQA/results/full-finetuning--llama-3.2-3B-lr_0.00001.json ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_info": {
3
+ "created_at": "2025-06-20T18:02:43+00:00",
4
+ "total_time": 3274.9747593409993,
5
+ "experiment_name": "full-finetuning/llama-3.2-3B-lr_0.00001",
6
+ "peft_branch": "main",
7
+ "train_config": {
8
+ "model_id": "meta-llama/Llama-3.2-3B",
9
+ "dtype": "bfloat16",
10
+ "max_seq_length": 768,
11
+ "batch_size": 4,
12
+ "batch_size_eval": 50,
13
+ "max_steps": 5000,
14
+ "eval_steps": 250,
15
+ "compile": false,
16
+ "query_template": "Question: {query} Think step by step.\nAnswer:",
17
+ "seed": 0,
18
+ "grad_norm_clip": 1.0,
19
+ "optimizer_type": "AdamW",
20
+ "optimizer_kwargs": {
21
+ "lr": 1e-05
22
+ },
23
+ "lr_scheduler": "cosine",
24
+ "use_amp": false,
25
+ "autocast_adapter_dtype": true,
26
+ "generation_kwargs": {
27
+ "max_length": 800,
28
+ "max_new_tokens": 300
29
+ },
30
+ "attn_implementation": null
31
+ },
32
+ "peft_config": null,
33
+ "error_msg": ""
34
+ },
35
+ "train_info": {
36
+ "cuda_memory_reserved_avg": 33098872284,
37
+ "cuda_memory_max": 37241225216,
38
+ "cuda_memory_reserved_99th": 33573390254,
39
+ "train_time": 3111.3685010060144,
40
+ "file_size": 6425499648,
41
+ "num_trainable_params": 3212749824,
42
+ "num_total_params": 3212749824,
43
+ "status": "success",
44
+ "metrics": [
45
+ {
46
+ "step": 250,
47
+ "valid accuracy": 0.3,
48
+ "train loss": 1.0749022357463838,
49
+ "train samples": 1000,
50
+ "train time": 90.81602771116013,
51
+ "eval time": 10.388541491003707,
52
+ "tokens / sec": 2331.295535996918,
53
+ "mem allocated avg": 26069449254.912,
54
+ "mem reserved avg": 33116739600.384,
55
+ "elapsed time": 162.0596859770012
56
+ },
57
+ {
58
+ "step": 500,
59
+ "valid accuracy": 0.4,
60
+ "train loss": 0.7238605101108551,
61
+ "train samples": 2000,
62
+ "train time": 90.41340426202805,
63
+ "eval time": 10.403155545005575,
64
+ "tokens / sec": 2300.488535938847,
65
+ "mem allocated avg": 26062513567.744,
66
+ "mem reserved avg": 33090961408.0,
67
+ "elapsed time": 315.86630137299653
68
+ },
69
+ {
70
+ "step": 750,
71
+ "valid accuracy": 0.42,
72
+ "train loss": 0.6648618497848511,
73
+ "train samples": 3000,
74
+ "train time": 91.4961106939445,
75
+ "eval time": 5.590419113999815,
76
+ "tokens / sec": 2343.27993150631,
77
+ "mem allocated avg": 26071394062.336,
78
+ "mem reserved avg": 33094367182.848,
79
+ "elapsed time": 465.79339110500587
80
+ },
81
+ {
82
+ "step": 1000,
83
+ "valid accuracy": 0.42,
84
+ "train loss": 0.6407654472589492,
85
+ "train samples": 4000,
86
+ "train time": 89.8546926038689,
87
+ "eval time": 10.434167744999286,
88
+ "tokens / sec": 2318.5878662838986,
89
+ "mem allocated avg": 26063373086.72,
90
+ "mem reserved avg": 33094367182.848,
91
+ "elapsed time": 618.5050604129938
92
+ },
93
+ {
94
+ "step": 1250,
95
+ "valid accuracy": 0.46,
96
+ "train loss": 0.6343449921607971,
97
+ "train samples": 5000,
98
+ "train time": 90.3596406209981,
99
+ "eval time": 5.810965301003307,
100
+ "tokens / sec": 2307.86663787969,
101
+ "mem allocated avg": 26063789404.16,
102
+ "mem reserved avg": 33081876545.536,
103
+ "elapsed time": 766.6042792719963
104
+ },
105
+ {
106
+ "step": 1500,
107
+ "valid accuracy": 0.54,
108
+ "train loss": 0.6249808443784713,
109
+ "train samples": 6000,
110
+ "train time": 90.81503154609527,
111
+ "eval time": 10.435444819988334,
112
+ "tokens / sec": 2305.025901948283,
113
+ "mem allocated avg": 26066218485.76,
114
+ "mem reserved avg": 33089409515.52,
115
+ "elapsed time": 920.292813491993
116
+ },
117
+ {
118
+ "step": 1750,
119
+ "valid accuracy": 0.46,
120
+ "train loss": 0.6174132014513016,
121
+ "train samples": 7000,
122
+ "train time": 90.68820026615867,
123
+ "eval time": 10.286707318999106,
124
+ "tokens / sec": 2308.5142210956765,
125
+ "mem allocated avg": 26065828059.136,
126
+ "mem reserved avg": 33101774323.712,
127
+ "elapsed time": 1073.8488811849966
128
+ },
129
+ {
130
+ "step": 2000,
131
+ "valid accuracy": 0.42,
132
+ "train loss": 0.618268838763237,
133
+ "train samples": 8000,
134
+ "train time": 90.44998777209548,
135
+ "eval time": 10.380125819006935,
136
+ "tokens / sec": 2296.252383398064,
137
+ "mem allocated avg": 26062920781.824,
138
+ "mem reserved avg": 33096330117.12,
139
+ "elapsed time": 1227.2062568730034
140
+ },
141
+ {
142
+ "step": 2250,
143
+ "valid accuracy": 0.5,
144
+ "train loss": 0.6107994567155838,
145
+ "train samples": 9000,
146
+ "train time": 91.58726547904371,
147
+ "eval time": 10.372407121991273,
148
+ "tokens / sec": 2346.920162707366,
149
+ "mem allocated avg": 26073357961.216,
150
+ "mem reserved avg": 33114382401.536,
151
+ "elapsed time": 1381.3805919409933
152
+ },
153
+ {
154
+ "step": 2500,
155
+ "valid accuracy": 0.54,
156
+ "train loss": 0.6089532144069671,
157
+ "train samples": 10000,
158
+ "train time": 89.29193754095468,
159
+ "eval time": 10.391672718993505,
160
+ "tokens / sec": 2306.6696240691504,
161
+ "mem allocated avg": 26059719045.12,
162
+ "mem reserved avg": 33086842601.472,
163
+ "elapsed time": 1533.778675338006
164
+ },
165
+ {
166
+ "step": 2750,
167
+ "valid accuracy": 0.52,
168
+ "train loss": 0.6020698472261429,
169
+ "train samples": 11000,
170
+ "train time": 90.41624103189679,
171
+ "eval time": 10.369720178001444,
172
+ "tokens / sec": 2343.3953632871467,
173
+ "mem allocated avg": 26070059464.704,
174
+ "mem reserved avg": 33107805732.864,
175
+ "elapsed time": 1686.671367884992
176
+ },
177
+ {
178
+ "step": 3000,
179
+ "valid accuracy": 0.5,
180
+ "train loss": 0.5949549045562744,
181
+ "train samples": 12000,
182
+ "train time": 90.9437831780233,
183
+ "eval time": 7.315949440002441,
184
+ "tokens / sec": 2295.165130654474,
185
+ "mem allocated avg": 26064854972.416,
186
+ "mem reserved avg": 33098074947.584,
187
+ "elapsed time": 1837.2926549609983
188
+ },
189
+ {
190
+ "step": 3250,
191
+ "valid accuracy": 0.48,
192
+ "train loss": 0.6066494225263596,
193
+ "train samples": 13000,
194
+ "train time": 90.87308476005273,
195
+ "eval time": 5.963120047992561,
196
+ "tokens / sec": 2320.8302057410824,
197
+ "mem allocated avg": 26066388537.344,
198
+ "mem reserved avg": 33098318217.216,
199
+ "elapsed time": 1986.6408478410012
200
+ },
201
+ {
202
+ "step": 3500,
203
+ "valid accuracy": 0.48,
204
+ "train loss": 0.592242598772049,
205
+ "train samples": 14000,
206
+ "train time": 90.65281462905114,
207
+ "eval time": 7.1309342330059735,
208
+ "tokens / sec": 2313.7726154261322,
209
+ "mem allocated avg": 26065652588.544,
210
+ "mem reserved avg": 33100457312.256,
211
+ "elapsed time": 2137.073564691993
212
+ },
213
+ {
214
+ "step": 3750,
215
+ "valid accuracy": 0.48,
216
+ "train loss": 0.5925718579292297,
217
+ "train samples": 15000,
218
+ "train time": 91.80342563094746,
219
+ "eval time": 5.844810517999576,
220
+ "tokens / sec": 2360.5110431407275,
221
+ "mem allocated avg": 26075058659.328,
222
+ "mem reserved avg": 33131771985.92,
223
+ "elapsed time": 2287.0305021950044
224
+ },
225
+ {
226
+ "step": 4000,
227
+ "valid accuracy": 0.5,
228
+ "train loss": 0.6050453131198883,
229
+ "train samples": 16000,
230
+ "train time": 89.85742108603881,
231
+ "eval time": 5.86809825799719,
232
+ "tokens / sec": 2274.414261280792,
233
+ "mem allocated avg": 26058425257.984,
234
+ "mem reserved avg": 33098662150.144,
235
+ "elapsed time": 2435.1958582270017
236
+ },
237
+ {
238
+ "step": 4250,
239
+ "valid accuracy": 0.48,
240
+ "train loss": 0.5929686036109925,
241
+ "train samples": 17000,
242
+ "train time": 90.97368233802263,
243
+ "eval time": 5.8907580230006715,
244
+ "tokens / sec": 2323.6280489841133,
245
+ "mem allocated avg": 26067367372.8,
246
+ "mem reserved avg": 33099207409.664,
247
+ "elapsed time": 2584.8373482140014
248
+ },
249
+ {
250
+ "step": 4500,
251
+ "valid accuracy": 0.48,
252
+ "train loss": 0.6010294322967529,
253
+ "train samples": 18000,
254
+ "train time": 90.13679483698797,
255
+ "eval time": 6.106882603999111,
256
+ "tokens / sec": 2305.5845326632484,
257
+ "mem allocated avg": 26064599832.576,
258
+ "mem reserved avg": 33092253253.632,
259
+ "elapsed time": 2733.494644669001
260
+ },
261
+ {
262
+ "step": 4750,
263
+ "valid accuracy": 0.5,
264
+ "train loss": 0.5936577550172806,
265
+ "train samples": 19000,
266
+ "train time": 90.74229130300228,
267
+ "eval time": 5.885364143003244,
268
+ "tokens / sec": 2313.5739354319567,
269
+ "mem allocated avg": 26065537388.544,
270
+ "mem reserved avg": 33100717359.104,
271
+ "elapsed time": 2882.6415541759925
272
+ },
273
+ {
274
+ "step": 5000,
275
+ "valid accuracy": 0.5,
276
+ "train loss": 0.5987544150352478,
277
+ "train samples": 20000,
278
+ "train time": 90.54863398504676,
279
+ "eval time": 5.88336711798911,
280
+ "tokens / sec": 2300.2003545895063,
281
+ "mem allocated avg": 26062803286.016,
282
+ "mem reserved avg": 33083126448.128,
283
+ "elapsed time": 3031.523533478001
284
+ },
285
+ {
286
+ "step": 5000,
287
+ "test accuracy": 0.5003790750568613,
288
+ "train loss": 0.5987544150352478,
289
+ "train samples": 20000,
290
+ "train total tokens": 4198051
291
+ }
292
+ ]
293
+ },
294
+ "meta_info": {
295
+ "model_info": {
296
+ "sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
297
+ "created_at": "2024-09-18T15:23:48+00:00"
298
+ },
299
+ "dataset_info": {
300
+ "metamath": {
301
+ "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
302
+ "created_at": "2023-09-21T17:22:46+00:00"
303
+ },
304
+ "gsm8k": {
305
+ "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
306
+ "created_at": "2022-04-12T10:22:10+00:00"
307
+ }
308
+ },
309
+ "package_info": {
310
+ "transformers-version": "4.52.4",
311
+ "transformers-commit-hash": null,
312
+ "peft-version": "0.15.2.dev0",
313
+ "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
314
+ "datasets-version": "3.6.0",
315
+ "datasets-commit-hash": null,
316
+ "bitsandbytes-version": "0.46.0",
317
+ "bitsandbytes-commit-hash": null,
318
+ "torch-version": "2.7.1+cu126",
319
+ "torch-commit-hash": null
320
+ },
321
+ "system_info": {
322
+ "system": "Linux",
323
+ "release": "6.8.0-1029-aws",
324
+ "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
325
+ "machine": "x86_64",
326
+ "processor": "x86_64",
327
+ "gpu": "NVIDIA L40S"
328
+ },
329
+ "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
330
+ }
331
+ }
MetaMathQA/results/ia3--llama-3.2-3B-default.json ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_info": {
3
+ "created_at": "2025-06-19T21:59:33+00:00",
4
+ "total_time": 2004.8640038169979,
5
+ "experiment_name": "ia3/llama-3.2-3B-default",
6
+ "peft_branch": "main",
7
+ "train_config": {
8
+ "model_id": "meta-llama/Llama-3.2-3B",
9
+ "dtype": "bfloat16",
10
+ "max_seq_length": 768,
11
+ "batch_size": 4,
12
+ "batch_size_eval": 50,
13
+ "max_steps": 5000,
14
+ "eval_steps": 250,
15
+ "compile": false,
16
+ "query_template": "Question: {query} Think step by step.\nAnswer:",
17
+ "seed": 0,
18
+ "grad_norm_clip": 1.0,
19
+ "optimizer_type": "AdamW",
20
+ "optimizer_kwargs": {
21
+ "lr": 0.0001,
22
+ "weight_decay": 0.1
23
+ },
24
+ "lr_scheduler": "cosine",
25
+ "use_amp": false,
26
+ "autocast_adapter_dtype": true,
27
+ "generation_kwargs": {
28
+ "max_length": 800,
29
+ "max_new_tokens": 300
30
+ },
31
+ "attn_implementation": null
32
+ },
33
+ "peft_config": {
34
+ "task_type": null,
35
+ "peft_type": "IA3",
36
+ "auto_mapping": null,
37
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
38
+ "revision": null,
39
+ "inference_mode": false,
40
+ "target_modules": [
41
+ "down_proj",
42
+ "v_proj",
43
+ "k_proj"
44
+ ],
45
+ "exclude_modules": null,
46
+ "feedforward_modules": [
47
+ "down_proj"
48
+ ],
49
+ "fan_in_fan_out": false,
50
+ "modules_to_save": null,
51
+ "init_ia3_weights": true
52
+ },
53
+ "error_msg": ""
54
+ },
55
+ "train_info": {
56
+ "cuda_memory_reserved_avg": 12023227429,
57
+ "cuda_memory_max": 23137878016,
58
+ "cuda_memory_reserved_99th": 18398566154,
59
+ "train_time": 1782.9318781230104,
60
+ "file_size": 1157064,
61
+ "num_trainable_params": 286720,
62
+ "num_total_params": 3213036544,
63
+ "status": "success",
64
+ "metrics": [
65
+ {
66
+ "step": 250,
67
+ "valid accuracy": 0.0,
68
+ "train loss": 1.3155810165405273,
69
+ "train samples": 1000,
70
+ "train time": 30.56459548201383,
71
+ "eval time": 10.972947114001727,
72
+ "tokens / sec": 6926.936105684404,
73
+ "mem allocated avg": 6780994971.648,
74
+ "mem reserved avg": 12076433014.784,
75
+ "elapsed time": 90.53726772200025
76
+ },
77
+ {
78
+ "step": 500,
79
+ "valid accuracy": 0.0,
80
+ "train loss": 1.205229633808136,
81
+ "train samples": 2000,
82
+ "train time": 30.221456803970796,
83
+ "eval time": 10.954313254995213,
84
+ "tokens / sec": 6882.361805029583,
85
+ "mem allocated avg": 6773721065.472,
86
+ "mem reserved avg": 11963673346.048,
87
+ "elapsed time": 175.07058417100052
88
+ },
89
+ {
90
+ "step": 750,
91
+ "valid accuracy": 0.1,
92
+ "train loss": 1.0194582087993622,
93
+ "train samples": 3000,
94
+ "train time": 30.774312397006724,
95
+ "eval time": 10.944943730006344,
96
+ "tokens / sec": 6966.881899231445,
97
+ "mem allocated avg": 6784231882.752,
98
+ "mem reserved avg": 12126680776.704,
99
+ "elapsed time": 260.540154495
100
+ },
101
+ {
102
+ "step": 1000,
103
+ "valid accuracy": 0.24,
104
+ "train loss": 0.9196457831859589,
105
+ "train samples": 4000,
106
+ "train time": 30.61534244806535,
107
+ "eval time": 10.960088267995161,
108
+ "tokens / sec": 6804.95409624808,
109
+ "mem allocated avg": 6775492155.392,
110
+ "mem reserved avg": 11986893012.992,
111
+ "elapsed time": 345.30987053900026
112
+ },
113
+ {
114
+ "step": 1250,
115
+ "valid accuracy": 0.32,
116
+ "train loss": 0.8685842225551605,
117
+ "train samples": 5000,
118
+ "train time": 29.97266351111466,
119
+ "eval time": 10.924794500999269,
120
+ "tokens / sec": 6957.606551138459,
121
+ "mem allocated avg": 6775089207.296,
122
+ "mem reserved avg": 11983428517.888,
123
+ "elapsed time": 429.5542291879974
124
+ },
125
+ {
126
+ "step": 1500,
127
+ "valid accuracy": 0.32,
128
+ "train loss": 0.8332846148014068,
129
+ "train samples": 6000,
130
+ "train time": 29.98314001694962,
131
+ "eval time": 10.942266878999362,
132
+ "tokens / sec": 6981.6236685572,
133
+ "mem allocated avg": 6776724867.072,
134
+ "mem reserved avg": 12008594341.888,
135
+ "elapsed time": 513.8152235820016
136
+ },
137
+ {
138
+ "step": 1750,
139
+ "valid accuracy": 0.32,
140
+ "train loss": 0.8169269208908081,
141
+ "train samples": 7000,
142
+ "train time": 30.245623568014707,
143
+ "eval time": 10.940915298000618,
144
+ "tokens / sec": 6921.8278647558345,
145
+ "mem allocated avg": 6777912934.4,
146
+ "mem reserved avg": 12032065667.072,
147
+ "elapsed time": 598.2868188970024
148
+ },
149
+ {
150
+ "step": 2000,
151
+ "valid accuracy": 0.32,
152
+ "train loss": 0.8072074156999588,
153
+ "train samples": 8000,
154
+ "train time": 30.292844633964705,
155
+ "eval time": 10.95617212200159,
156
+ "tokens / sec": 6856.272578875894,
157
+ "mem allocated avg": 6775099170.816,
158
+ "mem reserved avg": 11967473385.472,
159
+ "elapsed time": 682.7948923380027
160
+ },
161
+ {
162
+ "step": 2250,
163
+ "valid accuracy": 0.32,
164
+ "train loss": 0.7952859619855881,
165
+ "train samples": 9000,
166
+ "train time": 31.20892413101683,
167
+ "eval time": 10.942549917002907,
168
+ "tokens / sec": 6887.388975590319,
169
+ "mem allocated avg": 6786161477.632,
170
+ "mem reserved avg": 12167709458.432,
171
+ "elapsed time": 768.9645714229991
172
+ },
173
+ {
174
+ "step": 2500,
175
+ "valid accuracy": 0.28,
176
+ "train loss": 0.7890167078971863,
177
+ "train samples": 10000,
178
+ "train time": 30.187670495011844,
179
+ "eval time": 10.954304017002869,
180
+ "tokens / sec": 6822.884860692832,
181
+ "mem allocated avg": 6771082014.72,
182
+ "mem reserved avg": 11910984499.2,
183
+ "elapsed time": 853.427360558002
184
+ },
185
+ {
186
+ "step": 2750,
187
+ "valid accuracy": 0.3,
188
+ "train loss": 0.7823473591804504,
189
+ "train samples": 11000,
190
+ "train time": 30.410061570059042,
191
+ "eval time": 10.93302121299348,
192
+ "tokens / sec": 6967.4636965751015,
193
+ "mem allocated avg": 6782254225.408,
194
+ "mem reserved avg": 12090903363.584,
195
+ "elapsed time": 938.3584665200033
196
+ },
197
+ {
198
+ "step": 3000,
199
+ "valid accuracy": 0.24,
200
+ "train loss": 0.7709820411205291,
201
+ "train samples": 12000,
202
+ "train time": 30.02989622000314,
203
+ "eval time": 10.940404225999373,
204
+ "tokens / sec": 6950.773271769175,
205
+ "mem allocated avg": 6776725577.728,
206
+ "mem reserved avg": 12003133358.08,
207
+ "elapsed time": 1022.4627897890023
208
+ },
209
+ {
210
+ "step": 3250,
211
+ "valid accuracy": 0.3,
212
+ "train loss": 0.7755767168998718,
213
+ "train samples": 13000,
214
+ "train time": 30.172652364024543,
215
+ "eval time": 10.940153044000908,
216
+ "tokens / sec": 6989.806446431653,
217
+ "mem allocated avg": 6778589339.648,
218
+ "mem reserved avg": 12038298402.816,
219
+ "elapsed time": 1107.0076802080002
220
+ },
221
+ {
222
+ "step": 3500,
223
+ "valid accuracy": 0.34,
224
+ "train loss": 0.7658302361965179,
225
+ "train samples": 14000,
226
+ "train time": 30.384311634006735,
227
+ "eval time": 10.941136569999799,
228
+ "tokens / sec": 6903.233567590308,
229
+ "mem allocated avg": 6777534660.608,
230
+ "mem reserved avg": 12020623605.76,
231
+ "elapsed time": 1191.893303306002
232
+ },
233
+ {
234
+ "step": 3750,
235
+ "valid accuracy": 0.34,
236
+ "train loss": 0.7585167481899261,
237
+ "train samples": 15000,
238
+ "train time": 31.250990667955193,
239
+ "eval time": 10.924158087997057,
240
+ "tokens / sec": 6934.276173913666,
241
+ "mem allocated avg": 6788426940.416,
242
+ "mem reserved avg": 12209652498.432,
243
+ "elapsed time": 1278.4574160839984
244
+ },
245
+ {
246
+ "step": 4000,
247
+ "valid accuracy": 0.26,
248
+ "train loss": 0.7766438691616059,
249
+ "train samples": 16000,
250
+ "train time": 30.222231689898763,
251
+ "eval time": 10.98030305699649,
252
+ "tokens / sec": 6762.339793335249,
253
+ "mem allocated avg": 6769563977.728,
254
+ "mem reserved avg": 11885533462.528,
255
+ "elapsed time": 1362.9405450319973
256
+ },
257
+ {
258
+ "step": 4250,
259
+ "valid accuracy": 0.34,
260
+ "train loss": 0.7542061095237732,
261
+ "train samples": 17000,
262
+ "train time": 30.273203028933494,
263
+ "eval time": 10.948997009996674,
264
+ "tokens / sec": 6982.710081849145,
265
+ "mem allocated avg": 6780103426.048,
266
+ "mem reserved avg": 12047483928.576,
267
+ "elapsed time": 1447.661586811002
268
+ },
269
+ {
270
+ "step": 4500,
271
+ "valid accuracy": 0.32,
272
+ "train loss": 0.7659628703594208,
273
+ "train samples": 18000,
274
+ "train time": 29.84466753601737,
275
+ "eval time": 10.942651322002348,
276
+ "tokens / sec": 6963.320993581165,
277
+ "mem allocated avg": 6775043430.4,
278
+ "mem reserved avg": 11968387743.744,
279
+ "elapsed time": 1531.5572027719973
280
+ },
281
+ {
282
+ "step": 4750,
283
+ "valid accuracy": 0.28,
284
+ "train loss": 0.7580052223205567,
285
+ "train samples": 19000,
286
+ "train time": 30.03731635398435,
287
+ "eval time": 10.927273799999966,
288
+ "tokens / sec": 6989.272860661278,
289
+ "mem allocated avg": 6776962899.968,
290
+ "mem reserved avg": 12017695981.568,
291
+ "elapsed time": 1615.9832882379997
292
+ },
293
+ {
294
+ "step": 5000,
295
+ "valid accuracy": 0.36,
296
+ "train loss": 0.7657463653087616,
297
+ "train samples": 20000,
298
+ "train time": 30.07570726004633,
299
+ "eval time": 10.953207714999735,
300
+ "tokens / sec": 6925.19042691597,
301
+ "mem allocated avg": 6774270615.552,
302
+ "mem reserved avg": 11958900228.096,
303
+ "elapsed time": 1700.4354192270039
304
+ },
305
+ {
306
+ "step": 5000,
307
+ "test accuracy": 0.34495830174374525,
308
+ "train loss": 0.7657463653087616,
309
+ "train samples": 20000,
310
+ "train total tokens": 4198051
311
+ }
312
+ ]
313
+ },
314
+ "meta_info": {
315
+ "model_info": {
316
+ "sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
317
+ "created_at": "2024-09-18T15:23:48+00:00"
318
+ },
319
+ "dataset_info": {
320
+ "metamath": {
321
+ "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
322
+ "created_at": "2023-09-21T17:22:46+00:00"
323
+ },
324
+ "gsm8k": {
325
+ "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
326
+ "created_at": "2022-04-12T10:22:10+00:00"
327
+ }
328
+ },
329
+ "package_info": {
330
+ "transformers-version": "4.52.4",
331
+ "transformers-commit-hash": null,
332
+ "peft-version": "0.15.2.dev0",
333
+ "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
334
+ "datasets-version": "3.6.0",
335
+ "datasets-commit-hash": null,
336
+ "bitsandbytes-version": "0.46.0",
337
+ "bitsandbytes-commit-hash": null,
338
+ "torch-version": "2.7.1+cu126",
339
+ "torch-commit-hash": null
340
+ },
341
+ "system_info": {
342
+ "system": "Linux",
343
+ "release": "6.8.0-1029-aws",
344
+ "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
345
+ "machine": "x86_64",
346
+ "processor": "x86_64",
347
+ "gpu": "NVIDIA L40S"
348
+ },
349
+ "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
350
+ }
351
+ }