github-actions[bot]
commited on
Commit
·
a76607e
0
Parent(s):
🚀 Deploy method comparison app from GH action
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- MetaMathQA/Makefile +90 -0
- MetaMathQA/README.md +241 -0
- MetaMathQA/cancelled_results/.gitkeep +0 -0
- MetaMathQA/data.py +109 -0
- MetaMathQA/default_training_params.json +26 -0
- MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json +39 -0
- MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json +11 -0
- MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json +6 -0
- MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json +20 -0
- MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json +19 -0
- MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json +19 -0
- MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json +21 -0
- MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json +6 -0
- MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json +23 -0
- MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json +23 -0
- MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json +6 -0
- MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json +14 -0
- MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json +14 -0
- MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json +6 -0
- MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json +11 -0
- MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json +24 -0
- MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json +27 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json +30 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json +30 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json +9 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json +30 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json +30 -0
- MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json +30 -0
- MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json +27 -0
- MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json +15 -0
- MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json +6 -0
- MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json +17 -0
- MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json +17 -0
- MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json +6 -0
- MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json +17 -0
- MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json +22 -0
- MetaMathQA/experiments/vblora/llama-3.2-3B-default/adapter_config.json +26 -0
- MetaMathQA/experiments/vera/llama-3.2-3B-default/adapter_config.json +20 -0
- MetaMathQA/experiments/vera/llama-3.2-3B-default/training_params.json +6 -0
- MetaMathQA/requirements.txt +4 -0
- MetaMathQA/results/.gitkeep +0 -0
- MetaMathQA/results/adalora--llama-3.2-3B-rank32.json +4071 -0
- MetaMathQA/results/adaptionprompt--llama-3.2-3B-lr_0.0005.json +341 -0
- MetaMathQA/results/boft--llama-3.2-3B-default.json +354 -0
- MetaMathQA/results/bone--llama-3.2-3B-bat.json +350 -0
- MetaMathQA/results/bone--llama-3.2-3B-default.json +350 -0
- MetaMathQA/results/fourierft--llama-3.2-3B-default.json +354 -0
- MetaMathQA/results/fourierft--llama-3.2-3B-n_frequency-5000.json +354 -0
- MetaMathQA/results/full-finetuning--llama-3.2-3B-lr_0.00001.json +331 -0
- MetaMathQA/results/ia3--llama-3.2-3B-default.json +351 -0
MetaMathQA/Makefile
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Makefile for running MetaMathQA experiments.
|
2 |
+
|
3 |
+
# --- Configuration ---
|
4 |
+
PYTHON := python
|
5 |
+
RUN_SCRIPT := run.py
|
6 |
+
EXPERIMENTS_DIR := experiments
|
7 |
+
RESULTS_DIR := results
|
8 |
+
|
9 |
+
# --- Automatic Experiment and Result Discovery ---
|
10 |
+
|
11 |
+
# 1. Find all experiment directories by looking for adapter_config.json files.
|
12 |
+
# This gives us a list like: experiments/lora/llama-3.2-3B-rank32 ...
|
13 |
+
EXPERIMENT_PATHS := $(shell find $(EXPERIMENTS_DIR) \
|
14 |
+
-name "adapter_config.json" -or \
|
15 |
+
-name "training_params.json" | xargs dirname | sort -u)
|
16 |
+
|
17 |
+
# 2. Define a function to replace all occurrences of a character in a string.
|
18 |
+
# This is needed to replicate the result naming logic from run.py (e.g., "lora/foo" -> "lora-foo").
|
19 |
+
# Usage: $(call replace-all, string, char_to_replace, replacement_char)
|
20 |
+
replace-all = $(if $(findstring $(2),$(1)),$(call replace-all,$(subst $(2),$(3),$(1)),$(2),$(3)),$(1))
|
21 |
+
|
22 |
+
# 3. Define a function to convert an experiment path to its flat result file path.
|
23 |
+
# e.g., "experiments/lora/llama-3.2-3B-rank32" -> "results/lora-llama-3.2-3B-rank32.json"
|
24 |
+
exp_to_res = $(RESULTS_DIR)/$(call replace-all,$(patsubst $(EXPERIMENTS_DIR)/%,%,$(1)),/,--).json
|
25 |
+
|
26 |
+
# 4. Generate the list of all target result files we want to build.
|
27 |
+
RESULT_FILES := $(foreach exp,$(EXPERIMENT_PATHS),$(call exp_to_res,$(exp)))
|
28 |
+
|
29 |
+
|
30 |
+
# --- Main Rules ---
|
31 |
+
|
32 |
+
# The default 'all' target depends on all possible result files.
|
33 |
+
# Running `make` or `make all` will check and run any outdated or missing experiments.
|
34 |
+
all: $(RESULT_FILES)
|
35 |
+
|
36 |
+
|
37 |
+
# --- Dynamic Rule Generation ---
|
38 |
+
|
39 |
+
# This is the core logic. We dynamically generate a specific Makefile rule for each experiment found.
|
40 |
+
# This avoids a complex pattern rule and makes the logic clearer.
|
41 |
+
define EXPERIMENT_template
|
42 |
+
# Input $1: The full experiment path (e.g., experiments/lora/llama-3.2-3B-rank32)
|
43 |
+
|
44 |
+
# Define the rule:
|
45 |
+
# The target is the result file (e.g., results/lora-llama-3.2-3B-rank32.json).
|
46 |
+
# The dependencies are its config files, code changes need to be audited manually since they can
|
47 |
+
# vary in degree of importance. Note that we explicitly ignore when the script fails to run
|
48 |
+
# so that the other experiments still have a chance to run.
|
49 |
+
$(call exp_to_res,$(1)): $(wildcard $(1)/adapter_config.json) $(wildcard $(1)/training_params.json)
|
50 |
+
@echo "---"
|
51 |
+
@echo "Running experiment: $(1)"
|
52 |
+
-$(PYTHON) $(RUN_SCRIPT) -v $(1)
|
53 |
+
@echo "Finished: $$@"
|
54 |
+
@echo "---"
|
55 |
+
|
56 |
+
endef
|
57 |
+
|
58 |
+
# This command iterates through every found experiment path and evaluates the template,
|
59 |
+
# effectively stamping out a unique, explicit rule for each one.
|
60 |
+
$(foreach exp_path,$(EXPERIMENT_PATHS),$(eval $(call EXPERIMENT_template,$(exp_path))))
|
61 |
+
|
62 |
+
|
63 |
+
# --- Utility Rules ---
|
64 |
+
|
65 |
+
.PHONY: all clean list dump_rules
|
66 |
+
|
67 |
+
# The 'clean' rule removes all generated results.
|
68 |
+
clean:
|
69 |
+
@echo "Cleaning results directory..."
|
70 |
+
@([ -n "$(wildcard $(RESULTS_DIR)/*.json)" ] && rm $(RESULTS_DIR)/*.json) || exit 0
|
71 |
+
|
72 |
+
# The 'list' rule is for debugging. It shows the discovered experiments
|
73 |
+
# and the result files the Makefile expects to create for them.
|
74 |
+
list:
|
75 |
+
@echo "Discovered experiment configurations:"
|
76 |
+
@$(foreach exp,$(EXPERIMENT_PATHS),echo " - $(exp)/adapter_config.json";)
|
77 |
+
@echo "\nTarget result files:"
|
78 |
+
@$(foreach res,$(RESULT_FILES),echo " - $(res)";)
|
79 |
+
|
80 |
+
# The 'dump_rules' rule is for debugging. It dumps all dynamically defined rules.
|
81 |
+
define newline
|
82 |
+
|
83 |
+
|
84 |
+
endef
|
85 |
+
define DUMPED_RULES
|
86 |
+
$(foreach exp_path,$(EXPERIMENT_PATHS),$(call EXPERIMENT_template,$(exp_path)))
|
87 |
+
endef
|
88 |
+
|
89 |
+
dump_rules:
|
90 |
+
@echo -e "$(subst $(newline),\n,${DUMPED_RULES})"
|
MetaMathQA/README.md
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PEFT method comparison on the MetaMathQA and GSM8K datasets
|
2 |
+
|
3 |
+
## Goal
|
4 |
+
|
5 |
+
This goal is to provide a benchmarking framework for the different PEFT methods that are implemented. It is important that evaluating different PEFT methods is reproducible, idempotent, and version-controlled. Results for more PEFT methods can be added over time.
|
6 |
+
|
7 |
+
## Dataset
|
8 |
+
|
9 |
+
This task trains on the [MetaMathQA]((https://huggingface.co/datasets/meta-math/MetaMathQA)) dataset and validates/tests on the [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset ("main").
|
10 |
+
|
11 |
+
For the model to attain good accuracy, it needs to learn to adhere to the output format and it must express basic chain of thought reasoning capabilities to get to the correct result in the first place. The task is challenging for models in the sub 7B parameter range.
|
12 |
+
|
13 |
+
The train set uses the whole of MetaMathQA. The validation set is a random sample from the train set of GSM8K. The test set is the whole of the GSM8K test set.
|
14 |
+
|
15 |
+
## Running
|
16 |
+
|
17 |
+
Create an experiment in the `experiment/<peft-method>` folder of your choice and give it a name (the name itself does not matter but helps identify the experiment). An example would be `experiments/lora/llama-3.2-3B-rank32/`. Inside that directory, create 2 files:
|
18 |
+
|
19 |
+
- `adapter_config.json`
|
20 |
+
- Optional: `training_parameters.json`
|
21 |
+
|
22 |
+
Once you created these two files, you can either
|
23 |
+
|
24 |
+
- run the whole suite using by simply calling `make` (takes >24h)
|
25 |
+
- run one specific experiment by calling `make results/<experiment_name>-<experiment_variation>.json`,
|
26 |
+
for example `results/vblora-llama-3.2-3B-default.json`
|
27 |
+
|
28 |
+
You can get a list of all runnable experiments by running `make list`, e.g.:
|
29 |
+
```
|
30 |
+
% make list (git)-[method-comparison-results] ⛓ peft
|
31 |
+
Discovered experiment configurations:
|
32 |
+
- experiments/ptuning/llama-3.2-3B-default/adapter_config.json
|
33 |
+
[...]
|
34 |
+
- experiments/vblora/llama-3.2-3B-default/adapter_config.json
|
35 |
+
|
36 |
+
Target result files:
|
37 |
+
- results/ptuning-llama-3.2-3B-default.json
|
38 |
+
[...]
|
39 |
+
- results/vblora-llama-3.2-3B-default.json
|
40 |
+
```
|
41 |
+
|
42 |
+
In case you want to force the execution of an experiment, you can simply `touch` the respective adapter config
|
43 |
+
without modifying it. For example:
|
44 |
+
|
45 |
+
touch experiments/vblora/llama-3.2-3B-default/adapter_config.json
|
46 |
+
make
|
47 |
+
|
48 |
+
to run the VBLoRA default experiment again.
|
49 |
+
|
50 |
+
### `adapter_config.json`
|
51 |
+
|
52 |
+
This must be a valid PEFT configuration. It is easiest to create it programmatically, e.g.:
|
53 |
+
|
54 |
+
```python
|
55 |
+
from peft import LoraConfig
|
56 |
+
|
57 |
+
config = LoraConfig(...)
|
58 |
+
config.save_pretrained(<path-to-experiment>)
|
59 |
+
```
|
60 |
+
|
61 |
+
### `training_parameters.json`
|
62 |
+
|
63 |
+
There is a default file for the non-PEFT parameters: `default_training_params.json`. This contains all the other parameters that are relevant for training, e.g. the base model id, number of steps, batch size, learning rate, etc. If parameters that differ from the defaults are needed for a specific experiment, place a `training_parameters.json` into the experiment directory and adjust the parameters that need changing. The other parametes are taken from the aforementioned default config.
|
64 |
+
|
65 |
+
For an overview of all possible arguments, you can also check the `TrainConfig` `dataclass` in `utils.py`.
|
66 |
+
|
67 |
+
### Runtime performance
|
68 |
+
|
69 |
+
Several factors should be considered to achieve a fast runtime performance. Besides the obvious factors like `max_steps` or the base model size, we found the following factors to have a significant impact:
|
70 |
+
|
71 |
+
#### Eval batch size
|
72 |
+
|
73 |
+
Regarding the `batch_size_eval` parameter, it is quite critical since evaluation takes up a significant portion of the training time and batching helps with reducing that. It should be possible to choose a value that is multiple times higher than the batch size used for training (`batch_size`). You should also pay attention to the size of the validation set -- e.g. if it's 50, don't choose a `batch_size_eval` of 40, as that results in a large batch of 30 and a small batch of 10. 25 might be a better choice. Also, ensure via a quick train run that the batch size does not lead to out of memory errors -- getting this error at the very end on evaluating the test set would be quite a loss of time.
|
74 |
+
|
75 |
+
#### Generation length
|
76 |
+
|
77 |
+
During testing, we discovered that the validation time is greatly inflated by just a few very long generations. Those can inflate the validation time by a factor of 3 or more. At the same time, we discovered that these long generations do not help with accuracy -- in fact, if they exceed the maximum configured length, they're just cut off mid sentence and would thus produce an accuracy of 0 anyway.
|
78 |
+
|
79 |
+
To remedy this, we now set both `max_length` and `max_new_tokens` for the generation kwargs in the default training parameters. Normally, this is not possible when using transformers, as the latter argument overrides the former. However, we have added special logic inside of `get_generation_config` which takes both and chooses the smaller of the two. This way, we can get rid of these excessively long generations, thus considerably reducing eval times, while still guaranteeing a maximum total generation length to guard against OOM errors. Testing showed that this does not hamper test accuracy. It is therefore recommended not to change these settings.
|
80 |
+
|
81 |
+
#### Bucketing
|
82 |
+
|
83 |
+
The length of the sequences in the training data can vary a lot. Therefore, if samples are taken randomly from the training dataset, we will end up with batches containing very short and very long sequences. This is bad because the batch will be padded to the longest sequence, slowing down training. The obvious solution would be to sort the whole dataset by sequence length, but this is also bad because it introduces an order bias (e.g. first training on only short and then on only long answers).
|
84 |
+
|
85 |
+
The solution is to find a trade off between the two factors. This is achieved by the `BucketIterator`. It first creates buckets that contain multiple batches, e.g. 20x the batch size. The bucket is then sorted by sequence length and then batches are yielded from the bucket. Therefore, we have a small order bias within a bucket but not between buckets, stricking a good balance between training speed and training loss.
|
86 |
+
|
87 |
+
From practical experiments, for a batch size of 4, a bucket size of 80 provides a good balance with only slightly lower training loss but cutting training time by 25%. For eval, we don't use the iterator since there, the batch size is relatively big and thus there is little upside.
|
88 |
+
|
89 |
+
### Start a run
|
90 |
+
|
91 |
+
Once everything is set up properly, start a run by using the `run.py` script. Pass `-v` for verbose output to the console (recommended if observing the progress is desired). As an example, for `experiments/lora/llama-3.2-3B-rank32/` the invocation would be:
|
92 |
+
|
93 |
+
```sh
|
94 |
+
python run.py -v experiments/lora/llama-3.2-3B-rank32/
|
95 |
+
```
|
96 |
+
|
97 |
+
By default, the adapter will be saved in a temporary file for further inspection if needed. The prevent this, add the `--clean` flag to the call.
|
98 |
+
|
99 |
+
### Run status
|
100 |
+
|
101 |
+
The run can be categorized 3 different states:
|
102 |
+
|
103 |
+
1. Main run: You are on the `main` branch and the run ended successfully. The results are stored in the `results` folder and are used for further analysis.
|
104 |
+
2. Test run: You are not on the `main` branch and the run ended successfully. The results are stored in the `temporary_results` folder and are not used for further analysis.
|
105 |
+
3. The run was cancelled (`ctrl + c`). The results are stored in the `cancelled_results` folder and are not used for further analysis.
|
106 |
+
|
107 |
+
## Outputs
|
108 |
+
|
109 |
+
Results are stored in one of the result directories. An example output could look like so:
|
110 |
+
|
111 |
+
```js
|
112 |
+
{
|
113 |
+
"run_info": {
|
114 |
+
"created_at": "2025-03-05T13:50:05+00:00",
|
115 |
+
"total_time": 2711.0915009640157,
|
116 |
+
"experiment_name": "ia3/lr_0.001",
|
117 |
+
"peft_branch": "ben-method-comparison",
|
118 |
+
"train_config": {
|
119 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
120 |
+
"dtype": "bfloat16",
|
121 |
+
"max_seq_length": 768,
|
122 |
+
"batch_size": 4,
|
123 |
+
"batch_size_eval": 51,
|
124 |
+
"max_steps": 5000,
|
125 |
+
"eval_steps": 250,
|
126 |
+
"compile": false,
|
127 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
128 |
+
"seed": 0,
|
129 |
+
"grad_norm_clip": 1.0,
|
130 |
+
"optimizer_kwargs": {
|
131 |
+
"lr": 0.001
|
132 |
+
},
|
133 |
+
"lr_scheduler": "cosine",
|
134 |
+
"use_amp": false,
|
135 |
+
"generation_kwargs": {
|
136 |
+
"max_length": 800
|
137 |
+
},
|
138 |
+
"attn_implementation": null
|
139 |
+
},
|
140 |
+
"peft_config": {
|
141 |
+
"task_type": null,
|
142 |
+
"peft_type": "IA3",
|
143 |
+
"auto_mapping": null,
|
144 |
+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
145 |
+
"revision": null,
|
146 |
+
"inference_mode": false,
|
147 |
+
"target_modules": [
|
148 |
+
"v_proj",
|
149 |
+
"k_proj",
|
150 |
+
"down_proj"
|
151 |
+
],
|
152 |
+
"exclude_modules": null,
|
153 |
+
"feedforward_modules": [
|
154 |
+
"down_proj"
|
155 |
+
],
|
156 |
+
"fan_in_fan_out": false,
|
157 |
+
"modules_to_save": null,
|
158 |
+
"init_ia3_weights": true
|
159 |
+
}
|
160 |
+
},
|
161 |
+
"train_info": {
|
162 |
+
"cuda_memory_reserved_avg": 14229219940,
|
163 |
+
"cuda_memory_max": 24847056896,
|
164 |
+
"cuda_memory_reserved_99th": 19115624366,
|
165 |
+
"train_time": 2238.65277833899,
|
166 |
+
"file_size": 1157064,
|
167 |
+
"status": "success",
|
168 |
+
"metrics": [
|
169 |
+
{
|
170 |
+
"step": 250,
|
171 |
+
"valid accuracy": 0.0784313725490196,
|
172 |
+
"train loss": 1.1336498007774354,
|
173 |
+
"train samples": 1000
|
174 |
+
},
|
175 |
+
[...]
|
176 |
+
{
|
177 |
+
"step": 5000,
|
178 |
+
"valid accuracy": 0.21568627450980393,
|
179 |
+
"train loss": 0.6345920492410659,
|
180 |
+
"train samples": 20000
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"step": 5000,
|
184 |
+
"test accuracy": 0.35129740518962077,
|
185 |
+
"train loss": 0.6345920492410659,
|
186 |
+
"train samples": 20000,
|
187 |
+
"train total tokens": 4197579
|
188 |
+
}
|
189 |
+
]
|
190 |
+
},
|
191 |
+
"meta_info": {
|
192 |
+
"model_sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
193 |
+
"model_created_at": "2024-09-18T15:23:48+00:00",
|
194 |
+
"dataset_sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
195 |
+
"dataset_created_at": "2023-09-21T17:22:46+00:00",
|
196 |
+
"package_info": {
|
197 |
+
"transformers-version": "4.50.0.dev0",
|
198 |
+
"transformers-commit-hash": "752ef3fd4e70869626ec70657a770a85c0ad9219",
|
199 |
+
"peft-version": "0.14.1.dev0",
|
200 |
+
"peft-commit-hash": "a447a4e5ecd87b7d57733f4df9616a328cf130f4",
|
201 |
+
"datasets-version": "3.3.2",
|
202 |
+
"datasets-commit-hash": null,
|
203 |
+
"bitsandbytes-version": "0.45.2",
|
204 |
+
"bitsandbytes-commit-hash": null,
|
205 |
+
"torch-version": "2.6.0+cu124",
|
206 |
+
"torch-commit-hash": null
|
207 |
+
},
|
208 |
+
"system_info": {
|
209 |
+
"system": "Linux",
|
210 |
+
"release": "6.11.0-17-generic",
|
211 |
+
"version": "#17~24.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jan 20 22:48:29 UTC 2",
|
212 |
+
"machine": "x86_64",
|
213 |
+
"processor": "x86_64",
|
214 |
+
"gpu": "NVIDIA GeForce RTX 4090"
|
215 |
+
},
|
216 |
+
"pytorch_info": "PyTorch built with: [...]"
|
217 |
+
}
|
218 |
+
}
|
219 |
+
```
|
220 |
+
|
221 |
+
## Dependencies
|
222 |
+
|
223 |
+
Apart from the normal PEFT dependencies, ensure that the packages in the `requirements.txt` are installed, e.g. via:
|
224 |
+
|
225 |
+
```sh
|
226 |
+
python -m pip install -r requirements.txt
|
227 |
+
```
|
228 |
+
|
229 |
+
Python 3.12+ is required.
|
230 |
+
|
231 |
+
## Open tasks
|
232 |
+
|
233 |
+
- consider using `DataLoader`
|
234 |
+
- consider adding https://github.com/huggingface/Math-Verify
|
235 |
+
- consider adding `weight` argument to cross entropy calculation to downweight the EOS token, but it would require calculating the loss manually instead of relying on transformers (see https://github.com/huggingface/transformers/blob/6a876462c308bd7cd7d3ca8e93abaa7d5b02e90e/src/transformers/loss/loss_utils.py#L24-L48)
|
236 |
+
- do a sanity check against/comparison with transformers Trainer
|
237 |
+
- consider using vLLM to potentially speed up generations, at least for the test set
|
238 |
+
- using `torch.compile` leads to a huge slowdown, investigate (maybe recompiles), although it does save memory
|
239 |
+
- AMP does not appear to help, investigate
|
240 |
+
- packing of sequences (but this probably requires adjusting the attention matrix)
|
241 |
+
- clean up what gets printed and where (stdout, stderr)
|
MetaMathQA/cancelled_results/.gitkeep
ADDED
File without changes
|
MetaMathQA/data.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2025-present the HuggingFace Inc. team.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
"""
|
16 |
+
All utilities related to data handling.
|
17 |
+
"""
|
18 |
+
|
19 |
+
from functools import partial
|
20 |
+
from typing import Callable
|
21 |
+
|
22 |
+
import datasets
|
23 |
+
import numpy as np
|
24 |
+
from datasets import Dataset, load_dataset
|
25 |
+
|
26 |
+
|
27 |
+
# with a token limit of 768 for query + response, we have to exclude all texts with length > 1304; this leaves 93.8% of
|
28 |
+
# the dataset
|
29 |
+
CHAR_LIMIT = 1300
|
30 |
+
# train/valid/test split -- note that evaluation takes quite long, so don't choose too large sizes for the valid set,
|
31 |
+
# since it's run multiple times during training; test is only run once at the end and thus can be larger
|
32 |
+
VALID_SIZE = 50
|
33 |
+
|
34 |
+
|
35 |
+
def get_filtered_dataset(*, ds: datasets.Dataset, print_fn: Callable[..., None]) -> Dataset:
|
36 |
+
"""Return the filtered dataset, with long queries removed.
|
37 |
+
|
38 |
+
We determined that 99% of queries have 529 or fewer characters. Characters roughly correspond to tokens, so this is
|
39 |
+
a good proxy. We cannot use tokens directly, as that depends on the tokenizer, which can be different for each
|
40 |
+
model, but we want the same filter for each model.
|
41 |
+
|
42 |
+
"""
|
43 |
+
char_lengths = [len(f"{q} {r}") for q, r in zip(ds["query"], ds["response"])]
|
44 |
+
idx_filtered = [i for i, length in enumerate(char_lengths) if length <= CHAR_LIMIT]
|
45 |
+
print_fn(f"Filtered dataset: {100 * len(idx_filtered) / len(ds):.1f}% of the original dataset")
|
46 |
+
return ds.select(idx_filtered)
|
47 |
+
|
48 |
+
|
49 |
+
def get_train_valid_test_datasets(
|
50 |
+
*, tokenizer, query_template: str, print_fn: Callable[..., None]
|
51 |
+
) -> tuple[Dataset, Dataset, Dataset]:
|
52 |
+
"""
|
53 |
+
Return the indices of the train, valid, and test splits of the dataset.
|
54 |
+
|
55 |
+
We cannot use ds.train_test_split(..., stratify_by_column="type") as it gives:
|
56 |
+
|
57 |
+
> ValueError: Stratifying by column is only supported for ClassLabel column, and column type is Value.
|
58 |
+
|
59 |
+
even after calling ds_filtered.class_encode_column("type"). Thus, using sklearn's StratifiedKFold instead.
|
60 |
+
"""
|
61 |
+
metamath = load_dataset("meta-math/MetaMathQA")["train"]
|
62 |
+
metamath = get_filtered_dataset(ds=metamath, print_fn=print_fn)
|
63 |
+
|
64 |
+
# gsmk8k does not need to be filtered as query and response are short enough
|
65 |
+
gsm8k = load_dataset("openai/gsm8k", "main")
|
66 |
+
gsm8k = gsm8k.rename_columns({"question": "query", "answer": "response"})
|
67 |
+
gsm8k_train = gsm8k["train"]
|
68 |
+
gsm8k_test = gsm8k["test"]
|
69 |
+
|
70 |
+
np.random.seed(0)
|
71 |
+
indices = np.arange(len(gsm8k_train))
|
72 |
+
np.random.shuffle(indices)
|
73 |
+
idx_valid = indices[:VALID_SIZE]
|
74 |
+
|
75 |
+
ds_train = metamath
|
76 |
+
ds_valid = gsm8k_train.select(idx_valid)
|
77 |
+
ds_test = gsm8k_test
|
78 |
+
|
79 |
+
print_fn(f"Train size: {len(ds_train)}")
|
80 |
+
print_fn(f"Valid size: {len(ds_valid)}")
|
81 |
+
print_fn(f"Test size: {len(ds_test)}")
|
82 |
+
|
83 |
+
tokenize_with_answer_ = partial(tokenize_with_answer, tokenizer=tokenizer, template=query_template)
|
84 |
+
tokenize_wo_answer_ = partial(tokenize_wo_answer, tokenizer=tokenizer, template=query_template)
|
85 |
+
ds_train = ds_train.map(tokenize_with_answer_, batched=True).remove_columns(["type", "query", "original_question"])
|
86 |
+
ds_valid = ds_valid.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])
|
87 |
+
ds_test = ds_test.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])
|
88 |
+
|
89 |
+
return ds_train, ds_valid, ds_test
|
90 |
+
|
91 |
+
|
92 |
+
def tokenize_with_answer(samples, tokenizer, template):
|
93 |
+
queries = [template.format(query=sample) + answer for sample, answer in zip(samples["query"], samples["response"])]
|
94 |
+
tokenized = tokenizer(queries)
|
95 |
+
tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
|
96 |
+
tokenized["attention_mask"] = [
|
97 |
+
input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
|
98 |
+
]
|
99 |
+
return tokenized
|
100 |
+
|
101 |
+
|
102 |
+
def tokenize_wo_answer(samples, tokenizer, template):
|
103 |
+
queries = [template.format(query=sample) for sample in samples["query"]]
|
104 |
+
tokenized = tokenizer(queries)
|
105 |
+
tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
|
106 |
+
tokenized["attention_mask"] = [
|
107 |
+
input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
|
108 |
+
]
|
109 |
+
return tokenized
|
MetaMathQA/default_training_params.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
3 |
+
"dtype": "bfloat16",
|
4 |
+
"max_seq_length": 768,
|
5 |
+
"batch_size": 4,
|
6 |
+
"batch_size_eval": 50,
|
7 |
+
"max_steps": 5000,
|
8 |
+
"eval_steps": 250,
|
9 |
+
"compile": false,
|
10 |
+
"seed": 0,
|
11 |
+
"grad_norm_clip": 1.0,
|
12 |
+
"optimizer_type": "AdamW",
|
13 |
+
"optimizer_kwargs": {
|
14 |
+
"lr": 1e-4,
|
15 |
+
"weight_decay": 0.1
|
16 |
+
},
|
17 |
+
"lr_scheduler": "cosine",
|
18 |
+
"use_amp": false,
|
19 |
+
"autocast_adapter_dtype": true,
|
20 |
+
"attn_implementation": null,
|
21 |
+
"generation_kwargs": {
|
22 |
+
"max_length": 800,
|
23 |
+
"max_new_tokens": 300
|
24 |
+
},
|
25 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:"
|
26 |
+
}
|
MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"beta1": 0.85,
|
6 |
+
"beta2": 0.85,
|
7 |
+
"bias": "none",
|
8 |
+
"corda_config": null,
|
9 |
+
"deltaT": 1,
|
10 |
+
"eva_config": null,
|
11 |
+
"exclude_modules": null,
|
12 |
+
"fan_in_fan_out": false,
|
13 |
+
"inference_mode": false,
|
14 |
+
"init_lora_weights": true,
|
15 |
+
"init_r": 64,
|
16 |
+
"layer_replication": null,
|
17 |
+
"layers_pattern": null,
|
18 |
+
"layers_to_transform": null,
|
19 |
+
"loftq_config": {},
|
20 |
+
"lora_alpha": 8,
|
21 |
+
"lora_bias": false,
|
22 |
+
"lora_dropout": 0.0,
|
23 |
+
"megatron_config": null,
|
24 |
+
"megatron_core": "megatron.core",
|
25 |
+
"modules_to_save": null,
|
26 |
+
"orth_reg_weight": 0.5,
|
27 |
+
"peft_type": "ADALORA",
|
28 |
+
"r": 8,
|
29 |
+
"rank_pattern": null,
|
30 |
+
"revision": null,
|
31 |
+
"target_modules": null,
|
32 |
+
"target_r": 32,
|
33 |
+
"task_type": null,
|
34 |
+
"tfinal": 500,
|
35 |
+
"tinit": 200,
|
36 |
+
"total_step": 5000,
|
37 |
+
"use_dora": false,
|
38 |
+
"use_rslora": false
|
39 |
+
}
|
MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"adapter_layers": 28,
|
3 |
+
"adapter_len": 100,
|
4 |
+
"auto_mapping": null,
|
5 |
+
"base_model_name_or_path": null,
|
6 |
+
"inference_mode": false,
|
7 |
+
"peft_type": "ADAPTION_PROMPT",
|
8 |
+
"revision": null,
|
9 |
+
"target_modules": null,
|
10 |
+
"task_type": "CAUSAL_LM"
|
11 |
+
}
|
MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"optimizer_kwargs": {
|
3 |
+
"lr": 5e-4
|
4 |
+
}
|
5 |
+
}
|
6 |
+
|
MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"bias": "none",
|
5 |
+
"boft_block_num": 0,
|
6 |
+
"boft_block_size": 4,
|
7 |
+
"boft_dropout": 0.0,
|
8 |
+
"boft_n_butterfly_factor": 1,
|
9 |
+
"exclude_modules": null,
|
10 |
+
"fan_in_fan_out": false,
|
11 |
+
"inference_mode": false,
|
12 |
+
"init_weights": true,
|
13 |
+
"layers_pattern": null,
|
14 |
+
"layers_to_transform": null,
|
15 |
+
"modules_to_save": null,
|
16 |
+
"peft_type": "BOFT",
|
17 |
+
"revision": null,
|
18 |
+
"target_modules": null,
|
19 |
+
"task_type": null
|
20 |
+
}
|
MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"bias": "none",
|
5 |
+
"exclude_modules": null,
|
6 |
+
"inference_mode": false,
|
7 |
+
"init_weights": "bat",
|
8 |
+
"layers_pattern": null,
|
9 |
+
"layers_to_transform": null,
|
10 |
+
"modules_to_save": null,
|
11 |
+
"peft_type": "BONE",
|
12 |
+
"r": 64,
|
13 |
+
"revision": null,
|
14 |
+
"target_modules": [
|
15 |
+
"v_proj",
|
16 |
+
"q_proj"
|
17 |
+
],
|
18 |
+
"task_type": null
|
19 |
+
}
|
MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"bias": "none",
|
5 |
+
"exclude_modules": null,
|
6 |
+
"inference_mode": false,
|
7 |
+
"init_weights": true,
|
8 |
+
"layers_pattern": null,
|
9 |
+
"layers_to_transform": null,
|
10 |
+
"modules_to_save": null,
|
11 |
+
"peft_type": "BONE",
|
12 |
+
"r": 64,
|
13 |
+
"revision": null,
|
14 |
+
"target_modules": [
|
15 |
+
"v_proj",
|
16 |
+
"q_proj"
|
17 |
+
],
|
18 |
+
"task_type": null
|
19 |
+
}
|
MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"bias": "none",
|
5 |
+
"exclude_modules": null,
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": false,
|
8 |
+
"init_weights": false,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"modules_to_save": null,
|
12 |
+
"block_size": 64,
|
13 |
+
"block_size_pattern": {},
|
14 |
+
"peft_type": "C3A",
|
15 |
+
"revision": null,
|
16 |
+
"target_modules": [
|
17 |
+
"v_proj",
|
18 |
+
"q_proj"
|
19 |
+
],
|
20 |
+
"task_type": null
|
21 |
+
}
|
MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"optimizer_kwargs": {
|
3 |
+
"lr": 3e-1,
|
4 |
+
"weight_decay": 1e-5
|
5 |
+
}
|
6 |
+
}
|
MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"bias": "none",
|
5 |
+
"exclude_modules": null,
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": false,
|
8 |
+
"init_weights": false,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"modules_to_save": null,
|
12 |
+
"n_frequency": 1000,
|
13 |
+
"n_frequency_pattern": {},
|
14 |
+
"peft_type": "FOURIERFT",
|
15 |
+
"random_loc_seed": 777,
|
16 |
+
"revision": null,
|
17 |
+
"scaling": 300,
|
18 |
+
"target_modules": [
|
19 |
+
"v_proj",
|
20 |
+
"q_proj"
|
21 |
+
],
|
22 |
+
"task_type": null
|
23 |
+
}
|
MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"bias": "none",
|
5 |
+
"exclude_modules": null,
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": false,
|
8 |
+
"init_weights": false,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"modules_to_save": null,
|
12 |
+
"n_frequency": 5000,
|
13 |
+
"n_frequency_pattern": {},
|
14 |
+
"peft_type": "FOURIERFT",
|
15 |
+
"random_loc_seed": 777,
|
16 |
+
"revision": null,
|
17 |
+
"scaling": 300,
|
18 |
+
"target_modules": [
|
19 |
+
"v_proj",
|
20 |
+
"q_proj"
|
21 |
+
],
|
22 |
+
"task_type": null
|
23 |
+
}
|
MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"optimizer_kwargs": {
|
3 |
+
"lr": 1e-5
|
4 |
+
}
|
5 |
+
}
|
6 |
+
|
MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"exclude_modules": null,
|
5 |
+
"fan_in_fan_out": false,
|
6 |
+
"feedforward_modules": null,
|
7 |
+
"inference_mode": false,
|
8 |
+
"init_ia3_weights": true,
|
9 |
+
"modules_to_save": null,
|
10 |
+
"peft_type": "IA3",
|
11 |
+
"revision": null,
|
12 |
+
"target_modules": null,
|
13 |
+
"task_type": null
|
14 |
+
}
|
MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"exclude_modules": null,
|
5 |
+
"fan_in_fan_out": false,
|
6 |
+
"feedforward_modules": null,
|
7 |
+
"inference_mode": false,
|
8 |
+
"init_ia3_weights": true,
|
9 |
+
"modules_to_save": null,
|
10 |
+
"peft_type": "IA3",
|
11 |
+
"revision": null,
|
12 |
+
"target_modules": null,
|
13 |
+
"task_type": null
|
14 |
+
}
|
MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"optimizer_kwargs": {
|
3 |
+
"lr": 1e-3
|
4 |
+
}
|
5 |
+
}
|
6 |
+
|
MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"exclude_modules": null,
|
5 |
+
"inference_mode": false,
|
6 |
+
"modules_to_save": null,
|
7 |
+
"peft_type": "LN_TUNING",
|
8 |
+
"revision": null,
|
9 |
+
"target_modules": null,
|
10 |
+
"task_type": null
|
11 |
+
}
|
MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha": 64,
|
3 |
+
"alpha_pattern": {},
|
4 |
+
"auto_mapping": null,
|
5 |
+
"base_model_name_or_path": null,
|
6 |
+
"exclude_modules": null,
|
7 |
+
"inference_mode": false,
|
8 |
+
"init_weights": true,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"module_dropout": 0.0,
|
12 |
+
"modules_to_save": null,
|
13 |
+
"peft_type": "LOHA",
|
14 |
+
"r": 32,
|
15 |
+
"rank_dropout": 0.0,
|
16 |
+
"rank_pattern": {},
|
17 |
+
"revision": null,
|
18 |
+
"target_modules": [
|
19 |
+
"q_proj",
|
20 |
+
"v_proj"
|
21 |
+
],
|
22 |
+
"task_type": null,
|
23 |
+
"use_effective_conv2d": false
|
24 |
+
}
|
MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha": 64,
|
3 |
+
"alpha_pattern": {},
|
4 |
+
"auto_mapping": null,
|
5 |
+
"base_model_name_or_path": null,
|
6 |
+
"decompose_both": false,
|
7 |
+
"decompose_factor": -1,
|
8 |
+
"exclude_modules": null,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_weights": true,
|
11 |
+
"layers_pattern": null,
|
12 |
+
"layers_to_transform": null,
|
13 |
+
"module_dropout": 0.0,
|
14 |
+
"modules_to_save": null,
|
15 |
+
"peft_type": "LOKR",
|
16 |
+
"r": 32,
|
17 |
+
"rank_dropout": 0.0,
|
18 |
+
"rank_dropout_scale": false,
|
19 |
+
"rank_pattern": {},
|
20 |
+
"revision": null,
|
21 |
+
"target_modules": [
|
22 |
+
"q_proj",
|
23 |
+
"v_proj"
|
24 |
+
],
|
25 |
+
"task_type": null,
|
26 |
+
"use_effective_conv2d": false
|
27 |
+
}
|
MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"corda_config": null,
|
7 |
+
"eva_config": null,
|
8 |
+
"exclude_modules": null,
|
9 |
+
"fan_in_fan_out": false,
|
10 |
+
"inference_mode": false,
|
11 |
+
"init_lora_weights": true,
|
12 |
+
"layer_replication": null,
|
13 |
+
"layers_pattern": null,
|
14 |
+
"layers_to_transform": null,
|
15 |
+
"loftq_config": {},
|
16 |
+
"lora_alpha": 64,
|
17 |
+
"lora_bias": false,
|
18 |
+
"lora_dropout": 0.0,
|
19 |
+
"megatron_config": null,
|
20 |
+
"megatron_core": "megatron.core",
|
21 |
+
"modules_to_save": null,
|
22 |
+
"peft_type": "LORA",
|
23 |
+
"r": 32,
|
24 |
+
"rank_pattern": {},
|
25 |
+
"revision": null,
|
26 |
+
"target_modules": null,
|
27 |
+
"task_type": "CAUSAL_LM",
|
28 |
+
"use_dora": true,
|
29 |
+
"use_rslora": false
|
30 |
+
}
|
MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"corda_config": null,
|
7 |
+
"eva_config": null,
|
8 |
+
"exclude_modules": null,
|
9 |
+
"fan_in_fan_out": false,
|
10 |
+
"inference_mode": false,
|
11 |
+
"init_lora_weights": true,
|
12 |
+
"layer_replication": null,
|
13 |
+
"layers_pattern": null,
|
14 |
+
"layers_to_transform": null,
|
15 |
+
"loftq_config": {},
|
16 |
+
"lora_alpha": 64,
|
17 |
+
"lora_bias": false,
|
18 |
+
"lora_dropout": 0.0,
|
19 |
+
"megatron_config": null,
|
20 |
+
"megatron_core": "megatron.core",
|
21 |
+
"modules_to_save": null,
|
22 |
+
"peft_type": "LORA",
|
23 |
+
"r": 32,
|
24 |
+
"rank_pattern": {},
|
25 |
+
"revision": null,
|
26 |
+
"target_modules": null,
|
27 |
+
"task_type": "CAUSAL_LM",
|
28 |
+
"use_dora": false,
|
29 |
+
"use_rslora": false
|
30 |
+
}
|
MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"optimizer_type": "lora-fa",
|
3 |
+
"optimizer_kwargs": {
|
4 |
+
"r": 32,
|
5 |
+
"lora_alpha": 64,
|
6 |
+
"lr": 1e-4,
|
7 |
+
"weight_decay": 0.1
|
8 |
+
}
|
9 |
+
}
|
MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"corda_config": null,
|
7 |
+
"eva_config": null,
|
8 |
+
"exclude_modules": null,
|
9 |
+
"fan_in_fan_out": false,
|
10 |
+
"inference_mode": false,
|
11 |
+
"init_lora_weights": true,
|
12 |
+
"layer_replication": null,
|
13 |
+
"layers_pattern": null,
|
14 |
+
"layers_to_transform": null,
|
15 |
+
"loftq_config": {},
|
16 |
+
"lora_alpha": 64,
|
17 |
+
"lora_bias": false,
|
18 |
+
"lora_dropout": 0.0,
|
19 |
+
"megatron_config": null,
|
20 |
+
"megatron_core": "megatron.core",
|
21 |
+
"modules_to_save": null,
|
22 |
+
"peft_type": "LORA",
|
23 |
+
"r": 32,
|
24 |
+
"rank_pattern": {},
|
25 |
+
"revision": null,
|
26 |
+
"target_modules": null,
|
27 |
+
"task_type": "CAUSAL_LM",
|
28 |
+
"use_dora": false,
|
29 |
+
"use_rslora": false
|
30 |
+
}
|
MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"corda_config": null,
|
7 |
+
"eva_config": null,
|
8 |
+
"exclude_modules": null,
|
9 |
+
"fan_in_fan_out": false,
|
10 |
+
"inference_mode": false,
|
11 |
+
"init_lora_weights": true,
|
12 |
+
"layer_replication": null,
|
13 |
+
"layers_pattern": null,
|
14 |
+
"layers_to_transform": null,
|
15 |
+
"loftq_config": {},
|
16 |
+
"lora_alpha": 64,
|
17 |
+
"lora_bias": false,
|
18 |
+
"lora_dropout": 0.0,
|
19 |
+
"megatron_config": null,
|
20 |
+
"megatron_core": "megatron.core",
|
21 |
+
"modules_to_save": null,
|
22 |
+
"peft_type": "LORA",
|
23 |
+
"r": 64,
|
24 |
+
"rank_pattern": {},
|
25 |
+
"revision": null,
|
26 |
+
"target_modules": null,
|
27 |
+
"task_type": "CAUSAL_LM",
|
28 |
+
"use_dora": false,
|
29 |
+
"use_rslora": true
|
30 |
+
}
|
MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"corda_config": null,
|
7 |
+
"eva_config": null,
|
8 |
+
"exclude_modules": null,
|
9 |
+
"fan_in_fan_out": false,
|
10 |
+
"inference_mode": false,
|
11 |
+
"init_lora_weights": true,
|
12 |
+
"layer_replication": null,
|
13 |
+
"layers_pattern": null,
|
14 |
+
"layers_to_transform": null,
|
15 |
+
"loftq_config": {},
|
16 |
+
"lora_alpha": 128,
|
17 |
+
"lora_bias": false,
|
18 |
+
"lora_dropout": 0.0,
|
19 |
+
"megatron_config": null,
|
20 |
+
"megatron_core": "megatron.core",
|
21 |
+
"modules_to_save": null,
|
22 |
+
"peft_type": "LORA",
|
23 |
+
"r": 64,
|
24 |
+
"rank_pattern": {},
|
25 |
+
"revision": null,
|
26 |
+
"target_modules": null,
|
27 |
+
"task_type": "CAUSAL_LM",
|
28 |
+
"use_dora": false,
|
29 |
+
"use_rslora": false
|
30 |
+
}
|
MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"block_share": false,
|
7 |
+
"coft": false,
|
8 |
+
"eps": 6e-05,
|
9 |
+
"exclude_modules": null,
|
10 |
+
"fan_in_fan_out": false,
|
11 |
+
"inference_mode": false,
|
12 |
+
"init_weights": true,
|
13 |
+
"layers_pattern": null,
|
14 |
+
"layers_to_transform": null,
|
15 |
+
"module_dropout": 0.0,
|
16 |
+
"modules_to_save": null,
|
17 |
+
"oft_block_size": 0,
|
18 |
+
"peft_type": "OFT",
|
19 |
+
"r": 32,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"q_proj",
|
24 |
+
"v_proj"
|
25 |
+
],
|
26 |
+
"task_type": null
|
27 |
+
}
|
MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"encoder_hidden_size": 3072,
|
5 |
+
"inference_mode": false,
|
6 |
+
"num_attention_heads": 24,
|
7 |
+
"num_layers": 28,
|
8 |
+
"num_transformer_submodules": 1,
|
9 |
+
"num_virtual_tokens": 200,
|
10 |
+
"peft_type": "PREFIX_TUNING",
|
11 |
+
"prefix_projection": false,
|
12 |
+
"revision": null,
|
13 |
+
"task_type": "CAUSAL_LM",
|
14 |
+
"token_dim": 3072
|
15 |
+
}
|
MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"optimizer_kwargs": {
|
3 |
+
"lr": 1e-3
|
4 |
+
}
|
5 |
+
}
|
6 |
+
|
MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"inference_mode": false,
|
5 |
+
"num_attention_heads": 24,
|
6 |
+
"num_layers": 28,
|
7 |
+
"num_transformer_submodules": 1,
|
8 |
+
"num_virtual_tokens": 200,
|
9 |
+
"peft_type": "PROMPT_TUNING",
|
10 |
+
"prompt_tuning_init": "RANDOM",
|
11 |
+
"prompt_tuning_init_text": null,
|
12 |
+
"revision": null,
|
13 |
+
"task_type": "CAUSAL_LM",
|
14 |
+
"token_dim": 3072,
|
15 |
+
"tokenizer_kwargs": null,
|
16 |
+
"tokenizer_name_or_path": null
|
17 |
+
}
|
MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"inference_mode": false,
|
5 |
+
"num_attention_heads": 24,
|
6 |
+
"num_layers": 28,
|
7 |
+
"num_transformer_submodules": 1,
|
8 |
+
"num_virtual_tokens": 200,
|
9 |
+
"peft_type": "PROMPT_TUNING",
|
10 |
+
"prompt_tuning_init": "RANDOM",
|
11 |
+
"prompt_tuning_init_text": null,
|
12 |
+
"revision": null,
|
13 |
+
"task_type": "CAUSAL_LM",
|
14 |
+
"token_dim": 3072,
|
15 |
+
"tokenizer_kwargs": null,
|
16 |
+
"tokenizer_name_or_path": null
|
17 |
+
}
|
MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"optimizer_kwargs": {
|
3 |
+
"lr": 1e-3
|
4 |
+
}
|
5 |
+
}
|
6 |
+
|
MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"encoder_dropout": 0.0,
|
5 |
+
"encoder_hidden_size": 3072,
|
6 |
+
"encoder_num_layers": 2,
|
7 |
+
"encoder_reparameterization_type": "MLP",
|
8 |
+
"inference_mode": false,
|
9 |
+
"num_attention_heads": 24,
|
10 |
+
"num_layers": 28,
|
11 |
+
"num_transformer_submodules": 1,
|
12 |
+
"num_virtual_tokens": 20,
|
13 |
+
"peft_type": "P_TUNING",
|
14 |
+
"revision": null,
|
15 |
+
"task_type": "CAUSAL_LM",
|
16 |
+
"token_dim": 3072
|
17 |
+
}
|
MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"bias": "none",
|
5 |
+
"fan_in_fan_out": false,
|
6 |
+
"inference_mode": false,
|
7 |
+
"init_weights": true,
|
8 |
+
"layers_pattern": null,
|
9 |
+
"layers_to_transform": null,
|
10 |
+
"modules_to_save": null,
|
11 |
+
"peft_type": "RANDLORA",
|
12 |
+
"projection_prng_key": 0,
|
13 |
+
"r": 32,
|
14 |
+
"randlora_alpha": 640,
|
15 |
+
"randlora_dropout": 0.0,
|
16 |
+
"revision": null,
|
17 |
+
"save_projection": true,
|
18 |
+
"sparse": false,
|
19 |
+
"target_modules": null,
|
20 |
+
"task_type": null,
|
21 |
+
"very_sparse": false
|
22 |
+
}
|
MetaMathQA/experiments/vblora/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"bias": "none",
|
5 |
+
"exclude_modules": null,
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": false,
|
8 |
+
"init_logits_std": 0.1,
|
9 |
+
"init_vector_bank_bound": 0.02,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"modules_to_save": null,
|
13 |
+
"num_vectors": 256,
|
14 |
+
"peft_type": "VBLORA",
|
15 |
+
"r": 4,
|
16 |
+
"revision": null,
|
17 |
+
"save_only_topk_weights": false,
|
18 |
+
"target_modules": [
|
19 |
+
"v_proj",
|
20 |
+
"q_proj"
|
21 |
+
],
|
22 |
+
"task_type": null,
|
23 |
+
"topk": 2,
|
24 |
+
"vblora_dropout": 0.0,
|
25 |
+
"vector_length": 256
|
26 |
+
}
|
MetaMathQA/experiments/vera/llama-3.2-3B-default/adapter_config.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": null,
|
4 |
+
"bias": "none",
|
5 |
+
"d_initial": 0.1,
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": false,
|
8 |
+
"init_weights": true,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"modules_to_save": null,
|
12 |
+
"peft_type": "VERA",
|
13 |
+
"projection_prng_key": 0,
|
14 |
+
"r": 256,
|
15 |
+
"revision": null,
|
16 |
+
"save_projection": true,
|
17 |
+
"target_modules": null,
|
18 |
+
"task_type": null,
|
19 |
+
"vera_dropout": 0.0
|
20 |
+
}
|
MetaMathQA/experiments/vera/llama-3.2-3B-default/training_params.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"optimizer_kwargs": {
|
3 |
+
"lr": 1e-3
|
4 |
+
}
|
5 |
+
}
|
6 |
+
|
MetaMathQA/requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bitsandbytes
|
2 |
+
datasets
|
3 |
+
numpy
|
4 |
+
tqdm
|
MetaMathQA/results/.gitkeep
ADDED
File without changes
|
MetaMathQA/results/adalora--llama-3.2-3B-rank32.json
ADDED
@@ -0,0 +1,4071 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_info": {
|
3 |
+
"created_at": "2025-06-19T23:12:19+00:00",
|
4 |
+
"total_time": 2209.243281380004,
|
5 |
+
"experiment_name": "adalora/llama-3.2-3B-rank32",
|
6 |
+
"peft_branch": "main",
|
7 |
+
"train_config": {
|
8 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
9 |
+
"dtype": "bfloat16",
|
10 |
+
"max_seq_length": 768,
|
11 |
+
"batch_size": 4,
|
12 |
+
"batch_size_eval": 50,
|
13 |
+
"max_steps": 5000,
|
14 |
+
"eval_steps": 250,
|
15 |
+
"compile": false,
|
16 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
17 |
+
"seed": 0,
|
18 |
+
"grad_norm_clip": 1.0,
|
19 |
+
"optimizer_type": "AdamW",
|
20 |
+
"optimizer_kwargs": {
|
21 |
+
"lr": 0.0001,
|
22 |
+
"weight_decay": 0.1
|
23 |
+
},
|
24 |
+
"lr_scheduler": "cosine",
|
25 |
+
"use_amp": false,
|
26 |
+
"autocast_adapter_dtype": true,
|
27 |
+
"generation_kwargs": {
|
28 |
+
"max_length": 800,
|
29 |
+
"max_new_tokens": 300
|
30 |
+
},
|
31 |
+
"attn_implementation": null
|
32 |
+
},
|
33 |
+
"peft_config": {
|
34 |
+
"task_type": null,
|
35 |
+
"peft_type": "ADALORA",
|
36 |
+
"auto_mapping": null,
|
37 |
+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
38 |
+
"revision": null,
|
39 |
+
"inference_mode": false,
|
40 |
+
"r": 8,
|
41 |
+
"target_modules": [
|
42 |
+
"q_proj",
|
43 |
+
"v_proj"
|
44 |
+
],
|
45 |
+
"exclude_modules": null,
|
46 |
+
"lora_alpha": 8,
|
47 |
+
"lora_dropout": 0.0,
|
48 |
+
"fan_in_fan_out": false,
|
49 |
+
"bias": "none",
|
50 |
+
"use_rslora": false,
|
51 |
+
"modules_to_save": null,
|
52 |
+
"init_lora_weights": true,
|
53 |
+
"layers_to_transform": null,
|
54 |
+
"layers_pattern": null,
|
55 |
+
"rank_pattern": {
|
56 |
+
"model.layers.0.self_attn.q_proj.lora_E": [
|
57 |
+
false,
|
58 |
+
false,
|
59 |
+
false,
|
60 |
+
false,
|
61 |
+
false,
|
62 |
+
false,
|
63 |
+
false,
|
64 |
+
false,
|
65 |
+
false,
|
66 |
+
false,
|
67 |
+
false,
|
68 |
+
false,
|
69 |
+
false,
|
70 |
+
false,
|
71 |
+
false,
|
72 |
+
false,
|
73 |
+
false,
|
74 |
+
false,
|
75 |
+
false,
|
76 |
+
false,
|
77 |
+
false,
|
78 |
+
false,
|
79 |
+
false,
|
80 |
+
false,
|
81 |
+
false,
|
82 |
+
false,
|
83 |
+
false,
|
84 |
+
false,
|
85 |
+
false,
|
86 |
+
false,
|
87 |
+
false,
|
88 |
+
false,
|
89 |
+
false,
|
90 |
+
false,
|
91 |
+
false,
|
92 |
+
false,
|
93 |
+
false,
|
94 |
+
false,
|
95 |
+
false,
|
96 |
+
false,
|
97 |
+
false,
|
98 |
+
false,
|
99 |
+
false,
|
100 |
+
false,
|
101 |
+
false,
|
102 |
+
false,
|
103 |
+
false,
|
104 |
+
false,
|
105 |
+
false,
|
106 |
+
false,
|
107 |
+
false,
|
108 |
+
false,
|
109 |
+
false,
|
110 |
+
false,
|
111 |
+
false,
|
112 |
+
false,
|
113 |
+
false,
|
114 |
+
false,
|
115 |
+
false,
|
116 |
+
false,
|
117 |
+
false,
|
118 |
+
false,
|
119 |
+
false,
|
120 |
+
false
|
121 |
+
],
|
122 |
+
"model.layers.0.self_attn.v_proj.lora_E": [
|
123 |
+
true,
|
124 |
+
true,
|
125 |
+
true,
|
126 |
+
true,
|
127 |
+
true,
|
128 |
+
true,
|
129 |
+
true,
|
130 |
+
true,
|
131 |
+
true,
|
132 |
+
false,
|
133 |
+
true,
|
134 |
+
true,
|
135 |
+
true,
|
136 |
+
true,
|
137 |
+
true,
|
138 |
+
false,
|
139 |
+
true,
|
140 |
+
true,
|
141 |
+
true,
|
142 |
+
false,
|
143 |
+
true,
|
144 |
+
true,
|
145 |
+
true,
|
146 |
+
true,
|
147 |
+
true,
|
148 |
+
true,
|
149 |
+
true,
|
150 |
+
true,
|
151 |
+
true,
|
152 |
+
true,
|
153 |
+
true,
|
154 |
+
true,
|
155 |
+
true,
|
156 |
+
true,
|
157 |
+
true,
|
158 |
+
true,
|
159 |
+
true,
|
160 |
+
true,
|
161 |
+
true,
|
162 |
+
true,
|
163 |
+
true,
|
164 |
+
true,
|
165 |
+
true,
|
166 |
+
true,
|
167 |
+
true,
|
168 |
+
true,
|
169 |
+
true,
|
170 |
+
true,
|
171 |
+
true,
|
172 |
+
true,
|
173 |
+
true,
|
174 |
+
true,
|
175 |
+
false,
|
176 |
+
true,
|
177 |
+
true,
|
178 |
+
true,
|
179 |
+
true,
|
180 |
+
true,
|
181 |
+
true,
|
182 |
+
true,
|
183 |
+
true,
|
184 |
+
true,
|
185 |
+
true,
|
186 |
+
true
|
187 |
+
],
|
188 |
+
"model.layers.1.self_attn.q_proj.lora_E": [
|
189 |
+
false,
|
190 |
+
false,
|
191 |
+
true,
|
192 |
+
true,
|
193 |
+
false,
|
194 |
+
true,
|
195 |
+
true,
|
196 |
+
false,
|
197 |
+
false,
|
198 |
+
false,
|
199 |
+
false,
|
200 |
+
true,
|
201 |
+
false,
|
202 |
+
false,
|
203 |
+
true,
|
204 |
+
true,
|
205 |
+
true,
|
206 |
+
true,
|
207 |
+
false,
|
208 |
+
false,
|
209 |
+
false,
|
210 |
+
false,
|
211 |
+
false,
|
212 |
+
false,
|
213 |
+
true,
|
214 |
+
false,
|
215 |
+
true,
|
216 |
+
true,
|
217 |
+
false,
|
218 |
+
false,
|
219 |
+
true,
|
220 |
+
true,
|
221 |
+
true,
|
222 |
+
false,
|
223 |
+
true,
|
224 |
+
true,
|
225 |
+
false,
|
226 |
+
false,
|
227 |
+
true,
|
228 |
+
true,
|
229 |
+
true,
|
230 |
+
false,
|
231 |
+
false,
|
232 |
+
false,
|
233 |
+
true,
|
234 |
+
false,
|
235 |
+
true,
|
236 |
+
true,
|
237 |
+
true,
|
238 |
+
true,
|
239 |
+
false,
|
240 |
+
true,
|
241 |
+
true,
|
242 |
+
true,
|
243 |
+
false,
|
244 |
+
false,
|
245 |
+
true,
|
246 |
+
true,
|
247 |
+
false,
|
248 |
+
false,
|
249 |
+
true,
|
250 |
+
true,
|
251 |
+
false,
|
252 |
+
false
|
253 |
+
],
|
254 |
+
"model.layers.1.self_attn.v_proj.lora_E": [
|
255 |
+
true,
|
256 |
+
true,
|
257 |
+
true,
|
258 |
+
true,
|
259 |
+
true,
|
260 |
+
true,
|
261 |
+
true,
|
262 |
+
true,
|
263 |
+
true,
|
264 |
+
true,
|
265 |
+
true,
|
266 |
+
true,
|
267 |
+
true,
|
268 |
+
true,
|
269 |
+
true,
|
270 |
+
true,
|
271 |
+
true,
|
272 |
+
true,
|
273 |
+
false,
|
274 |
+
true,
|
275 |
+
true,
|
276 |
+
true,
|
277 |
+
true,
|
278 |
+
true,
|
279 |
+
true,
|
280 |
+
true,
|
281 |
+
true,
|
282 |
+
true,
|
283 |
+
true,
|
284 |
+
true,
|
285 |
+
true,
|
286 |
+
true,
|
287 |
+
true,
|
288 |
+
true,
|
289 |
+
true,
|
290 |
+
true,
|
291 |
+
true,
|
292 |
+
true,
|
293 |
+
true,
|
294 |
+
true,
|
295 |
+
true,
|
296 |
+
true,
|
297 |
+
false,
|
298 |
+
true,
|
299 |
+
true,
|
300 |
+
true,
|
301 |
+
true,
|
302 |
+
true,
|
303 |
+
true,
|
304 |
+
true,
|
305 |
+
true,
|
306 |
+
true,
|
307 |
+
true,
|
308 |
+
true,
|
309 |
+
true,
|
310 |
+
true,
|
311 |
+
true,
|
312 |
+
true,
|
313 |
+
true,
|
314 |
+
true,
|
315 |
+
true,
|
316 |
+
true,
|
317 |
+
true,
|
318 |
+
true
|
319 |
+
],
|
320 |
+
"model.layers.2.self_attn.q_proj.lora_E": [
|
321 |
+
true,
|
322 |
+
false,
|
323 |
+
true,
|
324 |
+
false,
|
325 |
+
false,
|
326 |
+
false,
|
327 |
+
true,
|
328 |
+
true,
|
329 |
+
true,
|
330 |
+
true,
|
331 |
+
false,
|
332 |
+
true,
|
333 |
+
true,
|
334 |
+
true,
|
335 |
+
false,
|
336 |
+
false,
|
337 |
+
true,
|
338 |
+
false,
|
339 |
+
false,
|
340 |
+
true,
|
341 |
+
false,
|
342 |
+
false,
|
343 |
+
false,
|
344 |
+
false,
|
345 |
+
true,
|
346 |
+
true,
|
347 |
+
false,
|
348 |
+
false,
|
349 |
+
false,
|
350 |
+
false,
|
351 |
+
true,
|
352 |
+
false,
|
353 |
+
false,
|
354 |
+
false,
|
355 |
+
false,
|
356 |
+
false,
|
357 |
+
false,
|
358 |
+
true,
|
359 |
+
true,
|
360 |
+
true,
|
361 |
+
false,
|
362 |
+
false,
|
363 |
+
false,
|
364 |
+
true,
|
365 |
+
true,
|
366 |
+
false,
|
367 |
+
false,
|
368 |
+
false,
|
369 |
+
true,
|
370 |
+
false,
|
371 |
+
true,
|
372 |
+
true,
|
373 |
+
false,
|
374 |
+
true,
|
375 |
+
false,
|
376 |
+
false,
|
377 |
+
false,
|
378 |
+
true,
|
379 |
+
true,
|
380 |
+
false,
|
381 |
+
true,
|
382 |
+
true,
|
383 |
+
false,
|
384 |
+
false
|
385 |
+
],
|
386 |
+
"model.layers.2.self_attn.v_proj.lora_E": [
|
387 |
+
true,
|
388 |
+
false,
|
389 |
+
false,
|
390 |
+
false,
|
391 |
+
true,
|
392 |
+
true,
|
393 |
+
true,
|
394 |
+
true,
|
395 |
+
false,
|
396 |
+
true,
|
397 |
+
true,
|
398 |
+
true,
|
399 |
+
false,
|
400 |
+
true,
|
401 |
+
false,
|
402 |
+
true,
|
403 |
+
false,
|
404 |
+
true,
|
405 |
+
false,
|
406 |
+
true,
|
407 |
+
false,
|
408 |
+
true,
|
409 |
+
true,
|
410 |
+
true,
|
411 |
+
true,
|
412 |
+
true,
|
413 |
+
false,
|
414 |
+
true,
|
415 |
+
false,
|
416 |
+
false,
|
417 |
+
false,
|
418 |
+
false,
|
419 |
+
true,
|
420 |
+
false,
|
421 |
+
false,
|
422 |
+
false,
|
423 |
+
false,
|
424 |
+
false,
|
425 |
+
true,
|
426 |
+
true,
|
427 |
+
false,
|
428 |
+
false,
|
429 |
+
true,
|
430 |
+
true,
|
431 |
+
false,
|
432 |
+
true,
|
433 |
+
true,
|
434 |
+
true,
|
435 |
+
true,
|
436 |
+
false,
|
437 |
+
false,
|
438 |
+
true,
|
439 |
+
false,
|
440 |
+
true,
|
441 |
+
false,
|
442 |
+
false,
|
443 |
+
false,
|
444 |
+
true,
|
445 |
+
true,
|
446 |
+
false,
|
447 |
+
false,
|
448 |
+
false,
|
449 |
+
true,
|
450 |
+
true
|
451 |
+
],
|
452 |
+
"model.layers.3.self_attn.q_proj.lora_E": [
|
453 |
+
false,
|
454 |
+
false,
|
455 |
+
false,
|
456 |
+
false,
|
457 |
+
false,
|
458 |
+
false,
|
459 |
+
false,
|
460 |
+
false,
|
461 |
+
false,
|
462 |
+
false,
|
463 |
+
false,
|
464 |
+
false,
|
465 |
+
false,
|
466 |
+
false,
|
467 |
+
false,
|
468 |
+
false,
|
469 |
+
false,
|
470 |
+
false,
|
471 |
+
false,
|
472 |
+
false,
|
473 |
+
false,
|
474 |
+
false,
|
475 |
+
false,
|
476 |
+
false,
|
477 |
+
false,
|
478 |
+
false,
|
479 |
+
false,
|
480 |
+
false,
|
481 |
+
false,
|
482 |
+
false,
|
483 |
+
false,
|
484 |
+
false,
|
485 |
+
false,
|
486 |
+
false,
|
487 |
+
false,
|
488 |
+
false,
|
489 |
+
false,
|
490 |
+
false,
|
491 |
+
false,
|
492 |
+
false,
|
493 |
+
false,
|
494 |
+
false,
|
495 |
+
false,
|
496 |
+
false,
|
497 |
+
false,
|
498 |
+
false,
|
499 |
+
false,
|
500 |
+
false,
|
501 |
+
false,
|
502 |
+
false,
|
503 |
+
false,
|
504 |
+
false,
|
505 |
+
false,
|
506 |
+
false,
|
507 |
+
false,
|
508 |
+
false,
|
509 |
+
false,
|
510 |
+
false,
|
511 |
+
false,
|
512 |
+
false,
|
513 |
+
false,
|
514 |
+
false,
|
515 |
+
false,
|
516 |
+
false
|
517 |
+
],
|
518 |
+
"model.layers.3.self_attn.v_proj.lora_E": [
|
519 |
+
false,
|
520 |
+
false,
|
521 |
+
false,
|
522 |
+
false,
|
523 |
+
false,
|
524 |
+
true,
|
525 |
+
false,
|
526 |
+
true,
|
527 |
+
false,
|
528 |
+
false,
|
529 |
+
false,
|
530 |
+
false,
|
531 |
+
true,
|
532 |
+
false,
|
533 |
+
false,
|
534 |
+
false,
|
535 |
+
false,
|
536 |
+
true,
|
537 |
+
true,
|
538 |
+
true,
|
539 |
+
true,
|
540 |
+
false,
|
541 |
+
true,
|
542 |
+
false,
|
543 |
+
false,
|
544 |
+
false,
|
545 |
+
false,
|
546 |
+
false,
|
547 |
+
false,
|
548 |
+
false,
|
549 |
+
false,
|
550 |
+
false,
|
551 |
+
true,
|
552 |
+
false,
|
553 |
+
false,
|
554 |
+
true,
|
555 |
+
false,
|
556 |
+
false,
|
557 |
+
true,
|
558 |
+
false,
|
559 |
+
true,
|
560 |
+
false,
|
561 |
+
true,
|
562 |
+
true,
|
563 |
+
false,
|
564 |
+
true,
|
565 |
+
false,
|
566 |
+
false,
|
567 |
+
true,
|
568 |
+
false,
|
569 |
+
false,
|
570 |
+
false,
|
571 |
+
false,
|
572 |
+
false,
|
573 |
+
true,
|
574 |
+
false,
|
575 |
+
true,
|
576 |
+
false,
|
577 |
+
false,
|
578 |
+
false,
|
579 |
+
false,
|
580 |
+
true,
|
581 |
+
true,
|
582 |
+
true
|
583 |
+
],
|
584 |
+
"model.layers.4.self_attn.q_proj.lora_E": [
|
585 |
+
false,
|
586 |
+
false,
|
587 |
+
false,
|
588 |
+
false,
|
589 |
+
false,
|
590 |
+
true,
|
591 |
+
false,
|
592 |
+
false,
|
593 |
+
true,
|
594 |
+
true,
|
595 |
+
false,
|
596 |
+
true,
|
597 |
+
false,
|
598 |
+
false,
|
599 |
+
false,
|
600 |
+
false,
|
601 |
+
false,
|
602 |
+
false,
|
603 |
+
false,
|
604 |
+
true,
|
605 |
+
false,
|
606 |
+
false,
|
607 |
+
true,
|
608 |
+
false,
|
609 |
+
true,
|
610 |
+
false,
|
611 |
+
false,
|
612 |
+
false,
|
613 |
+
false,
|
614 |
+
false,
|
615 |
+
true,
|
616 |
+
false,
|
617 |
+
false,
|
618 |
+
false,
|
619 |
+
false,
|
620 |
+
false,
|
621 |
+
false,
|
622 |
+
false,
|
623 |
+
true,
|
624 |
+
true,
|
625 |
+
false,
|
626 |
+
false,
|
627 |
+
true,
|
628 |
+
false,
|
629 |
+
false,
|
630 |
+
false,
|
631 |
+
true,
|
632 |
+
true,
|
633 |
+
false,
|
634 |
+
false,
|
635 |
+
false,
|
636 |
+
false,
|
637 |
+
false,
|
638 |
+
false,
|
639 |
+
false,
|
640 |
+
false,
|
641 |
+
false,
|
642 |
+
false,
|
643 |
+
false,
|
644 |
+
false,
|
645 |
+
true,
|
646 |
+
true,
|
647 |
+
false,
|
648 |
+
false
|
649 |
+
],
|
650 |
+
"model.layers.4.self_attn.v_proj.lora_E": [
|
651 |
+
true,
|
652 |
+
false,
|
653 |
+
true,
|
654 |
+
true,
|
655 |
+
false,
|
656 |
+
false,
|
657 |
+
true,
|
658 |
+
false,
|
659 |
+
false,
|
660 |
+
false,
|
661 |
+
true,
|
662 |
+
false,
|
663 |
+
true,
|
664 |
+
true,
|
665 |
+
false,
|
666 |
+
true,
|
667 |
+
false,
|
668 |
+
true,
|
669 |
+
true,
|
670 |
+
false,
|
671 |
+
true,
|
672 |
+
true,
|
673 |
+
false,
|
674 |
+
false,
|
675 |
+
true,
|
676 |
+
true,
|
677 |
+
true,
|
678 |
+
true,
|
679 |
+
false,
|
680 |
+
false,
|
681 |
+
false,
|
682 |
+
false,
|
683 |
+
false,
|
684 |
+
false,
|
685 |
+
true,
|
686 |
+
false,
|
687 |
+
true,
|
688 |
+
false,
|
689 |
+
false,
|
690 |
+
true,
|
691 |
+
true,
|
692 |
+
true,
|
693 |
+
true,
|
694 |
+
true,
|
695 |
+
false,
|
696 |
+
false,
|
697 |
+
false,
|
698 |
+
false,
|
699 |
+
false,
|
700 |
+
true,
|
701 |
+
false,
|
702 |
+
true,
|
703 |
+
true,
|
704 |
+
true,
|
705 |
+
true,
|
706 |
+
true,
|
707 |
+
false,
|
708 |
+
true,
|
709 |
+
true,
|
710 |
+
false,
|
711 |
+
true,
|
712 |
+
true,
|
713 |
+
true,
|
714 |
+
true
|
715 |
+
],
|
716 |
+
"model.layers.5.self_attn.q_proj.lora_E": [
|
717 |
+
false,
|
718 |
+
true,
|
719 |
+
false,
|
720 |
+
false,
|
721 |
+
false,
|
722 |
+
false,
|
723 |
+
false,
|
724 |
+
false,
|
725 |
+
false,
|
726 |
+
false,
|
727 |
+
false,
|
728 |
+
false,
|
729 |
+
false,
|
730 |
+
false,
|
731 |
+
false,
|
732 |
+
false,
|
733 |
+
false,
|
734 |
+
false,
|
735 |
+
false,
|
736 |
+
false,
|
737 |
+
false,
|
738 |
+
false,
|
739 |
+
false,
|
740 |
+
false,
|
741 |
+
false,
|
742 |
+
false,
|
743 |
+
false,
|
744 |
+
false,
|
745 |
+
false,
|
746 |
+
false,
|
747 |
+
false,
|
748 |
+
false,
|
749 |
+
false,
|
750 |
+
false,
|
751 |
+
false,
|
752 |
+
false,
|
753 |
+
false,
|
754 |
+
false,
|
755 |
+
false,
|
756 |
+
false,
|
757 |
+
false,
|
758 |
+
false,
|
759 |
+
false,
|
760 |
+
false,
|
761 |
+
false,
|
762 |
+
false,
|
763 |
+
false,
|
764 |
+
false,
|
765 |
+
false,
|
766 |
+
false,
|
767 |
+
false,
|
768 |
+
false,
|
769 |
+
false,
|
770 |
+
false,
|
771 |
+
false,
|
772 |
+
false,
|
773 |
+
false,
|
774 |
+
false,
|
775 |
+
false,
|
776 |
+
false,
|
777 |
+
false,
|
778 |
+
false,
|
779 |
+
false,
|
780 |
+
false
|
781 |
+
],
|
782 |
+
"model.layers.5.self_attn.v_proj.lora_E": [
|
783 |
+
true,
|
784 |
+
true,
|
785 |
+
true,
|
786 |
+
true,
|
787 |
+
true,
|
788 |
+
true,
|
789 |
+
true,
|
790 |
+
true,
|
791 |
+
false,
|
792 |
+
true,
|
793 |
+
false,
|
794 |
+
false,
|
795 |
+
true,
|
796 |
+
false,
|
797 |
+
false,
|
798 |
+
true,
|
799 |
+
false,
|
800 |
+
true,
|
801 |
+
false,
|
802 |
+
false,
|
803 |
+
false,
|
804 |
+
false,
|
805 |
+
true,
|
806 |
+
true,
|
807 |
+
false,
|
808 |
+
false,
|
809 |
+
false,
|
810 |
+
false,
|
811 |
+
true,
|
812 |
+
false,
|
813 |
+
true,
|
814 |
+
false,
|
815 |
+
true,
|
816 |
+
true,
|
817 |
+
false,
|
818 |
+
false,
|
819 |
+
true,
|
820 |
+
true,
|
821 |
+
true,
|
822 |
+
true,
|
823 |
+
false,
|
824 |
+
false,
|
825 |
+
true,
|
826 |
+
false,
|
827 |
+
true,
|
828 |
+
false,
|
829 |
+
false,
|
830 |
+
true,
|
831 |
+
true,
|
832 |
+
true,
|
833 |
+
false,
|
834 |
+
true,
|
835 |
+
false,
|
836 |
+
false,
|
837 |
+
false,
|
838 |
+
true,
|
839 |
+
true,
|
840 |
+
true,
|
841 |
+
true,
|
842 |
+
false,
|
843 |
+
false,
|
844 |
+
false,
|
845 |
+
true,
|
846 |
+
true
|
847 |
+
],
|
848 |
+
"model.layers.6.self_attn.q_proj.lora_E": [
|
849 |
+
false,
|
850 |
+
false,
|
851 |
+
true,
|
852 |
+
true,
|
853 |
+
false,
|
854 |
+
false,
|
855 |
+
true,
|
856 |
+
true,
|
857 |
+
false,
|
858 |
+
false,
|
859 |
+
false,
|
860 |
+
true,
|
861 |
+
false,
|
862 |
+
true,
|
863 |
+
false,
|
864 |
+
true,
|
865 |
+
false,
|
866 |
+
false,
|
867 |
+
false,
|
868 |
+
false,
|
869 |
+
true,
|
870 |
+
true,
|
871 |
+
true,
|
872 |
+
true,
|
873 |
+
false,
|
874 |
+
true,
|
875 |
+
false,
|
876 |
+
true,
|
877 |
+
false,
|
878 |
+
true,
|
879 |
+
false,
|
880 |
+
false,
|
881 |
+
false,
|
882 |
+
true,
|
883 |
+
true,
|
884 |
+
false,
|
885 |
+
false,
|
886 |
+
false,
|
887 |
+
false,
|
888 |
+
true,
|
889 |
+
true,
|
890 |
+
true,
|
891 |
+
false,
|
892 |
+
false,
|
893 |
+
true,
|
894 |
+
false,
|
895 |
+
false,
|
896 |
+
false,
|
897 |
+
false,
|
898 |
+
true,
|
899 |
+
true,
|
900 |
+
false,
|
901 |
+
false,
|
902 |
+
false,
|
903 |
+
true,
|
904 |
+
false,
|
905 |
+
false,
|
906 |
+
false,
|
907 |
+
false,
|
908 |
+
false,
|
909 |
+
false,
|
910 |
+
false,
|
911 |
+
false,
|
912 |
+
false
|
913 |
+
],
|
914 |
+
"model.layers.6.self_attn.v_proj.lora_E": [
|
915 |
+
false,
|
916 |
+
true,
|
917 |
+
true,
|
918 |
+
true,
|
919 |
+
true,
|
920 |
+
true,
|
921 |
+
true,
|
922 |
+
true,
|
923 |
+
true,
|
924 |
+
true,
|
925 |
+
false,
|
926 |
+
true,
|
927 |
+
true,
|
928 |
+
true,
|
929 |
+
true,
|
930 |
+
true,
|
931 |
+
false,
|
932 |
+
true,
|
933 |
+
true,
|
934 |
+
true,
|
935 |
+
false,
|
936 |
+
false,
|
937 |
+
false,
|
938 |
+
false,
|
939 |
+
true,
|
940 |
+
true,
|
941 |
+
false,
|
942 |
+
false,
|
943 |
+
false,
|
944 |
+
false,
|
945 |
+
true,
|
946 |
+
true,
|
947 |
+
false,
|
948 |
+
true,
|
949 |
+
true,
|
950 |
+
true,
|
951 |
+
false,
|
952 |
+
true,
|
953 |
+
true,
|
954 |
+
true,
|
955 |
+
false,
|
956 |
+
true,
|
957 |
+
true,
|
958 |
+
true,
|
959 |
+
true,
|
960 |
+
false,
|
961 |
+
false,
|
962 |
+
false,
|
963 |
+
true,
|
964 |
+
true,
|
965 |
+
false,
|
966 |
+
false,
|
967 |
+
true,
|
968 |
+
false,
|
969 |
+
true,
|
970 |
+
false,
|
971 |
+
true,
|
972 |
+
true,
|
973 |
+
false,
|
974 |
+
true,
|
975 |
+
false,
|
976 |
+
true,
|
977 |
+
false,
|
978 |
+
true
|
979 |
+
],
|
980 |
+
"model.layers.7.self_attn.q_proj.lora_E": [
|
981 |
+
false,
|
982 |
+
false,
|
983 |
+
false,
|
984 |
+
false,
|
985 |
+
false,
|
986 |
+
true,
|
987 |
+
false,
|
988 |
+
false,
|
989 |
+
false,
|
990 |
+
false,
|
991 |
+
false,
|
992 |
+
false,
|
993 |
+
false,
|
994 |
+
false,
|
995 |
+
true,
|
996 |
+
false,
|
997 |
+
false,
|
998 |
+
false,
|
999 |
+
false,
|
1000 |
+
false,
|
1001 |
+
false,
|
1002 |
+
false,
|
1003 |
+
false,
|
1004 |
+
false,
|
1005 |
+
false,
|
1006 |
+
false,
|
1007 |
+
false,
|
1008 |
+
false,
|
1009 |
+
false,
|
1010 |
+
false,
|
1011 |
+
true,
|
1012 |
+
false,
|
1013 |
+
false,
|
1014 |
+
false,
|
1015 |
+
false,
|
1016 |
+
false,
|
1017 |
+
false,
|
1018 |
+
false,
|
1019 |
+
false,
|
1020 |
+
false,
|
1021 |
+
false,
|
1022 |
+
false,
|
1023 |
+
false,
|
1024 |
+
false,
|
1025 |
+
false,
|
1026 |
+
false,
|
1027 |
+
false,
|
1028 |
+
false,
|
1029 |
+
false,
|
1030 |
+
false,
|
1031 |
+
true,
|
1032 |
+
false,
|
1033 |
+
false,
|
1034 |
+
false,
|
1035 |
+
false,
|
1036 |
+
false,
|
1037 |
+
false,
|
1038 |
+
false,
|
1039 |
+
false,
|
1040 |
+
false,
|
1041 |
+
false,
|
1042 |
+
false,
|
1043 |
+
false,
|
1044 |
+
false
|
1045 |
+
],
|
1046 |
+
"model.layers.7.self_attn.v_proj.lora_E": [
|
1047 |
+
false,
|
1048 |
+
false,
|
1049 |
+
true,
|
1050 |
+
true,
|
1051 |
+
false,
|
1052 |
+
true,
|
1053 |
+
true,
|
1054 |
+
true,
|
1055 |
+
true,
|
1056 |
+
false,
|
1057 |
+
true,
|
1058 |
+
true,
|
1059 |
+
true,
|
1060 |
+
true,
|
1061 |
+
true,
|
1062 |
+
true,
|
1063 |
+
true,
|
1064 |
+
true,
|
1065 |
+
true,
|
1066 |
+
true,
|
1067 |
+
false,
|
1068 |
+
false,
|
1069 |
+
true,
|
1070 |
+
true,
|
1071 |
+
true,
|
1072 |
+
true,
|
1073 |
+
true,
|
1074 |
+
false,
|
1075 |
+
true,
|
1076 |
+
false,
|
1077 |
+
false,
|
1078 |
+
true,
|
1079 |
+
true,
|
1080 |
+
true,
|
1081 |
+
true,
|
1082 |
+
false,
|
1083 |
+
false,
|
1084 |
+
false,
|
1085 |
+
true,
|
1086 |
+
false,
|
1087 |
+
false,
|
1088 |
+
true,
|
1089 |
+
true,
|
1090 |
+
true,
|
1091 |
+
false,
|
1092 |
+
true,
|
1093 |
+
true,
|
1094 |
+
true,
|
1095 |
+
true,
|
1096 |
+
true,
|
1097 |
+
true,
|
1098 |
+
false,
|
1099 |
+
true,
|
1100 |
+
true,
|
1101 |
+
true,
|
1102 |
+
true,
|
1103 |
+
true,
|
1104 |
+
false,
|
1105 |
+
false,
|
1106 |
+
false,
|
1107 |
+
true,
|
1108 |
+
true,
|
1109 |
+
true,
|
1110 |
+
true
|
1111 |
+
],
|
1112 |
+
"model.layers.8.self_attn.q_proj.lora_E": [
|
1113 |
+
false,
|
1114 |
+
true,
|
1115 |
+
false,
|
1116 |
+
false,
|
1117 |
+
false,
|
1118 |
+
false,
|
1119 |
+
false,
|
1120 |
+
true,
|
1121 |
+
false,
|
1122 |
+
false,
|
1123 |
+
false,
|
1124 |
+
false,
|
1125 |
+
true,
|
1126 |
+
true,
|
1127 |
+
false,
|
1128 |
+
false,
|
1129 |
+
false,
|
1130 |
+
false,
|
1131 |
+
false,
|
1132 |
+
false,
|
1133 |
+
false,
|
1134 |
+
false,
|
1135 |
+
false,
|
1136 |
+
false,
|
1137 |
+
false,
|
1138 |
+
false,
|
1139 |
+
false,
|
1140 |
+
false,
|
1141 |
+
false,
|
1142 |
+
false,
|
1143 |
+
false,
|
1144 |
+
false,
|
1145 |
+
false,
|
1146 |
+
true,
|
1147 |
+
false,
|
1148 |
+
false,
|
1149 |
+
false,
|
1150 |
+
false,
|
1151 |
+
true,
|
1152 |
+
false,
|
1153 |
+
false,
|
1154 |
+
true,
|
1155 |
+
false,
|
1156 |
+
false,
|
1157 |
+
false,
|
1158 |
+
false,
|
1159 |
+
true,
|
1160 |
+
false,
|
1161 |
+
false,
|
1162 |
+
false,
|
1163 |
+
false,
|
1164 |
+
false,
|
1165 |
+
true,
|
1166 |
+
false,
|
1167 |
+
false,
|
1168 |
+
false,
|
1169 |
+
false,
|
1170 |
+
false,
|
1171 |
+
false,
|
1172 |
+
false,
|
1173 |
+
false,
|
1174 |
+
false,
|
1175 |
+
false,
|
1176 |
+
true
|
1177 |
+
],
|
1178 |
+
"model.layers.8.self_attn.v_proj.lora_E": [
|
1179 |
+
false,
|
1180 |
+
true,
|
1181 |
+
false,
|
1182 |
+
false,
|
1183 |
+
false,
|
1184 |
+
true,
|
1185 |
+
false,
|
1186 |
+
false,
|
1187 |
+
false,
|
1188 |
+
false,
|
1189 |
+
true,
|
1190 |
+
true,
|
1191 |
+
true,
|
1192 |
+
true,
|
1193 |
+
true,
|
1194 |
+
false,
|
1195 |
+
false,
|
1196 |
+
true,
|
1197 |
+
true,
|
1198 |
+
true,
|
1199 |
+
false,
|
1200 |
+
true,
|
1201 |
+
true,
|
1202 |
+
true,
|
1203 |
+
true,
|
1204 |
+
true,
|
1205 |
+
false,
|
1206 |
+
true,
|
1207 |
+
true,
|
1208 |
+
false,
|
1209 |
+
false,
|
1210 |
+
true,
|
1211 |
+
true,
|
1212 |
+
false,
|
1213 |
+
false,
|
1214 |
+
true,
|
1215 |
+
false,
|
1216 |
+
true,
|
1217 |
+
false,
|
1218 |
+
true,
|
1219 |
+
true,
|
1220 |
+
false,
|
1221 |
+
true,
|
1222 |
+
false,
|
1223 |
+
true,
|
1224 |
+
true,
|
1225 |
+
true,
|
1226 |
+
false,
|
1227 |
+
true,
|
1228 |
+
false,
|
1229 |
+
false,
|
1230 |
+
true,
|
1231 |
+
true,
|
1232 |
+
true,
|
1233 |
+
false,
|
1234 |
+
true,
|
1235 |
+
true,
|
1236 |
+
true,
|
1237 |
+
true,
|
1238 |
+
false,
|
1239 |
+
false,
|
1240 |
+
false,
|
1241 |
+
false,
|
1242 |
+
true
|
1243 |
+
],
|
1244 |
+
"model.layers.9.self_attn.q_proj.lora_E": [
|
1245 |
+
true,
|
1246 |
+
false,
|
1247 |
+
true,
|
1248 |
+
true,
|
1249 |
+
false,
|
1250 |
+
false,
|
1251 |
+
true,
|
1252 |
+
true,
|
1253 |
+
false,
|
1254 |
+
false,
|
1255 |
+
true,
|
1256 |
+
false,
|
1257 |
+
false,
|
1258 |
+
false,
|
1259 |
+
false,
|
1260 |
+
true,
|
1261 |
+
false,
|
1262 |
+
true,
|
1263 |
+
false,
|
1264 |
+
true,
|
1265 |
+
false,
|
1266 |
+
false,
|
1267 |
+
false,
|
1268 |
+
true,
|
1269 |
+
false,
|
1270 |
+
true,
|
1271 |
+
false,
|
1272 |
+
true,
|
1273 |
+
false,
|
1274 |
+
true,
|
1275 |
+
false,
|
1276 |
+
true,
|
1277 |
+
true,
|
1278 |
+
false,
|
1279 |
+
false,
|
1280 |
+
true,
|
1281 |
+
true,
|
1282 |
+
false,
|
1283 |
+
false,
|
1284 |
+
false,
|
1285 |
+
false,
|
1286 |
+
true,
|
1287 |
+
true,
|
1288 |
+
true,
|
1289 |
+
false,
|
1290 |
+
false,
|
1291 |
+
false,
|
1292 |
+
false,
|
1293 |
+
true,
|
1294 |
+
true,
|
1295 |
+
true,
|
1296 |
+
false,
|
1297 |
+
false,
|
1298 |
+
false,
|
1299 |
+
false,
|
1300 |
+
false,
|
1301 |
+
true,
|
1302 |
+
false,
|
1303 |
+
true,
|
1304 |
+
false,
|
1305 |
+
false,
|
1306 |
+
true,
|
1307 |
+
false,
|
1308 |
+
true
|
1309 |
+
],
|
1310 |
+
"model.layers.9.self_attn.v_proj.lora_E": [
|
1311 |
+
true,
|
1312 |
+
true,
|
1313 |
+
false,
|
1314 |
+
true,
|
1315 |
+
true,
|
1316 |
+
true,
|
1317 |
+
true,
|
1318 |
+
true,
|
1319 |
+
true,
|
1320 |
+
false,
|
1321 |
+
true,
|
1322 |
+
true,
|
1323 |
+
true,
|
1324 |
+
true,
|
1325 |
+
true,
|
1326 |
+
false,
|
1327 |
+
true,
|
1328 |
+
true,
|
1329 |
+
true,
|
1330 |
+
true,
|
1331 |
+
true,
|
1332 |
+
true,
|
1333 |
+
true,
|
1334 |
+
true,
|
1335 |
+
false,
|
1336 |
+
true,
|
1337 |
+
true,
|
1338 |
+
true,
|
1339 |
+
true,
|
1340 |
+
true,
|
1341 |
+
true,
|
1342 |
+
true,
|
1343 |
+
false,
|
1344 |
+
true,
|
1345 |
+
true,
|
1346 |
+
false,
|
1347 |
+
true,
|
1348 |
+
true,
|
1349 |
+
true,
|
1350 |
+
true,
|
1351 |
+
true,
|
1352 |
+
true,
|
1353 |
+
false,
|
1354 |
+
false,
|
1355 |
+
true,
|
1356 |
+
false,
|
1357 |
+
true,
|
1358 |
+
false,
|
1359 |
+
true,
|
1360 |
+
true,
|
1361 |
+
true,
|
1362 |
+
true,
|
1363 |
+
true,
|
1364 |
+
true,
|
1365 |
+
true,
|
1366 |
+
true,
|
1367 |
+
true,
|
1368 |
+
true,
|
1369 |
+
true,
|
1370 |
+
true,
|
1371 |
+
true,
|
1372 |
+
true,
|
1373 |
+
false,
|
1374 |
+
false
|
1375 |
+
],
|
1376 |
+
"model.layers.10.self_attn.q_proj.lora_E": [
|
1377 |
+
false,
|
1378 |
+
false,
|
1379 |
+
false,
|
1380 |
+
false,
|
1381 |
+
false,
|
1382 |
+
false,
|
1383 |
+
false,
|
1384 |
+
false,
|
1385 |
+
false,
|
1386 |
+
false,
|
1387 |
+
true,
|
1388 |
+
false,
|
1389 |
+
false,
|
1390 |
+
false,
|
1391 |
+
false,
|
1392 |
+
false,
|
1393 |
+
false,
|
1394 |
+
false,
|
1395 |
+
false,
|
1396 |
+
false,
|
1397 |
+
false,
|
1398 |
+
false,
|
1399 |
+
false,
|
1400 |
+
false,
|
1401 |
+
false,
|
1402 |
+
false,
|
1403 |
+
false,
|
1404 |
+
false,
|
1405 |
+
false,
|
1406 |
+
false,
|
1407 |
+
false,
|
1408 |
+
false,
|
1409 |
+
false,
|
1410 |
+
false,
|
1411 |
+
false,
|
1412 |
+
false,
|
1413 |
+
false,
|
1414 |
+
false,
|
1415 |
+
false,
|
1416 |
+
true,
|
1417 |
+
false,
|
1418 |
+
false,
|
1419 |
+
false,
|
1420 |
+
false,
|
1421 |
+
false,
|
1422 |
+
false,
|
1423 |
+
false,
|
1424 |
+
false,
|
1425 |
+
false,
|
1426 |
+
false,
|
1427 |
+
true,
|
1428 |
+
false,
|
1429 |
+
false,
|
1430 |
+
false,
|
1431 |
+
false,
|
1432 |
+
false,
|
1433 |
+
false,
|
1434 |
+
false,
|
1435 |
+
false,
|
1436 |
+
false,
|
1437 |
+
false,
|
1438 |
+
false,
|
1439 |
+
false,
|
1440 |
+
false
|
1441 |
+
],
|
1442 |
+
"model.layers.10.self_attn.v_proj.lora_E": [
|
1443 |
+
true,
|
1444 |
+
true,
|
1445 |
+
false,
|
1446 |
+
false,
|
1447 |
+
false,
|
1448 |
+
true,
|
1449 |
+
true,
|
1450 |
+
false,
|
1451 |
+
false,
|
1452 |
+
true,
|
1453 |
+
true,
|
1454 |
+
true,
|
1455 |
+
true,
|
1456 |
+
true,
|
1457 |
+
true,
|
1458 |
+
true,
|
1459 |
+
true,
|
1460 |
+
true,
|
1461 |
+
true,
|
1462 |
+
true,
|
1463 |
+
true,
|
1464 |
+
true,
|
1465 |
+
true,
|
1466 |
+
true,
|
1467 |
+
true,
|
1468 |
+
false,
|
1469 |
+
false,
|
1470 |
+
false,
|
1471 |
+
false,
|
1472 |
+
true,
|
1473 |
+
true,
|
1474 |
+
true,
|
1475 |
+
true,
|
1476 |
+
false,
|
1477 |
+
false,
|
1478 |
+
false,
|
1479 |
+
false,
|
1480 |
+
true,
|
1481 |
+
false,
|
1482 |
+
false,
|
1483 |
+
false,
|
1484 |
+
true,
|
1485 |
+
true,
|
1486 |
+
true,
|
1487 |
+
false,
|
1488 |
+
true,
|
1489 |
+
true,
|
1490 |
+
false,
|
1491 |
+
true,
|
1492 |
+
false,
|
1493 |
+
false,
|
1494 |
+
true,
|
1495 |
+
true,
|
1496 |
+
false,
|
1497 |
+
false,
|
1498 |
+
true,
|
1499 |
+
true,
|
1500 |
+
true,
|
1501 |
+
true,
|
1502 |
+
true,
|
1503 |
+
true,
|
1504 |
+
true,
|
1505 |
+
false,
|
1506 |
+
true
|
1507 |
+
],
|
1508 |
+
"model.layers.11.self_attn.q_proj.lora_E": [
|
1509 |
+
true,
|
1510 |
+
false,
|
1511 |
+
false,
|
1512 |
+
false,
|
1513 |
+
false,
|
1514 |
+
true,
|
1515 |
+
false,
|
1516 |
+
false,
|
1517 |
+
false,
|
1518 |
+
true,
|
1519 |
+
true,
|
1520 |
+
false,
|
1521 |
+
true,
|
1522 |
+
false,
|
1523 |
+
false,
|
1524 |
+
false,
|
1525 |
+
false,
|
1526 |
+
false,
|
1527 |
+
false,
|
1528 |
+
false,
|
1529 |
+
false,
|
1530 |
+
true,
|
1531 |
+
false,
|
1532 |
+
false,
|
1533 |
+
true,
|
1534 |
+
false,
|
1535 |
+
false,
|
1536 |
+
false,
|
1537 |
+
false,
|
1538 |
+
false,
|
1539 |
+
false,
|
1540 |
+
false,
|
1541 |
+
false,
|
1542 |
+
false,
|
1543 |
+
false,
|
1544 |
+
false,
|
1545 |
+
true,
|
1546 |
+
false,
|
1547 |
+
false,
|
1548 |
+
false,
|
1549 |
+
true,
|
1550 |
+
true,
|
1551 |
+
true,
|
1552 |
+
false,
|
1553 |
+
true,
|
1554 |
+
false,
|
1555 |
+
false,
|
1556 |
+
false,
|
1557 |
+
true,
|
1558 |
+
true,
|
1559 |
+
false,
|
1560 |
+
false,
|
1561 |
+
false,
|
1562 |
+
false,
|
1563 |
+
true,
|
1564 |
+
true,
|
1565 |
+
false,
|
1566 |
+
true,
|
1567 |
+
false,
|
1568 |
+
true,
|
1569 |
+
true,
|
1570 |
+
false,
|
1571 |
+
false,
|
1572 |
+
false
|
1573 |
+
],
|
1574 |
+
"model.layers.11.self_attn.v_proj.lora_E": [
|
1575 |
+
false,
|
1576 |
+
true,
|
1577 |
+
true,
|
1578 |
+
true,
|
1579 |
+
false,
|
1580 |
+
true,
|
1581 |
+
true,
|
1582 |
+
false,
|
1583 |
+
true,
|
1584 |
+
false,
|
1585 |
+
true,
|
1586 |
+
true,
|
1587 |
+
true,
|
1588 |
+
true,
|
1589 |
+
true,
|
1590 |
+
true,
|
1591 |
+
true,
|
1592 |
+
false,
|
1593 |
+
false,
|
1594 |
+
true,
|
1595 |
+
false,
|
1596 |
+
true,
|
1597 |
+
true,
|
1598 |
+
true,
|
1599 |
+
true,
|
1600 |
+
false,
|
1601 |
+
false,
|
1602 |
+
true,
|
1603 |
+
true,
|
1604 |
+
true,
|
1605 |
+
true,
|
1606 |
+
false,
|
1607 |
+
true,
|
1608 |
+
true,
|
1609 |
+
true,
|
1610 |
+
true,
|
1611 |
+
true,
|
1612 |
+
true,
|
1613 |
+
true,
|
1614 |
+
true,
|
1615 |
+
true,
|
1616 |
+
false,
|
1617 |
+
false,
|
1618 |
+
false,
|
1619 |
+
false,
|
1620 |
+
false,
|
1621 |
+
true,
|
1622 |
+
false,
|
1623 |
+
false,
|
1624 |
+
true,
|
1625 |
+
true,
|
1626 |
+
false,
|
1627 |
+
false,
|
1628 |
+
true,
|
1629 |
+
true,
|
1630 |
+
true,
|
1631 |
+
true,
|
1632 |
+
true,
|
1633 |
+
false,
|
1634 |
+
true,
|
1635 |
+
true,
|
1636 |
+
true,
|
1637 |
+
false,
|
1638 |
+
false
|
1639 |
+
],
|
1640 |
+
"model.layers.12.self_attn.q_proj.lora_E": [
|
1641 |
+
false,
|
1642 |
+
false,
|
1643 |
+
false,
|
1644 |
+
false,
|
1645 |
+
false,
|
1646 |
+
false,
|
1647 |
+
false,
|
1648 |
+
false,
|
1649 |
+
false,
|
1650 |
+
false,
|
1651 |
+
false,
|
1652 |
+
false,
|
1653 |
+
false,
|
1654 |
+
true,
|
1655 |
+
false,
|
1656 |
+
true,
|
1657 |
+
false,
|
1658 |
+
false,
|
1659 |
+
false,
|
1660 |
+
false,
|
1661 |
+
false,
|
1662 |
+
true,
|
1663 |
+
false,
|
1664 |
+
false,
|
1665 |
+
false,
|
1666 |
+
false,
|
1667 |
+
false,
|
1668 |
+
false,
|
1669 |
+
true,
|
1670 |
+
false,
|
1671 |
+
false,
|
1672 |
+
false,
|
1673 |
+
false,
|
1674 |
+
false,
|
1675 |
+
false,
|
1676 |
+
false,
|
1677 |
+
false,
|
1678 |
+
true,
|
1679 |
+
true,
|
1680 |
+
false,
|
1681 |
+
true,
|
1682 |
+
false,
|
1683 |
+
false,
|
1684 |
+
false,
|
1685 |
+
false,
|
1686 |
+
true,
|
1687 |
+
false,
|
1688 |
+
true,
|
1689 |
+
false,
|
1690 |
+
false,
|
1691 |
+
true,
|
1692 |
+
false,
|
1693 |
+
true,
|
1694 |
+
false,
|
1695 |
+
false,
|
1696 |
+
true,
|
1697 |
+
false,
|
1698 |
+
false,
|
1699 |
+
false,
|
1700 |
+
false,
|
1701 |
+
false,
|
1702 |
+
false,
|
1703 |
+
false,
|
1704 |
+
false
|
1705 |
+
],
|
1706 |
+
"model.layers.12.self_attn.v_proj.lora_E": [
|
1707 |
+
true,
|
1708 |
+
true,
|
1709 |
+
true,
|
1710 |
+
true,
|
1711 |
+
false,
|
1712 |
+
true,
|
1713 |
+
true,
|
1714 |
+
false,
|
1715 |
+
true,
|
1716 |
+
true,
|
1717 |
+
true,
|
1718 |
+
true,
|
1719 |
+
true,
|
1720 |
+
true,
|
1721 |
+
true,
|
1722 |
+
true,
|
1723 |
+
true,
|
1724 |
+
false,
|
1725 |
+
false,
|
1726 |
+
false,
|
1727 |
+
false,
|
1728 |
+
true,
|
1729 |
+
true,
|
1730 |
+
false,
|
1731 |
+
false,
|
1732 |
+
true,
|
1733 |
+
true,
|
1734 |
+
true,
|
1735 |
+
true,
|
1736 |
+
true,
|
1737 |
+
true,
|
1738 |
+
true,
|
1739 |
+
true,
|
1740 |
+
true,
|
1741 |
+
true,
|
1742 |
+
false,
|
1743 |
+
false,
|
1744 |
+
true,
|
1745 |
+
false,
|
1746 |
+
true,
|
1747 |
+
true,
|
1748 |
+
true,
|
1749 |
+
true,
|
1750 |
+
true,
|
1751 |
+
false,
|
1752 |
+
false,
|
1753 |
+
true,
|
1754 |
+
false,
|
1755 |
+
true,
|
1756 |
+
true,
|
1757 |
+
true,
|
1758 |
+
true,
|
1759 |
+
false,
|
1760 |
+
true,
|
1761 |
+
false,
|
1762 |
+
true,
|
1763 |
+
false,
|
1764 |
+
true,
|
1765 |
+
false,
|
1766 |
+
true,
|
1767 |
+
false,
|
1768 |
+
true,
|
1769 |
+
true,
|
1770 |
+
false
|
1771 |
+
],
|
1772 |
+
"model.layers.13.self_attn.q_proj.lora_E": [
|
1773 |
+
true,
|
1774 |
+
true,
|
1775 |
+
false,
|
1776 |
+
true,
|
1777 |
+
true,
|
1778 |
+
true,
|
1779 |
+
false,
|
1780 |
+
false,
|
1781 |
+
true,
|
1782 |
+
true,
|
1783 |
+
false,
|
1784 |
+
true,
|
1785 |
+
false,
|
1786 |
+
true,
|
1787 |
+
false,
|
1788 |
+
true,
|
1789 |
+
false,
|
1790 |
+
false,
|
1791 |
+
true,
|
1792 |
+
true,
|
1793 |
+
false,
|
1794 |
+
true,
|
1795 |
+
false,
|
1796 |
+
true,
|
1797 |
+
true,
|
1798 |
+
true,
|
1799 |
+
true,
|
1800 |
+
false,
|
1801 |
+
false,
|
1802 |
+
true,
|
1803 |
+
true,
|
1804 |
+
false,
|
1805 |
+
false,
|
1806 |
+
true,
|
1807 |
+
false,
|
1808 |
+
true,
|
1809 |
+
false,
|
1810 |
+
true,
|
1811 |
+
true,
|
1812 |
+
true,
|
1813 |
+
false,
|
1814 |
+
false,
|
1815 |
+
false,
|
1816 |
+
false,
|
1817 |
+
true,
|
1818 |
+
true,
|
1819 |
+
true,
|
1820 |
+
true,
|
1821 |
+
false,
|
1822 |
+
true,
|
1823 |
+
false,
|
1824 |
+
true,
|
1825 |
+
true,
|
1826 |
+
true,
|
1827 |
+
false,
|
1828 |
+
true,
|
1829 |
+
false,
|
1830 |
+
true,
|
1831 |
+
true,
|
1832 |
+
false,
|
1833 |
+
false,
|
1834 |
+
false,
|
1835 |
+
true,
|
1836 |
+
false
|
1837 |
+
],
|
1838 |
+
"model.layers.13.self_attn.v_proj.lora_E": [
|
1839 |
+
true,
|
1840 |
+
false,
|
1841 |
+
true,
|
1842 |
+
true,
|
1843 |
+
true,
|
1844 |
+
false,
|
1845 |
+
true,
|
1846 |
+
true,
|
1847 |
+
true,
|
1848 |
+
false,
|
1849 |
+
true,
|
1850 |
+
true,
|
1851 |
+
true,
|
1852 |
+
false,
|
1853 |
+
true,
|
1854 |
+
false,
|
1855 |
+
true,
|
1856 |
+
false,
|
1857 |
+
true,
|
1858 |
+
true,
|
1859 |
+
true,
|
1860 |
+
true,
|
1861 |
+
true,
|
1862 |
+
true,
|
1863 |
+
true,
|
1864 |
+
true,
|
1865 |
+
true,
|
1866 |
+
false,
|
1867 |
+
true,
|
1868 |
+
true,
|
1869 |
+
false,
|
1870 |
+
false,
|
1871 |
+
true,
|
1872 |
+
true,
|
1873 |
+
false,
|
1874 |
+
false,
|
1875 |
+
true,
|
1876 |
+
false,
|
1877 |
+
false,
|
1878 |
+
true,
|
1879 |
+
false,
|
1880 |
+
false,
|
1881 |
+
true,
|
1882 |
+
true,
|
1883 |
+
true,
|
1884 |
+
true,
|
1885 |
+
true,
|
1886 |
+
true,
|
1887 |
+
true,
|
1888 |
+
false,
|
1889 |
+
true,
|
1890 |
+
false,
|
1891 |
+
false,
|
1892 |
+
true,
|
1893 |
+
true,
|
1894 |
+
true,
|
1895 |
+
false,
|
1896 |
+
true,
|
1897 |
+
true,
|
1898 |
+
false,
|
1899 |
+
true,
|
1900 |
+
true,
|
1901 |
+
true,
|
1902 |
+
true
|
1903 |
+
],
|
1904 |
+
"model.layers.14.self_attn.q_proj.lora_E": [
|
1905 |
+
false,
|
1906 |
+
true,
|
1907 |
+
false,
|
1908 |
+
true,
|
1909 |
+
true,
|
1910 |
+
false,
|
1911 |
+
false,
|
1912 |
+
false,
|
1913 |
+
true,
|
1914 |
+
false,
|
1915 |
+
false,
|
1916 |
+
true,
|
1917 |
+
false,
|
1918 |
+
false,
|
1919 |
+
true,
|
1920 |
+
true,
|
1921 |
+
false,
|
1922 |
+
true,
|
1923 |
+
true,
|
1924 |
+
true,
|
1925 |
+
false,
|
1926 |
+
false,
|
1927 |
+
false,
|
1928 |
+
true,
|
1929 |
+
false,
|
1930 |
+
true,
|
1931 |
+
false,
|
1932 |
+
true,
|
1933 |
+
false,
|
1934 |
+
false,
|
1935 |
+
true,
|
1936 |
+
true,
|
1937 |
+
true,
|
1938 |
+
true,
|
1939 |
+
true,
|
1940 |
+
false,
|
1941 |
+
false,
|
1942 |
+
true,
|
1943 |
+
true,
|
1944 |
+
false,
|
1945 |
+
true,
|
1946 |
+
true,
|
1947 |
+
false,
|
1948 |
+
false,
|
1949 |
+
true,
|
1950 |
+
false,
|
1951 |
+
false,
|
1952 |
+
false,
|
1953 |
+
true,
|
1954 |
+
false,
|
1955 |
+
true,
|
1956 |
+
true,
|
1957 |
+
true,
|
1958 |
+
false,
|
1959 |
+
true,
|
1960 |
+
true,
|
1961 |
+
true,
|
1962 |
+
false,
|
1963 |
+
false,
|
1964 |
+
true,
|
1965 |
+
false,
|
1966 |
+
true,
|
1967 |
+
true,
|
1968 |
+
false
|
1969 |
+
],
|
1970 |
+
"model.layers.14.self_attn.v_proj.lora_E": [
|
1971 |
+
true,
|
1972 |
+
true,
|
1973 |
+
true,
|
1974 |
+
false,
|
1975 |
+
false,
|
1976 |
+
false,
|
1977 |
+
true,
|
1978 |
+
false,
|
1979 |
+
false,
|
1980 |
+
false,
|
1981 |
+
false,
|
1982 |
+
true,
|
1983 |
+
true,
|
1984 |
+
false,
|
1985 |
+
false,
|
1986 |
+
true,
|
1987 |
+
false,
|
1988 |
+
true,
|
1989 |
+
true,
|
1990 |
+
true,
|
1991 |
+
false,
|
1992 |
+
true,
|
1993 |
+
false,
|
1994 |
+
false,
|
1995 |
+
true,
|
1996 |
+
false,
|
1997 |
+
true,
|
1998 |
+
false,
|
1999 |
+
true,
|
2000 |
+
true,
|
2001 |
+
false,
|
2002 |
+
true,
|
2003 |
+
false,
|
2004 |
+
true,
|
2005 |
+
false,
|
2006 |
+
false,
|
2007 |
+
true,
|
2008 |
+
false,
|
2009 |
+
false,
|
2010 |
+
true,
|
2011 |
+
false,
|
2012 |
+
true,
|
2013 |
+
true,
|
2014 |
+
false,
|
2015 |
+
true,
|
2016 |
+
false,
|
2017 |
+
true,
|
2018 |
+
false,
|
2019 |
+
true,
|
2020 |
+
true,
|
2021 |
+
true,
|
2022 |
+
true,
|
2023 |
+
true,
|
2024 |
+
true,
|
2025 |
+
false,
|
2026 |
+
false,
|
2027 |
+
true,
|
2028 |
+
true,
|
2029 |
+
false,
|
2030 |
+
true,
|
2031 |
+
true,
|
2032 |
+
true,
|
2033 |
+
true,
|
2034 |
+
false
|
2035 |
+
],
|
2036 |
+
"model.layers.15.self_attn.q_proj.lora_E": [
|
2037 |
+
false,
|
2038 |
+
true,
|
2039 |
+
true,
|
2040 |
+
true,
|
2041 |
+
true,
|
2042 |
+
true,
|
2043 |
+
false,
|
2044 |
+
true,
|
2045 |
+
false,
|
2046 |
+
true,
|
2047 |
+
false,
|
2048 |
+
true,
|
2049 |
+
false,
|
2050 |
+
true,
|
2051 |
+
true,
|
2052 |
+
true,
|
2053 |
+
true,
|
2054 |
+
true,
|
2055 |
+
true,
|
2056 |
+
false,
|
2057 |
+
true,
|
2058 |
+
true,
|
2059 |
+
false,
|
2060 |
+
true,
|
2061 |
+
false,
|
2062 |
+
true,
|
2063 |
+
false,
|
2064 |
+
true,
|
2065 |
+
true,
|
2066 |
+
true,
|
2067 |
+
false,
|
2068 |
+
true,
|
2069 |
+
false,
|
2070 |
+
false,
|
2071 |
+
false,
|
2072 |
+
true,
|
2073 |
+
true,
|
2074 |
+
true,
|
2075 |
+
true,
|
2076 |
+
false,
|
2077 |
+
true,
|
2078 |
+
true,
|
2079 |
+
false,
|
2080 |
+
true,
|
2081 |
+
false,
|
2082 |
+
true,
|
2083 |
+
false,
|
2084 |
+
false,
|
2085 |
+
true,
|
2086 |
+
true,
|
2087 |
+
false,
|
2088 |
+
true,
|
2089 |
+
false,
|
2090 |
+
true,
|
2091 |
+
false,
|
2092 |
+
true,
|
2093 |
+
true,
|
2094 |
+
true,
|
2095 |
+
true,
|
2096 |
+
true,
|
2097 |
+
false,
|
2098 |
+
true,
|
2099 |
+
true,
|
2100 |
+
true
|
2101 |
+
],
|
2102 |
+
"model.layers.15.self_attn.v_proj.lora_E": [
|
2103 |
+
true,
|
2104 |
+
true,
|
2105 |
+
true,
|
2106 |
+
true,
|
2107 |
+
false,
|
2108 |
+
true,
|
2109 |
+
true,
|
2110 |
+
true,
|
2111 |
+
true,
|
2112 |
+
false,
|
2113 |
+
true,
|
2114 |
+
true,
|
2115 |
+
false,
|
2116 |
+
true,
|
2117 |
+
true,
|
2118 |
+
false,
|
2119 |
+
false,
|
2120 |
+
true,
|
2121 |
+
false,
|
2122 |
+
true,
|
2123 |
+
true,
|
2124 |
+
true,
|
2125 |
+
true,
|
2126 |
+
true,
|
2127 |
+
false,
|
2128 |
+
true,
|
2129 |
+
true,
|
2130 |
+
true,
|
2131 |
+
true,
|
2132 |
+
true,
|
2133 |
+
false,
|
2134 |
+
false,
|
2135 |
+
false,
|
2136 |
+
true,
|
2137 |
+
true,
|
2138 |
+
true,
|
2139 |
+
true,
|
2140 |
+
true,
|
2141 |
+
true,
|
2142 |
+
true,
|
2143 |
+
true,
|
2144 |
+
true,
|
2145 |
+
true,
|
2146 |
+
false,
|
2147 |
+
true,
|
2148 |
+
true,
|
2149 |
+
true,
|
2150 |
+
true,
|
2151 |
+
true,
|
2152 |
+
false,
|
2153 |
+
true,
|
2154 |
+
true,
|
2155 |
+
true,
|
2156 |
+
true,
|
2157 |
+
false,
|
2158 |
+
false,
|
2159 |
+
false,
|
2160 |
+
true,
|
2161 |
+
true,
|
2162 |
+
true,
|
2163 |
+
true,
|
2164 |
+
true,
|
2165 |
+
true,
|
2166 |
+
true
|
2167 |
+
],
|
2168 |
+
"model.layers.16.self_attn.q_proj.lora_E": [
|
2169 |
+
false,
|
2170 |
+
false,
|
2171 |
+
false,
|
2172 |
+
false,
|
2173 |
+
false,
|
2174 |
+
false,
|
2175 |
+
false,
|
2176 |
+
true,
|
2177 |
+
true,
|
2178 |
+
true,
|
2179 |
+
true,
|
2180 |
+
false,
|
2181 |
+
true,
|
2182 |
+
false,
|
2183 |
+
true,
|
2184 |
+
true,
|
2185 |
+
true,
|
2186 |
+
false,
|
2187 |
+
true,
|
2188 |
+
false,
|
2189 |
+
true,
|
2190 |
+
true,
|
2191 |
+
true,
|
2192 |
+
false,
|
2193 |
+
false,
|
2194 |
+
false,
|
2195 |
+
true,
|
2196 |
+
false,
|
2197 |
+
false,
|
2198 |
+
false,
|
2199 |
+
true,
|
2200 |
+
false,
|
2201 |
+
true,
|
2202 |
+
true,
|
2203 |
+
true,
|
2204 |
+
true,
|
2205 |
+
false,
|
2206 |
+
true,
|
2207 |
+
true,
|
2208 |
+
false,
|
2209 |
+
true,
|
2210 |
+
true,
|
2211 |
+
false,
|
2212 |
+
true,
|
2213 |
+
true,
|
2214 |
+
true,
|
2215 |
+
true,
|
2216 |
+
false,
|
2217 |
+
true,
|
2218 |
+
true,
|
2219 |
+
false,
|
2220 |
+
true,
|
2221 |
+
true,
|
2222 |
+
false,
|
2223 |
+
true,
|
2224 |
+
false,
|
2225 |
+
false,
|
2226 |
+
true,
|
2227 |
+
true,
|
2228 |
+
true,
|
2229 |
+
false,
|
2230 |
+
false,
|
2231 |
+
true,
|
2232 |
+
true
|
2233 |
+
],
|
2234 |
+
"model.layers.16.self_attn.v_proj.lora_E": [
|
2235 |
+
true,
|
2236 |
+
false,
|
2237 |
+
true,
|
2238 |
+
false,
|
2239 |
+
true,
|
2240 |
+
false,
|
2241 |
+
true,
|
2242 |
+
false,
|
2243 |
+
false,
|
2244 |
+
true,
|
2245 |
+
false,
|
2246 |
+
true,
|
2247 |
+
true,
|
2248 |
+
true,
|
2249 |
+
true,
|
2250 |
+
true,
|
2251 |
+
true,
|
2252 |
+
false,
|
2253 |
+
true,
|
2254 |
+
false,
|
2255 |
+
true,
|
2256 |
+
true,
|
2257 |
+
true,
|
2258 |
+
false,
|
2259 |
+
true,
|
2260 |
+
true,
|
2261 |
+
false,
|
2262 |
+
false,
|
2263 |
+
false,
|
2264 |
+
true,
|
2265 |
+
true,
|
2266 |
+
true,
|
2267 |
+
false,
|
2268 |
+
true,
|
2269 |
+
false,
|
2270 |
+
false,
|
2271 |
+
true,
|
2272 |
+
false,
|
2273 |
+
false,
|
2274 |
+
false,
|
2275 |
+
true,
|
2276 |
+
false,
|
2277 |
+
true,
|
2278 |
+
false,
|
2279 |
+
true,
|
2280 |
+
true,
|
2281 |
+
true,
|
2282 |
+
false,
|
2283 |
+
true,
|
2284 |
+
true,
|
2285 |
+
false,
|
2286 |
+
false,
|
2287 |
+
true,
|
2288 |
+
true,
|
2289 |
+
true,
|
2290 |
+
false,
|
2291 |
+
true,
|
2292 |
+
true,
|
2293 |
+
false,
|
2294 |
+
true,
|
2295 |
+
false,
|
2296 |
+
true,
|
2297 |
+
false,
|
2298 |
+
false
|
2299 |
+
],
|
2300 |
+
"model.layers.17.self_attn.q_proj.lora_E": [
|
2301 |
+
true,
|
2302 |
+
true,
|
2303 |
+
true,
|
2304 |
+
true,
|
2305 |
+
false,
|
2306 |
+
true,
|
2307 |
+
false,
|
2308 |
+
true,
|
2309 |
+
false,
|
2310 |
+
false,
|
2311 |
+
true,
|
2312 |
+
true,
|
2313 |
+
true,
|
2314 |
+
false,
|
2315 |
+
true,
|
2316 |
+
false,
|
2317 |
+
true,
|
2318 |
+
true,
|
2319 |
+
true,
|
2320 |
+
true,
|
2321 |
+
true,
|
2322 |
+
true,
|
2323 |
+
true,
|
2324 |
+
true,
|
2325 |
+
true,
|
2326 |
+
true,
|
2327 |
+
true,
|
2328 |
+
true,
|
2329 |
+
true,
|
2330 |
+
true,
|
2331 |
+
true,
|
2332 |
+
true,
|
2333 |
+
false,
|
2334 |
+
false,
|
2335 |
+
true,
|
2336 |
+
true,
|
2337 |
+
false,
|
2338 |
+
true,
|
2339 |
+
true,
|
2340 |
+
true,
|
2341 |
+
true,
|
2342 |
+
false,
|
2343 |
+
true,
|
2344 |
+
true,
|
2345 |
+
false,
|
2346 |
+
true,
|
2347 |
+
true,
|
2348 |
+
true,
|
2349 |
+
false,
|
2350 |
+
true,
|
2351 |
+
false,
|
2352 |
+
true,
|
2353 |
+
true,
|
2354 |
+
true,
|
2355 |
+
true,
|
2356 |
+
false,
|
2357 |
+
true,
|
2358 |
+
true,
|
2359 |
+
true,
|
2360 |
+
true,
|
2361 |
+
true,
|
2362 |
+
true,
|
2363 |
+
true,
|
2364 |
+
true
|
2365 |
+
],
|
2366 |
+
"model.layers.17.self_attn.v_proj.lora_E": [
|
2367 |
+
false,
|
2368 |
+
true,
|
2369 |
+
true,
|
2370 |
+
true,
|
2371 |
+
true,
|
2372 |
+
true,
|
2373 |
+
true,
|
2374 |
+
true,
|
2375 |
+
true,
|
2376 |
+
true,
|
2377 |
+
false,
|
2378 |
+
true,
|
2379 |
+
true,
|
2380 |
+
false,
|
2381 |
+
false,
|
2382 |
+
false,
|
2383 |
+
true,
|
2384 |
+
true,
|
2385 |
+
false,
|
2386 |
+
true,
|
2387 |
+
true,
|
2388 |
+
false,
|
2389 |
+
false,
|
2390 |
+
true,
|
2391 |
+
true,
|
2392 |
+
false,
|
2393 |
+
false,
|
2394 |
+
false,
|
2395 |
+
false,
|
2396 |
+
true,
|
2397 |
+
true,
|
2398 |
+
true,
|
2399 |
+
false,
|
2400 |
+
true,
|
2401 |
+
false,
|
2402 |
+
true,
|
2403 |
+
false,
|
2404 |
+
true,
|
2405 |
+
true,
|
2406 |
+
true,
|
2407 |
+
true,
|
2408 |
+
false,
|
2409 |
+
true,
|
2410 |
+
true,
|
2411 |
+
true,
|
2412 |
+
true,
|
2413 |
+
true,
|
2414 |
+
true,
|
2415 |
+
false,
|
2416 |
+
true,
|
2417 |
+
true,
|
2418 |
+
true,
|
2419 |
+
true,
|
2420 |
+
true,
|
2421 |
+
true,
|
2422 |
+
true,
|
2423 |
+
true,
|
2424 |
+
true,
|
2425 |
+
false,
|
2426 |
+
false,
|
2427 |
+
true,
|
2428 |
+
true,
|
2429 |
+
true,
|
2430 |
+
true
|
2431 |
+
],
|
2432 |
+
"model.layers.18.self_attn.q_proj.lora_E": [
|
2433 |
+
false,
|
2434 |
+
true,
|
2435 |
+
false,
|
2436 |
+
true,
|
2437 |
+
false,
|
2438 |
+
true,
|
2439 |
+
false,
|
2440 |
+
true,
|
2441 |
+
true,
|
2442 |
+
true,
|
2443 |
+
false,
|
2444 |
+
true,
|
2445 |
+
true,
|
2446 |
+
true,
|
2447 |
+
false,
|
2448 |
+
true,
|
2449 |
+
true,
|
2450 |
+
false,
|
2451 |
+
true,
|
2452 |
+
false,
|
2453 |
+
false,
|
2454 |
+
false,
|
2455 |
+
true,
|
2456 |
+
true,
|
2457 |
+
false,
|
2458 |
+
true,
|
2459 |
+
true,
|
2460 |
+
true,
|
2461 |
+
false,
|
2462 |
+
true,
|
2463 |
+
true,
|
2464 |
+
true,
|
2465 |
+
true,
|
2466 |
+
true,
|
2467 |
+
true,
|
2468 |
+
false,
|
2469 |
+
true,
|
2470 |
+
true,
|
2471 |
+
true,
|
2472 |
+
true,
|
2473 |
+
true,
|
2474 |
+
true,
|
2475 |
+
false,
|
2476 |
+
false,
|
2477 |
+
true,
|
2478 |
+
true,
|
2479 |
+
true,
|
2480 |
+
true,
|
2481 |
+
true,
|
2482 |
+
false,
|
2483 |
+
true,
|
2484 |
+
false,
|
2485 |
+
false,
|
2486 |
+
false,
|
2487 |
+
false,
|
2488 |
+
true,
|
2489 |
+
false,
|
2490 |
+
false,
|
2491 |
+
true,
|
2492 |
+
false,
|
2493 |
+
true,
|
2494 |
+
false,
|
2495 |
+
true,
|
2496 |
+
true
|
2497 |
+
],
|
2498 |
+
"model.layers.18.self_attn.v_proj.lora_E": [
|
2499 |
+
true,
|
2500 |
+
true,
|
2501 |
+
true,
|
2502 |
+
true,
|
2503 |
+
true,
|
2504 |
+
true,
|
2505 |
+
true,
|
2506 |
+
true,
|
2507 |
+
false,
|
2508 |
+
true,
|
2509 |
+
true,
|
2510 |
+
false,
|
2511 |
+
false,
|
2512 |
+
false,
|
2513 |
+
true,
|
2514 |
+
false,
|
2515 |
+
true,
|
2516 |
+
true,
|
2517 |
+
true,
|
2518 |
+
true,
|
2519 |
+
true,
|
2520 |
+
true,
|
2521 |
+
true,
|
2522 |
+
true,
|
2523 |
+
true,
|
2524 |
+
true,
|
2525 |
+
true,
|
2526 |
+
true,
|
2527 |
+
true,
|
2528 |
+
false,
|
2529 |
+
true,
|
2530 |
+
true,
|
2531 |
+
true,
|
2532 |
+
false,
|
2533 |
+
true,
|
2534 |
+
true,
|
2535 |
+
true,
|
2536 |
+
true,
|
2537 |
+
true,
|
2538 |
+
true,
|
2539 |
+
true,
|
2540 |
+
true,
|
2541 |
+
false,
|
2542 |
+
true,
|
2543 |
+
true,
|
2544 |
+
false,
|
2545 |
+
true,
|
2546 |
+
true,
|
2547 |
+
true,
|
2548 |
+
true,
|
2549 |
+
true,
|
2550 |
+
true,
|
2551 |
+
true,
|
2552 |
+
true,
|
2553 |
+
false,
|
2554 |
+
true,
|
2555 |
+
true,
|
2556 |
+
true,
|
2557 |
+
true,
|
2558 |
+
true,
|
2559 |
+
true,
|
2560 |
+
false,
|
2561 |
+
false,
|
2562 |
+
true
|
2563 |
+
],
|
2564 |
+
"model.layers.19.self_attn.q_proj.lora_E": [
|
2565 |
+
false,
|
2566 |
+
true,
|
2567 |
+
false,
|
2568 |
+
true,
|
2569 |
+
false,
|
2570 |
+
true,
|
2571 |
+
false,
|
2572 |
+
true,
|
2573 |
+
true,
|
2574 |
+
true,
|
2575 |
+
true,
|
2576 |
+
true,
|
2577 |
+
true,
|
2578 |
+
true,
|
2579 |
+
true,
|
2580 |
+
true,
|
2581 |
+
true,
|
2582 |
+
true,
|
2583 |
+
true,
|
2584 |
+
true,
|
2585 |
+
false,
|
2586 |
+
true,
|
2587 |
+
true,
|
2588 |
+
true,
|
2589 |
+
true,
|
2590 |
+
true,
|
2591 |
+
true,
|
2592 |
+
false,
|
2593 |
+
false,
|
2594 |
+
true,
|
2595 |
+
true,
|
2596 |
+
true,
|
2597 |
+
true,
|
2598 |
+
false,
|
2599 |
+
true,
|
2600 |
+
false,
|
2601 |
+
true,
|
2602 |
+
false,
|
2603 |
+
true,
|
2604 |
+
false,
|
2605 |
+
false,
|
2606 |
+
false,
|
2607 |
+
true,
|
2608 |
+
true,
|
2609 |
+
true,
|
2610 |
+
true,
|
2611 |
+
true,
|
2612 |
+
false,
|
2613 |
+
false,
|
2614 |
+
false,
|
2615 |
+
false,
|
2616 |
+
true,
|
2617 |
+
true,
|
2618 |
+
true,
|
2619 |
+
true,
|
2620 |
+
true,
|
2621 |
+
true,
|
2622 |
+
false,
|
2623 |
+
false,
|
2624 |
+
true,
|
2625 |
+
true,
|
2626 |
+
false,
|
2627 |
+
true,
|
2628 |
+
true
|
2629 |
+
],
|
2630 |
+
"model.layers.19.self_attn.v_proj.lora_E": [
|
2631 |
+
false,
|
2632 |
+
true,
|
2633 |
+
false,
|
2634 |
+
true,
|
2635 |
+
true,
|
2636 |
+
true,
|
2637 |
+
false,
|
2638 |
+
true,
|
2639 |
+
false,
|
2640 |
+
false,
|
2641 |
+
false,
|
2642 |
+
false,
|
2643 |
+
true,
|
2644 |
+
true,
|
2645 |
+
true,
|
2646 |
+
false,
|
2647 |
+
true,
|
2648 |
+
false,
|
2649 |
+
false,
|
2650 |
+
false,
|
2651 |
+
false,
|
2652 |
+
true,
|
2653 |
+
false,
|
2654 |
+
true,
|
2655 |
+
true,
|
2656 |
+
false,
|
2657 |
+
true,
|
2658 |
+
true,
|
2659 |
+
false,
|
2660 |
+
false,
|
2661 |
+
true,
|
2662 |
+
true,
|
2663 |
+
true,
|
2664 |
+
true,
|
2665 |
+
false,
|
2666 |
+
false,
|
2667 |
+
false,
|
2668 |
+
false,
|
2669 |
+
false,
|
2670 |
+
true,
|
2671 |
+
false,
|
2672 |
+
false,
|
2673 |
+
true,
|
2674 |
+
false,
|
2675 |
+
false,
|
2676 |
+
false,
|
2677 |
+
true,
|
2678 |
+
true,
|
2679 |
+
false,
|
2680 |
+
false,
|
2681 |
+
false,
|
2682 |
+
true,
|
2683 |
+
true,
|
2684 |
+
true,
|
2685 |
+
true,
|
2686 |
+
false,
|
2687 |
+
true,
|
2688 |
+
true,
|
2689 |
+
false,
|
2690 |
+
true,
|
2691 |
+
false,
|
2692 |
+
true,
|
2693 |
+
true,
|
2694 |
+
true
|
2695 |
+
],
|
2696 |
+
"model.layers.20.self_attn.q_proj.lora_E": [
|
2697 |
+
false,
|
2698 |
+
true,
|
2699 |
+
false,
|
2700 |
+
false,
|
2701 |
+
false,
|
2702 |
+
false,
|
2703 |
+
true,
|
2704 |
+
false,
|
2705 |
+
false,
|
2706 |
+
false,
|
2707 |
+
false,
|
2708 |
+
false,
|
2709 |
+
false,
|
2710 |
+
false,
|
2711 |
+
false,
|
2712 |
+
true,
|
2713 |
+
false,
|
2714 |
+
false,
|
2715 |
+
false,
|
2716 |
+
false,
|
2717 |
+
false,
|
2718 |
+
false,
|
2719 |
+
false,
|
2720 |
+
false,
|
2721 |
+
false,
|
2722 |
+
false,
|
2723 |
+
false,
|
2724 |
+
true,
|
2725 |
+
false,
|
2726 |
+
false,
|
2727 |
+
false,
|
2728 |
+
false,
|
2729 |
+
false,
|
2730 |
+
true,
|
2731 |
+
false,
|
2732 |
+
true,
|
2733 |
+
true,
|
2734 |
+
false,
|
2735 |
+
false,
|
2736 |
+
false,
|
2737 |
+
false,
|
2738 |
+
true,
|
2739 |
+
false,
|
2740 |
+
true,
|
2741 |
+
false,
|
2742 |
+
true,
|
2743 |
+
false,
|
2744 |
+
false,
|
2745 |
+
false,
|
2746 |
+
false,
|
2747 |
+
true,
|
2748 |
+
true,
|
2749 |
+
false,
|
2750 |
+
false,
|
2751 |
+
true,
|
2752 |
+
true,
|
2753 |
+
false,
|
2754 |
+
false,
|
2755 |
+
false,
|
2756 |
+
false,
|
2757 |
+
false,
|
2758 |
+
true,
|
2759 |
+
false,
|
2760 |
+
false
|
2761 |
+
],
|
2762 |
+
"model.layers.20.self_attn.v_proj.lora_E": [
|
2763 |
+
true,
|
2764 |
+
false,
|
2765 |
+
true,
|
2766 |
+
true,
|
2767 |
+
false,
|
2768 |
+
false,
|
2769 |
+
false,
|
2770 |
+
true,
|
2771 |
+
true,
|
2772 |
+
false,
|
2773 |
+
false,
|
2774 |
+
true,
|
2775 |
+
true,
|
2776 |
+
true,
|
2777 |
+
false,
|
2778 |
+
true,
|
2779 |
+
false,
|
2780 |
+
true,
|
2781 |
+
false,
|
2782 |
+
false,
|
2783 |
+
false,
|
2784 |
+
false,
|
2785 |
+
true,
|
2786 |
+
false,
|
2787 |
+
false,
|
2788 |
+
false,
|
2789 |
+
true,
|
2790 |
+
false,
|
2791 |
+
true,
|
2792 |
+
false,
|
2793 |
+
true,
|
2794 |
+
true,
|
2795 |
+
true,
|
2796 |
+
false,
|
2797 |
+
true,
|
2798 |
+
false,
|
2799 |
+
true,
|
2800 |
+
false,
|
2801 |
+
true,
|
2802 |
+
false,
|
2803 |
+
true,
|
2804 |
+
true,
|
2805 |
+
true,
|
2806 |
+
true,
|
2807 |
+
false,
|
2808 |
+
false,
|
2809 |
+
false,
|
2810 |
+
false,
|
2811 |
+
false,
|
2812 |
+
false,
|
2813 |
+
false,
|
2814 |
+
false,
|
2815 |
+
true,
|
2816 |
+
false,
|
2817 |
+
false,
|
2818 |
+
false,
|
2819 |
+
true,
|
2820 |
+
false,
|
2821 |
+
false,
|
2822 |
+
true,
|
2823 |
+
false,
|
2824 |
+
false,
|
2825 |
+
true,
|
2826 |
+
true
|
2827 |
+
],
|
2828 |
+
"model.layers.21.self_attn.q_proj.lora_E": [
|
2829 |
+
false,
|
2830 |
+
false,
|
2831 |
+
true,
|
2832 |
+
false,
|
2833 |
+
true,
|
2834 |
+
true,
|
2835 |
+
true,
|
2836 |
+
true,
|
2837 |
+
true,
|
2838 |
+
false,
|
2839 |
+
true,
|
2840 |
+
true,
|
2841 |
+
true,
|
2842 |
+
true,
|
2843 |
+
true,
|
2844 |
+
true,
|
2845 |
+
false,
|
2846 |
+
false,
|
2847 |
+
false,
|
2848 |
+
false,
|
2849 |
+
true,
|
2850 |
+
true,
|
2851 |
+
false,
|
2852 |
+
true,
|
2853 |
+
true,
|
2854 |
+
true,
|
2855 |
+
true,
|
2856 |
+
false,
|
2857 |
+
false,
|
2858 |
+
false,
|
2859 |
+
false,
|
2860 |
+
false,
|
2861 |
+
false,
|
2862 |
+
false,
|
2863 |
+
true,
|
2864 |
+
true,
|
2865 |
+
true,
|
2866 |
+
false,
|
2867 |
+
true,
|
2868 |
+
false,
|
2869 |
+
true,
|
2870 |
+
false,
|
2871 |
+
true,
|
2872 |
+
false,
|
2873 |
+
false,
|
2874 |
+
false,
|
2875 |
+
true,
|
2876 |
+
false,
|
2877 |
+
true,
|
2878 |
+
true,
|
2879 |
+
true,
|
2880 |
+
true,
|
2881 |
+
true,
|
2882 |
+
false,
|
2883 |
+
false,
|
2884 |
+
true,
|
2885 |
+
true,
|
2886 |
+
false,
|
2887 |
+
true,
|
2888 |
+
true,
|
2889 |
+
false,
|
2890 |
+
false,
|
2891 |
+
true,
|
2892 |
+
true
|
2893 |
+
],
|
2894 |
+
"model.layers.21.self_attn.v_proj.lora_E": [
|
2895 |
+
true,
|
2896 |
+
true,
|
2897 |
+
true,
|
2898 |
+
true,
|
2899 |
+
true,
|
2900 |
+
false,
|
2901 |
+
true,
|
2902 |
+
true,
|
2903 |
+
false,
|
2904 |
+
true,
|
2905 |
+
true,
|
2906 |
+
false,
|
2907 |
+
false,
|
2908 |
+
true,
|
2909 |
+
true,
|
2910 |
+
false,
|
2911 |
+
true,
|
2912 |
+
true,
|
2913 |
+
false,
|
2914 |
+
true,
|
2915 |
+
true,
|
2916 |
+
true,
|
2917 |
+
true,
|
2918 |
+
false,
|
2919 |
+
false,
|
2920 |
+
false,
|
2921 |
+
false,
|
2922 |
+
false,
|
2923 |
+
true,
|
2924 |
+
true,
|
2925 |
+
true,
|
2926 |
+
false,
|
2927 |
+
true,
|
2928 |
+
true,
|
2929 |
+
false,
|
2930 |
+
true,
|
2931 |
+
true,
|
2932 |
+
false,
|
2933 |
+
true,
|
2934 |
+
true,
|
2935 |
+
false,
|
2936 |
+
false,
|
2937 |
+
false,
|
2938 |
+
true,
|
2939 |
+
true,
|
2940 |
+
false,
|
2941 |
+
false,
|
2942 |
+
false,
|
2943 |
+
true,
|
2944 |
+
true,
|
2945 |
+
false,
|
2946 |
+
true,
|
2947 |
+
true,
|
2948 |
+
false,
|
2949 |
+
false,
|
2950 |
+
false,
|
2951 |
+
false,
|
2952 |
+
true,
|
2953 |
+
false,
|
2954 |
+
false,
|
2955 |
+
false,
|
2956 |
+
false,
|
2957 |
+
false,
|
2958 |
+
false
|
2959 |
+
],
|
2960 |
+
"model.layers.22.self_attn.q_proj.lora_E": [
|
2961 |
+
false,
|
2962 |
+
true,
|
2963 |
+
false,
|
2964 |
+
true,
|
2965 |
+
true,
|
2966 |
+
false,
|
2967 |
+
true,
|
2968 |
+
false,
|
2969 |
+
false,
|
2970 |
+
true,
|
2971 |
+
false,
|
2972 |
+
false,
|
2973 |
+
false,
|
2974 |
+
false,
|
2975 |
+
false,
|
2976 |
+
false,
|
2977 |
+
true,
|
2978 |
+
false,
|
2979 |
+
true,
|
2980 |
+
true,
|
2981 |
+
false,
|
2982 |
+
false,
|
2983 |
+
false,
|
2984 |
+
false,
|
2985 |
+
true,
|
2986 |
+
true,
|
2987 |
+
true,
|
2988 |
+
false,
|
2989 |
+
false,
|
2990 |
+
true,
|
2991 |
+
false,
|
2992 |
+
false,
|
2993 |
+
false,
|
2994 |
+
false,
|
2995 |
+
false,
|
2996 |
+
true,
|
2997 |
+
false,
|
2998 |
+
false,
|
2999 |
+
false,
|
3000 |
+
false,
|
3001 |
+
true,
|
3002 |
+
false,
|
3003 |
+
false,
|
3004 |
+
false,
|
3005 |
+
true,
|
3006 |
+
false,
|
3007 |
+
true,
|
3008 |
+
false,
|
3009 |
+
false,
|
3010 |
+
false,
|
3011 |
+
true,
|
3012 |
+
false,
|
3013 |
+
true,
|
3014 |
+
true,
|
3015 |
+
true,
|
3016 |
+
false,
|
3017 |
+
false,
|
3018 |
+
true,
|
3019 |
+
false,
|
3020 |
+
false,
|
3021 |
+
true,
|
3022 |
+
true,
|
3023 |
+
false,
|
3024 |
+
true
|
3025 |
+
],
|
3026 |
+
"model.layers.22.self_attn.v_proj.lora_E": [
|
3027 |
+
false,
|
3028 |
+
true,
|
3029 |
+
true,
|
3030 |
+
true,
|
3031 |
+
true,
|
3032 |
+
false,
|
3033 |
+
false,
|
3034 |
+
true,
|
3035 |
+
true,
|
3036 |
+
true,
|
3037 |
+
true,
|
3038 |
+
false,
|
3039 |
+
true,
|
3040 |
+
true,
|
3041 |
+
false,
|
3042 |
+
true,
|
3043 |
+
true,
|
3044 |
+
true,
|
3045 |
+
true,
|
3046 |
+
true,
|
3047 |
+
true,
|
3048 |
+
true,
|
3049 |
+
true,
|
3050 |
+
true,
|
3051 |
+
false,
|
3052 |
+
false,
|
3053 |
+
true,
|
3054 |
+
true,
|
3055 |
+
false,
|
3056 |
+
true,
|
3057 |
+
false,
|
3058 |
+
false,
|
3059 |
+
true,
|
3060 |
+
false,
|
3061 |
+
false,
|
3062 |
+
false,
|
3063 |
+
false,
|
3064 |
+
true,
|
3065 |
+
false,
|
3066 |
+
false,
|
3067 |
+
false,
|
3068 |
+
true,
|
3069 |
+
true,
|
3070 |
+
false,
|
3071 |
+
false,
|
3072 |
+
false,
|
3073 |
+
true,
|
3074 |
+
true,
|
3075 |
+
false,
|
3076 |
+
false,
|
3077 |
+
true,
|
3078 |
+
true,
|
3079 |
+
true,
|
3080 |
+
true,
|
3081 |
+
true,
|
3082 |
+
true,
|
3083 |
+
true,
|
3084 |
+
true,
|
3085 |
+
false,
|
3086 |
+
false,
|
3087 |
+
true,
|
3088 |
+
false,
|
3089 |
+
true,
|
3090 |
+
true
|
3091 |
+
],
|
3092 |
+
"model.layers.23.self_attn.q_proj.lora_E": [
|
3093 |
+
true,
|
3094 |
+
false,
|
3095 |
+
true,
|
3096 |
+
false,
|
3097 |
+
true,
|
3098 |
+
true,
|
3099 |
+
true,
|
3100 |
+
true,
|
3101 |
+
true,
|
3102 |
+
false,
|
3103 |
+
true,
|
3104 |
+
true,
|
3105 |
+
true,
|
3106 |
+
true,
|
3107 |
+
true,
|
3108 |
+
true,
|
3109 |
+
true,
|
3110 |
+
true,
|
3111 |
+
false,
|
3112 |
+
false,
|
3113 |
+
true,
|
3114 |
+
true,
|
3115 |
+
true,
|
3116 |
+
true,
|
3117 |
+
true,
|
3118 |
+
false,
|
3119 |
+
true,
|
3120 |
+
true,
|
3121 |
+
true,
|
3122 |
+
true,
|
3123 |
+
true,
|
3124 |
+
true,
|
3125 |
+
true,
|
3126 |
+
true,
|
3127 |
+
true,
|
3128 |
+
true,
|
3129 |
+
true,
|
3130 |
+
true,
|
3131 |
+
true,
|
3132 |
+
true,
|
3133 |
+
false,
|
3134 |
+
true,
|
3135 |
+
true,
|
3136 |
+
false,
|
3137 |
+
true,
|
3138 |
+
false,
|
3139 |
+
true,
|
3140 |
+
false,
|
3141 |
+
true,
|
3142 |
+
true,
|
3143 |
+
true,
|
3144 |
+
true,
|
3145 |
+
true,
|
3146 |
+
true,
|
3147 |
+
true,
|
3148 |
+
false,
|
3149 |
+
true,
|
3150 |
+
false,
|
3151 |
+
true,
|
3152 |
+
true,
|
3153 |
+
true,
|
3154 |
+
false,
|
3155 |
+
true,
|
3156 |
+
true
|
3157 |
+
],
|
3158 |
+
"model.layers.23.self_attn.v_proj.lora_E": [
|
3159 |
+
false,
|
3160 |
+
true,
|
3161 |
+
false,
|
3162 |
+
true,
|
3163 |
+
false,
|
3164 |
+
false,
|
3165 |
+
true,
|
3166 |
+
false,
|
3167 |
+
true,
|
3168 |
+
false,
|
3169 |
+
true,
|
3170 |
+
true,
|
3171 |
+
true,
|
3172 |
+
true,
|
3173 |
+
true,
|
3174 |
+
true,
|
3175 |
+
false,
|
3176 |
+
true,
|
3177 |
+
true,
|
3178 |
+
true,
|
3179 |
+
true,
|
3180 |
+
true,
|
3181 |
+
true,
|
3182 |
+
true,
|
3183 |
+
true,
|
3184 |
+
true,
|
3185 |
+
true,
|
3186 |
+
true,
|
3187 |
+
true,
|
3188 |
+
false,
|
3189 |
+
false,
|
3190 |
+
true,
|
3191 |
+
false,
|
3192 |
+
true,
|
3193 |
+
true,
|
3194 |
+
false,
|
3195 |
+
true,
|
3196 |
+
false,
|
3197 |
+
true,
|
3198 |
+
true,
|
3199 |
+
true,
|
3200 |
+
true,
|
3201 |
+
true,
|
3202 |
+
true,
|
3203 |
+
true,
|
3204 |
+
true,
|
3205 |
+
true,
|
3206 |
+
true,
|
3207 |
+
true,
|
3208 |
+
true,
|
3209 |
+
false,
|
3210 |
+
true,
|
3211 |
+
true,
|
3212 |
+
true,
|
3213 |
+
false,
|
3214 |
+
false,
|
3215 |
+
false,
|
3216 |
+
true,
|
3217 |
+
true,
|
3218 |
+
false,
|
3219 |
+
true,
|
3220 |
+
false,
|
3221 |
+
true,
|
3222 |
+
true
|
3223 |
+
],
|
3224 |
+
"model.layers.24.self_attn.q_proj.lora_E": [
|
3225 |
+
true,
|
3226 |
+
true,
|
3227 |
+
true,
|
3228 |
+
true,
|
3229 |
+
true,
|
3230 |
+
true,
|
3231 |
+
false,
|
3232 |
+
true,
|
3233 |
+
true,
|
3234 |
+
true,
|
3235 |
+
true,
|
3236 |
+
false,
|
3237 |
+
true,
|
3238 |
+
true,
|
3239 |
+
true,
|
3240 |
+
false,
|
3241 |
+
true,
|
3242 |
+
true,
|
3243 |
+
true,
|
3244 |
+
true,
|
3245 |
+
true,
|
3246 |
+
true,
|
3247 |
+
false,
|
3248 |
+
false,
|
3249 |
+
true,
|
3250 |
+
true,
|
3251 |
+
true,
|
3252 |
+
true,
|
3253 |
+
false,
|
3254 |
+
true,
|
3255 |
+
true,
|
3256 |
+
true,
|
3257 |
+
true,
|
3258 |
+
true,
|
3259 |
+
true,
|
3260 |
+
true,
|
3261 |
+
true,
|
3262 |
+
true,
|
3263 |
+
false,
|
3264 |
+
true,
|
3265 |
+
true,
|
3266 |
+
true,
|
3267 |
+
false,
|
3268 |
+
true,
|
3269 |
+
true,
|
3270 |
+
true,
|
3271 |
+
true,
|
3272 |
+
false,
|
3273 |
+
true,
|
3274 |
+
false,
|
3275 |
+
true,
|
3276 |
+
true,
|
3277 |
+
true,
|
3278 |
+
true,
|
3279 |
+
false,
|
3280 |
+
false,
|
3281 |
+
false,
|
3282 |
+
true,
|
3283 |
+
true,
|
3284 |
+
true,
|
3285 |
+
true,
|
3286 |
+
false,
|
3287 |
+
false,
|
3288 |
+
true
|
3289 |
+
],
|
3290 |
+
"model.layers.24.self_attn.v_proj.lora_E": [
|
3291 |
+
true,
|
3292 |
+
true,
|
3293 |
+
true,
|
3294 |
+
false,
|
3295 |
+
true,
|
3296 |
+
false,
|
3297 |
+
false,
|
3298 |
+
true,
|
3299 |
+
true,
|
3300 |
+
true,
|
3301 |
+
false,
|
3302 |
+
true,
|
3303 |
+
true,
|
3304 |
+
false,
|
3305 |
+
false,
|
3306 |
+
true,
|
3307 |
+
false,
|
3308 |
+
false,
|
3309 |
+
false,
|
3310 |
+
false,
|
3311 |
+
true,
|
3312 |
+
true,
|
3313 |
+
true,
|
3314 |
+
false,
|
3315 |
+
true,
|
3316 |
+
false,
|
3317 |
+
false,
|
3318 |
+
true,
|
3319 |
+
false,
|
3320 |
+
true,
|
3321 |
+
false,
|
3322 |
+
true,
|
3323 |
+
true,
|
3324 |
+
false,
|
3325 |
+
true,
|
3326 |
+
true,
|
3327 |
+
false,
|
3328 |
+
false,
|
3329 |
+
false,
|
3330 |
+
true,
|
3331 |
+
false,
|
3332 |
+
false,
|
3333 |
+
true,
|
3334 |
+
true,
|
3335 |
+
false,
|
3336 |
+
true,
|
3337 |
+
true,
|
3338 |
+
false,
|
3339 |
+
false,
|
3340 |
+
true,
|
3341 |
+
true,
|
3342 |
+
true,
|
3343 |
+
true,
|
3344 |
+
false,
|
3345 |
+
false,
|
3346 |
+
true,
|
3347 |
+
true,
|
3348 |
+
true,
|
3349 |
+
false,
|
3350 |
+
true,
|
3351 |
+
false,
|
3352 |
+
true,
|
3353 |
+
true,
|
3354 |
+
true
|
3355 |
+
],
|
3356 |
+
"model.layers.25.self_attn.q_proj.lora_E": [
|
3357 |
+
false,
|
3358 |
+
false,
|
3359 |
+
false,
|
3360 |
+
false,
|
3361 |
+
true,
|
3362 |
+
true,
|
3363 |
+
false,
|
3364 |
+
true,
|
3365 |
+
true,
|
3366 |
+
false,
|
3367 |
+
false,
|
3368 |
+
false,
|
3369 |
+
false,
|
3370 |
+
false,
|
3371 |
+
false,
|
3372 |
+
false,
|
3373 |
+
true,
|
3374 |
+
false,
|
3375 |
+
false,
|
3376 |
+
true,
|
3377 |
+
false,
|
3378 |
+
false,
|
3379 |
+
true,
|
3380 |
+
false,
|
3381 |
+
false,
|
3382 |
+
false,
|
3383 |
+
true,
|
3384 |
+
false,
|
3385 |
+
false,
|
3386 |
+
false,
|
3387 |
+
false,
|
3388 |
+
false,
|
3389 |
+
false,
|
3390 |
+
false,
|
3391 |
+
false,
|
3392 |
+
false,
|
3393 |
+
false,
|
3394 |
+
false,
|
3395 |
+
false,
|
3396 |
+
false,
|
3397 |
+
false,
|
3398 |
+
false,
|
3399 |
+
false,
|
3400 |
+
false,
|
3401 |
+
true,
|
3402 |
+
false,
|
3403 |
+
false,
|
3404 |
+
false,
|
3405 |
+
false,
|
3406 |
+
false,
|
3407 |
+
true,
|
3408 |
+
false,
|
3409 |
+
false,
|
3410 |
+
false,
|
3411 |
+
true,
|
3412 |
+
true,
|
3413 |
+
true,
|
3414 |
+
false,
|
3415 |
+
false,
|
3416 |
+
false,
|
3417 |
+
false,
|
3418 |
+
false,
|
3419 |
+
false,
|
3420 |
+
false
|
3421 |
+
],
|
3422 |
+
"model.layers.25.self_attn.v_proj.lora_E": [
|
3423 |
+
false,
|
3424 |
+
false,
|
3425 |
+
false,
|
3426 |
+
true,
|
3427 |
+
false,
|
3428 |
+
false,
|
3429 |
+
false,
|
3430 |
+
true,
|
3431 |
+
true,
|
3432 |
+
false,
|
3433 |
+
false,
|
3434 |
+
true,
|
3435 |
+
false,
|
3436 |
+
true,
|
3437 |
+
true,
|
3438 |
+
true,
|
3439 |
+
false,
|
3440 |
+
false,
|
3441 |
+
false,
|
3442 |
+
false,
|
3443 |
+
true,
|
3444 |
+
false,
|
3445 |
+
false,
|
3446 |
+
false,
|
3447 |
+
true,
|
3448 |
+
true,
|
3449 |
+
true,
|
3450 |
+
true,
|
3451 |
+
false,
|
3452 |
+
false,
|
3453 |
+
false,
|
3454 |
+
false,
|
3455 |
+
true,
|
3456 |
+
false,
|
3457 |
+
false,
|
3458 |
+
false,
|
3459 |
+
false,
|
3460 |
+
true,
|
3461 |
+
false,
|
3462 |
+
true,
|
3463 |
+
false,
|
3464 |
+
false,
|
3465 |
+
false,
|
3466 |
+
false,
|
3467 |
+
false,
|
3468 |
+
false,
|
3469 |
+
false,
|
3470 |
+
false,
|
3471 |
+
false,
|
3472 |
+
false,
|
3473 |
+
false,
|
3474 |
+
false,
|
3475 |
+
false,
|
3476 |
+
false,
|
3477 |
+
true,
|
3478 |
+
false,
|
3479 |
+
false,
|
3480 |
+
false,
|
3481 |
+
false,
|
3482 |
+
false,
|
3483 |
+
false,
|
3484 |
+
false,
|
3485 |
+
false,
|
3486 |
+
false
|
3487 |
+
],
|
3488 |
+
"model.layers.26.self_attn.q_proj.lora_E": [
|
3489 |
+
true,
|
3490 |
+
false,
|
3491 |
+
false,
|
3492 |
+
true,
|
3493 |
+
false,
|
3494 |
+
false,
|
3495 |
+
false,
|
3496 |
+
false,
|
3497 |
+
false,
|
3498 |
+
false,
|
3499 |
+
true,
|
3500 |
+
false,
|
3501 |
+
true,
|
3502 |
+
false,
|
3503 |
+
true,
|
3504 |
+
true,
|
3505 |
+
true,
|
3506 |
+
false,
|
3507 |
+
false,
|
3508 |
+
true,
|
3509 |
+
true,
|
3510 |
+
true,
|
3511 |
+
false,
|
3512 |
+
false,
|
3513 |
+
true,
|
3514 |
+
true,
|
3515 |
+
false,
|
3516 |
+
false,
|
3517 |
+
true,
|
3518 |
+
false,
|
3519 |
+
true,
|
3520 |
+
true,
|
3521 |
+
false,
|
3522 |
+
false,
|
3523 |
+
false,
|
3524 |
+
true,
|
3525 |
+
true,
|
3526 |
+
false,
|
3527 |
+
false,
|
3528 |
+
false,
|
3529 |
+
true,
|
3530 |
+
false,
|
3531 |
+
false,
|
3532 |
+
false,
|
3533 |
+
true,
|
3534 |
+
true,
|
3535 |
+
false,
|
3536 |
+
false,
|
3537 |
+
true,
|
3538 |
+
false,
|
3539 |
+
true,
|
3540 |
+
true,
|
3541 |
+
false,
|
3542 |
+
true,
|
3543 |
+
false,
|
3544 |
+
false,
|
3545 |
+
true,
|
3546 |
+
true,
|
3547 |
+
true,
|
3548 |
+
false,
|
3549 |
+
true,
|
3550 |
+
true,
|
3551 |
+
true,
|
3552 |
+
true
|
3553 |
+
],
|
3554 |
+
"model.layers.26.self_attn.v_proj.lora_E": [
|
3555 |
+
false,
|
3556 |
+
false,
|
3557 |
+
true,
|
3558 |
+
false,
|
3559 |
+
true,
|
3560 |
+
false,
|
3561 |
+
false,
|
3562 |
+
false,
|
3563 |
+
true,
|
3564 |
+
false,
|
3565 |
+
false,
|
3566 |
+
false,
|
3567 |
+
false,
|
3568 |
+
false,
|
3569 |
+
true,
|
3570 |
+
false,
|
3571 |
+
false,
|
3572 |
+
false,
|
3573 |
+
false,
|
3574 |
+
false,
|
3575 |
+
false,
|
3576 |
+
false,
|
3577 |
+
true,
|
3578 |
+
false,
|
3579 |
+
false,
|
3580 |
+
true,
|
3581 |
+
false,
|
3582 |
+
true,
|
3583 |
+
false,
|
3584 |
+
true,
|
3585 |
+
true,
|
3586 |
+
false,
|
3587 |
+
false,
|
3588 |
+
false,
|
3589 |
+
false,
|
3590 |
+
false,
|
3591 |
+
false,
|
3592 |
+
false,
|
3593 |
+
false,
|
3594 |
+
false,
|
3595 |
+
true,
|
3596 |
+
false,
|
3597 |
+
false,
|
3598 |
+
false,
|
3599 |
+
false,
|
3600 |
+
false,
|
3601 |
+
false,
|
3602 |
+
false,
|
3603 |
+
true,
|
3604 |
+
true,
|
3605 |
+
false,
|
3606 |
+
false,
|
3607 |
+
false,
|
3608 |
+
true,
|
3609 |
+
false,
|
3610 |
+
true,
|
3611 |
+
false,
|
3612 |
+
true,
|
3613 |
+
false,
|
3614 |
+
false,
|
3615 |
+
false,
|
3616 |
+
true,
|
3617 |
+
false,
|
3618 |
+
false
|
3619 |
+
],
|
3620 |
+
"model.layers.27.self_attn.q_proj.lora_E": [
|
3621 |
+
true,
|
3622 |
+
false,
|
3623 |
+
false,
|
3624 |
+
true,
|
3625 |
+
true,
|
3626 |
+
false,
|
3627 |
+
false,
|
3628 |
+
true,
|
3629 |
+
true,
|
3630 |
+
false,
|
3631 |
+
false,
|
3632 |
+
false,
|
3633 |
+
true,
|
3634 |
+
true,
|
3635 |
+
false,
|
3636 |
+
true,
|
3637 |
+
false,
|
3638 |
+
false,
|
3639 |
+
true,
|
3640 |
+
false,
|
3641 |
+
false,
|
3642 |
+
true,
|
3643 |
+
true,
|
3644 |
+
true,
|
3645 |
+
true,
|
3646 |
+
false,
|
3647 |
+
false,
|
3648 |
+
true,
|
3649 |
+
true,
|
3650 |
+
false,
|
3651 |
+
false,
|
3652 |
+
false,
|
3653 |
+
false,
|
3654 |
+
true,
|
3655 |
+
true,
|
3656 |
+
true,
|
3657 |
+
false,
|
3658 |
+
true,
|
3659 |
+
false,
|
3660 |
+
false,
|
3661 |
+
false,
|
3662 |
+
true,
|
3663 |
+
false,
|
3664 |
+
true,
|
3665 |
+
true,
|
3666 |
+
true,
|
3667 |
+
false,
|
3668 |
+
false,
|
3669 |
+
false,
|
3670 |
+
true,
|
3671 |
+
true,
|
3672 |
+
true,
|
3673 |
+
true,
|
3674 |
+
true,
|
3675 |
+
false,
|
3676 |
+
false,
|
3677 |
+
false,
|
3678 |
+
false,
|
3679 |
+
true,
|
3680 |
+
false,
|
3681 |
+
false,
|
3682 |
+
false,
|
3683 |
+
true,
|
3684 |
+
false
|
3685 |
+
],
|
3686 |
+
"model.layers.27.self_attn.v_proj.lora_E": [
|
3687 |
+
false,
|
3688 |
+
false,
|
3689 |
+
true,
|
3690 |
+
true,
|
3691 |
+
true,
|
3692 |
+
true,
|
3693 |
+
true,
|
3694 |
+
true,
|
3695 |
+
true,
|
3696 |
+
false,
|
3697 |
+
false,
|
3698 |
+
false,
|
3699 |
+
true,
|
3700 |
+
false,
|
3701 |
+
false,
|
3702 |
+
false,
|
3703 |
+
true,
|
3704 |
+
true,
|
3705 |
+
false,
|
3706 |
+
false,
|
3707 |
+
false,
|
3708 |
+
true,
|
3709 |
+
false,
|
3710 |
+
true,
|
3711 |
+
true,
|
3712 |
+
true,
|
3713 |
+
true,
|
3714 |
+
true,
|
3715 |
+
false,
|
3716 |
+
true,
|
3717 |
+
true,
|
3718 |
+
false,
|
3719 |
+
true,
|
3720 |
+
false,
|
3721 |
+
true,
|
3722 |
+
true,
|
3723 |
+
false,
|
3724 |
+
true,
|
3725 |
+
true,
|
3726 |
+
false,
|
3727 |
+
false,
|
3728 |
+
true,
|
3729 |
+
false,
|
3730 |
+
true,
|
3731 |
+
true,
|
3732 |
+
false,
|
3733 |
+
false,
|
3734 |
+
true,
|
3735 |
+
false,
|
3736 |
+
true,
|
3737 |
+
true,
|
3738 |
+
true,
|
3739 |
+
false,
|
3740 |
+
false,
|
3741 |
+
true,
|
3742 |
+
false,
|
3743 |
+
false,
|
3744 |
+
true,
|
3745 |
+
true,
|
3746 |
+
true,
|
3747 |
+
true,
|
3748 |
+
true,
|
3749 |
+
false,
|
3750 |
+
true
|
3751 |
+
]
|
3752 |
+
},
|
3753 |
+
"alpha_pattern": {},
|
3754 |
+
"megatron_config": null,
|
3755 |
+
"megatron_core": "megatron.core",
|
3756 |
+
"trainable_token_indices": null,
|
3757 |
+
"loftq_config": {},
|
3758 |
+
"eva_config": null,
|
3759 |
+
"corda_config": null,
|
3760 |
+
"use_dora": false,
|
3761 |
+
"layer_replication": null,
|
3762 |
+
"lora_bias": false,
|
3763 |
+
"target_r": 32,
|
3764 |
+
"init_r": 64,
|
3765 |
+
"tinit": 200,
|
3766 |
+
"tfinal": 500,
|
3767 |
+
"deltaT": 1,
|
3768 |
+
"beta1": 0.85,
|
3769 |
+
"beta2": 0.85,
|
3770 |
+
"orth_reg_weight": 0.5,
|
3771 |
+
"total_step": 5000
|
3772 |
+
},
|
3773 |
+
"error_msg": ""
|
3774 |
+
},
|
3775 |
+
"train_info": {
|
3776 |
+
"cuda_memory_reserved_avg": 12361399900,
|
3777 |
+
"cuda_memory_max": 22793945088,
|
3778 |
+
"cuda_memory_reserved_99th": 18203426160,
|
3779 |
+
"train_time": 1986.3603882369862,
|
3780 |
+
"file_size": 35147440,
|
3781 |
+
"num_trainable_params": 18353664,
|
3782 |
+
"num_total_params": 3231103544,
|
3783 |
+
"status": "success",
|
3784 |
+
"metrics": [
|
3785 |
+
{
|
3786 |
+
"step": 250,
|
3787 |
+
"valid accuracy": 0.0,
|
3788 |
+
"train loss": 1.3241184422969818,
|
3789 |
+
"train samples": 1000,
|
3790 |
+
"train time": 35.95594502204767,
|
3791 |
+
"eval time": 11.413120707002236,
|
3792 |
+
"tokens / sec": 5888.289123542072,
|
3793 |
+
"mem allocated avg": 7292959393.792,
|
3794 |
+
"mem reserved avg": 12441731727.36,
|
3795 |
+
"elapsed time": 100.98083375500573
|
3796 |
+
},
|
3797 |
+
{
|
3798 |
+
"step": 500,
|
3799 |
+
"valid accuracy": 0.38,
|
3800 |
+
"train loss": 1.0195633232593537,
|
3801 |
+
"train samples": 2000,
|
3802 |
+
"train time": 37.64258231502754,
|
3803 |
+
"eval time": 11.37802824100072,
|
3804 |
+
"tokens / sec": 5525.524212428035,
|
3805 |
+
"mem allocated avg": 7285510731.776,
|
3806 |
+
"mem reserved avg": 12328493907.968,
|
3807 |
+
"elapsed time": 197.93603045200143
|
3808 |
+
},
|
3809 |
+
{
|
3810 |
+
"step": 750,
|
3811 |
+
"valid accuracy": 0.28,
|
3812 |
+
"train loss": 0.7883218789100647,
|
3813 |
+
"train samples": 3000,
|
3814 |
+
"train time": 37.909325722001086,
|
3815 |
+
"eval time": 11.385932488003164,
|
3816 |
+
"tokens / sec": 5655.626838954038,
|
3817 |
+
"mem allocated avg": 7296095842.304,
|
3818 |
+
"mem reserved avg": 12484438130.688,
|
3819 |
+
"elapsed time": 295.9188707240028
|
3820 |
+
},
|
3821 |
+
{
|
3822 |
+
"step": 1000,
|
3823 |
+
"valid accuracy": 0.3,
|
3824 |
+
"train loss": 0.7408825470209122,
|
3825 |
+
"train samples": 4000,
|
3826 |
+
"train time": 37.79932949803333,
|
3827 |
+
"eval time": 11.34964040399791,
|
3828 |
+
"tokens / sec": 5511.6321576772825,
|
3829 |
+
"mem allocated avg": 7286506670.08,
|
3830 |
+
"mem reserved avg": 12351948455.936,
|
3831 |
+
"elapsed time": 393.33776786700037
|
3832 |
+
},
|
3833 |
+
{
|
3834 |
+
"step": 1250,
|
3835 |
+
"valid accuracy": 0.36,
|
3836 |
+
"train loss": 0.7282904219627381,
|
3837 |
+
"train samples": 5000,
|
3838 |
+
"train time": 37.475317073069164,
|
3839 |
+
"eval time": 11.342822429993248,
|
3840 |
+
"tokens / sec": 5564.676066473135,
|
3841 |
+
"mem allocated avg": 7287005519.872,
|
3842 |
+
"mem reserved avg": 12349910024.192,
|
3843 |
+
"elapsed time": 490.5430299360014
|
3844 |
+
},
|
3845 |
+
{
|
3846 |
+
"step": 1500,
|
3847 |
+
"valid accuracy": 0.38,
|
3848 |
+
"train loss": 0.7161256531476975,
|
3849 |
+
"train samples": 6000,
|
3850 |
+
"train time": 37.660518338059774,
|
3851 |
+
"eval time": 11.34013032400253,
|
3852 |
+
"tokens / sec": 5558.367469107556,
|
3853 |
+
"mem allocated avg": 7287642494.976,
|
3854 |
+
"mem reserved avg": 12380570386.432,
|
3855 |
+
"elapsed time": 588.017992052999
|
3856 |
+
},
|
3857 |
+
{
|
3858 |
+
"step": 1750,
|
3859 |
+
"valid accuracy": 0.34,
|
3860 |
+
"train loss": 0.7056601424217224,
|
3861 |
+
"train samples": 7000,
|
3862 |
+
"train time": 37.636171496975294,
|
3863 |
+
"eval time": 11.3171367870018,
|
3864 |
+
"tokens / sec": 5562.600861695649,
|
3865 |
+
"mem allocated avg": 7289782888.448,
|
3866 |
+
"mem reserved avg": 12389051269.12,
|
3867 |
+
"elapsed time": 685.2421731229988
|
3868 |
+
},
|
3869 |
+
{
|
3870 |
+
"step": 2000,
|
3871 |
+
"valid accuracy": 0.34,
|
3872 |
+
"train loss": 0.7058932571411133,
|
3873 |
+
"train samples": 8000,
|
3874 |
+
"train time": 37.505602380944765,
|
3875 |
+
"eval time": 11.37751964799827,
|
3876 |
+
"tokens / sec": 5537.732680318789,
|
3877 |
+
"mem allocated avg": 7287054886.912,
|
3878 |
+
"mem reserved avg": 12336119152.64,
|
3879 |
+
"elapsed time": 782.1823508529997
|
3880 |
+
},
|
3881 |
+
{
|
3882 |
+
"step": 2250,
|
3883 |
+
"valid accuracy": 0.3,
|
3884 |
+
"train loss": 0.700018577337265,
|
3885 |
+
"train samples": 9000,
|
3886 |
+
"train time": 38.06487834800646,
|
3887 |
+
"eval time": 11.33160761000181,
|
3888 |
+
"tokens / sec": 5646.885247730137,
|
3889 |
+
"mem allocated avg": 7297638139.904,
|
3890 |
+
"mem reserved avg": 12521129902.08,
|
3891 |
+
"elapsed time": 880.444039299
|
3892 |
+
},
|
3893 |
+
{
|
3894 |
+
"step": 2500,
|
3895 |
+
"valid accuracy": 0.34,
|
3896 |
+
"train loss": 0.6984639673233032,
|
3897 |
+
"train samples": 10000,
|
3898 |
+
"train time": 37.400825600088865,
|
3899 |
+
"eval time": 7.680036880999978,
|
3900 |
+
"tokens / sec": 5507.017470745635,
|
3901 |
+
"mem allocated avg": 7283608303.616,
|
3902 |
+
"mem reserved avg": 12278598467.584,
|
3903 |
+
"elapsed time": 973.4031999860017
|
3904 |
+
},
|
3905 |
+
{
|
3906 |
+
"step": 2750,
|
3907 |
+
"valid accuracy": 0.32,
|
3908 |
+
"train loss": 0.691307947397232,
|
3909 |
+
"train samples": 11000,
|
3910 |
+
"train time": 37.97861938195274,
|
3911 |
+
"eval time": 11.376824188999308,
|
3912 |
+
"tokens / sec": 5578.954776346737,
|
3913 |
+
"mem allocated avg": 7293332232.192,
|
3914 |
+
"mem reserved avg": 12452821467.136,
|
3915 |
+
"elapsed time": 1071.2981272770048
|
3916 |
+
},
|
3917 |
+
{
|
3918 |
+
"step": 3000,
|
3919 |
+
"valid accuracy": 0.3,
|
3920 |
+
"train loss": 0.6851879090070725,
|
3921 |
+
"train samples": 12000,
|
3922 |
+
"train time": 37.862704559986014,
|
3923 |
+
"eval time": 11.377599911000289,
|
3924 |
+
"tokens / sec": 5512.839149387935,
|
3925 |
+
"mem allocated avg": 7288929478.656,
|
3926 |
+
"mem reserved avg": 12371468746.752,
|
3927 |
+
"elapsed time": 1168.7257358770003
|
3928 |
+
},
|
3929 |
+
{
|
3930 |
+
"step": 3250,
|
3931 |
+
"valid accuracy": 0.34,
|
3932 |
+
"train loss": 0.6939580011367797,
|
3933 |
+
"train samples": 13000,
|
3934 |
+
"train time": 37.79518606400961,
|
3935 |
+
"eval time": 7.2029460159974406,
|
3936 |
+
"tokens / sec": 5580.102176050141,
|
3937 |
+
"mem allocated avg": 7290687285.248,
|
3938 |
+
"mem reserved avg": 12403068633.088,
|
3939 |
+
"elapsed time": 1261.9857917680056
|
3940 |
+
},
|
3941 |
+
{
|
3942 |
+
"step": 3500,
|
3943 |
+
"valid accuracy": 0.4,
|
3944 |
+
"train loss": 0.6825792235136032,
|
3945 |
+
"train samples": 14000,
|
3946 |
+
"train time": 37.73422463506722,
|
3947 |
+
"eval time": 11.28984081800445,
|
3948 |
+
"tokens / sec": 5558.614282617983,
|
3949 |
+
"mem allocated avg": 7289277476.864,
|
3950 |
+
"mem reserved avg": 12381820289.024,
|
3951 |
+
"elapsed time": 1359.695578400002
|
3952 |
+
},
|
3953 |
+
{
|
3954 |
+
"step": 3750,
|
3955 |
+
"valid accuracy": 0.34,
|
3956 |
+
"train loss": 0.6795008780956269,
|
3957 |
+
"train samples": 15000,
|
3958 |
+
"train time": 38.156728624038806,
|
3959 |
+
"eval time": 11.362600938999094,
|
3960 |
+
"tokens / sec": 5679.286663570962,
|
3961 |
+
"mem allocated avg": 7299185600.512,
|
3962 |
+
"mem reserved avg": 12562561236.992,
|
3963 |
+
"elapsed time": 1458.6053942910003
|
3964 |
+
},
|
3965 |
+
{
|
3966 |
+
"step": 4000,
|
3967 |
+
"valid accuracy": 0.32,
|
3968 |
+
"train loss": 0.6967895623445511,
|
3969 |
+
"train samples": 16000,
|
3970 |
+
"train time": 37.352128309052205,
|
3971 |
+
"eval time": 11.363241717001074,
|
3972 |
+
"tokens / sec": 5471.522219805362,
|
3973 |
+
"mem allocated avg": 7281535514.624,
|
3974 |
+
"mem reserved avg": 12256066666.496,
|
3975 |
+
"elapsed time": 1555.2909630150025
|
3976 |
+
},
|
3977 |
+
{
|
3978 |
+
"step": 4250,
|
3979 |
+
"valid accuracy": 0.34,
|
3980 |
+
"train loss": 0.6776066061258316,
|
3981 |
+
"train samples": 17000,
|
3982 |
+
"train time": 37.65609644694632,
|
3983 |
+
"eval time": 11.334564828997827,
|
3984 |
+
"tokens / sec": 5613.672683726684,
|
3985 |
+
"mem allocated avg": 7291894349.824,
|
3986 |
+
"mem reserved avg": 12418562392.064,
|
3987 |
+
"elapsed time": 1652.928281804001
|
3988 |
+
},
|
3989 |
+
{
|
3990 |
+
"step": 4500,
|
3991 |
+
"valid accuracy": 0.34,
|
3992 |
+
"train loss": 0.6868188911676407,
|
3993 |
+
"train samples": 18000,
|
3994 |
+
"train time": 37.48494880297949,
|
3995 |
+
"eval time": 11.33762150000257,
|
3996 |
+
"tokens / sec": 5544.038517760537,
|
3997 |
+
"mem allocated avg": 7285549684.736,
|
3998 |
+
"mem reserved avg": 12333837451.264,
|
3999 |
+
"elapsed time": 1749.9311109990012
|
4000 |
+
},
|
4001 |
+
{
|
4002 |
+
"step": 4750,
|
4003 |
+
"valid accuracy": 0.34,
|
4004 |
+
"train loss": 0.6806062284708023,
|
4005 |
+
"train samples": 19000,
|
4006 |
+
"train time": 33.62080936400889,
|
4007 |
+
"eval time": 11.34113016500487,
|
4008 |
+
"tokens / sec": 6244.31725384755,
|
4009 |
+
"mem allocated avg": 7068488509.44,
|
4010 |
+
"mem reserved avg": 12120833916.928,
|
4011 |
+
"elapsed time": 1843.633759463999
|
4012 |
+
},
|
4013 |
+
{
|
4014 |
+
"step": 5000,
|
4015 |
+
"valid accuracy": 0.28,
|
4016 |
+
"train loss": 0.6862971596717834,
|
4017 |
+
"train samples": 20000,
|
4018 |
+
"train time": 33.47089828590106,
|
4019 |
+
"eval time": 11.363945298006001,
|
4020 |
+
"tokens / sec": 6222.7191580255185,
|
4021 |
+
"mem allocated avg": 7065409925.12,
|
4022 |
+
"mem reserved avg": 12064965787.648,
|
4023 |
+
"elapsed time": 1937.0431615920024
|
4024 |
+
},
|
4025 |
+
{
|
4026 |
+
"step": 5000,
|
4027 |
+
"test accuracy": 0.3904473085670963,
|
4028 |
+
"train loss": 0.6862971596717834,
|
4029 |
+
"train samples": 20000,
|
4030 |
+
"train total tokens": 4198051
|
4031 |
+
}
|
4032 |
+
]
|
4033 |
+
},
|
4034 |
+
"meta_info": {
|
4035 |
+
"model_info": {
|
4036 |
+
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
4037 |
+
"created_at": "2024-09-18T15:23:48+00:00"
|
4038 |
+
},
|
4039 |
+
"dataset_info": {
|
4040 |
+
"metamath": {
|
4041 |
+
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
4042 |
+
"created_at": "2023-09-21T17:22:46+00:00"
|
4043 |
+
},
|
4044 |
+
"gsm8k": {
|
4045 |
+
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
4046 |
+
"created_at": "2022-04-12T10:22:10+00:00"
|
4047 |
+
}
|
4048 |
+
},
|
4049 |
+
"package_info": {
|
4050 |
+
"transformers-version": "4.52.4",
|
4051 |
+
"transformers-commit-hash": null,
|
4052 |
+
"peft-version": "0.15.2.dev0",
|
4053 |
+
"peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
|
4054 |
+
"datasets-version": "3.6.0",
|
4055 |
+
"datasets-commit-hash": null,
|
4056 |
+
"bitsandbytes-version": "0.46.0",
|
4057 |
+
"bitsandbytes-commit-hash": null,
|
4058 |
+
"torch-version": "2.7.1+cu126",
|
4059 |
+
"torch-commit-hash": null
|
4060 |
+
},
|
4061 |
+
"system_info": {
|
4062 |
+
"system": "Linux",
|
4063 |
+
"release": "6.8.0-1029-aws",
|
4064 |
+
"version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
|
4065 |
+
"machine": "x86_64",
|
4066 |
+
"processor": "x86_64",
|
4067 |
+
"gpu": "NVIDIA L40S"
|
4068 |
+
},
|
4069 |
+
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
4070 |
+
}
|
4071 |
+
}
|
MetaMathQA/results/adaptionprompt--llama-3.2-3B-lr_0.0005.json
ADDED
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_info": {
|
3 |
+
"created_at": "2025-06-20T04:48:22+00:00",
|
4 |
+
"total_time": 2260.6744696069945,
|
5 |
+
"experiment_name": "adaptionprompt/llama-3.2-3B-lr_0.0005",
|
6 |
+
"peft_branch": "main",
|
7 |
+
"train_config": {
|
8 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
9 |
+
"dtype": "bfloat16",
|
10 |
+
"max_seq_length": 768,
|
11 |
+
"batch_size": 4,
|
12 |
+
"batch_size_eval": 50,
|
13 |
+
"max_steps": 5000,
|
14 |
+
"eval_steps": 250,
|
15 |
+
"compile": false,
|
16 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
17 |
+
"seed": 0,
|
18 |
+
"grad_norm_clip": 1.0,
|
19 |
+
"optimizer_type": "AdamW",
|
20 |
+
"optimizer_kwargs": {
|
21 |
+
"lr": 0.0005
|
22 |
+
},
|
23 |
+
"lr_scheduler": "cosine",
|
24 |
+
"use_amp": false,
|
25 |
+
"autocast_adapter_dtype": true,
|
26 |
+
"generation_kwargs": {
|
27 |
+
"max_length": 800,
|
28 |
+
"max_new_tokens": 300
|
29 |
+
},
|
30 |
+
"attn_implementation": null
|
31 |
+
},
|
32 |
+
"peft_config": {
|
33 |
+
"task_type": "CAUSAL_LM",
|
34 |
+
"peft_type": "ADAPTION_PROMPT",
|
35 |
+
"auto_mapping": null,
|
36 |
+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
37 |
+
"revision": null,
|
38 |
+
"inference_mode": false,
|
39 |
+
"target_modules": "self_attn",
|
40 |
+
"adapter_len": 100,
|
41 |
+
"adapter_layers": 28
|
42 |
+
},
|
43 |
+
"error_msg": ""
|
44 |
+
},
|
45 |
+
"train_info": {
|
46 |
+
"cuda_memory_reserved_avg": 11893757234,
|
47 |
+
"cuda_memory_max": 22410166272,
|
48 |
+
"cuda_memory_reserved_99th": 17907664814,
|
49 |
+
"train_time": 1989.2834085189897,
|
50 |
+
"file_size": 17210384,
|
51 |
+
"num_trainable_params": 8601628,
|
52 |
+
"num_total_params": 3221351452,
|
53 |
+
"status": "success",
|
54 |
+
"metrics": [
|
55 |
+
{
|
56 |
+
"step": 250,
|
57 |
+
"valid accuracy": 0.0,
|
58 |
+
"train loss": 1.3201356165409088,
|
59 |
+
"train samples": 1000,
|
60 |
+
"train time": 36.18721537806414,
|
61 |
+
"eval time": 13.46754032199533,
|
62 |
+
"tokens / sec": 5850.657415556191,
|
63 |
+
"mem allocated avg": 6848060076.032,
|
64 |
+
"mem reserved avg": 11943163199.488,
|
65 |
+
"elapsed time": 99.94861951399798
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"step": 500,
|
69 |
+
"valid accuracy": 0.1,
|
70 |
+
"train loss": 1.153662922859192,
|
71 |
+
"train samples": 2000,
|
72 |
+
"train time": 35.6493088029747,
|
73 |
+
"eval time": 13.314302301005227,
|
74 |
+
"tokens / sec": 5834.474972559473,
|
75 |
+
"mem allocated avg": 6840933136.384,
|
76 |
+
"mem reserved avg": 11833045942.272,
|
77 |
+
"elapsed time": 193.4177081749949
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"step": 750,
|
81 |
+
"valid accuracy": 0.22,
|
82 |
+
"train loss": 0.9016587936878204,
|
83 |
+
"train samples": 3000,
|
84 |
+
"train time": 36.424757257977035,
|
85 |
+
"eval time": 13.392894379001518,
|
86 |
+
"tokens / sec": 5886.133941305707,
|
87 |
+
"mem allocated avg": 6851972698.112,
|
88 |
+
"mem reserved avg": 11989870968.832,
|
89 |
+
"elapsed time": 288.2962625699947
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"step": 1000,
|
93 |
+
"valid accuracy": 0.2,
|
94 |
+
"train loss": 0.8571369113922119,
|
95 |
+
"train samples": 4000,
|
96 |
+
"train time": 35.59983186099271,
|
97 |
+
"eval time": 13.363479856001504,
|
98 |
+
"tokens / sec": 5852.1624712581015,
|
99 |
+
"mem allocated avg": 6842572642.304,
|
100 |
+
"mem reserved avg": 11863001661.44,
|
101 |
+
"elapsed time": 381.66334240599826
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"step": 1250,
|
105 |
+
"valid accuracy": 0.18,
|
106 |
+
"train loss": 0.84929132604599,
|
107 |
+
"train samples": 5000,
|
108 |
+
"train time": 35.52914607799903,
|
109 |
+
"eval time": 13.408120855005109,
|
110 |
+
"tokens / sec": 5869.490911551474,
|
111 |
+
"mem allocated avg": 6843078866.944,
|
112 |
+
"mem reserved avg": 11855409971.2,
|
113 |
+
"elapsed time": 475.2031378399988
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"step": 1500,
|
117 |
+
"valid accuracy": 0.18,
|
118 |
+
"train loss": 0.8379741818904877,
|
119 |
+
"train samples": 6000,
|
120 |
+
"train time": 35.84657208897261,
|
121 |
+
"eval time": 13.451748254003178,
|
122 |
+
"tokens / sec": 5839.637873335062,
|
123 |
+
"mem allocated avg": 6844234328.064,
|
124 |
+
"mem reserved avg": 11880013758.464,
|
125 |
+
"elapsed time": 568.970056428996
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"step": 1750,
|
129 |
+
"valid accuracy": 0.2,
|
130 |
+
"train loss": 0.8320568509101868,
|
131 |
+
"train samples": 7000,
|
132 |
+
"train time": 36.04748217701126,
|
133 |
+
"eval time": 13.354637482996623,
|
134 |
+
"tokens / sec": 5807.756529900249,
|
135 |
+
"mem allocated avg": 6845049858.048,
|
136 |
+
"mem reserved avg": 11894333112.32,
|
137 |
+
"elapsed time": 663.2131869919976
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"step": 2000,
|
141 |
+
"valid accuracy": 0.2,
|
142 |
+
"train loss": 0.83651398563385,
|
143 |
+
"train samples": 8000,
|
144 |
+
"train time": 35.70882848704787,
|
145 |
+
"eval time": 13.407459709997056,
|
146 |
+
"tokens / sec": 5816.376756110452,
|
147 |
+
"mem allocated avg": 6842067818.496,
|
148 |
+
"mem reserved avg": 11843724640.256,
|
149 |
+
"elapsed time": 756.9679808469955
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"step": 2250,
|
153 |
+
"valid accuracy": 0.18,
|
154 |
+
"train loss": 0.8321560187339783,
|
155 |
+
"train samples": 9000,
|
156 |
+
"train time": 36.077689886013104,
|
157 |
+
"eval time": 13.313609958000598,
|
158 |
+
"tokens / sec": 5957.92027369615,
|
159 |
+
"mem allocated avg": 6853360060.416,
|
160 |
+
"mem reserved avg": 12025841319.936,
|
161 |
+
"elapsed time": 851.5264306229947
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"step": 2500,
|
165 |
+
"valid accuracy": 0.22,
|
166 |
+
"train loss": 0.830465945482254,
|
167 |
+
"train samples": 10000,
|
168 |
+
"train time": 35.51607862501987,
|
169 |
+
"eval time": 13.570960901000944,
|
170 |
+
"tokens / sec": 5799.260728488849,
|
171 |
+
"mem allocated avg": 6838232895.488,
|
172 |
+
"mem reserved avg": 11785499312.128,
|
173 |
+
"elapsed time": 945.1205676109967
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"step": 2750,
|
177 |
+
"valid accuracy": 0.2,
|
178 |
+
"train loss": 0.8323929319381714,
|
179 |
+
"train samples": 11000,
|
180 |
+
"train time": 36.33290277811466,
|
181 |
+
"eval time": 13.340032396001334,
|
182 |
+
"tokens / sec": 5831.6562619276265,
|
183 |
+
"mem allocated avg": 6849506107.392,
|
184 |
+
"mem reserved avg": 11957667102.72,
|
185 |
+
"elapsed time": 1039.698461469001
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"step": 3000,
|
189 |
+
"valid accuracy": 0.22,
|
190 |
+
"train loss": 0.8273163681030273,
|
191 |
+
"train samples": 12000,
|
192 |
+
"train time": 36.133581758025684,
|
193 |
+
"eval time": 13.486512909999874,
|
194 |
+
"tokens / sec": 5776.648476140576,
|
195 |
+
"mem allocated avg": 6844330549.248,
|
196 |
+
"mem reserved avg": 11874754101.248,
|
197 |
+
"elapsed time": 1134.0729920019949
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"step": 3250,
|
201 |
+
"valid accuracy": 0.18,
|
202 |
+
"train loss": 0.8321007430553437,
|
203 |
+
"train samples": 13000,
|
204 |
+
"train time": 35.81564853595046,
|
205 |
+
"eval time": 13.383609317002993,
|
206 |
+
"tokens / sec": 5888.515456820645,
|
207 |
+
"mem allocated avg": 6845503963.136,
|
208 |
+
"mem reserved avg": 11903065653.248,
|
209 |
+
"elapsed time": 1228.1345331240009
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"step": 3500,
|
213 |
+
"valid accuracy": 0.18,
|
214 |
+
"train loss": 0.8267617487907409,
|
215 |
+
"train samples": 14000,
|
216 |
+
"train time": 35.759473790014454,
|
217 |
+
"eval time": 13.568141147006827,
|
218 |
+
"tokens / sec": 5865.578482269809,
|
219 |
+
"mem allocated avg": 6844375582.72,
|
220 |
+
"mem reserved avg": 11893385199.616,
|
221 |
+
"elapsed time": 1322.3741278140005
|
222 |
+
},
|
223 |
+
{
|
224 |
+
"step": 3750,
|
225 |
+
"valid accuracy": 0.18,
|
226 |
+
"train loss": 0.822540352344513,
|
227 |
+
"train samples": 15000,
|
228 |
+
"train time": 36.6447854490616,
|
229 |
+
"eval time": 13.383382205000089,
|
230 |
+
"tokens / sec": 5913.610827418539,
|
231 |
+
"mem allocated avg": 6855454945.28,
|
232 |
+
"mem reserved avg": 12064244367.36,
|
233 |
+
"elapsed time": 1417.8726171529997
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"step": 4000,
|
237 |
+
"valid accuracy": 0.22,
|
238 |
+
"train loss": 0.842738341331482,
|
239 |
+
"train samples": 16000,
|
240 |
+
"train time": 35.83419257100468,
|
241 |
+
"eval time": 13.484180120998644,
|
242 |
+
"tokens / sec": 5703.295800373884,
|
243 |
+
"mem allocated avg": 6837201041.408,
|
244 |
+
"mem reserved avg": 11769015697.408,
|
245 |
+
"elapsed time": 1511.8286734409994
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"step": 4250,
|
249 |
+
"valid accuracy": 0.24,
|
250 |
+
"train loss": 0.8195172207355499,
|
251 |
+
"train samples": 17000,
|
252 |
+
"train time": 36.032976000991766,
|
253 |
+
"eval time": 13.43221827600064,
|
254 |
+
"tokens / sec": 5866.542913196561,
|
255 |
+
"mem allocated avg": 6847173238.784,
|
256 |
+
"mem reserved avg": 11924070727.68,
|
257 |
+
"elapsed time": 1606.2413196950001
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"step": 4500,
|
261 |
+
"valid accuracy": 0.22,
|
262 |
+
"train loss": 0.8333091423511505,
|
263 |
+
"train samples": 18000,
|
264 |
+
"train time": 35.92476197002543,
|
265 |
+
"eval time": 13.364069708994066,
|
266 |
+
"tokens / sec": 5784.812163081199,
|
267 |
+
"mem allocated avg": 6842308513.792,
|
268 |
+
"mem reserved avg": 11840637632.512,
|
269 |
+
"elapsed time": 1700.1633438569988
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"step": 4750,
|
273 |
+
"valid accuracy": 0.24,
|
274 |
+
"train loss": 0.8247289218902588,
|
275 |
+
"train samples": 19000,
|
276 |
+
"train time": 36.319470202004595,
|
277 |
+
"eval time": 13.367499373998726,
|
278 |
+
"tokens / sec": 5780.343128144329,
|
279 |
+
"mem allocated avg": 6845010323.456,
|
280 |
+
"mem reserved avg": 11893443919.872,
|
281 |
+
"elapsed time": 1795.0117048679967
|
282 |
+
},
|
283 |
+
{
|
284 |
+
"step": 5000,
|
285 |
+
"valid accuracy": 0.24,
|
286 |
+
"train loss": 0.8317011270523071,
|
287 |
+
"train samples": 20000,
|
288 |
+
"train time": 35.778475134953624,
|
289 |
+
"eval time": 13.382634160996531,
|
290 |
+
"tokens / sec": 5821.377216731123,
|
291 |
+
"mem allocated avg": 6841479706.624,
|
292 |
+
"mem reserved avg": 11840956399.616,
|
293 |
+
"elapsed time": 1888.9356832179983
|
294 |
+
},
|
295 |
+
{
|
296 |
+
"step": 5000,
|
297 |
+
"test accuracy": 0.22062168309325247,
|
298 |
+
"train loss": 0.8317011270523071,
|
299 |
+
"train samples": 20000,
|
300 |
+
"train total tokens": 4198051
|
301 |
+
}
|
302 |
+
]
|
303 |
+
},
|
304 |
+
"meta_info": {
|
305 |
+
"model_info": {
|
306 |
+
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
307 |
+
"created_at": "2024-09-18T15:23:48+00:00"
|
308 |
+
},
|
309 |
+
"dataset_info": {
|
310 |
+
"metamath": {
|
311 |
+
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
312 |
+
"created_at": "2023-09-21T17:22:46+00:00"
|
313 |
+
},
|
314 |
+
"gsm8k": {
|
315 |
+
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
316 |
+
"created_at": "2022-04-12T10:22:10+00:00"
|
317 |
+
}
|
318 |
+
},
|
319 |
+
"package_info": {
|
320 |
+
"transformers-version": "4.52.4",
|
321 |
+
"transformers-commit-hash": null,
|
322 |
+
"peft-version": "0.15.2.dev0",
|
323 |
+
"peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
|
324 |
+
"datasets-version": "3.6.0",
|
325 |
+
"datasets-commit-hash": null,
|
326 |
+
"bitsandbytes-version": "0.46.0",
|
327 |
+
"bitsandbytes-commit-hash": null,
|
328 |
+
"torch-version": "2.7.1+cu126",
|
329 |
+
"torch-commit-hash": null
|
330 |
+
},
|
331 |
+
"system_info": {
|
332 |
+
"system": "Linux",
|
333 |
+
"release": "6.8.0-1029-aws",
|
334 |
+
"version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
|
335 |
+
"machine": "x86_64",
|
336 |
+
"processor": "x86_64",
|
337 |
+
"gpu": "NVIDIA L40S"
|
338 |
+
},
|
339 |
+
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
340 |
+
}
|
341 |
+
}
|
MetaMathQA/results/boft--llama-3.2-3B-default.json
ADDED
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_info": {
|
3 |
+
"created_at": "2025-06-20T00:26:06+00:00",
|
4 |
+
"total_time": 11113.556226242006,
|
5 |
+
"experiment_name": "boft/llama-3.2-3B-default",
|
6 |
+
"peft_branch": "main",
|
7 |
+
"train_config": {
|
8 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
9 |
+
"dtype": "bfloat16",
|
10 |
+
"max_seq_length": 768,
|
11 |
+
"batch_size": 4,
|
12 |
+
"batch_size_eval": 50,
|
13 |
+
"max_steps": 5000,
|
14 |
+
"eval_steps": 250,
|
15 |
+
"compile": false,
|
16 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
17 |
+
"seed": 0,
|
18 |
+
"grad_norm_clip": 1.0,
|
19 |
+
"optimizer_type": "AdamW",
|
20 |
+
"optimizer_kwargs": {
|
21 |
+
"lr": 0.0001,
|
22 |
+
"weight_decay": 0.1
|
23 |
+
},
|
24 |
+
"lr_scheduler": "cosine",
|
25 |
+
"use_amp": false,
|
26 |
+
"autocast_adapter_dtype": true,
|
27 |
+
"generation_kwargs": {
|
28 |
+
"max_length": 800,
|
29 |
+
"max_new_tokens": 300
|
30 |
+
},
|
31 |
+
"attn_implementation": null
|
32 |
+
},
|
33 |
+
"peft_config": {
|
34 |
+
"task_type": null,
|
35 |
+
"peft_type": "BOFT",
|
36 |
+
"auto_mapping": null,
|
37 |
+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
38 |
+
"revision": null,
|
39 |
+
"inference_mode": false,
|
40 |
+
"boft_block_size": 4,
|
41 |
+
"boft_block_num": 0,
|
42 |
+
"boft_n_butterfly_factor": 1,
|
43 |
+
"target_modules": [
|
44 |
+
"q_proj",
|
45 |
+
"v_proj"
|
46 |
+
],
|
47 |
+
"exclude_modules": null,
|
48 |
+
"boft_dropout": 0.0,
|
49 |
+
"fan_in_fan_out": false,
|
50 |
+
"bias": "none",
|
51 |
+
"modules_to_save": null,
|
52 |
+
"init_weights": true,
|
53 |
+
"layers_to_transform": null,
|
54 |
+
"layers_pattern": null
|
55 |
+
},
|
56 |
+
"error_msg": ""
|
57 |
+
},
|
58 |
+
"train_info": {
|
59 |
+
"cuda_memory_reserved_avg": 14814855089,
|
60 |
+
"cuda_memory_max": 24427626496,
|
61 |
+
"cuda_memory_reserved_99th": 20103445872,
|
62 |
+
"train_time": 8291.859631775995,
|
63 |
+
"file_size": 3225360,
|
64 |
+
"num_trainable_params": 802816,
|
65 |
+
"num_total_params": 3213552640,
|
66 |
+
"status": "success",
|
67 |
+
"metrics": [
|
68 |
+
{
|
69 |
+
"step": 250,
|
70 |
+
"valid accuracy": 0.0,
|
71 |
+
"train loss": 1.291453486919403,
|
72 |
+
"train samples": 1000,
|
73 |
+
"train time": 168.6401632970519,
|
74 |
+
"eval time": 140.71104099299555,
|
75 |
+
"tokens / sec": 1255.4482625059293,
|
76 |
+
"mem allocated avg": 6794374191.104,
|
77 |
+
"mem reserved avg": 14862272954.368,
|
78 |
+
"elapsed time": 378.35506656600046
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"step": 500,
|
82 |
+
"valid accuracy": 0.12,
|
83 |
+
"train loss": 1.0658165102005004,
|
84 |
+
"train samples": 2000,
|
85 |
+
"train time": 168.0782826189752,
|
86 |
+
"eval time": 140.55351014900225,
|
87 |
+
"tokens / sec": 1237.4888460248842,
|
88 |
+
"mem allocated avg": 6786098696.192,
|
89 |
+
"mem reserved avg": 14759126630.4,
|
90 |
+
"elapsed time": 750.4153373740046
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"step": 750,
|
94 |
+
"valid accuracy": 0.38,
|
95 |
+
"train loss": 0.8760707340240479,
|
96 |
+
"train samples": 3000,
|
97 |
+
"train time": 168.35559053501493,
|
98 |
+
"eval time": 140.5371915020005,
|
99 |
+
"tokens / sec": 1273.5009233649919,
|
100 |
+
"mem allocated avg": 6796379451.392,
|
101 |
+
"mem reserved avg": 14898109087.744,
|
102 |
+
"elapsed time": 1123.1088362480004
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"step": 1000,
|
106 |
+
"valid accuracy": 0.42,
|
107 |
+
"train loss": 0.8187176239490509,
|
108 |
+
"train samples": 4000,
|
109 |
+
"train time": 168.23626853094902,
|
110 |
+
"eval time": 140.51234973900137,
|
111 |
+
"tokens / sec": 1238.3536666570453,
|
112 |
+
"mem allocated avg": 6788017170.432,
|
113 |
+
"mem reserved avg": 14785978564.608,
|
114 |
+
"elapsed time": 1495.2035204040003
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"step": 1250,
|
118 |
+
"valid accuracy": 0.44,
|
119 |
+
"train loss": 0.7968595073223114,
|
120 |
+
"train samples": 5000,
|
121 |
+
"train time": 168.06973706404096,
|
122 |
+
"eval time": 140.56398986800195,
|
123 |
+
"tokens / sec": 1240.7825682534333,
|
124 |
+
"mem allocated avg": 6786994073.6,
|
125 |
+
"mem reserved avg": 14784728662.016,
|
126 |
+
"elapsed time": 1867.293767313
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"step": 1500,
|
130 |
+
"valid accuracy": 0.3,
|
131 |
+
"train loss": 0.7768308148384094,
|
132 |
+
"train samples": 6000,
|
133 |
+
"train time": 168.12391281103191,
|
134 |
+
"eval time": 140.47015122300218,
|
135 |
+
"tokens / sec": 1245.0995013141533,
|
136 |
+
"mem allocated avg": 6790023022.592,
|
137 |
+
"mem reserved avg": 14800616685.568,
|
138 |
+
"elapsed time": 2239.2391544300044
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"step": 1750,
|
142 |
+
"valid accuracy": 0.34,
|
143 |
+
"train loss": 0.7639130955934524,
|
144 |
+
"train samples": 7000,
|
145 |
+
"train time": 168.4569528100401,
|
146 |
+
"eval time": 140.76006173399946,
|
147 |
+
"tokens / sec": 1242.780404772479,
|
148 |
+
"mem allocated avg": 6790166409.216,
|
149 |
+
"mem reserved avg": 14820103421.952,
|
150 |
+
"elapsed time": 2611.854956449002
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"step": 2000,
|
154 |
+
"valid accuracy": 0.28,
|
155 |
+
"train loss": 0.7575103138685226,
|
156 |
+
"train samples": 8000,
|
157 |
+
"train time": 168.38565446306166,
|
158 |
+
"eval time": 140.82750502999988,
|
159 |
+
"tokens / sec": 1233.4542432506432,
|
160 |
+
"mem allocated avg": 6787659706.368,
|
161 |
+
"mem reserved avg": 14766038843.392,
|
162 |
+
"elapsed time": 2984.338527646003
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"step": 2250,
|
166 |
+
"valid accuracy": 0.36,
|
167 |
+
"train loss": 0.7480558000802994,
|
168 |
+
"train samples": 9000,
|
169 |
+
"train time": 168.98983921804756,
|
170 |
+
"eval time": 140.92262020800263,
|
171 |
+
"tokens / sec": 1271.9581307054364,
|
172 |
+
"mem allocated avg": 6798715979.776,
|
173 |
+
"mem reserved avg": 14937929809.92,
|
174 |
+
"elapsed time": 3357.8442202950027
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"step": 2500,
|
178 |
+
"valid accuracy": 0.36,
|
179 |
+
"train loss": 0.7452825582027436,
|
180 |
+
"train samples": 10000,
|
181 |
+
"train time": 168.30827127001976,
|
182 |
+
"eval time": 140.89225408899802,
|
183 |
+
"tokens / sec": 1223.7485326527044,
|
184 |
+
"mem allocated avg": 6783722676.224,
|
185 |
+
"mem reserved avg": 14710111993.856,
|
186 |
+
"elapsed time": 3730.0927005050034
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"step": 2750,
|
190 |
+
"valid accuracy": 0.4,
|
191 |
+
"train loss": 0.7368131847381592,
|
192 |
+
"train samples": 11000,
|
193 |
+
"train time": 168.8352410539519,
|
194 |
+
"eval time": 140.97951381299936,
|
195 |
+
"tokens / sec": 1254.9571918595636,
|
196 |
+
"mem allocated avg": 6794155292.672,
|
197 |
+
"mem reserved avg": 14876869132.288,
|
198 |
+
"elapsed time": 4103.762088249001
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"step": 3000,
|
202 |
+
"valid accuracy": 0.38,
|
203 |
+
"train loss": 0.7284122853279114,
|
204 |
+
"train samples": 12000,
|
205 |
+
"train time": 168.7332625999261,
|
206 |
+
"eval time": 140.92822863799665,
|
207 |
+
"tokens / sec": 1237.0471404616308,
|
208 |
+
"mem allocated avg": 6789107718.144,
|
209 |
+
"mem reserved avg": 14802571231.232,
|
210 |
+
"elapsed time": 4477.013831755001
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"step": 3250,
|
214 |
+
"valid accuracy": 0.34,
|
215 |
+
"train loss": 0.7360657904148101,
|
216 |
+
"train samples": 13000,
|
217 |
+
"train time": 168.6564349730761,
|
218 |
+
"eval time": 140.91345744199498,
|
219 |
+
"tokens / sec": 1250.4770424779092,
|
220 |
+
"mem allocated avg": 6791307786.24,
|
221 |
+
"mem reserved avg": 14825665069.056,
|
222 |
+
"elapsed time": 4850.336532419002
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"step": 3500,
|
226 |
+
"valid accuracy": 0.34,
|
227 |
+
"train loss": 0.7245372575521469,
|
228 |
+
"train samples": 14000,
|
229 |
+
"train time": 168.69712368501496,
|
230 |
+
"eval time": 141.10813598799723,
|
231 |
+
"tokens / sec": 1243.3525564528145,
|
232 |
+
"mem allocated avg": 6789542191.104,
|
233 |
+
"mem reserved avg": 14803175211.008,
|
234 |
+
"elapsed time": 5223.900597244006
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"step": 3750,
|
238 |
+
"valid accuracy": 0.36,
|
239 |
+
"train loss": 0.7196882257461548,
|
240 |
+
"train samples": 15000,
|
241 |
+
"train time": 169.02741387199057,
|
242 |
+
"eval time": 140.85168583100312,
|
243 |
+
"tokens / sec": 1282.0583066135978,
|
244 |
+
"mem allocated avg": 6800711397.376,
|
245 |
+
"mem reserved avg": 14974772576.256,
|
246 |
+
"elapsed time": 5597.923287113001
|
247 |
+
},
|
248 |
+
{
|
249 |
+
"step": 4000,
|
250 |
+
"valid accuracy": 0.4,
|
251 |
+
"train loss": 0.7386573747396469,
|
252 |
+
"train samples": 16000,
|
253 |
+
"train time": 168.47688378201565,
|
254 |
+
"eval time": 141.17620621900278,
|
255 |
+
"tokens / sec": 1213.062560347618,
|
256 |
+
"mem allocated avg": 6781920968.704,
|
257 |
+
"mem reserved avg": 14703241723.904,
|
258 |
+
"elapsed time": 5970.573302798002
|
259 |
+
},
|
260 |
+
{
|
261 |
+
"step": 4250,
|
262 |
+
"valid accuracy": 0.36,
|
263 |
+
"train loss": 0.7167660998106002,
|
264 |
+
"train samples": 17000,
|
265 |
+
"train time": 168.66243355697225,
|
266 |
+
"eval time": 141.03309625500697,
|
267 |
+
"tokens / sec": 1253.3259217358275,
|
268 |
+
"mem allocated avg": 6792739334.144,
|
269 |
+
"mem reserved avg": 14838457696.256,
|
270 |
+
"elapsed time": 6343.574297415005
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"step": 4500,
|
274 |
+
"valid accuracy": 0.36,
|
275 |
+
"train loss": 0.7278824989795685,
|
276 |
+
"train samples": 18000,
|
277 |
+
"train time": 168.825120675996,
|
278 |
+
"eval time": 141.10180295899772,
|
279 |
+
"tokens / sec": 1230.966097745832,
|
280 |
+
"mem allocated avg": 6787403542.528,
|
281 |
+
"mem reserved avg": 14768026943.488,
|
282 |
+
"elapsed time": 6716.868663600006
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"step": 4750,
|
286 |
+
"valid accuracy": 0.34,
|
287 |
+
"train loss": 0.7206774606704712,
|
288 |
+
"train samples": 19000,
|
289 |
+
"train time": 168.64492384497134,
|
290 |
+
"eval time": 140.88104952100548,
|
291 |
+
"tokens / sec": 1244.8581031290848,
|
292 |
+
"mem allocated avg": 6790186668.032,
|
293 |
+
"mem reserved avg": 14817972715.52,
|
294 |
+
"elapsed time": 7090.485984892002
|
295 |
+
},
|
296 |
+
{
|
297 |
+
"step": 5000,
|
298 |
+
"valid accuracy": 0.34,
|
299 |
+
"train loss": 0.7268091850280761,
|
300 |
+
"train samples": 20000,
|
301 |
+
"train time": 168.56219975605927,
|
302 |
+
"eval time": 140.98389447200316,
|
303 |
+
"tokens / sec": 1235.6269691628356,
|
304 |
+
"mem allocated avg": 6787183779.84,
|
305 |
+
"mem reserved avg": 14761332834.304,
|
306 |
+
"elapsed time": 7463.428281595006
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"step": 5000,
|
310 |
+
"test accuracy": 0.3646702047005307,
|
311 |
+
"train loss": 0.7268091850280761,
|
312 |
+
"train samples": 20000,
|
313 |
+
"train total tokens": 4198051
|
314 |
+
}
|
315 |
+
]
|
316 |
+
},
|
317 |
+
"meta_info": {
|
318 |
+
"model_info": {
|
319 |
+
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
320 |
+
"created_at": "2024-09-18T15:23:48+00:00"
|
321 |
+
},
|
322 |
+
"dataset_info": {
|
323 |
+
"metamath": {
|
324 |
+
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
325 |
+
"created_at": "2023-09-21T17:22:46+00:00"
|
326 |
+
},
|
327 |
+
"gsm8k": {
|
328 |
+
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
329 |
+
"created_at": "2022-04-12T10:22:10+00:00"
|
330 |
+
}
|
331 |
+
},
|
332 |
+
"package_info": {
|
333 |
+
"transformers-version": "4.52.4",
|
334 |
+
"transformers-commit-hash": null,
|
335 |
+
"peft-version": "0.15.2.dev0",
|
336 |
+
"peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
|
337 |
+
"datasets-version": "3.6.0",
|
338 |
+
"datasets-commit-hash": null,
|
339 |
+
"bitsandbytes-version": "0.46.0",
|
340 |
+
"bitsandbytes-commit-hash": null,
|
341 |
+
"torch-version": "2.7.1+cu126",
|
342 |
+
"torch-commit-hash": null
|
343 |
+
},
|
344 |
+
"system_info": {
|
345 |
+
"system": "Linux",
|
346 |
+
"release": "6.8.0-1029-aws",
|
347 |
+
"version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
|
348 |
+
"machine": "x86_64",
|
349 |
+
"processor": "x86_64",
|
350 |
+
"gpu": "NVIDIA L40S"
|
351 |
+
},
|
352 |
+
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
353 |
+
}
|
354 |
+
}
|
MetaMathQA/results/bone--llama-3.2-3B-bat.json
ADDED
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_info": {
|
3 |
+
"created_at": "2025-06-20T03:31:24+00:00",
|
4 |
+
"total_time": 2742.3845372959986,
|
5 |
+
"experiment_name": "bone/llama-3.2-3B-bat",
|
6 |
+
"peft_branch": "main",
|
7 |
+
"train_config": {
|
8 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
9 |
+
"dtype": "bfloat16",
|
10 |
+
"max_seq_length": 768,
|
11 |
+
"batch_size": 4,
|
12 |
+
"batch_size_eval": 50,
|
13 |
+
"max_steps": 5000,
|
14 |
+
"eval_steps": 250,
|
15 |
+
"compile": false,
|
16 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
17 |
+
"seed": 0,
|
18 |
+
"grad_norm_clip": 1.0,
|
19 |
+
"optimizer_type": "AdamW",
|
20 |
+
"optimizer_kwargs": {
|
21 |
+
"lr": 0.0001,
|
22 |
+
"weight_decay": 0.1
|
23 |
+
},
|
24 |
+
"lr_scheduler": "cosine",
|
25 |
+
"use_amp": false,
|
26 |
+
"autocast_adapter_dtype": true,
|
27 |
+
"generation_kwargs": {
|
28 |
+
"max_length": 800,
|
29 |
+
"max_new_tokens": 300
|
30 |
+
},
|
31 |
+
"attn_implementation": null
|
32 |
+
},
|
33 |
+
"peft_config": {
|
34 |
+
"task_type": null,
|
35 |
+
"peft_type": "BONE",
|
36 |
+
"auto_mapping": null,
|
37 |
+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
38 |
+
"revision": null,
|
39 |
+
"inference_mode": false,
|
40 |
+
"r": 64,
|
41 |
+
"target_modules": [
|
42 |
+
"v_proj",
|
43 |
+
"q_proj"
|
44 |
+
],
|
45 |
+
"exclude_modules": null,
|
46 |
+
"init_weights": "bat",
|
47 |
+
"layers_to_transform": null,
|
48 |
+
"layers_pattern": null,
|
49 |
+
"bias": "none",
|
50 |
+
"modules_to_save": null
|
51 |
+
},
|
52 |
+
"error_msg": ""
|
53 |
+
},
|
54 |
+
"train_info": {
|
55 |
+
"cuda_memory_reserved_avg": 14713983755,
|
56 |
+
"cuda_memory_max": 25251807232,
|
57 |
+
"cuda_memory_reserved_99th": 20472733368,
|
58 |
+
"train_time": 2430.7548372539895,
|
59 |
+
"file_size": 29367552,
|
60 |
+
"num_trainable_params": 7340032,
|
61 |
+
"num_total_params": 3220089856,
|
62 |
+
"status": "success",
|
63 |
+
"metrics": [
|
64 |
+
{
|
65 |
+
"step": 250,
|
66 |
+
"valid accuracy": 0.34,
|
67 |
+
"train loss": 0.8741071329116822,
|
68 |
+
"train samples": 1000,
|
69 |
+
"train time": 44.769113782072964,
|
70 |
+
"eval time": 16.53786130100343,
|
71 |
+
"tokens / sec": 4729.130914464948,
|
72 |
+
"mem allocated avg": 6898425409.536,
|
73 |
+
"mem reserved avg": 14773294989.312,
|
74 |
+
"elapsed time": 124.73039968500234
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"step": 500,
|
78 |
+
"valid accuracy": 0.42,
|
79 |
+
"train loss": 0.6946564470529556,
|
80 |
+
"train samples": 2000,
|
81 |
+
"train time": 43.747789238033874,
|
82 |
+
"eval time": 16.4541177170031,
|
83 |
+
"tokens / sec": 4754.4116770858745,
|
84 |
+
"mem allocated avg": 6890118709.248,
|
85 |
+
"mem reserved avg": 14662749913.088,
|
86 |
+
"elapsed time": 242.48505929599924
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"step": 750,
|
90 |
+
"valid accuracy": 0.42,
|
91 |
+
"train loss": 0.6668610339164733,
|
92 |
+
"train samples": 3000,
|
93 |
+
"train time": 44.788394879076805,
|
94 |
+
"eval time": 8.99262467600056,
|
95 |
+
"tokens / sec": 4786.9766393472355,
|
96 |
+
"mem allocated avg": 6900886024.192,
|
97 |
+
"mem reserved avg": 14820195696.64,
|
98 |
+
"elapsed time": 354.3122298879971
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"step": 1000,
|
102 |
+
"valid accuracy": 0.42,
|
103 |
+
"train loss": 0.6476555281877517,
|
104 |
+
"train samples": 4000,
|
105 |
+
"train time": 43.08444309095648,
|
106 |
+
"eval time": 14.581032188005338,
|
107 |
+
"tokens / sec": 4835.527282090601,
|
108 |
+
"mem allocated avg": 6892210176.0,
|
109 |
+
"mem reserved avg": 14677799075.84,
|
110 |
+
"elapsed time": 469.41999823199876
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"step": 1250,
|
114 |
+
"valid accuracy": 0.38,
|
115 |
+
"train loss": 0.6442477897405624,
|
116 |
+
"train samples": 5000,
|
117 |
+
"train time": 43.81069704208494,
|
118 |
+
"eval time": 16.504536090003967,
|
119 |
+
"tokens / sec": 4759.979048031958,
|
120 |
+
"mem allocated avg": 6892437598.208,
|
121 |
+
"mem reserved avg": 14675995525.12,
|
122 |
+
"elapsed time": 587.4669312400001
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"step": 1500,
|
126 |
+
"valid accuracy": 0.48,
|
127 |
+
"train loss": 0.6370412122011184,
|
128 |
+
"train samples": 6000,
|
129 |
+
"train time": 44.041188616007275,
|
130 |
+
"eval time": 11.50742915799492,
|
131 |
+
"tokens / sec": 4753.07335197389,
|
132 |
+
"mem allocated avg": 6893869041.664,
|
133 |
+
"mem reserved avg": 14704349020.16,
|
134 |
+
"elapsed time": 700.887209352004
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"step": 1750,
|
138 |
+
"valid accuracy": 0.44,
|
139 |
+
"train loss": 0.6277673766613007,
|
140 |
+
"train samples": 7000,
|
141 |
+
"train time": 44.32280573899334,
|
142 |
+
"eval time": 16.494074002999696,
|
143 |
+
"tokens / sec": 4723.414876595195,
|
144 |
+
"mem allocated avg": 6895170344.96,
|
145 |
+
"mem reserved avg": 14718215389.184,
|
146 |
+
"elapsed time": 819.4313268580008
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"step": 2000,
|
150 |
+
"valid accuracy": 0.48,
|
151 |
+
"train loss": 0.6278820457458496,
|
152 |
+
"train samples": 8000,
|
153 |
+
"train time": 43.325528461049544,
|
154 |
+
"eval time": 16.452074027998606,
|
155 |
+
"tokens / sec": 4793.848047040501,
|
156 |
+
"mem allocated avg": 6891568050.176,
|
157 |
+
"mem reserved avg": 14656710115.328,
|
158 |
+
"elapsed time": 936.9070930559974
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"step": 2250,
|
162 |
+
"valid accuracy": 0.44,
|
163 |
+
"train loss": 0.6160005252361298,
|
164 |
+
"train samples": 9000,
|
165 |
+
"train time": 45.04456213898811,
|
166 |
+
"eval time": 16.52133422600309,
|
167 |
+
"tokens / sec": 4771.896757188206,
|
168 |
+
"mem allocated avg": 6903412344.832,
|
169 |
+
"mem reserved avg": 14851812360.192,
|
170 |
+
"elapsed time": 1056.8185863660037
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"step": 2500,
|
174 |
+
"valid accuracy": 0.5,
|
175 |
+
"train loss": 0.6121727240085602,
|
176 |
+
"train samples": 10000,
|
177 |
+
"train time": 43.16439942702709,
|
178 |
+
"eval time": 16.356938169003115,
|
179 |
+
"tokens / sec": 4771.686916395162,
|
180 |
+
"mem allocated avg": 6888002562.048,
|
181 |
+
"mem reserved avg": 14598350569.472,
|
182 |
+
"elapsed time": 1173.7929829869972
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"step": 2750,
|
186 |
+
"valid accuracy": 0.52,
|
187 |
+
"train loss": 0.6007345867156982,
|
188 |
+
"train samples": 11000,
|
189 |
+
"train time": 44.3066304440581,
|
190 |
+
"eval time": 16.514935120998416,
|
191 |
+
"tokens / sec": 4782.151065798665,
|
192 |
+
"mem allocated avg": 6899352545.28,
|
193 |
+
"mem reserved avg": 14785458470.912,
|
194 |
+
"elapsed time": 1292.7444534430033
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"step": 3000,
|
198 |
+
"valid accuracy": 0.52,
|
199 |
+
"train loss": 0.5899704934358597,
|
200 |
+
"train samples": 12000,
|
201 |
+
"train time": 44.07467572299356,
|
202 |
+
"eval time": 16.412788394998643,
|
203 |
+
"tokens / sec": 4735.848796979486,
|
204 |
+
"mem allocated avg": 6894036676.608,
|
205 |
+
"mem reserved avg": 14687865405.44,
|
206 |
+
"elapsed time": 1411.115336062001
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"step": 3250,
|
210 |
+
"valid accuracy": 0.48,
|
211 |
+
"train loss": 0.5988378477096558,
|
212 |
+
"train samples": 13000,
|
213 |
+
"train time": 44.070030323957326,
|
214 |
+
"eval time": 10.250203846997465,
|
215 |
+
"tokens / sec": 4785.587812163363,
|
216 |
+
"mem allocated avg": 6895260303.36,
|
217 |
+
"mem reserved avg": 14725043716.096,
|
218 |
+
"elapsed time": 1523.332073521
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"step": 3500,
|
222 |
+
"valid accuracy": 0.5,
|
223 |
+
"train loss": 0.5801258901357651,
|
224 |
+
"train samples": 14000,
|
225 |
+
"train time": 43.991991777089424,
|
226 |
+
"eval time": 16.38271237299341,
|
227 |
+
"tokens / sec": 4767.913238909897,
|
228 |
+
"mem allocated avg": 6893688922.112,
|
229 |
+
"mem reserved avg": 14703484993.536,
|
230 |
+
"elapsed time": 1641.7187374700006
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"step": 3750,
|
234 |
+
"valid accuracy": 0.5,
|
235 |
+
"train loss": 0.5768071869611741,
|
236 |
+
"train samples": 15000,
|
237 |
+
"train time": 45.04501243098639,
|
238 |
+
"eval time": 16.454509290000715,
|
239 |
+
"tokens / sec": 4810.810083180938,
|
240 |
+
"mem allocated avg": 6905122422.784,
|
241 |
+
"mem reserved avg": 14891314315.264,
|
242 |
+
"elapsed time": 1761.645320085001
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"step": 4000,
|
246 |
+
"valid accuracy": 0.52,
|
247 |
+
"train loss": 0.5858320169448853,
|
248 |
+
"train samples": 16000,
|
249 |
+
"train time": 42.547905418032315,
|
250 |
+
"eval time": 16.350580427999375,
|
251 |
+
"tokens / sec": 4803.36218650576,
|
252 |
+
"mem allocated avg": 6886491265.024,
|
253 |
+
"mem reserved avg": 14582730981.376,
|
254 |
+
"elapsed time": 1878.0724109930015
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"step": 4250,
|
258 |
+
"valid accuracy": 0.54,
|
259 |
+
"train loss": 0.5723247408866883,
|
260 |
+
"train samples": 17000,
|
261 |
+
"train time": 44.19116178697732,
|
262 |
+
"eval time": 16.508775556001638,
|
263 |
+
"tokens / sec": 4783.513070305705,
|
264 |
+
"mem allocated avg": 6897152284.672,
|
265 |
+
"mem reserved avg": 14738381602.816,
|
266 |
+
"elapsed time": 1996.8971549050038
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"step": 4500,
|
270 |
+
"valid accuracy": 0.48,
|
271 |
+
"train loss": 0.5789256048202515,
|
272 |
+
"train samples": 18000,
|
273 |
+
"train time": 43.87211918797402,
|
274 |
+
"eval time": 16.414912490006827,
|
275 |
+
"tokens / sec": 4736.903615473535,
|
276 |
+
"mem allocated avg": 6893093124.096,
|
277 |
+
"mem reserved avg": 14658832433.152,
|
278 |
+
"elapsed time": 2114.9650602839974
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"step": 4750,
|
282 |
+
"valid accuracy": 0.48,
|
283 |
+
"train loss": 0.568240401506424,
|
284 |
+
"train samples": 19000,
|
285 |
+
"train time": 43.939464293958736,
|
286 |
+
"eval time": 16.460097985000175,
|
287 |
+
"tokens / sec": 4777.914418698651,
|
288 |
+
"mem allocated avg": 6894218592.256,
|
289 |
+
"mem reserved avg": 14710372040.704,
|
290 |
+
"elapsed time": 2233.517725938
|
291 |
+
},
|
292 |
+
{
|
293 |
+
"step": 5000,
|
294 |
+
"valid accuracy": 0.5,
|
295 |
+
"train loss": 0.57634852206707,
|
296 |
+
"train samples": 20000,
|
297 |
+
"train time": 42.787552905057964,
|
298 |
+
"eval time": 16.445046182001533,
|
299 |
+
"tokens / sec": 4867.770785166333,
|
300 |
+
"mem allocated avg": 6890906441.728,
|
301 |
+
"mem reserved avg": 14656718503.936,
|
302 |
+
"elapsed time": 2350.279711092
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"step": 5000,
|
306 |
+
"test accuracy": 0.5170583775587566,
|
307 |
+
"train loss": 0.57634852206707,
|
308 |
+
"train samples": 20000,
|
309 |
+
"train total tokens": 4198051
|
310 |
+
}
|
311 |
+
]
|
312 |
+
},
|
313 |
+
"meta_info": {
|
314 |
+
"model_info": {
|
315 |
+
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
316 |
+
"created_at": "2024-09-18T15:23:48+00:00"
|
317 |
+
},
|
318 |
+
"dataset_info": {
|
319 |
+
"metamath": {
|
320 |
+
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
321 |
+
"created_at": "2023-09-21T17:22:46+00:00"
|
322 |
+
},
|
323 |
+
"gsm8k": {
|
324 |
+
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
325 |
+
"created_at": "2022-04-12T10:22:10+00:00"
|
326 |
+
}
|
327 |
+
},
|
328 |
+
"package_info": {
|
329 |
+
"transformers-version": "4.52.4",
|
330 |
+
"transformers-commit-hash": null,
|
331 |
+
"peft-version": "0.15.2.dev0",
|
332 |
+
"peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
|
333 |
+
"datasets-version": "3.6.0",
|
334 |
+
"datasets-commit-hash": null,
|
335 |
+
"bitsandbytes-version": "0.46.0",
|
336 |
+
"bitsandbytes-commit-hash": null,
|
337 |
+
"torch-version": "2.7.1+cu126",
|
338 |
+
"torch-commit-hash": null
|
339 |
+
},
|
340 |
+
"system_info": {
|
341 |
+
"system": "Linux",
|
342 |
+
"release": "6.8.0-1029-aws",
|
343 |
+
"version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
|
344 |
+
"machine": "x86_64",
|
345 |
+
"processor": "x86_64",
|
346 |
+
"gpu": "NVIDIA L40S"
|
347 |
+
},
|
348 |
+
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
349 |
+
}
|
350 |
+
}
|
MetaMathQA/results/bone--llama-3.2-3B-default.json
ADDED
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_info": {
|
3 |
+
"created_at": "2025-06-20T04:17:11+00:00",
|
4 |
+
"total_time": 1867.121674144997,
|
5 |
+
"experiment_name": "bone/llama-3.2-3B-default",
|
6 |
+
"peft_branch": "main",
|
7 |
+
"train_config": {
|
8 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
9 |
+
"dtype": "bfloat16",
|
10 |
+
"max_seq_length": 768,
|
11 |
+
"batch_size": 4,
|
12 |
+
"batch_size_eval": 50,
|
13 |
+
"max_steps": 5000,
|
14 |
+
"eval_steps": 250,
|
15 |
+
"compile": false,
|
16 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
17 |
+
"seed": 0,
|
18 |
+
"grad_norm_clip": 1.0,
|
19 |
+
"optimizer_type": "AdamW",
|
20 |
+
"optimizer_kwargs": {
|
21 |
+
"lr": 0.0001,
|
22 |
+
"weight_decay": 0.1
|
23 |
+
},
|
24 |
+
"lr_scheduler": "cosine",
|
25 |
+
"use_amp": false,
|
26 |
+
"autocast_adapter_dtype": true,
|
27 |
+
"generation_kwargs": {
|
28 |
+
"max_length": 800,
|
29 |
+
"max_new_tokens": 300
|
30 |
+
},
|
31 |
+
"attn_implementation": null
|
32 |
+
},
|
33 |
+
"peft_config": {
|
34 |
+
"task_type": null,
|
35 |
+
"peft_type": "BONE",
|
36 |
+
"auto_mapping": null,
|
37 |
+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
38 |
+
"revision": null,
|
39 |
+
"inference_mode": false,
|
40 |
+
"r": 64,
|
41 |
+
"target_modules": [
|
42 |
+
"v_proj",
|
43 |
+
"q_proj"
|
44 |
+
],
|
45 |
+
"exclude_modules": null,
|
46 |
+
"init_weights": true,
|
47 |
+
"layers_to_transform": null,
|
48 |
+
"layers_pattern": null,
|
49 |
+
"bias": "none",
|
50 |
+
"modules_to_save": null
|
51 |
+
},
|
52 |
+
"error_msg": ""
|
53 |
+
},
|
54 |
+
"train_info": {
|
55 |
+
"cuda_memory_reserved_avg": 11170837063,
|
56 |
+
"cuda_memory_max": 20248002560,
|
57 |
+
"cuda_memory_reserved_99th": 16303469363,
|
58 |
+
"train_time": 1664.0814183089897,
|
59 |
+
"file_size": 29367496,
|
60 |
+
"num_trainable_params": 7340032,
|
61 |
+
"num_total_params": 3220089856,
|
62 |
+
"status": "success",
|
63 |
+
"metrics": [
|
64 |
+
{
|
65 |
+
"step": 250,
|
66 |
+
"valid accuracy": 0.34,
|
67 |
+
"train loss": 0.8771067566871643,
|
68 |
+
"train samples": 1000,
|
69 |
+
"train time": 29.468342912026856,
|
70 |
+
"eval time": 11.086663477995899,
|
71 |
+
"tokens / sec": 7184.625230948821,
|
72 |
+
"mem allocated avg": 6894354876.416,
|
73 |
+
"mem reserved avg": 11212691603.456,
|
74 |
+
"elapsed time": 88.56553585999791
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"step": 500,
|
78 |
+
"valid accuracy": 0.38,
|
79 |
+
"train loss": 0.6947847135066986,
|
80 |
+
"train samples": 2000,
|
81 |
+
"train time": 29.13603712292388,
|
82 |
+
"eval time": 11.12908834600239,
|
83 |
+
"tokens / sec": 7138.753946615206,
|
84 |
+
"mem allocated avg": 6887297284.096,
|
85 |
+
"mem reserved avg": 11116172279.808,
|
86 |
+
"elapsed time": 169.94219922799675
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"step": 750,
|
90 |
+
"valid accuracy": 0.42,
|
91 |
+
"train loss": 0.6673308206796646,
|
92 |
+
"train samples": 3000,
|
93 |
+
"train time": 29.74789179801155,
|
94 |
+
"eval time": 6.2111000180011615,
|
95 |
+
"tokens / sec": 7207.267037805055,
|
96 |
+
"mem allocated avg": 6897885888.512,
|
97 |
+
"mem reserved avg": 11257109282.816,
|
98 |
+
"elapsed time": 247.40845895299572
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"step": 1000,
|
102 |
+
"valid accuracy": 0.44,
|
103 |
+
"train loss": 0.6480507221221924,
|
104 |
+
"train samples": 4000,
|
105 |
+
"train time": 29.01437903306214,
|
106 |
+
"eval time": 11.063560270995367,
|
107 |
+
"tokens / sec": 7180.439731713689,
|
108 |
+
"mem allocated avg": 6888501639.168,
|
109 |
+
"mem reserved avg": 11141564596.224,
|
110 |
+
"elapsed time": 328.43337820599845
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"step": 1250,
|
114 |
+
"valid accuracy": 0.42,
|
115 |
+
"train loss": 0.6442041766643524,
|
116 |
+
"train samples": 5000,
|
117 |
+
"train time": 28.86099356606428,
|
118 |
+
"eval time": 11.061821620001865,
|
119 |
+
"tokens / sec": 7225.600169399779,
|
120 |
+
"mem allocated avg": 6888334700.544,
|
121 |
+
"mem reserved avg": 11139123511.296,
|
122 |
+
"elapsed time": 409.5306018880001
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"step": 1500,
|
126 |
+
"valid accuracy": 0.52,
|
127 |
+
"train loss": 0.6375475705862045,
|
128 |
+
"train samples": 6000,
|
129 |
+
"train time": 29.36598393299937,
|
130 |
+
"eval time": 6.896059851998871,
|
131 |
+
"tokens / sec": 7128.349606047729,
|
132 |
+
"mem allocated avg": 6890338080.768,
|
133 |
+
"mem reserved avg": 11164893315.072,
|
134 |
+
"elapsed time": 487.1438905899995
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"step": 1750,
|
138 |
+
"valid accuracy": 0.42,
|
139 |
+
"train loss": 0.6282199568748474,
|
140 |
+
"train samples": 7000,
|
141 |
+
"train time": 29.2208460940019,
|
142 |
+
"eval time": 11.139122824002698,
|
143 |
+
"tokens / sec": 7164.576936838726,
|
144 |
+
"mem allocated avg": 6891485964.288,
|
145 |
+
"mem reserved avg": 11174582157.312,
|
146 |
+
"elapsed time": 568.6407176649955
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"step": 2000,
|
150 |
+
"valid accuracy": 0.44,
|
151 |
+
"train loss": 0.628275181055069,
|
152 |
+
"train samples": 8000,
|
153 |
+
"train time": 28.774674860083906,
|
154 |
+
"eval time": 11.096917715003656,
|
155 |
+
"tokens / sec": 7218.013791986054,
|
156 |
+
"mem allocated avg": 6889055956.992,
|
157 |
+
"mem reserved avg": 11126481879.04,
|
158 |
+
"elapsed time": 649.4662010969987
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"step": 2250,
|
162 |
+
"valid accuracy": 0.5,
|
163 |
+
"train loss": 0.6164452042579651,
|
164 |
+
"train samples": 9000,
|
165 |
+
"train time": 29.666104338008154,
|
166 |
+
"eval time": 6.740810982002586,
|
167 |
+
"tokens / sec": 7245.575541396888,
|
168 |
+
"mem allocated avg": 6899385456.64,
|
169 |
+
"mem reserved avg": 11287358603.264,
|
170 |
+
"elapsed time": 727.5584506419982
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"step": 2500,
|
174 |
+
"valid accuracy": 0.52,
|
175 |
+
"train loss": 0.6124898854494095,
|
176 |
+
"train samples": 10000,
|
177 |
+
"train time": 28.952800227045373,
|
178 |
+
"eval time": 11.054138113999215,
|
179 |
+
"tokens / sec": 7113.888756349109,
|
180 |
+
"mem allocated avg": 6884753041.408,
|
181 |
+
"mem reserved avg": 11077492408.32,
|
182 |
+
"elapsed time": 808.6757636719994
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"step": 2750,
|
186 |
+
"valid accuracy": 0.48,
|
187 |
+
"train loss": 0.6010023313760757,
|
188 |
+
"train samples": 11000,
|
189 |
+
"train time": 29.36040201097785,
|
190 |
+
"eval time": 5.933361176998005,
|
191 |
+
"tokens / sec": 7216.556500853691,
|
192 |
+
"mem allocated avg": 6895703631.872,
|
193 |
+
"mem reserved avg": 11229007446.016,
|
194 |
+
"elapsed time": 885.2688505609985
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"step": 3000,
|
198 |
+
"valid accuracy": 0.36,
|
199 |
+
"train loss": 0.590470621585846,
|
200 |
+
"train samples": 12000,
|
201 |
+
"train time": 29.152743853985157,
|
202 |
+
"eval time": 11.051910919995862,
|
203 |
+
"tokens / sec": 7159.909236861306,
|
204 |
+
"mem allocated avg": 6890226739.2,
|
205 |
+
"mem reserved avg": 11156563427.328,
|
206 |
+
"elapsed time": 966.2876440099935
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"step": 3250,
|
210 |
+
"valid accuracy": 0.46,
|
211 |
+
"train loss": 0.5996054347753524,
|
212 |
+
"train samples": 13000,
|
213 |
+
"train time": 29.23224936202314,
|
214 |
+
"eval time": 11.06002619300125,
|
215 |
+
"tokens / sec": 7214.668888053154,
|
216 |
+
"mem allocated avg": 6892138940.416,
|
217 |
+
"mem reserved avg": 11182651998.208,
|
218 |
+
"elapsed time": 1047.7634995759945
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"step": 3500,
|
222 |
+
"valid accuracy": 0.46,
|
223 |
+
"train loss": 0.5810788285732269,
|
224 |
+
"train samples": 14000,
|
225 |
+
"train time": 29.556202010979177,
|
226 |
+
"eval time": 7.767598452002858,
|
227 |
+
"tokens / sec": 7096.649289448104,
|
228 |
+
"mem allocated avg": 6891370110.976,
|
229 |
+
"mem reserved avg": 11166763974.656,
|
230 |
+
"elapsed time": 1126.3068484049945
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"step": 3750,
|
234 |
+
"valid accuracy": 0.5,
|
235 |
+
"train loss": 0.5778432558774949,
|
236 |
+
"train samples": 15000,
|
237 |
+
"train time": 30.077826159038523,
|
238 |
+
"eval time": 11.010653469995304,
|
239 |
+
"tokens / sec": 7204.742751493022,
|
240 |
+
"mem allocated avg": 6901065279.488,
|
241 |
+
"mem reserved avg": 11319788961.792,
|
242 |
+
"elapsed time": 1209.0550349339974
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"step": 4000,
|
246 |
+
"valid accuracy": 0.4,
|
247 |
+
"train loss": 0.5869229323863984,
|
248 |
+
"train samples": 16000,
|
249 |
+
"train time": 29.213863794990175,
|
250 |
+
"eval time": 11.144038623999222,
|
251 |
+
"tokens / sec": 6995.753845988955,
|
252 |
+
"mem allocated avg": 6883645001.728,
|
253 |
+
"mem reserved avg": 11058953584.64,
|
254 |
+
"elapsed time": 1290.3985370609953
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"step": 4250,
|
258 |
+
"valid accuracy": 0.46,
|
259 |
+
"train loss": 0.5733816763162612,
|
260 |
+
"train samples": 17000,
|
261 |
+
"train time": 29.18649683901458,
|
262 |
+
"eval time": 11.153094029003114,
|
263 |
+
"tokens / sec": 7242.698607029438,
|
264 |
+
"mem allocated avg": 6893432758.272,
|
265 |
+
"mem reserved avg": 11193884344.32,
|
266 |
+
"elapsed time": 1372.1237251569983
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"step": 4500,
|
270 |
+
"valid accuracy": 0.48,
|
271 |
+
"train loss": 0.5803762240409851,
|
272 |
+
"train samples": 18000,
|
273 |
+
"train time": 29.077459994943638,
|
274 |
+
"eval time": 11.118935573998897,
|
275 |
+
"tokens / sec": 7147.047920834147,
|
276 |
+
"mem allocated avg": 6888416004.096,
|
277 |
+
"mem reserved avg": 11124485390.336,
|
278 |
+
"elapsed time": 1453.4214935309938
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"step": 4750,
|
282 |
+
"valid accuracy": 0.48,
|
283 |
+
"train loss": 0.5692038584947586,
|
284 |
+
"train samples": 19000,
|
285 |
+
"train time": 29.40723867896304,
|
286 |
+
"eval time": 11.099454375005735,
|
287 |
+
"tokens / sec": 7139.024588193769,
|
288 |
+
"mem allocated avg": 6890813089.792,
|
289 |
+
"mem reserved avg": 11168844349.44,
|
290 |
+
"elapsed time": 1535.6791463129994
|
291 |
+
},
|
292 |
+
{
|
293 |
+
"step": 5000,
|
294 |
+
"valid accuracy": 0.48,
|
295 |
+
"train loss": 0.5775641392469406,
|
296 |
+
"train samples": 20000,
|
297 |
+
"train time": 28.941933833950316,
|
298 |
+
"eval time": 11.18307958800142,
|
299 |
+
"tokens / sec": 7196.47834159849,
|
300 |
+
"mem allocated avg": 6887869800.448,
|
301 |
+
"mem reserved avg": 11118328152.064,
|
302 |
+
"elapsed time": 1617.277517963994
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"step": 5000,
|
306 |
+
"test accuracy": 0.5079605761940864,
|
307 |
+
"train loss": 0.5775641392469406,
|
308 |
+
"train samples": 20000,
|
309 |
+
"train total tokens": 4198051
|
310 |
+
}
|
311 |
+
]
|
312 |
+
},
|
313 |
+
"meta_info": {
|
314 |
+
"model_info": {
|
315 |
+
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
316 |
+
"created_at": "2024-09-18T15:23:48+00:00"
|
317 |
+
},
|
318 |
+
"dataset_info": {
|
319 |
+
"metamath": {
|
320 |
+
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
321 |
+
"created_at": "2023-09-21T17:22:46+00:00"
|
322 |
+
},
|
323 |
+
"gsm8k": {
|
324 |
+
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
325 |
+
"created_at": "2022-04-12T10:22:10+00:00"
|
326 |
+
}
|
327 |
+
},
|
328 |
+
"package_info": {
|
329 |
+
"transformers-version": "4.52.4",
|
330 |
+
"transformers-commit-hash": null,
|
331 |
+
"peft-version": "0.15.2.dev0",
|
332 |
+
"peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
|
333 |
+
"datasets-version": "3.6.0",
|
334 |
+
"datasets-commit-hash": null,
|
335 |
+
"bitsandbytes-version": "0.46.0",
|
336 |
+
"bitsandbytes-commit-hash": null,
|
337 |
+
"torch-version": "2.7.1+cu126",
|
338 |
+
"torch-commit-hash": null
|
339 |
+
},
|
340 |
+
"system_info": {
|
341 |
+
"system": "Linux",
|
342 |
+
"release": "6.8.0-1029-aws",
|
343 |
+
"version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
|
344 |
+
"machine": "x86_64",
|
345 |
+
"processor": "x86_64",
|
346 |
+
"gpu": "NVIDIA L40S"
|
347 |
+
},
|
348 |
+
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
349 |
+
}
|
350 |
+
}
|
MetaMathQA/results/fourierft--llama-3.2-3B-default.json
ADDED
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_info": {
|
3 |
+
"created_at": "2025-06-20T10:18:57+00:00",
|
4 |
+
"total_time": 2823.832106703994,
|
5 |
+
"experiment_name": "fourierft/llama-3.2-3B-default",
|
6 |
+
"peft_branch": "main",
|
7 |
+
"train_config": {
|
8 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
9 |
+
"dtype": "bfloat16",
|
10 |
+
"max_seq_length": 768,
|
11 |
+
"batch_size": 4,
|
12 |
+
"batch_size_eval": 50,
|
13 |
+
"max_steps": 5000,
|
14 |
+
"eval_steps": 250,
|
15 |
+
"compile": false,
|
16 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
17 |
+
"seed": 0,
|
18 |
+
"grad_norm_clip": 1.0,
|
19 |
+
"optimizer_type": "AdamW",
|
20 |
+
"optimizer_kwargs": {
|
21 |
+
"lr": 0.0001,
|
22 |
+
"weight_decay": 0.1
|
23 |
+
},
|
24 |
+
"lr_scheduler": "cosine",
|
25 |
+
"use_amp": false,
|
26 |
+
"autocast_adapter_dtype": true,
|
27 |
+
"generation_kwargs": {
|
28 |
+
"max_length": 800,
|
29 |
+
"max_new_tokens": 300
|
30 |
+
},
|
31 |
+
"attn_implementation": null
|
32 |
+
},
|
33 |
+
"peft_config": {
|
34 |
+
"task_type": null,
|
35 |
+
"peft_type": "FOURIERFT",
|
36 |
+
"auto_mapping": null,
|
37 |
+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
38 |
+
"revision": null,
|
39 |
+
"inference_mode": false,
|
40 |
+
"n_frequency": 1000,
|
41 |
+
"scaling": 300,
|
42 |
+
"random_loc_seed": 777,
|
43 |
+
"fan_in_fan_out": false,
|
44 |
+
"target_modules": [
|
45 |
+
"q_proj",
|
46 |
+
"v_proj"
|
47 |
+
],
|
48 |
+
"exclude_modules": null,
|
49 |
+
"bias": "none",
|
50 |
+
"modules_to_save": null,
|
51 |
+
"layers_to_transform": null,
|
52 |
+
"layers_pattern": null,
|
53 |
+
"n_frequency_pattern": {},
|
54 |
+
"init_weights": false
|
55 |
+
},
|
56 |
+
"error_msg": ""
|
57 |
+
},
|
58 |
+
"train_info": {
|
59 |
+
"cuda_memory_reserved_avg": 13104129350,
|
60 |
+
"cuda_memory_max": 23653777408,
|
61 |
+
"cuda_memory_reserved_99th": 19017267937,
|
62 |
+
"train_time": 2424.3862988609762,
|
63 |
+
"file_size": 231416,
|
64 |
+
"num_trainable_params": 56000,
|
65 |
+
"num_total_params": 3212805824,
|
66 |
+
"status": "success",
|
67 |
+
"metrics": [
|
68 |
+
{
|
69 |
+
"step": 250,
|
70 |
+
"valid accuracy": 0.0,
|
71 |
+
"train loss": 1.3263031902313231,
|
72 |
+
"train samples": 1000,
|
73 |
+
"train time": 53.55340486107161,
|
74 |
+
"eval time": 19.578013352002017,
|
75 |
+
"tokens / sec": 3953.4180982374883,
|
76 |
+
"mem allocated avg": 6781303625.728,
|
77 |
+
"mem reserved avg": 13152850804.736,
|
78 |
+
"elapsed time": 119.84825310099404
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"step": 500,
|
82 |
+
"valid accuracy": 0.0,
|
83 |
+
"train loss": 1.3399862418174744,
|
84 |
+
"train samples": 2000,
|
85 |
+
"train time": 52.85717789203045,
|
86 |
+
"eval time": 19.544192551999004,
|
87 |
+
"tokens / sec": 3935.03793231005,
|
88 |
+
"mem allocated avg": 6774035257.344,
|
89 |
+
"mem reserved avg": 13043463356.416,
|
90 |
+
"elapsed time": 233.5829256769939
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"step": 750,
|
94 |
+
"valid accuracy": 0.0,
|
95 |
+
"train loss": 1.3045952091217041,
|
96 |
+
"train samples": 3000,
|
97 |
+
"train time": 53.35706212905643,
|
98 |
+
"eval time": 19.607110917990212,
|
99 |
+
"tokens / sec": 4018.2309790861696,
|
100 |
+
"mem allocated avg": 6783920330.752,
|
101 |
+
"mem reserved avg": 13205673869.312,
|
102 |
+
"elapsed time": 348.1469791559939
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"step": 1000,
|
106 |
+
"valid accuracy": 0.0,
|
107 |
+
"train loss": 1.3111453976631164,
|
108 |
+
"train samples": 4000,
|
109 |
+
"train time": 52.95546973698947,
|
110 |
+
"eval time": 19.472347582006478,
|
111 |
+
"tokens / sec": 3934.1733919976355,
|
112 |
+
"mem allocated avg": 6776025266.176,
|
113 |
+
"mem reserved avg": 13077269446.656,
|
114 |
+
"elapsed time": 461.81266678999236
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"step": 1250,
|
118 |
+
"valid accuracy": 0.0,
|
119 |
+
"train loss": 1.299716483592987,
|
120 |
+
"train samples": 5000,
|
121 |
+
"train time": 52.12036712520057,
|
122 |
+
"eval time": 19.626158429004136,
|
123 |
+
"tokens / sec": 4001.0846335572023,
|
124 |
+
"mem allocated avg": 6775331573.76,
|
125 |
+
"mem reserved avg": 13063344357.376,
|
126 |
+
"elapsed time": 574.6407375999988
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"step": 1500,
|
130 |
+
"valid accuracy": 0.0,
|
131 |
+
"train loss": 1.2867344057559966,
|
132 |
+
"train samples": 6000,
|
133 |
+
"train time": 52.594848359090975,
|
134 |
+
"eval time": 19.54386943600548,
|
135 |
+
"tokens / sec": 3980.0666135738998,
|
136 |
+
"mem allocated avg": 6776458844.16,
|
137 |
+
"mem reserved avg": 13093568512.0,
|
138 |
+
"elapsed time": 688.0431025519938
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"step": 1750,
|
142 |
+
"valid accuracy": 0.0,
|
143 |
+
"train loss": 1.2803141210079194,
|
144 |
+
"train samples": 7000,
|
145 |
+
"train time": 52.98738884186605,
|
146 |
+
"eval time": 19.568909612993593,
|
147 |
+
"tokens / sec": 3951.0344739725274,
|
148 |
+
"mem allocated avg": 6778496358.4,
|
149 |
+
"mem reserved avg": 13108768669.696,
|
150 |
+
"elapsed time": 801.9154772249894
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"step": 2000,
|
154 |
+
"valid accuracy": 0.0,
|
155 |
+
"train loss": 1.2766506419181824,
|
156 |
+
"train samples": 8000,
|
157 |
+
"train time": 52.03297274692159,
|
158 |
+
"eval time": 19.525613270001486,
|
159 |
+
"tokens / sec": 3991.62279292005,
|
160 |
+
"mem allocated avg": 6774647097.344,
|
161 |
+
"mem reserved avg": 13051189264.384,
|
162 |
+
"elapsed time": 914.5343848449993
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"step": 2250,
|
166 |
+
"valid accuracy": 0.0,
|
167 |
+
"train loss": 1.2596003375053406,
|
168 |
+
"train samples": 9000,
|
169 |
+
"train time": 53.934016149127274,
|
170 |
+
"eval time": 19.535415460006334,
|
171 |
+
"tokens / sec": 3985.388356870549,
|
172 |
+
"mem allocated avg": 6785830477.824,
|
173 |
+
"mem reserved avg": 13237223424.0,
|
174 |
+
"elapsed time": 1029.9007452719961
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"step": 2500,
|
178 |
+
"valid accuracy": 0.0,
|
179 |
+
"train loss": 1.2684449093341827,
|
180 |
+
"train samples": 10000,
|
181 |
+
"train time": 52.006629903029534,
|
182 |
+
"eval time": 19.470633051998448,
|
183 |
+
"tokens / sec": 3960.3989026791724,
|
184 |
+
"mem allocated avg": 6771212331.008,
|
185 |
+
"mem reserved avg": 12996118052.864,
|
186 |
+
"elapsed time": 1142.5889472209965
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"step": 2750,
|
190 |
+
"valid accuracy": 0.0,
|
191 |
+
"train loss": 1.2548872971534728,
|
192 |
+
"train samples": 11000,
|
193 |
+
"train time": 53.403087337108445,
|
194 |
+
"eval time": 19.463876378998975,
|
195 |
+
"tokens / sec": 3967.579601952513,
|
196 |
+
"mem allocated avg": 6781916252.16,
|
197 |
+
"mem reserved avg": 13168084516.864,
|
198 |
+
"elapsed time": 1257.0122518049902
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"step": 3000,
|
202 |
+
"valid accuracy": 0.0,
|
203 |
+
"train loss": 1.253697858095169,
|
204 |
+
"train samples": 12000,
|
205 |
+
"train time": 53.20096563108382,
|
206 |
+
"eval time": 19.472515105997445,
|
207 |
+
"tokens / sec": 3923.443823321214,
|
208 |
+
"mem allocated avg": 6777045135.36,
|
209 |
+
"mem reserved avg": 13084844359.68,
|
210 |
+
"elapsed time": 1370.94780872899
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"step": 3250,
|
214 |
+
"valid accuracy": 0.0,
|
215 |
+
"train loss": 1.248513156414032,
|
216 |
+
"train samples": 13000,
|
217 |
+
"train time": 52.962746563891415,
|
218 |
+
"eval time": 19.54665829600708,
|
219 |
+
"tokens / sec": 3982.06312328573,
|
220 |
+
"mem allocated avg": 6779038627.84,
|
221 |
+
"mem reserved avg": 13110345728.0,
|
222 |
+
"elapsed time": 1484.7621198889974
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"step": 3500,
|
226 |
+
"valid accuracy": 0.0,
|
227 |
+
"train loss": 1.2477959940433503,
|
228 |
+
"train samples": 14000,
|
229 |
+
"train time": 52.93443578510778,
|
230 |
+
"eval time": 19.444701158994576,
|
231 |
+
"tokens / sec": 3962.4489595298505,
|
232 |
+
"mem allocated avg": 6776803573.76,
|
233 |
+
"mem reserved avg": 13097142059.008,
|
234 |
+
"elapsed time": 1598.8772237269877
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"step": 3750,
|
238 |
+
"valid accuracy": 0.0,
|
239 |
+
"train loss": 1.228544222354889,
|
240 |
+
"train samples": 15000,
|
241 |
+
"train time": 53.31031796212483,
|
242 |
+
"eval time": 19.472959079008433,
|
243 |
+
"tokens / sec": 4064.9354249577,
|
244 |
+
"mem allocated avg": 6788200585.216,
|
245 |
+
"mem reserved avg": 13268999471.104,
|
246 |
+
"elapsed time": 1713.6814467679942
|
247 |
+
},
|
248 |
+
{
|
249 |
+
"step": 4000,
|
250 |
+
"valid accuracy": 0.0,
|
251 |
+
"train loss": 1.2609001460075377,
|
252 |
+
"train samples": 16000,
|
253 |
+
"train time": 51.9827769130934,
|
254 |
+
"eval time": 19.473652824002784,
|
255 |
+
"tokens / sec": 3931.552182017475,
|
256 |
+
"mem allocated avg": 6770180233.216,
|
257 |
+
"mem reserved avg": 12983610638.336,
|
258 |
+
"elapsed time": 1826.5604049959948
|
259 |
+
},
|
260 |
+
{
|
261 |
+
"step": 4250,
|
262 |
+
"valid accuracy": 0.0,
|
263 |
+
"train loss": 1.227214762210846,
|
264 |
+
"train samples": 17000,
|
265 |
+
"train time": 53.09942602888623,
|
266 |
+
"eval time": 19.547112297004787,
|
267 |
+
"tokens / sec": 3981.0034836347163,
|
268 |
+
"mem allocated avg": 6779591426.048,
|
269 |
+
"mem reserved avg": 13132760088.576,
|
270 |
+
"elapsed time": 1940.5098487799987
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"step": 4500,
|
274 |
+
"valid accuracy": 0.0,
|
275 |
+
"train loss": 1.2504195840358734,
|
276 |
+
"train samples": 18000,
|
277 |
+
"train time": 52.23909889203787,
|
278 |
+
"eval time": 19.522137050997117,
|
279 |
+
"tokens / sec": 3978.207978462565,
|
280 |
+
"mem allocated avg": 6775933241.344,
|
281 |
+
"mem reserved avg": 13056079822.848,
|
282 |
+
"elapsed time": 2053.2267840139975
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"step": 4750,
|
286 |
+
"valid accuracy": 0.0,
|
287 |
+
"train loss": 1.2349513354301453,
|
288 |
+
"train samples": 19000,
|
289 |
+
"train time": 53.36620609794045,
|
290 |
+
"eval time": 19.541859832999762,
|
291 |
+
"tokens / sec": 3933.931514912433,
|
292 |
+
"mem allocated avg": 6777532579.84,
|
293 |
+
"mem reserved avg": 13101604798.464,
|
294 |
+
"elapsed time": 2167.8329333979927
|
295 |
+
},
|
296 |
+
{
|
297 |
+
"step": 5000,
|
298 |
+
"valid accuracy": 0.0,
|
299 |
+
"train loss": 1.2480293517112733,
|
300 |
+
"train samples": 20000,
|
301 |
+
"train time": 52.46977503092785,
|
302 |
+
"eval time": 19.44991449599911,
|
303 |
+
"tokens / sec": 3969.5234042309344,
|
304 |
+
"mem allocated avg": 6773533165.568,
|
305 |
+
"mem reserved avg": 13049645760.512,
|
306 |
+
"elapsed time": 2281.220151823989
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"step": 5000,
|
310 |
+
"test accuracy": 0.000758150113722517,
|
311 |
+
"train loss": 1.2480293517112733,
|
312 |
+
"train samples": 20000,
|
313 |
+
"train total tokens": 4198051
|
314 |
+
}
|
315 |
+
]
|
316 |
+
},
|
317 |
+
"meta_info": {
|
318 |
+
"model_info": {
|
319 |
+
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
320 |
+
"created_at": "2024-09-18T15:23:48+00:00"
|
321 |
+
},
|
322 |
+
"dataset_info": {
|
323 |
+
"metamath": {
|
324 |
+
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
325 |
+
"created_at": "2023-09-21T17:22:46+00:00"
|
326 |
+
},
|
327 |
+
"gsm8k": {
|
328 |
+
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
329 |
+
"created_at": "2022-04-12T10:22:10+00:00"
|
330 |
+
}
|
331 |
+
},
|
332 |
+
"package_info": {
|
333 |
+
"transformers-version": "4.52.4",
|
334 |
+
"transformers-commit-hash": null,
|
335 |
+
"peft-version": "0.15.2.dev0",
|
336 |
+
"peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
|
337 |
+
"datasets-version": "3.6.0",
|
338 |
+
"datasets-commit-hash": null,
|
339 |
+
"bitsandbytes-version": "0.46.0",
|
340 |
+
"bitsandbytes-commit-hash": null,
|
341 |
+
"torch-version": "2.7.1+cu126",
|
342 |
+
"torch-commit-hash": null
|
343 |
+
},
|
344 |
+
"system_info": {
|
345 |
+
"system": "Linux",
|
346 |
+
"release": "6.8.0-1029-aws",
|
347 |
+
"version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
|
348 |
+
"machine": "x86_64",
|
349 |
+
"processor": "x86_64",
|
350 |
+
"gpu": "NVIDIA L40S"
|
351 |
+
},
|
352 |
+
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
353 |
+
}
|
354 |
+
}
|
MetaMathQA/results/fourierft--llama-3.2-3B-n_frequency-5000.json
ADDED
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_info": {
|
3 |
+
"created_at": "2025-06-20T09:31:48+00:00",
|
4 |
+
"total_time": 2824.376998209991,
|
5 |
+
"experiment_name": "fourierft/llama-3.2-3B-n_frequency-5000",
|
6 |
+
"peft_branch": "main",
|
7 |
+
"train_config": {
|
8 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
9 |
+
"dtype": "bfloat16",
|
10 |
+
"max_seq_length": 768,
|
11 |
+
"batch_size": 4,
|
12 |
+
"batch_size_eval": 50,
|
13 |
+
"max_steps": 5000,
|
14 |
+
"eval_steps": 250,
|
15 |
+
"compile": false,
|
16 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
17 |
+
"seed": 0,
|
18 |
+
"grad_norm_clip": 1.0,
|
19 |
+
"optimizer_type": "AdamW",
|
20 |
+
"optimizer_kwargs": {
|
21 |
+
"lr": 0.0001,
|
22 |
+
"weight_decay": 0.1
|
23 |
+
},
|
24 |
+
"lr_scheduler": "cosine",
|
25 |
+
"use_amp": false,
|
26 |
+
"autocast_adapter_dtype": true,
|
27 |
+
"generation_kwargs": {
|
28 |
+
"max_length": 800,
|
29 |
+
"max_new_tokens": 300
|
30 |
+
},
|
31 |
+
"attn_implementation": null
|
32 |
+
},
|
33 |
+
"peft_config": {
|
34 |
+
"task_type": null,
|
35 |
+
"peft_type": "FOURIERFT",
|
36 |
+
"auto_mapping": null,
|
37 |
+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
38 |
+
"revision": null,
|
39 |
+
"inference_mode": false,
|
40 |
+
"n_frequency": 5000,
|
41 |
+
"scaling": 300,
|
42 |
+
"random_loc_seed": 777,
|
43 |
+
"fan_in_fan_out": false,
|
44 |
+
"target_modules": [
|
45 |
+
"v_proj",
|
46 |
+
"q_proj"
|
47 |
+
],
|
48 |
+
"exclude_modules": null,
|
49 |
+
"bias": "none",
|
50 |
+
"modules_to_save": null,
|
51 |
+
"layers_to_transform": null,
|
52 |
+
"layers_pattern": null,
|
53 |
+
"n_frequency_pattern": {},
|
54 |
+
"init_weights": false
|
55 |
+
},
|
56 |
+
"error_msg": ""
|
57 |
+
},
|
58 |
+
"train_info": {
|
59 |
+
"cuda_memory_reserved_avg": 13111221498,
|
60 |
+
"cuda_memory_max": 23681040384,
|
61 |
+
"cuda_memory_reserved_99th": 19054869872,
|
62 |
+
"train_time": 2421.913372163006,
|
63 |
+
"file_size": 1127472,
|
64 |
+
"num_trainable_params": 280000,
|
65 |
+
"num_total_params": 3213029824,
|
66 |
+
"status": "success",
|
67 |
+
"metrics": [
|
68 |
+
{
|
69 |
+
"step": 250,
|
70 |
+
"valid accuracy": 0.0,
|
71 |
+
"train loss": 1.3800132541656494,
|
72 |
+
"train samples": 1000,
|
73 |
+
"train time": 53.57064967796032,
|
74 |
+
"eval time": 19.631924207002157,
|
75 |
+
"tokens / sec": 3952.1454616053315,
|
76 |
+
"mem allocated avg": 6784830552.064,
|
77 |
+
"mem reserved avg": 13158731218.944,
|
78 |
+
"elapsed time": 119.20255395398999
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"step": 500,
|
82 |
+
"valid accuracy": 0.0,
|
83 |
+
"train loss": 1.3702282276153563,
|
84 |
+
"train samples": 2000,
|
85 |
+
"train time": 53.00863014489005,
|
86 |
+
"eval time": 19.629790833001607,
|
87 |
+
"tokens / sec": 3923.7950392508,
|
88 |
+
"mem allocated avg": 6777176354.816,
|
89 |
+
"mem reserved avg": 13048941117.44,
|
90 |
+
"elapsed time": 232.4386439989903
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"step": 750,
|
94 |
+
"valid accuracy": 0.0,
|
95 |
+
"train loss": 1.3024170677661895,
|
96 |
+
"train samples": 3000,
|
97 |
+
"train time": 53.97298614999454,
|
98 |
+
"eval time": 19.64192995200574,
|
99 |
+
"tokens / sec": 3972.3760957780855,
|
100 |
+
"mem allocated avg": 6787548153.856,
|
101 |
+
"mem reserved avg": 13211654946.816,
|
102 |
+
"elapsed time": 346.9217278779979
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"step": 1000,
|
106 |
+
"valid accuracy": 0.0,
|
107 |
+
"train loss": 1.2704877371788026,
|
108 |
+
"train samples": 4000,
|
109 |
+
"train time": 52.95541349705309,
|
110 |
+
"eval time": 19.62998814698949,
|
111 |
+
"tokens / sec": 3934.1775701854103,
|
112 |
+
"mem allocated avg": 6779591346.176,
|
113 |
+
"mem reserved avg": 13082126450.688,
|
114 |
+
"elapsed time": 460.14450727400254
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"step": 1250,
|
118 |
+
"valid accuracy": 0.0,
|
119 |
+
"train loss": 1.2236453666687013,
|
120 |
+
"train samples": 5000,
|
121 |
+
"train time": 53.36593960013124,
|
122 |
+
"eval time": 19.652927816001466,
|
123 |
+
"tokens / sec": 3907.698460152047,
|
124 |
+
"mem allocated avg": 6779029788.672,
|
125 |
+
"mem reserved avg": 13073486184.448,
|
126 |
+
"elapsed time": 573.5348878969962
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"step": 1500,
|
130 |
+
"valid accuracy": 0.0,
|
131 |
+
"train loss": 1.1792121708393097,
|
132 |
+
"train samples": 6000,
|
133 |
+
"train time": 53.3776921518147,
|
134 |
+
"eval time": 19.616937039012555,
|
135 |
+
"tokens / sec": 3921.69446750581,
|
136 |
+
"mem allocated avg": 6779851802.624,
|
137 |
+
"mem reserved avg": 13098995941.376,
|
138 |
+
"elapsed time": 686.9838123609952
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"step": 1750,
|
142 |
+
"valid accuracy": 0.02,
|
143 |
+
"train loss": 1.1485692322254182,
|
144 |
+
"train samples": 7000,
|
145 |
+
"train time": 53.188338823019876,
|
146 |
+
"eval time": 19.653264298991417,
|
147 |
+
"tokens / sec": 3936.1071361264494,
|
148 |
+
"mem allocated avg": 6782223466.496,
|
149 |
+
"mem reserved avg": 13116058370.048,
|
150 |
+
"elapsed time": 800.3354816049978
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"step": 2000,
|
154 |
+
"valid accuracy": 0.06,
|
155 |
+
"train loss": 1.1230667443275453,
|
156 |
+
"train samples": 8000,
|
157 |
+
"train time": 53.074023688037414,
|
158 |
+
"eval time": 19.656479785000556,
|
159 |
+
"tokens / sec": 3913.3268135239105,
|
160 |
+
"mem allocated avg": 6778141935.616,
|
161 |
+
"mem reserved avg": 13055400345.6,
|
162 |
+
"elapsed time": 913.367253695993
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"step": 2250,
|
166 |
+
"valid accuracy": 0.1,
|
167 |
+
"train loss": 1.094045166015625,
|
168 |
+
"train samples": 9000,
|
169 |
+
"train time": 54.34830153394432,
|
170 |
+
"eval time": 19.628162662993418,
|
171 |
+
"tokens / sec": 3955.008600696563,
|
172 |
+
"mem allocated avg": 6789509545.984,
|
173 |
+
"mem reserved avg": 13248556433.408,
|
174 |
+
"elapsed time": 1028.463336018991
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"step": 2500,
|
178 |
+
"valid accuracy": 0.12,
|
179 |
+
"train loss": 1.077717797279358,
|
180 |
+
"train samples": 10000,
|
181 |
+
"train time": 52.1458756570355,
|
182 |
+
"eval time": 19.611369335994823,
|
183 |
+
"tokens / sec": 3949.823402231256,
|
184 |
+
"mem allocated avg": 6775024920.576,
|
185 |
+
"mem reserved avg": 13002233348.096,
|
186 |
+
"elapsed time": 1140.4990660109906
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"step": 2750,
|
190 |
+
"valid accuracy": 0.12,
|
191 |
+
"train loss": 1.0569540388584138,
|
192 |
+
"train samples": 11000,
|
193 |
+
"train time": 53.227410834049806,
|
194 |
+
"eval time": 19.625236430001678,
|
195 |
+
"tokens / sec": 3980.6745562092756,
|
196 |
+
"mem allocated avg": 6785537161.216,
|
197 |
+
"mem reserved avg": 13177051938.816,
|
198 |
+
"elapsed time": 1254.066401210992
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"step": 3000,
|
202 |
+
"valid accuracy": 0.12,
|
203 |
+
"train loss": 1.0361379137039184,
|
204 |
+
"train samples": 12000,
|
205 |
+
"train time": 53.65395914198598,
|
206 |
+
"eval time": 19.719437510997523,
|
207 |
+
"tokens / sec": 3890.3186892066865,
|
208 |
+
"mem allocated avg": 6780720910.336,
|
209 |
+
"mem reserved avg": 13092201168.896,
|
210 |
+
"elapsed time": 1367.8724600419955
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"step": 3250,
|
214 |
+
"valid accuracy": 0.16,
|
215 |
+
"train loss": 1.0240549674034118,
|
216 |
+
"train samples": 13000,
|
217 |
+
"train time": 52.97706237102102,
|
218 |
+
"eval time": 19.7029277440015,
|
219 |
+
"tokens / sec": 3980.9870642311216,
|
220 |
+
"mem allocated avg": 6782688188.416,
|
221 |
+
"mem reserved avg": 13119816466.432,
|
222 |
+
"elapsed time": 1481.1549517469975
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"step": 3500,
|
226 |
+
"valid accuracy": 0.18,
|
227 |
+
"train loss": 1.0098259932994842,
|
228 |
+
"train samples": 14000,
|
229 |
+
"train time": 52.869576787008555,
|
230 |
+
"eval time": 19.597270865997416,
|
231 |
+
"tokens / sec": 3967.3099870839346,
|
232 |
+
"mem allocated avg": 6780575592.448,
|
233 |
+
"mem reserved avg": 13102678540.288,
|
234 |
+
"elapsed time": 1594.3849144269916
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"step": 3750,
|
238 |
+
"valid accuracy": 0.22,
|
239 |
+
"train loss": 0.9942408270835876,
|
240 |
+
"train samples": 15000,
|
241 |
+
"train time": 54.702630093932385,
|
242 |
+
"eval time": 19.623511597994366,
|
243 |
+
"tokens / sec": 3961.4731435744384,
|
244 |
+
"mem allocated avg": 6792074147.84,
|
245 |
+
"mem reserved avg": 13278612815.872,
|
246 |
+
"elapsed time": 1709.9712875620025
|
247 |
+
},
|
248 |
+
{
|
249 |
+
"step": 4000,
|
250 |
+
"valid accuracy": 0.16,
|
251 |
+
"train loss": 1.0123027296066285,
|
252 |
+
"train samples": 16000,
|
253 |
+
"train time": 52.456372838059906,
|
254 |
+
"eval time": 19.68401901901234,
|
255 |
+
"tokens / sec": 3896.056645603915,
|
256 |
+
"mem allocated avg": 6773958766.592,
|
257 |
+
"mem reserved avg": 12989172285.44,
|
258 |
+
"elapsed time": 1822.6668115109933
|
259 |
+
},
|
260 |
+
{
|
261 |
+
"step": 4250,
|
262 |
+
"valid accuracy": 0.24,
|
263 |
+
"train loss": 0.9849327182769776,
|
264 |
+
"train samples": 17000,
|
265 |
+
"train time": 53.25562528491719,
|
266 |
+
"eval time": 19.648335694990237,
|
267 |
+
"tokens / sec": 3969.3271625123257,
|
268 |
+
"mem allocated avg": 6783509901.312,
|
269 |
+
"mem reserved avg": 13139588415.488,
|
270 |
+
"elapsed time": 1936.0694442329986
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"step": 4500,
|
274 |
+
"valid accuracy": 0.18,
|
275 |
+
"train loss": 0.9994378657341003,
|
276 |
+
"train samples": 18000,
|
277 |
+
"train time": 53.01732904899109,
|
278 |
+
"eval time": 19.688141086997348,
|
279 |
+
"tokens / sec": 3919.8127051621955,
|
280 |
+
"mem allocated avg": 6779470948.352,
|
281 |
+
"mem reserved avg": 13063528906.752,
|
282 |
+
"elapsed time": 2048.985867203999
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"step": 4750,
|
286 |
+
"valid accuracy": 0.16,
|
287 |
+
"train loss": 0.9892346875667573,
|
288 |
+
"train samples": 19000,
|
289 |
+
"train time": 53.11992502908106,
|
290 |
+
"eval time": 19.68838914000662,
|
291 |
+
"tokens / sec": 3952.1704875348883,
|
292 |
+
"mem allocated avg": 6781060145.152,
|
293 |
+
"mem reserved avg": 13109733359.616,
|
294 |
+
"elapsed time": 2162.7099456459982
|
295 |
+
},
|
296 |
+
{
|
297 |
+
"step": 5000,
|
298 |
+
"valid accuracy": 0.2,
|
299 |
+
"train loss": 0.9978675174713135,
|
300 |
+
"train samples": 20000,
|
301 |
+
"train time": 52.76285280592856,
|
302 |
+
"eval time": 19.634052573994268,
|
303 |
+
"tokens / sec": 3947.4741967818154,
|
304 |
+
"mem allocated avg": 6777472888.832,
|
305 |
+
"mem reserved avg": 13055861719.04,
|
306 |
+
"elapsed time": 2275.669019541994
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"step": 5000,
|
310 |
+
"test accuracy": 0.1197877179681577,
|
311 |
+
"train loss": 0.9978675174713135,
|
312 |
+
"train samples": 20000,
|
313 |
+
"train total tokens": 4198051
|
314 |
+
}
|
315 |
+
]
|
316 |
+
},
|
317 |
+
"meta_info": {
|
318 |
+
"model_info": {
|
319 |
+
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
320 |
+
"created_at": "2024-09-18T15:23:48+00:00"
|
321 |
+
},
|
322 |
+
"dataset_info": {
|
323 |
+
"metamath": {
|
324 |
+
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
325 |
+
"created_at": "2023-09-21T17:22:46+00:00"
|
326 |
+
},
|
327 |
+
"gsm8k": {
|
328 |
+
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
329 |
+
"created_at": "2022-04-12T10:22:10+00:00"
|
330 |
+
}
|
331 |
+
},
|
332 |
+
"package_info": {
|
333 |
+
"transformers-version": "4.52.4",
|
334 |
+
"transformers-commit-hash": null,
|
335 |
+
"peft-version": "0.15.2.dev0",
|
336 |
+
"peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
|
337 |
+
"datasets-version": "3.6.0",
|
338 |
+
"datasets-commit-hash": null,
|
339 |
+
"bitsandbytes-version": "0.46.0",
|
340 |
+
"bitsandbytes-commit-hash": null,
|
341 |
+
"torch-version": "2.7.1+cu126",
|
342 |
+
"torch-commit-hash": null
|
343 |
+
},
|
344 |
+
"system_info": {
|
345 |
+
"system": "Linux",
|
346 |
+
"release": "6.8.0-1029-aws",
|
347 |
+
"version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
|
348 |
+
"machine": "x86_64",
|
349 |
+
"processor": "x86_64",
|
350 |
+
"gpu": "NVIDIA L40S"
|
351 |
+
},
|
352 |
+
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
353 |
+
}
|
354 |
+
}
|
MetaMathQA/results/full-finetuning--llama-3.2-3B-lr_0.00001.json
ADDED
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_info": {
|
3 |
+
"created_at": "2025-06-20T18:02:43+00:00",
|
4 |
+
"total_time": 3274.9747593409993,
|
5 |
+
"experiment_name": "full-finetuning/llama-3.2-3B-lr_0.00001",
|
6 |
+
"peft_branch": "main",
|
7 |
+
"train_config": {
|
8 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
9 |
+
"dtype": "bfloat16",
|
10 |
+
"max_seq_length": 768,
|
11 |
+
"batch_size": 4,
|
12 |
+
"batch_size_eval": 50,
|
13 |
+
"max_steps": 5000,
|
14 |
+
"eval_steps": 250,
|
15 |
+
"compile": false,
|
16 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
17 |
+
"seed": 0,
|
18 |
+
"grad_norm_clip": 1.0,
|
19 |
+
"optimizer_type": "AdamW",
|
20 |
+
"optimizer_kwargs": {
|
21 |
+
"lr": 1e-05
|
22 |
+
},
|
23 |
+
"lr_scheduler": "cosine",
|
24 |
+
"use_amp": false,
|
25 |
+
"autocast_adapter_dtype": true,
|
26 |
+
"generation_kwargs": {
|
27 |
+
"max_length": 800,
|
28 |
+
"max_new_tokens": 300
|
29 |
+
},
|
30 |
+
"attn_implementation": null
|
31 |
+
},
|
32 |
+
"peft_config": null,
|
33 |
+
"error_msg": ""
|
34 |
+
},
|
35 |
+
"train_info": {
|
36 |
+
"cuda_memory_reserved_avg": 33098872284,
|
37 |
+
"cuda_memory_max": 37241225216,
|
38 |
+
"cuda_memory_reserved_99th": 33573390254,
|
39 |
+
"train_time": 3111.3685010060144,
|
40 |
+
"file_size": 6425499648,
|
41 |
+
"num_trainable_params": 3212749824,
|
42 |
+
"num_total_params": 3212749824,
|
43 |
+
"status": "success",
|
44 |
+
"metrics": [
|
45 |
+
{
|
46 |
+
"step": 250,
|
47 |
+
"valid accuracy": 0.3,
|
48 |
+
"train loss": 1.0749022357463838,
|
49 |
+
"train samples": 1000,
|
50 |
+
"train time": 90.81602771116013,
|
51 |
+
"eval time": 10.388541491003707,
|
52 |
+
"tokens / sec": 2331.295535996918,
|
53 |
+
"mem allocated avg": 26069449254.912,
|
54 |
+
"mem reserved avg": 33116739600.384,
|
55 |
+
"elapsed time": 162.0596859770012
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"step": 500,
|
59 |
+
"valid accuracy": 0.4,
|
60 |
+
"train loss": 0.7238605101108551,
|
61 |
+
"train samples": 2000,
|
62 |
+
"train time": 90.41340426202805,
|
63 |
+
"eval time": 10.403155545005575,
|
64 |
+
"tokens / sec": 2300.488535938847,
|
65 |
+
"mem allocated avg": 26062513567.744,
|
66 |
+
"mem reserved avg": 33090961408.0,
|
67 |
+
"elapsed time": 315.86630137299653
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"step": 750,
|
71 |
+
"valid accuracy": 0.42,
|
72 |
+
"train loss": 0.6648618497848511,
|
73 |
+
"train samples": 3000,
|
74 |
+
"train time": 91.4961106939445,
|
75 |
+
"eval time": 5.590419113999815,
|
76 |
+
"tokens / sec": 2343.27993150631,
|
77 |
+
"mem allocated avg": 26071394062.336,
|
78 |
+
"mem reserved avg": 33094367182.848,
|
79 |
+
"elapsed time": 465.79339110500587
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"step": 1000,
|
83 |
+
"valid accuracy": 0.42,
|
84 |
+
"train loss": 0.6407654472589492,
|
85 |
+
"train samples": 4000,
|
86 |
+
"train time": 89.8546926038689,
|
87 |
+
"eval time": 10.434167744999286,
|
88 |
+
"tokens / sec": 2318.5878662838986,
|
89 |
+
"mem allocated avg": 26063373086.72,
|
90 |
+
"mem reserved avg": 33094367182.848,
|
91 |
+
"elapsed time": 618.5050604129938
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"step": 1250,
|
95 |
+
"valid accuracy": 0.46,
|
96 |
+
"train loss": 0.6343449921607971,
|
97 |
+
"train samples": 5000,
|
98 |
+
"train time": 90.3596406209981,
|
99 |
+
"eval time": 5.810965301003307,
|
100 |
+
"tokens / sec": 2307.86663787969,
|
101 |
+
"mem allocated avg": 26063789404.16,
|
102 |
+
"mem reserved avg": 33081876545.536,
|
103 |
+
"elapsed time": 766.6042792719963
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"step": 1500,
|
107 |
+
"valid accuracy": 0.54,
|
108 |
+
"train loss": 0.6249808443784713,
|
109 |
+
"train samples": 6000,
|
110 |
+
"train time": 90.81503154609527,
|
111 |
+
"eval time": 10.435444819988334,
|
112 |
+
"tokens / sec": 2305.025901948283,
|
113 |
+
"mem allocated avg": 26066218485.76,
|
114 |
+
"mem reserved avg": 33089409515.52,
|
115 |
+
"elapsed time": 920.292813491993
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"step": 1750,
|
119 |
+
"valid accuracy": 0.46,
|
120 |
+
"train loss": 0.6174132014513016,
|
121 |
+
"train samples": 7000,
|
122 |
+
"train time": 90.68820026615867,
|
123 |
+
"eval time": 10.286707318999106,
|
124 |
+
"tokens / sec": 2308.5142210956765,
|
125 |
+
"mem allocated avg": 26065828059.136,
|
126 |
+
"mem reserved avg": 33101774323.712,
|
127 |
+
"elapsed time": 1073.8488811849966
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"step": 2000,
|
131 |
+
"valid accuracy": 0.42,
|
132 |
+
"train loss": 0.618268838763237,
|
133 |
+
"train samples": 8000,
|
134 |
+
"train time": 90.44998777209548,
|
135 |
+
"eval time": 10.380125819006935,
|
136 |
+
"tokens / sec": 2296.252383398064,
|
137 |
+
"mem allocated avg": 26062920781.824,
|
138 |
+
"mem reserved avg": 33096330117.12,
|
139 |
+
"elapsed time": 1227.2062568730034
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"step": 2250,
|
143 |
+
"valid accuracy": 0.5,
|
144 |
+
"train loss": 0.6107994567155838,
|
145 |
+
"train samples": 9000,
|
146 |
+
"train time": 91.58726547904371,
|
147 |
+
"eval time": 10.372407121991273,
|
148 |
+
"tokens / sec": 2346.920162707366,
|
149 |
+
"mem allocated avg": 26073357961.216,
|
150 |
+
"mem reserved avg": 33114382401.536,
|
151 |
+
"elapsed time": 1381.3805919409933
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"step": 2500,
|
155 |
+
"valid accuracy": 0.54,
|
156 |
+
"train loss": 0.6089532144069671,
|
157 |
+
"train samples": 10000,
|
158 |
+
"train time": 89.29193754095468,
|
159 |
+
"eval time": 10.391672718993505,
|
160 |
+
"tokens / sec": 2306.6696240691504,
|
161 |
+
"mem allocated avg": 26059719045.12,
|
162 |
+
"mem reserved avg": 33086842601.472,
|
163 |
+
"elapsed time": 1533.778675338006
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"step": 2750,
|
167 |
+
"valid accuracy": 0.52,
|
168 |
+
"train loss": 0.6020698472261429,
|
169 |
+
"train samples": 11000,
|
170 |
+
"train time": 90.41624103189679,
|
171 |
+
"eval time": 10.369720178001444,
|
172 |
+
"tokens / sec": 2343.3953632871467,
|
173 |
+
"mem allocated avg": 26070059464.704,
|
174 |
+
"mem reserved avg": 33107805732.864,
|
175 |
+
"elapsed time": 1686.671367884992
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"step": 3000,
|
179 |
+
"valid accuracy": 0.5,
|
180 |
+
"train loss": 0.5949549045562744,
|
181 |
+
"train samples": 12000,
|
182 |
+
"train time": 90.9437831780233,
|
183 |
+
"eval time": 7.315949440002441,
|
184 |
+
"tokens / sec": 2295.165130654474,
|
185 |
+
"mem allocated avg": 26064854972.416,
|
186 |
+
"mem reserved avg": 33098074947.584,
|
187 |
+
"elapsed time": 1837.2926549609983
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"step": 3250,
|
191 |
+
"valid accuracy": 0.48,
|
192 |
+
"train loss": 0.6066494225263596,
|
193 |
+
"train samples": 13000,
|
194 |
+
"train time": 90.87308476005273,
|
195 |
+
"eval time": 5.963120047992561,
|
196 |
+
"tokens / sec": 2320.8302057410824,
|
197 |
+
"mem allocated avg": 26066388537.344,
|
198 |
+
"mem reserved avg": 33098318217.216,
|
199 |
+
"elapsed time": 1986.6408478410012
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"step": 3500,
|
203 |
+
"valid accuracy": 0.48,
|
204 |
+
"train loss": 0.592242598772049,
|
205 |
+
"train samples": 14000,
|
206 |
+
"train time": 90.65281462905114,
|
207 |
+
"eval time": 7.1309342330059735,
|
208 |
+
"tokens / sec": 2313.7726154261322,
|
209 |
+
"mem allocated avg": 26065652588.544,
|
210 |
+
"mem reserved avg": 33100457312.256,
|
211 |
+
"elapsed time": 2137.073564691993
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"step": 3750,
|
215 |
+
"valid accuracy": 0.48,
|
216 |
+
"train loss": 0.5925718579292297,
|
217 |
+
"train samples": 15000,
|
218 |
+
"train time": 91.80342563094746,
|
219 |
+
"eval time": 5.844810517999576,
|
220 |
+
"tokens / sec": 2360.5110431407275,
|
221 |
+
"mem allocated avg": 26075058659.328,
|
222 |
+
"mem reserved avg": 33131771985.92,
|
223 |
+
"elapsed time": 2287.0305021950044
|
224 |
+
},
|
225 |
+
{
|
226 |
+
"step": 4000,
|
227 |
+
"valid accuracy": 0.5,
|
228 |
+
"train loss": 0.6050453131198883,
|
229 |
+
"train samples": 16000,
|
230 |
+
"train time": 89.85742108603881,
|
231 |
+
"eval time": 5.86809825799719,
|
232 |
+
"tokens / sec": 2274.414261280792,
|
233 |
+
"mem allocated avg": 26058425257.984,
|
234 |
+
"mem reserved avg": 33098662150.144,
|
235 |
+
"elapsed time": 2435.1958582270017
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"step": 4250,
|
239 |
+
"valid accuracy": 0.48,
|
240 |
+
"train loss": 0.5929686036109925,
|
241 |
+
"train samples": 17000,
|
242 |
+
"train time": 90.97368233802263,
|
243 |
+
"eval time": 5.8907580230006715,
|
244 |
+
"tokens / sec": 2323.6280489841133,
|
245 |
+
"mem allocated avg": 26067367372.8,
|
246 |
+
"mem reserved avg": 33099207409.664,
|
247 |
+
"elapsed time": 2584.8373482140014
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"step": 4500,
|
251 |
+
"valid accuracy": 0.48,
|
252 |
+
"train loss": 0.6010294322967529,
|
253 |
+
"train samples": 18000,
|
254 |
+
"train time": 90.13679483698797,
|
255 |
+
"eval time": 6.106882603999111,
|
256 |
+
"tokens / sec": 2305.5845326632484,
|
257 |
+
"mem allocated avg": 26064599832.576,
|
258 |
+
"mem reserved avg": 33092253253.632,
|
259 |
+
"elapsed time": 2733.494644669001
|
260 |
+
},
|
261 |
+
{
|
262 |
+
"step": 4750,
|
263 |
+
"valid accuracy": 0.5,
|
264 |
+
"train loss": 0.5936577550172806,
|
265 |
+
"train samples": 19000,
|
266 |
+
"train time": 90.74229130300228,
|
267 |
+
"eval time": 5.885364143003244,
|
268 |
+
"tokens / sec": 2313.5739354319567,
|
269 |
+
"mem allocated avg": 26065537388.544,
|
270 |
+
"mem reserved avg": 33100717359.104,
|
271 |
+
"elapsed time": 2882.6415541759925
|
272 |
+
},
|
273 |
+
{
|
274 |
+
"step": 5000,
|
275 |
+
"valid accuracy": 0.5,
|
276 |
+
"train loss": 0.5987544150352478,
|
277 |
+
"train samples": 20000,
|
278 |
+
"train time": 90.54863398504676,
|
279 |
+
"eval time": 5.88336711798911,
|
280 |
+
"tokens / sec": 2300.2003545895063,
|
281 |
+
"mem allocated avg": 26062803286.016,
|
282 |
+
"mem reserved avg": 33083126448.128,
|
283 |
+
"elapsed time": 3031.523533478001
|
284 |
+
},
|
285 |
+
{
|
286 |
+
"step": 5000,
|
287 |
+
"test accuracy": 0.5003790750568613,
|
288 |
+
"train loss": 0.5987544150352478,
|
289 |
+
"train samples": 20000,
|
290 |
+
"train total tokens": 4198051
|
291 |
+
}
|
292 |
+
]
|
293 |
+
},
|
294 |
+
"meta_info": {
|
295 |
+
"model_info": {
|
296 |
+
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
297 |
+
"created_at": "2024-09-18T15:23:48+00:00"
|
298 |
+
},
|
299 |
+
"dataset_info": {
|
300 |
+
"metamath": {
|
301 |
+
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
302 |
+
"created_at": "2023-09-21T17:22:46+00:00"
|
303 |
+
},
|
304 |
+
"gsm8k": {
|
305 |
+
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
306 |
+
"created_at": "2022-04-12T10:22:10+00:00"
|
307 |
+
}
|
308 |
+
},
|
309 |
+
"package_info": {
|
310 |
+
"transformers-version": "4.52.4",
|
311 |
+
"transformers-commit-hash": null,
|
312 |
+
"peft-version": "0.15.2.dev0",
|
313 |
+
"peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
|
314 |
+
"datasets-version": "3.6.0",
|
315 |
+
"datasets-commit-hash": null,
|
316 |
+
"bitsandbytes-version": "0.46.0",
|
317 |
+
"bitsandbytes-commit-hash": null,
|
318 |
+
"torch-version": "2.7.1+cu126",
|
319 |
+
"torch-commit-hash": null
|
320 |
+
},
|
321 |
+
"system_info": {
|
322 |
+
"system": "Linux",
|
323 |
+
"release": "6.8.0-1029-aws",
|
324 |
+
"version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
|
325 |
+
"machine": "x86_64",
|
326 |
+
"processor": "x86_64",
|
327 |
+
"gpu": "NVIDIA L40S"
|
328 |
+
},
|
329 |
+
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
330 |
+
}
|
331 |
+
}
|
MetaMathQA/results/ia3--llama-3.2-3B-default.json
ADDED
@@ -0,0 +1,351 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_info": {
|
3 |
+
"created_at": "2025-06-19T21:59:33+00:00",
|
4 |
+
"total_time": 2004.8640038169979,
|
5 |
+
"experiment_name": "ia3/llama-3.2-3B-default",
|
6 |
+
"peft_branch": "main",
|
7 |
+
"train_config": {
|
8 |
+
"model_id": "meta-llama/Llama-3.2-3B",
|
9 |
+
"dtype": "bfloat16",
|
10 |
+
"max_seq_length": 768,
|
11 |
+
"batch_size": 4,
|
12 |
+
"batch_size_eval": 50,
|
13 |
+
"max_steps": 5000,
|
14 |
+
"eval_steps": 250,
|
15 |
+
"compile": false,
|
16 |
+
"query_template": "Question: {query} Think step by step.\nAnswer:",
|
17 |
+
"seed": 0,
|
18 |
+
"grad_norm_clip": 1.0,
|
19 |
+
"optimizer_type": "AdamW",
|
20 |
+
"optimizer_kwargs": {
|
21 |
+
"lr": 0.0001,
|
22 |
+
"weight_decay": 0.1
|
23 |
+
},
|
24 |
+
"lr_scheduler": "cosine",
|
25 |
+
"use_amp": false,
|
26 |
+
"autocast_adapter_dtype": true,
|
27 |
+
"generation_kwargs": {
|
28 |
+
"max_length": 800,
|
29 |
+
"max_new_tokens": 300
|
30 |
+
},
|
31 |
+
"attn_implementation": null
|
32 |
+
},
|
33 |
+
"peft_config": {
|
34 |
+
"task_type": null,
|
35 |
+
"peft_type": "IA3",
|
36 |
+
"auto_mapping": null,
|
37 |
+
"base_model_name_or_path": "meta-llama/Llama-3.2-3B",
|
38 |
+
"revision": null,
|
39 |
+
"inference_mode": false,
|
40 |
+
"target_modules": [
|
41 |
+
"down_proj",
|
42 |
+
"v_proj",
|
43 |
+
"k_proj"
|
44 |
+
],
|
45 |
+
"exclude_modules": null,
|
46 |
+
"feedforward_modules": [
|
47 |
+
"down_proj"
|
48 |
+
],
|
49 |
+
"fan_in_fan_out": false,
|
50 |
+
"modules_to_save": null,
|
51 |
+
"init_ia3_weights": true
|
52 |
+
},
|
53 |
+
"error_msg": ""
|
54 |
+
},
|
55 |
+
"train_info": {
|
56 |
+
"cuda_memory_reserved_avg": 12023227429,
|
57 |
+
"cuda_memory_max": 23137878016,
|
58 |
+
"cuda_memory_reserved_99th": 18398566154,
|
59 |
+
"train_time": 1782.9318781230104,
|
60 |
+
"file_size": 1157064,
|
61 |
+
"num_trainable_params": 286720,
|
62 |
+
"num_total_params": 3213036544,
|
63 |
+
"status": "success",
|
64 |
+
"metrics": [
|
65 |
+
{
|
66 |
+
"step": 250,
|
67 |
+
"valid accuracy": 0.0,
|
68 |
+
"train loss": 1.3155810165405273,
|
69 |
+
"train samples": 1000,
|
70 |
+
"train time": 30.56459548201383,
|
71 |
+
"eval time": 10.972947114001727,
|
72 |
+
"tokens / sec": 6926.936105684404,
|
73 |
+
"mem allocated avg": 6780994971.648,
|
74 |
+
"mem reserved avg": 12076433014.784,
|
75 |
+
"elapsed time": 90.53726772200025
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"step": 500,
|
79 |
+
"valid accuracy": 0.0,
|
80 |
+
"train loss": 1.205229633808136,
|
81 |
+
"train samples": 2000,
|
82 |
+
"train time": 30.221456803970796,
|
83 |
+
"eval time": 10.954313254995213,
|
84 |
+
"tokens / sec": 6882.361805029583,
|
85 |
+
"mem allocated avg": 6773721065.472,
|
86 |
+
"mem reserved avg": 11963673346.048,
|
87 |
+
"elapsed time": 175.07058417100052
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"step": 750,
|
91 |
+
"valid accuracy": 0.1,
|
92 |
+
"train loss": 1.0194582087993622,
|
93 |
+
"train samples": 3000,
|
94 |
+
"train time": 30.774312397006724,
|
95 |
+
"eval time": 10.944943730006344,
|
96 |
+
"tokens / sec": 6966.881899231445,
|
97 |
+
"mem allocated avg": 6784231882.752,
|
98 |
+
"mem reserved avg": 12126680776.704,
|
99 |
+
"elapsed time": 260.540154495
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"step": 1000,
|
103 |
+
"valid accuracy": 0.24,
|
104 |
+
"train loss": 0.9196457831859589,
|
105 |
+
"train samples": 4000,
|
106 |
+
"train time": 30.61534244806535,
|
107 |
+
"eval time": 10.960088267995161,
|
108 |
+
"tokens / sec": 6804.95409624808,
|
109 |
+
"mem allocated avg": 6775492155.392,
|
110 |
+
"mem reserved avg": 11986893012.992,
|
111 |
+
"elapsed time": 345.30987053900026
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"step": 1250,
|
115 |
+
"valid accuracy": 0.32,
|
116 |
+
"train loss": 0.8685842225551605,
|
117 |
+
"train samples": 5000,
|
118 |
+
"train time": 29.97266351111466,
|
119 |
+
"eval time": 10.924794500999269,
|
120 |
+
"tokens / sec": 6957.606551138459,
|
121 |
+
"mem allocated avg": 6775089207.296,
|
122 |
+
"mem reserved avg": 11983428517.888,
|
123 |
+
"elapsed time": 429.5542291879974
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"step": 1500,
|
127 |
+
"valid accuracy": 0.32,
|
128 |
+
"train loss": 0.8332846148014068,
|
129 |
+
"train samples": 6000,
|
130 |
+
"train time": 29.98314001694962,
|
131 |
+
"eval time": 10.942266878999362,
|
132 |
+
"tokens / sec": 6981.6236685572,
|
133 |
+
"mem allocated avg": 6776724867.072,
|
134 |
+
"mem reserved avg": 12008594341.888,
|
135 |
+
"elapsed time": 513.8152235820016
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"step": 1750,
|
139 |
+
"valid accuracy": 0.32,
|
140 |
+
"train loss": 0.8169269208908081,
|
141 |
+
"train samples": 7000,
|
142 |
+
"train time": 30.245623568014707,
|
143 |
+
"eval time": 10.940915298000618,
|
144 |
+
"tokens / sec": 6921.8278647558345,
|
145 |
+
"mem allocated avg": 6777912934.4,
|
146 |
+
"mem reserved avg": 12032065667.072,
|
147 |
+
"elapsed time": 598.2868188970024
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"step": 2000,
|
151 |
+
"valid accuracy": 0.32,
|
152 |
+
"train loss": 0.8072074156999588,
|
153 |
+
"train samples": 8000,
|
154 |
+
"train time": 30.292844633964705,
|
155 |
+
"eval time": 10.95617212200159,
|
156 |
+
"tokens / sec": 6856.272578875894,
|
157 |
+
"mem allocated avg": 6775099170.816,
|
158 |
+
"mem reserved avg": 11967473385.472,
|
159 |
+
"elapsed time": 682.7948923380027
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"step": 2250,
|
163 |
+
"valid accuracy": 0.32,
|
164 |
+
"train loss": 0.7952859619855881,
|
165 |
+
"train samples": 9000,
|
166 |
+
"train time": 31.20892413101683,
|
167 |
+
"eval time": 10.942549917002907,
|
168 |
+
"tokens / sec": 6887.388975590319,
|
169 |
+
"mem allocated avg": 6786161477.632,
|
170 |
+
"mem reserved avg": 12167709458.432,
|
171 |
+
"elapsed time": 768.9645714229991
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"step": 2500,
|
175 |
+
"valid accuracy": 0.28,
|
176 |
+
"train loss": 0.7890167078971863,
|
177 |
+
"train samples": 10000,
|
178 |
+
"train time": 30.187670495011844,
|
179 |
+
"eval time": 10.954304017002869,
|
180 |
+
"tokens / sec": 6822.884860692832,
|
181 |
+
"mem allocated avg": 6771082014.72,
|
182 |
+
"mem reserved avg": 11910984499.2,
|
183 |
+
"elapsed time": 853.427360558002
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"step": 2750,
|
187 |
+
"valid accuracy": 0.3,
|
188 |
+
"train loss": 0.7823473591804504,
|
189 |
+
"train samples": 11000,
|
190 |
+
"train time": 30.410061570059042,
|
191 |
+
"eval time": 10.93302121299348,
|
192 |
+
"tokens / sec": 6967.4636965751015,
|
193 |
+
"mem allocated avg": 6782254225.408,
|
194 |
+
"mem reserved avg": 12090903363.584,
|
195 |
+
"elapsed time": 938.3584665200033
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"step": 3000,
|
199 |
+
"valid accuracy": 0.24,
|
200 |
+
"train loss": 0.7709820411205291,
|
201 |
+
"train samples": 12000,
|
202 |
+
"train time": 30.02989622000314,
|
203 |
+
"eval time": 10.940404225999373,
|
204 |
+
"tokens / sec": 6950.773271769175,
|
205 |
+
"mem allocated avg": 6776725577.728,
|
206 |
+
"mem reserved avg": 12003133358.08,
|
207 |
+
"elapsed time": 1022.4627897890023
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"step": 3250,
|
211 |
+
"valid accuracy": 0.3,
|
212 |
+
"train loss": 0.7755767168998718,
|
213 |
+
"train samples": 13000,
|
214 |
+
"train time": 30.172652364024543,
|
215 |
+
"eval time": 10.940153044000908,
|
216 |
+
"tokens / sec": 6989.806446431653,
|
217 |
+
"mem allocated avg": 6778589339.648,
|
218 |
+
"mem reserved avg": 12038298402.816,
|
219 |
+
"elapsed time": 1107.0076802080002
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"step": 3500,
|
223 |
+
"valid accuracy": 0.34,
|
224 |
+
"train loss": 0.7658302361965179,
|
225 |
+
"train samples": 14000,
|
226 |
+
"train time": 30.384311634006735,
|
227 |
+
"eval time": 10.941136569999799,
|
228 |
+
"tokens / sec": 6903.233567590308,
|
229 |
+
"mem allocated avg": 6777534660.608,
|
230 |
+
"mem reserved avg": 12020623605.76,
|
231 |
+
"elapsed time": 1191.893303306002
|
232 |
+
},
|
233 |
+
{
|
234 |
+
"step": 3750,
|
235 |
+
"valid accuracy": 0.34,
|
236 |
+
"train loss": 0.7585167481899261,
|
237 |
+
"train samples": 15000,
|
238 |
+
"train time": 31.250990667955193,
|
239 |
+
"eval time": 10.924158087997057,
|
240 |
+
"tokens / sec": 6934.276173913666,
|
241 |
+
"mem allocated avg": 6788426940.416,
|
242 |
+
"mem reserved avg": 12209652498.432,
|
243 |
+
"elapsed time": 1278.4574160839984
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"step": 4000,
|
247 |
+
"valid accuracy": 0.26,
|
248 |
+
"train loss": 0.7766438691616059,
|
249 |
+
"train samples": 16000,
|
250 |
+
"train time": 30.222231689898763,
|
251 |
+
"eval time": 10.98030305699649,
|
252 |
+
"tokens / sec": 6762.339793335249,
|
253 |
+
"mem allocated avg": 6769563977.728,
|
254 |
+
"mem reserved avg": 11885533462.528,
|
255 |
+
"elapsed time": 1362.9405450319973
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"step": 4250,
|
259 |
+
"valid accuracy": 0.34,
|
260 |
+
"train loss": 0.7542061095237732,
|
261 |
+
"train samples": 17000,
|
262 |
+
"train time": 30.273203028933494,
|
263 |
+
"eval time": 10.948997009996674,
|
264 |
+
"tokens / sec": 6982.710081849145,
|
265 |
+
"mem allocated avg": 6780103426.048,
|
266 |
+
"mem reserved avg": 12047483928.576,
|
267 |
+
"elapsed time": 1447.661586811002
|
268 |
+
},
|
269 |
+
{
|
270 |
+
"step": 4500,
|
271 |
+
"valid accuracy": 0.32,
|
272 |
+
"train loss": 0.7659628703594208,
|
273 |
+
"train samples": 18000,
|
274 |
+
"train time": 29.84466753601737,
|
275 |
+
"eval time": 10.942651322002348,
|
276 |
+
"tokens / sec": 6963.320993581165,
|
277 |
+
"mem allocated avg": 6775043430.4,
|
278 |
+
"mem reserved avg": 11968387743.744,
|
279 |
+
"elapsed time": 1531.5572027719973
|
280 |
+
},
|
281 |
+
{
|
282 |
+
"step": 4750,
|
283 |
+
"valid accuracy": 0.28,
|
284 |
+
"train loss": 0.7580052223205567,
|
285 |
+
"train samples": 19000,
|
286 |
+
"train time": 30.03731635398435,
|
287 |
+
"eval time": 10.927273799999966,
|
288 |
+
"tokens / sec": 6989.272860661278,
|
289 |
+
"mem allocated avg": 6776962899.968,
|
290 |
+
"mem reserved avg": 12017695981.568,
|
291 |
+
"elapsed time": 1615.9832882379997
|
292 |
+
},
|
293 |
+
{
|
294 |
+
"step": 5000,
|
295 |
+
"valid accuracy": 0.36,
|
296 |
+
"train loss": 0.7657463653087616,
|
297 |
+
"train samples": 20000,
|
298 |
+
"train time": 30.07570726004633,
|
299 |
+
"eval time": 10.953207714999735,
|
300 |
+
"tokens / sec": 6925.19042691597,
|
301 |
+
"mem allocated avg": 6774270615.552,
|
302 |
+
"mem reserved avg": 11958900228.096,
|
303 |
+
"elapsed time": 1700.4354192270039
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"step": 5000,
|
307 |
+
"test accuracy": 0.34495830174374525,
|
308 |
+
"train loss": 0.7657463653087616,
|
309 |
+
"train samples": 20000,
|
310 |
+
"train total tokens": 4198051
|
311 |
+
}
|
312 |
+
]
|
313 |
+
},
|
314 |
+
"meta_info": {
|
315 |
+
"model_info": {
|
316 |
+
"sha": "13afe5124825b4f3751f836b40dafda64c1ed062",
|
317 |
+
"created_at": "2024-09-18T15:23:48+00:00"
|
318 |
+
},
|
319 |
+
"dataset_info": {
|
320 |
+
"metamath": {
|
321 |
+
"sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18",
|
322 |
+
"created_at": "2023-09-21T17:22:46+00:00"
|
323 |
+
},
|
324 |
+
"gsm8k": {
|
325 |
+
"sha": "e53f048856ff4f594e959d75785d2c2d37b678ee",
|
326 |
+
"created_at": "2022-04-12T10:22:10+00:00"
|
327 |
+
}
|
328 |
+
},
|
329 |
+
"package_info": {
|
330 |
+
"transformers-version": "4.52.4",
|
331 |
+
"transformers-commit-hash": null,
|
332 |
+
"peft-version": "0.15.2.dev0",
|
333 |
+
"peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf",
|
334 |
+
"datasets-version": "3.6.0",
|
335 |
+
"datasets-commit-hash": null,
|
336 |
+
"bitsandbytes-version": "0.46.0",
|
337 |
+
"bitsandbytes-commit-hash": null,
|
338 |
+
"torch-version": "2.7.1+cu126",
|
339 |
+
"torch-commit-hash": null
|
340 |
+
},
|
341 |
+
"system_info": {
|
342 |
+
"system": "Linux",
|
343 |
+
"release": "6.8.0-1029-aws",
|
344 |
+
"version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025",
|
345 |
+
"machine": "x86_64",
|
346 |
+
"processor": "x86_64",
|
347 |
+
"gpu": "NVIDIA L40S"
|
348 |
+
},
|
349 |
+
"pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n"
|
350 |
+
}
|
351 |
+
}
|