diff --git a/MetaMathQA/Makefile b/MetaMathQA/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..47ce7ab90d5c75e52b57d73a34b35a9056229d83 --- /dev/null +++ b/MetaMathQA/Makefile @@ -0,0 +1,90 @@ +# Makefile for running MetaMathQA experiments. + +# --- Configuration --- +PYTHON := python +RUN_SCRIPT := run.py +EXPERIMENTS_DIR := experiments +RESULTS_DIR := results + +# --- Automatic Experiment and Result Discovery --- + +# 1. Find all experiment directories by looking for adapter_config.json files. +# This gives us a list like: experiments/lora/llama-3.2-3B-rank32 ... +EXPERIMENT_PATHS := $(shell find $(EXPERIMENTS_DIR) \ + -name "adapter_config.json" -or \ + -name "training_params.json" | xargs dirname | sort -u) + +# 2. Define a function to replace all occurrences of a character in a string. +# This is needed to replicate the result naming logic from run.py (e.g., "lora/foo" -> "lora-foo"). +# Usage: $(call replace-all, string, char_to_replace, replacement_char) +replace-all = $(if $(findstring $(2),$(1)),$(call replace-all,$(subst $(2),$(3),$(1)),$(2),$(3)),$(1)) + +# 3. Define a function to convert an experiment path to its flat result file path. +# e.g., "experiments/lora/llama-3.2-3B-rank32" -> "results/lora-llama-3.2-3B-rank32.json" +exp_to_res = $(RESULTS_DIR)/$(call replace-all,$(patsubst $(EXPERIMENTS_DIR)/%,%,$(1)),/,--).json + +# 4. Generate the list of all target result files we want to build. +RESULT_FILES := $(foreach exp,$(EXPERIMENT_PATHS),$(call exp_to_res,$(exp))) + + +# --- Main Rules --- + +# The default 'all' target depends on all possible result files. +# Running `make` or `make all` will check and run any outdated or missing experiments. +all: $(RESULT_FILES) + + +# --- Dynamic Rule Generation --- + +# This is the core logic. We dynamically generate a specific Makefile rule for each experiment found. +# This avoids a complex pattern rule and makes the logic clearer. +define EXPERIMENT_template +# Input $1: The full experiment path (e.g., experiments/lora/llama-3.2-3B-rank32) + +# Define the rule: +# The target is the result file (e.g., results/lora-llama-3.2-3B-rank32.json). +# The dependencies are its config files, code changes need to be audited manually since they can +# vary in degree of importance. Note that we explicitly ignore when the script fails to run +# so that the other experiments still have a chance to run. +$(call exp_to_res,$(1)): $(wildcard $(1)/adapter_config.json) $(wildcard $(1)/training_params.json) + @echo "---" + @echo "Running experiment: $(1)" + -$(PYTHON) $(RUN_SCRIPT) -v $(1) + @echo "Finished: $$@" + @echo "---" + +endef + +# This command iterates through every found experiment path and evaluates the template, +# effectively stamping out a unique, explicit rule for each one. +$(foreach exp_path,$(EXPERIMENT_PATHS),$(eval $(call EXPERIMENT_template,$(exp_path)))) + + +# --- Utility Rules --- + +.PHONY: all clean list dump_rules + +# The 'clean' rule removes all generated results. +clean: + @echo "Cleaning results directory..." + @([ -n "$(wildcard $(RESULTS_DIR)/*.json)" ] && rm $(RESULTS_DIR)/*.json) || exit 0 + +# The 'list' rule is for debugging. It shows the discovered experiments +# and the result files the Makefile expects to create for them. +list: + @echo "Discovered experiment configurations:" + @$(foreach exp,$(EXPERIMENT_PATHS),echo " - $(exp)/adapter_config.json";) + @echo "\nTarget result files:" + @$(foreach res,$(RESULT_FILES),echo " - $(res)";) + +# The 'dump_rules' rule is for debugging. It dumps all dynamically defined rules. +define newline + + +endef +define DUMPED_RULES + $(foreach exp_path,$(EXPERIMENT_PATHS),$(call EXPERIMENT_template,$(exp_path))) +endef + +dump_rules: + @echo -e "$(subst $(newline),\n,${DUMPED_RULES})" diff --git a/MetaMathQA/README.md b/MetaMathQA/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20679f5aceff6349cadaf5b21bd7bcc92b579b5e --- /dev/null +++ b/MetaMathQA/README.md @@ -0,0 +1,241 @@ +# PEFT method comparison on the MetaMathQA and GSM8K datasets + +## Goal + +This goal is to provide a benchmarking framework for the different PEFT methods that are implemented. It is important that evaluating different PEFT methods is reproducible, idempotent, and version-controlled. Results for more PEFT methods can be added over time. + +## Dataset + +This task trains on the [MetaMathQA]((https://huggingface.co/datasets/meta-math/MetaMathQA)) dataset and validates/tests on the [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset ("main"). + +For the model to attain good accuracy, it needs to learn to adhere to the output format and it must express basic chain of thought reasoning capabilities to get to the correct result in the first place. The task is challenging for models in the sub 7B parameter range. + +The train set uses the whole of MetaMathQA. The validation set is a random sample from the train set of GSM8K. The test set is the whole of the GSM8K test set. + +## Running + +Create an experiment in the `experiment/` folder of your choice and give it a name (the name itself does not matter but helps identify the experiment). An example would be `experiments/lora/llama-3.2-3B-rank32/`. Inside that directory, create 2 files: + +- `adapter_config.json` +- Optional: `training_parameters.json` + +Once you created these two files, you can either + +- run the whole suite using by simply calling `make` (takes >24h) +- run one specific experiment by calling `make results/-.json`, + for example `results/vblora-llama-3.2-3B-default.json` + +You can get a list of all runnable experiments by running `make list`, e.g.: +``` +% make list (git)-[method-comparison-results] ⛓ peft +Discovered experiment configurations: + - experiments/ptuning/llama-3.2-3B-default/adapter_config.json + [...] + - experiments/vblora/llama-3.2-3B-default/adapter_config.json + +Target result files: + - results/ptuning-llama-3.2-3B-default.json + [...] + - results/vblora-llama-3.2-3B-default.json +``` + +In case you want to force the execution of an experiment, you can simply `touch` the respective adapter config +without modifying it. For example: + + touch experiments/vblora/llama-3.2-3B-default/adapter_config.json + make + +to run the VBLoRA default experiment again. + +### `adapter_config.json` + +This must be a valid PEFT configuration. It is easiest to create it programmatically, e.g.: + +```python +from peft import LoraConfig + +config = LoraConfig(...) +config.save_pretrained() +``` + +### `training_parameters.json` + +There is a default file for the non-PEFT parameters: `default_training_params.json`. This contains all the other parameters that are relevant for training, e.g. the base model id, number of steps, batch size, learning rate, etc. If parameters that differ from the defaults are needed for a specific experiment, place a `training_parameters.json` into the experiment directory and adjust the parameters that need changing. The other parametes are taken from the aforementioned default config. + +For an overview of all possible arguments, you can also check the `TrainConfig` `dataclass` in `utils.py`. + +### Runtime performance + +Several factors should be considered to achieve a fast runtime performance. Besides the obvious factors like `max_steps` or the base model size, we found the following factors to have a significant impact: + +#### Eval batch size + +Regarding the `batch_size_eval` parameter, it is quite critical since evaluation takes up a significant portion of the training time and batching helps with reducing that. It should be possible to choose a value that is multiple times higher than the batch size used for training (`batch_size`). You should also pay attention to the size of the validation set -- e.g. if it's 50, don't choose a `batch_size_eval` of 40, as that results in a large batch of 30 and a small batch of 10. 25 might be a better choice. Also, ensure via a quick train run that the batch size does not lead to out of memory errors -- getting this error at the very end on evaluating the test set would be quite a loss of time. + +#### Generation length + +During testing, we discovered that the validation time is greatly inflated by just a few very long generations. Those can inflate the validation time by a factor of 3 or more. At the same time, we discovered that these long generations do not help with accuracy -- in fact, if they exceed the maximum configured length, they're just cut off mid sentence and would thus produce an accuracy of 0 anyway. + +To remedy this, we now set both `max_length` and `max_new_tokens` for the generation kwargs in the default training parameters. Normally, this is not possible when using transformers, as the latter argument overrides the former. However, we have added special logic inside of `get_generation_config` which takes both and chooses the smaller of the two. This way, we can get rid of these excessively long generations, thus considerably reducing eval times, while still guaranteeing a maximum total generation length to guard against OOM errors. Testing showed that this does not hamper test accuracy. It is therefore recommended not to change these settings. + +#### Bucketing + +The length of the sequences in the training data can vary a lot. Therefore, if samples are taken randomly from the training dataset, we will end up with batches containing very short and very long sequences. This is bad because the batch will be padded to the longest sequence, slowing down training. The obvious solution would be to sort the whole dataset by sequence length, but this is also bad because it introduces an order bias (e.g. first training on only short and then on only long answers). + +The solution is to find a trade off between the two factors. This is achieved by the `BucketIterator`. It first creates buckets that contain multiple batches, e.g. 20x the batch size. The bucket is then sorted by sequence length and then batches are yielded from the bucket. Therefore, we have a small order bias within a bucket but not between buckets, stricking a good balance between training speed and training loss. + +From practical experiments, for a batch size of 4, a bucket size of 80 provides a good balance with only slightly lower training loss but cutting training time by 25%. For eval, we don't use the iterator since there, the batch size is relatively big and thus there is little upside. + +### Start a run + +Once everything is set up properly, start a run by using the `run.py` script. Pass `-v` for verbose output to the console (recommended if observing the progress is desired). As an example, for `experiments/lora/llama-3.2-3B-rank32/` the invocation would be: + +```sh +python run.py -v experiments/lora/llama-3.2-3B-rank32/ +``` + +By default, the adapter will be saved in a temporary file for further inspection if needed. The prevent this, add the `--clean` flag to the call. + +### Run status + +The run can be categorized 3 different states: + +1. Main run: You are on the `main` branch and the run ended successfully. The results are stored in the `results` folder and are used for further analysis. +2. Test run: You are not on the `main` branch and the run ended successfully. The results are stored in the `temporary_results` folder and are not used for further analysis. +3. The run was cancelled (`ctrl + c`). The results are stored in the `cancelled_results` folder and are not used for further analysis. + +## Outputs + +Results are stored in one of the result directories. An example output could look like so: + +```js +{ + "run_info": { + "created_at": "2025-03-05T13:50:05+00:00", + "total_time": 2711.0915009640157, + "experiment_name": "ia3/lr_0.001", + "peft_branch": "ben-method-comparison", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 51, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "generation_kwargs": { + "max_length": 800 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "IA3", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "target_modules": [ + "v_proj", + "k_proj", + "down_proj" + ], + "exclude_modules": null, + "feedforward_modules": [ + "down_proj" + ], + "fan_in_fan_out": false, + "modules_to_save": null, + "init_ia3_weights": true + } + }, + "train_info": { + "cuda_memory_reserved_avg": 14229219940, + "cuda_memory_max": 24847056896, + "cuda_memory_reserved_99th": 19115624366, + "train_time": 2238.65277833899, + "file_size": 1157064, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0784313725490196, + "train loss": 1.1336498007774354, + "train samples": 1000 + }, + [...] + { + "step": 5000, + "valid accuracy": 0.21568627450980393, + "train loss": 0.6345920492410659, + "train samples": 20000 + }, + { + "step": 5000, + "test accuracy": 0.35129740518962077, + "train loss": 0.6345920492410659, + "train samples": 20000, + "train total tokens": 4197579 + } + ] + }, + "meta_info": { + "model_sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "model_created_at": "2024-09-18T15:23:48+00:00", + "dataset_sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "dataset_created_at": "2023-09-21T17:22:46+00:00", + "package_info": { + "transformers-version": "4.50.0.dev0", + "transformers-commit-hash": "752ef3fd4e70869626ec70657a770a85c0ad9219", + "peft-version": "0.14.1.dev0", + "peft-commit-hash": "a447a4e5ecd87b7d57733f4df9616a328cf130f4", + "datasets-version": "3.3.2", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.45.2", + "bitsandbytes-commit-hash": null, + "torch-version": "2.6.0+cu124", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.11.0-17-generic", + "version": "#17~24.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jan 20 22:48:29 UTC 2", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA GeForce RTX 4090" + }, + "pytorch_info": "PyTorch built with: [...]" + } +} +``` + +## Dependencies + +Apart from the normal PEFT dependencies, ensure that the packages in the `requirements.txt` are installed, e.g. via: + +```sh +python -m pip install -r requirements.txt +``` + +Python 3.12+ is required. + +## Open tasks + +- consider using `DataLoader` +- consider adding https://github.com/huggingface/Math-Verify +- consider adding `weight` argument to cross entropy calculation to downweight the EOS token, but it would require calculating the loss manually instead of relying on transformers (see https://github.com/huggingface/transformers/blob/6a876462c308bd7cd7d3ca8e93abaa7d5b02e90e/src/transformers/loss/loss_utils.py#L24-L48) +- do a sanity check against/comparison with transformers Trainer +- consider using vLLM to potentially speed up generations, at least for the test set +- using `torch.compile` leads to a huge slowdown, investigate (maybe recompiles), although it does save memory +- AMP does not appear to help, investigate +- packing of sequences (but this probably requires adjusting the attention matrix) +- clean up what gets printed and where (stdout, stderr) diff --git a/MetaMathQA/cancelled_results/.gitkeep b/MetaMathQA/cancelled_results/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MetaMathQA/data.py b/MetaMathQA/data.py new file mode 100644 index 0000000000000000000000000000000000000000..be3ace83cfa83c211f5f41086fee9f36660363f7 --- /dev/null +++ b/MetaMathQA/data.py @@ -0,0 +1,109 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +All utilities related to data handling. +""" + +from functools import partial +from typing import Callable + +import datasets +import numpy as np +from datasets import Dataset, load_dataset + + +# with a token limit of 768 for query + response, we have to exclude all texts with length > 1304; this leaves 93.8% of +# the dataset +CHAR_LIMIT = 1300 +# train/valid/test split -- note that evaluation takes quite long, so don't choose too large sizes for the valid set, +# since it's run multiple times during training; test is only run once at the end and thus can be larger +VALID_SIZE = 50 + + +def get_filtered_dataset(*, ds: datasets.Dataset, print_fn: Callable[..., None]) -> Dataset: + """Return the filtered dataset, with long queries removed. + + We determined that 99% of queries have 529 or fewer characters. Characters roughly correspond to tokens, so this is + a good proxy. We cannot use tokens directly, as that depends on the tokenizer, which can be different for each + model, but we want the same filter for each model. + + """ + char_lengths = [len(f"{q} {r}") for q, r in zip(ds["query"], ds["response"])] + idx_filtered = [i for i, length in enumerate(char_lengths) if length <= CHAR_LIMIT] + print_fn(f"Filtered dataset: {100 * len(idx_filtered) / len(ds):.1f}% of the original dataset") + return ds.select(idx_filtered) + + +def get_train_valid_test_datasets( + *, tokenizer, query_template: str, print_fn: Callable[..., None] +) -> tuple[Dataset, Dataset, Dataset]: + """ + Return the indices of the train, valid, and test splits of the dataset. + + We cannot use ds.train_test_split(..., stratify_by_column="type") as it gives: + + > ValueError: Stratifying by column is only supported for ClassLabel column, and column type is Value. + + even after calling ds_filtered.class_encode_column("type"). Thus, using sklearn's StratifiedKFold instead. + """ + metamath = load_dataset("meta-math/MetaMathQA")["train"] + metamath = get_filtered_dataset(ds=metamath, print_fn=print_fn) + + # gsmk8k does not need to be filtered as query and response are short enough + gsm8k = load_dataset("openai/gsm8k", "main") + gsm8k = gsm8k.rename_columns({"question": "query", "answer": "response"}) + gsm8k_train = gsm8k["train"] + gsm8k_test = gsm8k["test"] + + np.random.seed(0) + indices = np.arange(len(gsm8k_train)) + np.random.shuffle(indices) + idx_valid = indices[:VALID_SIZE] + + ds_train = metamath + ds_valid = gsm8k_train.select(idx_valid) + ds_test = gsm8k_test + + print_fn(f"Train size: {len(ds_train)}") + print_fn(f"Valid size: {len(ds_valid)}") + print_fn(f"Test size: {len(ds_test)}") + + tokenize_with_answer_ = partial(tokenize_with_answer, tokenizer=tokenizer, template=query_template) + tokenize_wo_answer_ = partial(tokenize_wo_answer, tokenizer=tokenizer, template=query_template) + ds_train = ds_train.map(tokenize_with_answer_, batched=True).remove_columns(["type", "query", "original_question"]) + ds_valid = ds_valid.map(tokenize_wo_answer_, batched=True).remove_columns(["query"]) + ds_test = ds_test.map(tokenize_wo_answer_, batched=True).remove_columns(["query"]) + + return ds_train, ds_valid, ds_test + + +def tokenize_with_answer(samples, tokenizer, template): + queries = [template.format(query=sample) + answer for sample, answer in zip(samples["query"], samples["response"])] + tokenized = tokenizer(queries) + tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]] + tokenized["attention_mask"] = [ + input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"] + ] + return tokenized + + +def tokenize_wo_answer(samples, tokenizer, template): + queries = [template.format(query=sample) for sample in samples["query"]] + tokenized = tokenizer(queries) + tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]] + tokenized["attention_mask"] = [ + input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"] + ] + return tokenized diff --git a/MetaMathQA/default_training_params.json b/MetaMathQA/default_training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..a200a41ed96409033011b7e4fc33e05fe9c61162 --- /dev/null +++ b/MetaMathQA/default_training_params.json @@ -0,0 +1,26 @@ +{ + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 1e-4, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "attn_implementation": null, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "query_template": "Question: {query} Think step by step.\nAnswer:" +} diff --git a/MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json b/MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d20357b52d92ad65b3af6e932c9dd8d16b47bcb4 --- /dev/null +++ b/MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "beta1": 0.85, + "beta2": 0.85, + "bias": "none", + "corda_config": null, + "deltaT": 1, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "init_r": 64, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 8, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "orth_reg_weight": 0.5, + "peft_type": "ADALORA", + "r": 8, + "rank_pattern": null, + "revision": null, + "target_modules": null, + "target_r": 32, + "task_type": null, + "tfinal": 500, + "tinit": 200, + "total_step": 5000, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json b/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..367bea4cf187d10c96a4d8b53f355bfd269a1e6a --- /dev/null +++ b/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json @@ -0,0 +1,11 @@ +{ + "adapter_layers": 28, + "adapter_len": 100, + "auto_mapping": null, + "base_model_name_or_path": null, + "inference_mode": false, + "peft_type": "ADAPTION_PROMPT", + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json b/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..e8106a88d0de4099e2cbd2648abbe43bdebe6091 --- /dev/null +++ b/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 5e-4 + } +} + diff --git a/MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..44d50893ff7f19c1851a2150e879c444ec134fe1 --- /dev/null +++ b/MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,20 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "boft_block_num": 0, + "boft_block_size": 4, + "boft_dropout": 0.0, + "boft_n_butterfly_factor": 1, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "BOFT", + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json b/MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd69e4389edfe1738ceec6c42be177dd17d924c6 --- /dev/null +++ b/MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json @@ -0,0 +1,19 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "inference_mode": false, + "init_weights": "bat", + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "BONE", + "r": 64, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..abc68802718821c659614b5fdeabb45db2df824b --- /dev/null +++ b/MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,19 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "BONE", + "r": 64, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..170c4bb33e558339b07da8044fb3cb2093d2e4eb --- /dev/null +++ b/MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": false, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "block_size": 64, + "block_size_pattern": {}, + "peft_type": "C3A", + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null +} diff --git a/MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json b/MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..a39b9dc8a825e9b79ace91032d0755835548eb44 --- /dev/null +++ b/MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 3e-1, + "weight_decay": 1e-5 + } +} diff --git a/MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a2a379f07427f4c68eeaf06756004ddaa377f96b --- /dev/null +++ b/MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,23 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": false, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "n_frequency": 1000, + "n_frequency_pattern": {}, + "peft_type": "FOURIERFT", + "random_loc_seed": 777, + "revision": null, + "scaling": 300, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json b/MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..40d40246c48487419ea0d21eb369bea60c729496 --- /dev/null +++ b/MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json @@ -0,0 +1,23 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": false, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "n_frequency": 5000, + "n_frequency_pattern": {}, + "peft_type": "FOURIERFT", + "random_loc_seed": 777, + "revision": null, + "scaling": 300, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json b/MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..6d6c3b0f9114a63d0739eef0c996f4c1c0c0e36c --- /dev/null +++ b/MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-5 + } +} + diff --git a/MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0c8e514faa808ac0874e71f21bad7a576d15349d --- /dev/null +++ b/MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,14 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "feedforward_modules": null, + "inference_mode": false, + "init_ia3_weights": true, + "modules_to_save": null, + "peft_type": "IA3", + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json b/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0c8e514faa808ac0874e71f21bad7a576d15349d --- /dev/null +++ b/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json @@ -0,0 +1,14 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "feedforward_modules": null, + "inference_mode": false, + "init_ia3_weights": true, + "modules_to_save": null, + "peft_type": "IA3", + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json b/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..8a120ad9a80c36dc3666f4da481a5292a7dc8072 --- /dev/null +++ b/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} + diff --git a/MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..70b7363d3ac83e8ff2ee85634baafbee1f42b56a --- /dev/null +++ b/MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,11 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "exclude_modules": null, + "inference_mode": false, + "modules_to_save": null, + "peft_type": "LN_TUNING", + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json b/MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1137259fa26b3abeac269c7bdda56dbeb29e34f7 --- /dev/null +++ b/MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json @@ -0,0 +1,24 @@ +{ + "alpha": 64, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "exclude_modules": null, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 32, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": null, + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json b/MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d30dd77a4c5f185fde99a5d60f381961ac7c522 --- /dev/null +++ b/MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json @@ -0,0 +1,27 @@ +{ + "alpha": 64, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "decompose_both": false, + "decompose_factor": -1, + "exclude_modules": null, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOKR", + "r": 32, + "rank_dropout": 0.0, + "rank_dropout_scale": false, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": null, + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json b/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..255d09d2508a603fd8eea98152025c6cd8f0a789 --- /dev/null +++ b/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": true, + "use_rslora": false +} diff --git a/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json b/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8832c108fac1825c52774517fd3e5bf0fc7d8d64 --- /dev/null +++ b/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} diff --git a/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json b/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..985db872405905c31c93a10aa9cd3f77ed223437 --- /dev/null +++ b/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json @@ -0,0 +1,9 @@ +{ + "optimizer_type": "lora-fa", + "optimizer_kwargs": { + "r": 32, + "lora_alpha": 64, + "lr": 1e-4, + "weight_decay": 0.1 + } +} diff --git a/MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json b/MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8832c108fac1825c52774517fd3e5bf0fc7d8d64 --- /dev/null +++ b/MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} diff --git a/MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json b/MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dc1f8039ab02888675a12a1a1a017ebdd196b9d4 --- /dev/null +++ b/MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": true +} diff --git a/MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json b/MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..75890c9dce9fef14eee47ce19f3baa86d4d4168a --- /dev/null +++ b/MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} diff --git a/MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json b/MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8cdb86ecd110a0176dd42b34e25b3c133cbab4a --- /dev/null +++ b/MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json @@ -0,0 +1,27 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "block_share": false, + "coft": false, + "eps": 6e-05, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "oft_block_size": 0, + "peft_type": "OFT", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json b/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..77bff7fd16cd3f675655221218e69a55eaead91f --- /dev/null +++ b/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json @@ -0,0 +1,15 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "encoder_hidden_size": 3072, + "inference_mode": false, + "num_attention_heads": 24, + "num_layers": 28, + "num_transformer_submodules": 1, + "num_virtual_tokens": 200, + "peft_type": "PREFIX_TUNING", + "prefix_projection": false, + "revision": null, + "task_type": "CAUSAL_LM", + "token_dim": 3072 +} \ No newline at end of file diff --git a/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json b/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..8a120ad9a80c36dc3666f4da481a5292a7dc8072 --- /dev/null +++ b/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} + diff --git a/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..efa055b03d6f3a6c6d0f7df76f11550891919b0a --- /dev/null +++ b/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,17 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "inference_mode": false, + "num_attention_heads": 24, + "num_layers": 28, + "num_transformer_submodules": 1, + "num_virtual_tokens": 200, + "peft_type": "PROMPT_TUNING", + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "revision": null, + "task_type": "CAUSAL_LM", + "token_dim": 3072, + "tokenizer_kwargs": null, + "tokenizer_name_or_path": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json b/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..efa055b03d6f3a6c6d0f7df76f11550891919b0a --- /dev/null +++ b/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json @@ -0,0 +1,17 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "inference_mode": false, + "num_attention_heads": 24, + "num_layers": 28, + "num_transformer_submodules": 1, + "num_virtual_tokens": 200, + "peft_type": "PROMPT_TUNING", + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "revision": null, + "task_type": "CAUSAL_LM", + "token_dim": 3072, + "tokenizer_kwargs": null, + "tokenizer_name_or_path": null +} \ No newline at end of file diff --git a/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json b/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..8a120ad9a80c36dc3666f4da481a5292a7dc8072 --- /dev/null +++ b/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} + diff --git a/MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..54469edf776f3de255d054c317887b1312aa7791 --- /dev/null +++ b/MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,17 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "encoder_dropout": 0.0, + "encoder_hidden_size": 3072, + "encoder_num_layers": 2, + "encoder_reparameterization_type": "MLP", + "inference_mode": false, + "num_attention_heads": 24, + "num_layers": 28, + "num_transformer_submodules": 1, + "num_virtual_tokens": 20, + "peft_type": "P_TUNING", + "revision": null, + "task_type": "CAUSAL_LM", + "token_dim": 3072 +} \ No newline at end of file diff --git a/MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3dbdfaa6b123a057774dc5c46d86bfe4d4e35b55 --- /dev/null +++ b/MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,22 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "RANDLORA", + "projection_prng_key": 0, + "r": 32, + "randlora_alpha": 640, + "randlora_dropout": 0.0, + "revision": null, + "save_projection": true, + "sparse": false, + "target_modules": null, + "task_type": null, + "very_sparse": false +} \ No newline at end of file diff --git a/MetaMathQA/experiments/vblora/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/vblora/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6cbc59e57c07e6b883ff34ed98090d51916d652 --- /dev/null +++ b/MetaMathQA/experiments/vblora/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_logits_std": 0.1, + "init_vector_bank_bound": 0.02, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "num_vectors": 256, + "peft_type": "VBLORA", + "r": 4, + "revision": null, + "save_only_topk_weights": false, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null, + "topk": 2, + "vblora_dropout": 0.0, + "vector_length": 256 +} \ No newline at end of file diff --git a/MetaMathQA/experiments/vera/llama-3.2-3B-default/adapter_config.json b/MetaMathQA/experiments/vera/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f4962c1b4fa1266ba29f31559fa3260483d8fac7 --- /dev/null +++ b/MetaMathQA/experiments/vera/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,20 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "d_initial": 0.1, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "VERA", + "projection_prng_key": 0, + "r": 256, + "revision": null, + "save_projection": true, + "target_modules": null, + "task_type": null, + "vera_dropout": 0.0 +} \ No newline at end of file diff --git a/MetaMathQA/experiments/vera/llama-3.2-3B-default/training_params.json b/MetaMathQA/experiments/vera/llama-3.2-3B-default/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..8a120ad9a80c36dc3666f4da481a5292a7dc8072 --- /dev/null +++ b/MetaMathQA/experiments/vera/llama-3.2-3B-default/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} + diff --git a/MetaMathQA/requirements.txt b/MetaMathQA/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee25a1ced129a9c13938c22922aa0514230af60b --- /dev/null +++ b/MetaMathQA/requirements.txt @@ -0,0 +1,4 @@ +bitsandbytes +datasets +numpy +tqdm diff --git a/MetaMathQA/results/.gitkeep b/MetaMathQA/results/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MetaMathQA/results/adalora--llama-3.2-3B-rank32.json b/MetaMathQA/results/adalora--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000000000000000000000000000000000..cfad82b31779866b0c730b6c6c3da14c1ea136bb --- /dev/null +++ b/MetaMathQA/results/adalora--llama-3.2-3B-rank32.json @@ -0,0 +1,4071 @@ +{ + "run_info": { + "created_at": "2025-06-19T23:12:19+00:00", + "total_time": 2209.243281380004, + "experiment_name": "adalora/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "ADALORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": { + "model.layers.0.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.0.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true + ], + "model.layers.1.self_attn.q_proj.lora_E": [ + false, + false, + true, + true, + false, + true, + true, + false, + false, + false, + false, + true, + false, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + false, + true, + false, + true, + true, + false, + false, + true, + true, + true, + false, + true, + true, + false, + false, + true, + true, + true, + false, + false, + false, + true, + false, + true, + true, + true, + true, + false, + true, + true, + true, + false, + false, + true, + true, + false, + false, + true, + true, + false, + false + ], + "model.layers.1.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true + ], + "model.layers.2.self_attn.q_proj.lora_E": [ + true, + false, + true, + false, + false, + false, + true, + true, + true, + true, + false, + true, + true, + true, + false, + false, + true, + false, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + true, + true, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + false, + true, + true, + false, + true, + false, + false, + false, + true, + true, + false, + true, + true, + false, + false + ], + "model.layers.2.self_attn.v_proj.lora_E": [ + true, + false, + false, + false, + true, + true, + true, + true, + false, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + true, + true, + false, + false, + true, + true, + false, + true, + true, + true, + true, + false, + false, + true, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + true + ], + "model.layers.3.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.3.self_attn.v_proj.lora_E": [ + false, + false, + false, + false, + false, + true, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + true, + true, + true, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + true, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + true, + false, + false, + false, + false, + true, + true, + true + ], + "model.layers.4.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + true, + false, + false, + true, + true, + false, + true, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + true, + true, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + true, + false, + false + ], + "model.layers.4.self_attn.v_proj.lora_E": [ + true, + false, + true, + true, + false, + false, + true, + false, + false, + false, + true, + false, + true, + true, + false, + true, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + false, + true, + false, + true, + false, + false, + true, + true, + true, + true, + true, + false, + false, + false, + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true + ], + "model.layers.5.self_attn.q_proj.lora_E": [ + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.5.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + false, + false, + true, + false, + false, + true, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + false, + false, + true, + false, + true, + false, + true, + true, + false, + false, + true, + true, + true, + true, + false, + false, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + false, + false, + false, + true, + true, + true, + true, + false, + false, + false, + true, + true + ], + "model.layers.6.self_attn.q_proj.lora_E": [ + false, + false, + true, + true, + false, + false, + true, + true, + false, + false, + false, + true, + false, + true, + false, + true, + false, + false, + false, + false, + true, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.6.self_attn.v_proj.lora_E": [ + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + false, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + true, + false, + true, + false, + true + ], + "model.layers.7.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.7.self_attn.v_proj.lora_E": [ + false, + false, + true, + true, + false, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + true, + false, + true, + false, + false, + true, + true, + true, + true, + false, + false, + false, + true, + false, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + false, + false, + true, + true, + true, + true + ], + "model.layers.8.self_attn.q_proj.lora_E": [ + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + false, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true + ], + "model.layers.8.self_attn.v_proj.lora_E": [ + false, + true, + false, + false, + false, + true, + false, + false, + false, + false, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + false, + false, + true, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + true, + false, + true, + true, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + true, + true, + true, + false, + false, + false, + false, + true + ], + "model.layers.9.self_attn.q_proj.lora_E": [ + true, + false, + true, + true, + false, + false, + true, + true, + false, + false, + true, + false, + false, + false, + false, + true, + false, + true, + false, + true, + false, + false, + false, + true, + false, + true, + false, + true, + false, + true, + false, + true, + true, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + false, + false, + false, + false, + true, + false, + true, + false, + false, + true, + false, + true + ], + "model.layers.9.self_attn.v_proj.lora_E": [ + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false + ], + "model.layers.10.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.10.self_attn.v_proj.lora_E": [ + true, + true, + false, + false, + false, + true, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + false, + false, + true, + true, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + true, + false, + true, + true, + false, + true, + false, + false, + true, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + false, + true + ], + "model.layers.11.self_attn.q_proj.lora_E": [ + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + true, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + false, + true, + false, + true, + true, + false, + false, + false + ], + "model.layers.11.self_attn.v_proj.lora_E": [ + false, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + false, + false, + false, + true, + false, + false, + true, + true, + false, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + false + ], + "model.layers.12.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + true, + true, + false, + true, + false, + false, + false, + false, + true, + false, + true, + false, + false, + true, + false, + true, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.12.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + false, + false, + true, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + true, + false, + true, + true, + false + ], + "model.layers.13.self_attn.q_proj.lora_E": [ + true, + true, + false, + true, + true, + true, + false, + false, + true, + true, + false, + true, + false, + true, + false, + true, + false, + false, + true, + true, + false, + true, + false, + true, + true, + true, + true, + false, + false, + true, + true, + false, + false, + true, + false, + true, + false, + true, + true, + true, + false, + false, + false, + false, + true, + true, + true, + true, + false, + true, + false, + true, + true, + true, + false, + true, + false, + true, + true, + false, + false, + false, + true, + false + ], + "model.layers.13.self_attn.v_proj.lora_E": [ + true, + false, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + false, + true, + true, + false, + false, + true, + false, + false, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true + ], + "model.layers.14.self_attn.q_proj.lora_E": [ + false, + true, + false, + true, + true, + false, + false, + false, + true, + false, + false, + true, + false, + false, + true, + true, + false, + true, + true, + true, + false, + false, + false, + true, + false, + true, + false, + true, + false, + false, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + true, + false, + false, + true, + false, + false, + false, + true, + false, + true, + true, + true, + false, + true, + true, + true, + false, + false, + true, + false, + true, + true, + false + ], + "model.layers.14.self_attn.v_proj.lora_E": [ + true, + true, + true, + false, + false, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + true, + false, + true, + false, + false, + true, + false, + false, + true, + false, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + true, + true, + true, + false + ], + "model.layers.15.self_attn.q_proj.lora_E": [ + false, + true, + true, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + false, + true, + false, + false, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + false, + false, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true + ], + "model.layers.15.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + false, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + false, + false, + false, + true, + true, + true, + true, + true, + true, + true + ], + "model.layers.16.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + false, + false, + true, + true, + true, + true, + false, + true, + false, + true, + true, + true, + false, + true, + false, + true, + true, + true, + false, + false, + false, + true, + false, + false, + false, + true, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + false, + true, + false, + false, + true, + true, + true, + false, + false, + true, + true + ], + "model.layers.16.self_attn.v_proj.lora_E": [ + true, + false, + true, + false, + true, + false, + true, + false, + false, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + false, + true, + true, + true, + false, + true, + true, + false, + false, + false, + true, + true, + true, + false, + true, + false, + false, + true, + false, + false, + false, + true, + false, + true, + false, + true, + true, + true, + false, + true, + true, + false, + false, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + false, + false + ], + "model.layers.17.self_attn.q_proj.lora_E": [ + true, + true, + true, + true, + false, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + false, + true, + false, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true + ], + "model.layers.17.self_attn.v_proj.lora_E": [ + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + false, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true + ], + "model.layers.18.self_attn.q_proj.lora_E": [ + false, + true, + false, + true, + false, + true, + false, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + false, + true, + false, + false, + false, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + true, + false, + true, + false, + false, + false, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true + ], + "model.layers.18.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + false, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true + ], + "model.layers.19.self_attn.q_proj.lora_E": [ + false, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + false, + false, + true, + true, + true, + true, + true, + false, + false, + false, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + true + ], + "model.layers.19.self_attn.v_proj.lora_E": [ + false, + true, + false, + true, + true, + true, + false, + true, + false, + false, + false, + false, + true, + true, + true, + false, + true, + false, + false, + false, + false, + true, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + true, + true + ], + "model.layers.20.self_attn.q_proj.lora_E": [ + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + true, + true, + false, + false, + false, + false, + true, + false, + true, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + true, + true, + false, + false, + false, + false, + false, + true, + false, + false + ], + "model.layers.20.self_attn.v_proj.lora_E": [ + true, + false, + true, + true, + false, + false, + false, + true, + true, + false, + false, + true, + true, + true, + false, + true, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + false, + true, + false, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + true, + false, + false, + true, + false, + false, + true, + true + ], + "model.layers.21.self_attn.q_proj.lora_E": [ + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + false, + false, + true, + true, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + false, + false, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true + ], + "model.layers.21.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + true, + true, + true, + false, + true, + true, + false, + true, + true, + false, + true, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + true, + false, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false + ], + "model.layers.22.self_attn.q_proj.lora_E": [ + false, + true, + false, + true, + true, + false, + true, + false, + false, + true, + false, + false, + false, + false, + false, + false, + true, + false, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + false, + true, + false, + false, + false, + true, + false, + true, + true, + true, + false, + false, + true, + false, + false, + true, + true, + false, + true + ], + "model.layers.22.self_attn.v_proj.lora_E": [ + false, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + false, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true + ], + "model.layers.23.self_attn.q_proj.lora_E": [ + true, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + false, + true, + false, + true, + true, + true, + false, + true, + true + ], + "model.layers.23.self_attn.v_proj.lora_E": [ + false, + true, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + false, + false, + true, + true, + false, + true, + false, + true, + true + ], + "model.layers.24.self_attn.q_proj.lora_E": [ + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + false, + true, + false, + true, + true, + true, + true, + false, + false, + false, + true, + true, + true, + true, + false, + false, + true + ], + "model.layers.24.self_attn.v_proj.lora_E": [ + true, + true, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + true, + false, + false, + true, + false, + false, + false, + false, + true, + true, + true, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + true, + true, + false, + false, + false, + true, + false, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true, + true, + true, + false, + false, + true, + true, + true, + false, + true, + false, + true, + true, + true + ], + "model.layers.25.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + true, + true, + false, + true, + true, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + false, + true, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + true, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.25.self_attn.v_proj.lora_E": [ + false, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.26.self_attn.q_proj.lora_E": [ + true, + false, + false, + true, + false, + false, + false, + false, + false, + false, + true, + false, + true, + false, + true, + true, + true, + false, + false, + true, + true, + true, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + true, + true, + true + ], + "model.layers.26.self_attn.v_proj.lora_E": [ + false, + false, + true, + false, + true, + false, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + true, + true, + false, + false, + false, + true, + false, + true, + false, + true, + false, + false, + false, + true, + false, + false + ], + "model.layers.27.self_attn.q_proj.lora_E": [ + true, + false, + false, + true, + true, + false, + false, + true, + true, + false, + false, + false, + true, + true, + false, + true, + false, + false, + true, + false, + false, + true, + true, + true, + true, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + true, + false, + false, + false, + true, + false, + true, + true, + true, + false, + false, + false, + true, + true, + true, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + false + ], + "model.layers.27.self_attn.v_proj.lora_E": [ + false, + false, + true, + true, + true, + true, + true, + true, + true, + false, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + true, + false, + true, + true, + false, + false, + true, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + false, + false, + true, + false, + false, + true, + true, + true, + true, + true, + false, + true + ] + }, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "lora_bias": false, + "target_r": 32, + "init_r": 64, + "tinit": 200, + "tfinal": 500, + "deltaT": 1, + "beta1": 0.85, + "beta2": 0.85, + "orth_reg_weight": 0.5, + "total_step": 5000 + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 12361399900, + "cuda_memory_max": 22793945088, + "cuda_memory_reserved_99th": 18203426160, + "train_time": 1986.3603882369862, + "file_size": 35147440, + "num_trainable_params": 18353664, + "num_total_params": 3231103544, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3241184422969818, + "train samples": 1000, + "train time": 35.95594502204767, + "eval time": 11.413120707002236, + "tokens / sec": 5888.289123542072, + "mem allocated avg": 7292959393.792, + "mem reserved avg": 12441731727.36, + "elapsed time": 100.98083375500573 + }, + { + "step": 500, + "valid accuracy": 0.38, + "train loss": 1.0195633232593537, + "train samples": 2000, + "train time": 37.64258231502754, + "eval time": 11.37802824100072, + "tokens / sec": 5525.524212428035, + "mem allocated avg": 7285510731.776, + "mem reserved avg": 12328493907.968, + "elapsed time": 197.93603045200143 + }, + { + "step": 750, + "valid accuracy": 0.28, + "train loss": 0.7883218789100647, + "train samples": 3000, + "train time": 37.909325722001086, + "eval time": 11.385932488003164, + "tokens / sec": 5655.626838954038, + "mem allocated avg": 7296095842.304, + "mem reserved avg": 12484438130.688, + "elapsed time": 295.9188707240028 + }, + { + "step": 1000, + "valid accuracy": 0.3, + "train loss": 0.7408825470209122, + "train samples": 4000, + "train time": 37.79932949803333, + "eval time": 11.34964040399791, + "tokens / sec": 5511.6321576772825, + "mem allocated avg": 7286506670.08, + "mem reserved avg": 12351948455.936, + "elapsed time": 393.33776786700037 + }, + { + "step": 1250, + "valid accuracy": 0.36, + "train loss": 0.7282904219627381, + "train samples": 5000, + "train time": 37.475317073069164, + "eval time": 11.342822429993248, + "tokens / sec": 5564.676066473135, + "mem allocated avg": 7287005519.872, + "mem reserved avg": 12349910024.192, + "elapsed time": 490.5430299360014 + }, + { + "step": 1500, + "valid accuracy": 0.38, + "train loss": 0.7161256531476975, + "train samples": 6000, + "train time": 37.660518338059774, + "eval time": 11.34013032400253, + "tokens / sec": 5558.367469107556, + "mem allocated avg": 7287642494.976, + "mem reserved avg": 12380570386.432, + "elapsed time": 588.017992052999 + }, + { + "step": 1750, + "valid accuracy": 0.34, + "train loss": 0.7056601424217224, + "train samples": 7000, + "train time": 37.636171496975294, + "eval time": 11.3171367870018, + "tokens / sec": 5562.600861695649, + "mem allocated avg": 7289782888.448, + "mem reserved avg": 12389051269.12, + "elapsed time": 685.2421731229988 + }, + { + "step": 2000, + "valid accuracy": 0.34, + "train loss": 0.7058932571411133, + "train samples": 8000, + "train time": 37.505602380944765, + "eval time": 11.37751964799827, + "tokens / sec": 5537.732680318789, + "mem allocated avg": 7287054886.912, + "mem reserved avg": 12336119152.64, + "elapsed time": 782.1823508529997 + }, + { + "step": 2250, + "valid accuracy": 0.3, + "train loss": 0.700018577337265, + "train samples": 9000, + "train time": 38.06487834800646, + "eval time": 11.33160761000181, + "tokens / sec": 5646.885247730137, + "mem allocated avg": 7297638139.904, + "mem reserved avg": 12521129902.08, + "elapsed time": 880.444039299 + }, + { + "step": 2500, + "valid accuracy": 0.34, + "train loss": 0.6984639673233032, + "train samples": 10000, + "train time": 37.400825600088865, + "eval time": 7.680036880999978, + "tokens / sec": 5507.017470745635, + "mem allocated avg": 7283608303.616, + "mem reserved avg": 12278598467.584, + "elapsed time": 973.4031999860017 + }, + { + "step": 2750, + "valid accuracy": 0.32, + "train loss": 0.691307947397232, + "train samples": 11000, + "train time": 37.97861938195274, + "eval time": 11.376824188999308, + "tokens / sec": 5578.954776346737, + "mem allocated avg": 7293332232.192, + "mem reserved avg": 12452821467.136, + "elapsed time": 1071.2981272770048 + }, + { + "step": 3000, + "valid accuracy": 0.3, + "train loss": 0.6851879090070725, + "train samples": 12000, + "train time": 37.862704559986014, + "eval time": 11.377599911000289, + "tokens / sec": 5512.839149387935, + "mem allocated avg": 7288929478.656, + "mem reserved avg": 12371468746.752, + "elapsed time": 1168.7257358770003 + }, + { + "step": 3250, + "valid accuracy": 0.34, + "train loss": 0.6939580011367797, + "train samples": 13000, + "train time": 37.79518606400961, + "eval time": 7.2029460159974406, + "tokens / sec": 5580.102176050141, + "mem allocated avg": 7290687285.248, + "mem reserved avg": 12403068633.088, + "elapsed time": 1261.9857917680056 + }, + { + "step": 3500, + "valid accuracy": 0.4, + "train loss": 0.6825792235136032, + "train samples": 14000, + "train time": 37.73422463506722, + "eval time": 11.28984081800445, + "tokens / sec": 5558.614282617983, + "mem allocated avg": 7289277476.864, + "mem reserved avg": 12381820289.024, + "elapsed time": 1359.695578400002 + }, + { + "step": 3750, + "valid accuracy": 0.34, + "train loss": 0.6795008780956269, + "train samples": 15000, + "train time": 38.156728624038806, + "eval time": 11.362600938999094, + "tokens / sec": 5679.286663570962, + "mem allocated avg": 7299185600.512, + "mem reserved avg": 12562561236.992, + "elapsed time": 1458.6053942910003 + }, + { + "step": 4000, + "valid accuracy": 0.32, + "train loss": 0.6967895623445511, + "train samples": 16000, + "train time": 37.352128309052205, + "eval time": 11.363241717001074, + "tokens / sec": 5471.522219805362, + "mem allocated avg": 7281535514.624, + "mem reserved avg": 12256066666.496, + "elapsed time": 1555.2909630150025 + }, + { + "step": 4250, + "valid accuracy": 0.34, + "train loss": 0.6776066061258316, + "train samples": 17000, + "train time": 37.65609644694632, + "eval time": 11.334564828997827, + "tokens / sec": 5613.672683726684, + "mem allocated avg": 7291894349.824, + "mem reserved avg": 12418562392.064, + "elapsed time": 1652.928281804001 + }, + { + "step": 4500, + "valid accuracy": 0.34, + "train loss": 0.6868188911676407, + "train samples": 18000, + "train time": 37.48494880297949, + "eval time": 11.33762150000257, + "tokens / sec": 5544.038517760537, + "mem allocated avg": 7285549684.736, + "mem reserved avg": 12333837451.264, + "elapsed time": 1749.9311109990012 + }, + { + "step": 4750, + "valid accuracy": 0.34, + "train loss": 0.6806062284708023, + "train samples": 19000, + "train time": 33.62080936400889, + "eval time": 11.34113016500487, + "tokens / sec": 6244.31725384755, + "mem allocated avg": 7068488509.44, + "mem reserved avg": 12120833916.928, + "elapsed time": 1843.633759463999 + }, + { + "step": 5000, + "valid accuracy": 0.28, + "train loss": 0.6862971596717834, + "train samples": 20000, + "train time": 33.47089828590106, + "eval time": 11.363945298006001, + "tokens / sec": 6222.7191580255185, + "mem allocated avg": 7065409925.12, + "mem reserved avg": 12064965787.648, + "elapsed time": 1937.0431615920024 + }, + { + "step": 5000, + "test accuracy": 0.3904473085670963, + "train loss": 0.6862971596717834, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/adaptionprompt--llama-3.2-3B-lr_0.0005.json b/MetaMathQA/results/adaptionprompt--llama-3.2-3B-lr_0.0005.json new file mode 100644 index 0000000000000000000000000000000000000000..87127c7ef3f9aa7fedffd566149f8b5264e448b9 --- /dev/null +++ b/MetaMathQA/results/adaptionprompt--llama-3.2-3B-lr_0.0005.json @@ -0,0 +1,341 @@ +{ + "run_info": { + "created_at": "2025-06-20T04:48:22+00:00", + "total_time": 2260.6744696069945, + "experiment_name": "adaptionprompt/llama-3.2-3B-lr_0.0005", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0005 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "ADAPTION_PROMPT", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "target_modules": "self_attn", + "adapter_len": 100, + "adapter_layers": 28 + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 11893757234, + "cuda_memory_max": 22410166272, + "cuda_memory_reserved_99th": 17907664814, + "train_time": 1989.2834085189897, + "file_size": 17210384, + "num_trainable_params": 8601628, + "num_total_params": 3221351452, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3201356165409088, + "train samples": 1000, + "train time": 36.18721537806414, + "eval time": 13.46754032199533, + "tokens / sec": 5850.657415556191, + "mem allocated avg": 6848060076.032, + "mem reserved avg": 11943163199.488, + "elapsed time": 99.94861951399798 + }, + { + "step": 500, + "valid accuracy": 0.1, + "train loss": 1.153662922859192, + "train samples": 2000, + "train time": 35.6493088029747, + "eval time": 13.314302301005227, + "tokens / sec": 5834.474972559473, + "mem allocated avg": 6840933136.384, + "mem reserved avg": 11833045942.272, + "elapsed time": 193.4177081749949 + }, + { + "step": 750, + "valid accuracy": 0.22, + "train loss": 0.9016587936878204, + "train samples": 3000, + "train time": 36.424757257977035, + "eval time": 13.392894379001518, + "tokens / sec": 5886.133941305707, + "mem allocated avg": 6851972698.112, + "mem reserved avg": 11989870968.832, + "elapsed time": 288.2962625699947 + }, + { + "step": 1000, + "valid accuracy": 0.2, + "train loss": 0.8571369113922119, + "train samples": 4000, + "train time": 35.59983186099271, + "eval time": 13.363479856001504, + "tokens / sec": 5852.1624712581015, + "mem allocated avg": 6842572642.304, + "mem reserved avg": 11863001661.44, + "elapsed time": 381.66334240599826 + }, + { + "step": 1250, + "valid accuracy": 0.18, + "train loss": 0.84929132604599, + "train samples": 5000, + "train time": 35.52914607799903, + "eval time": 13.408120855005109, + "tokens / sec": 5869.490911551474, + "mem allocated avg": 6843078866.944, + "mem reserved avg": 11855409971.2, + "elapsed time": 475.2031378399988 + }, + { + "step": 1500, + "valid accuracy": 0.18, + "train loss": 0.8379741818904877, + "train samples": 6000, + "train time": 35.84657208897261, + "eval time": 13.451748254003178, + "tokens / sec": 5839.637873335062, + "mem allocated avg": 6844234328.064, + "mem reserved avg": 11880013758.464, + "elapsed time": 568.970056428996 + }, + { + "step": 1750, + "valid accuracy": 0.2, + "train loss": 0.8320568509101868, + "train samples": 7000, + "train time": 36.04748217701126, + "eval time": 13.354637482996623, + "tokens / sec": 5807.756529900249, + "mem allocated avg": 6845049858.048, + "mem reserved avg": 11894333112.32, + "elapsed time": 663.2131869919976 + }, + { + "step": 2000, + "valid accuracy": 0.2, + "train loss": 0.83651398563385, + "train samples": 8000, + "train time": 35.70882848704787, + "eval time": 13.407459709997056, + "tokens / sec": 5816.376756110452, + "mem allocated avg": 6842067818.496, + "mem reserved avg": 11843724640.256, + "elapsed time": 756.9679808469955 + }, + { + "step": 2250, + "valid accuracy": 0.18, + "train loss": 0.8321560187339783, + "train samples": 9000, + "train time": 36.077689886013104, + "eval time": 13.313609958000598, + "tokens / sec": 5957.92027369615, + "mem allocated avg": 6853360060.416, + "mem reserved avg": 12025841319.936, + "elapsed time": 851.5264306229947 + }, + { + "step": 2500, + "valid accuracy": 0.22, + "train loss": 0.830465945482254, + "train samples": 10000, + "train time": 35.51607862501987, + "eval time": 13.570960901000944, + "tokens / sec": 5799.260728488849, + "mem allocated avg": 6838232895.488, + "mem reserved avg": 11785499312.128, + "elapsed time": 945.1205676109967 + }, + { + "step": 2750, + "valid accuracy": 0.2, + "train loss": 0.8323929319381714, + "train samples": 11000, + "train time": 36.33290277811466, + "eval time": 13.340032396001334, + "tokens / sec": 5831.6562619276265, + "mem allocated avg": 6849506107.392, + "mem reserved avg": 11957667102.72, + "elapsed time": 1039.698461469001 + }, + { + "step": 3000, + "valid accuracy": 0.22, + "train loss": 0.8273163681030273, + "train samples": 12000, + "train time": 36.133581758025684, + "eval time": 13.486512909999874, + "tokens / sec": 5776.648476140576, + "mem allocated avg": 6844330549.248, + "mem reserved avg": 11874754101.248, + "elapsed time": 1134.0729920019949 + }, + { + "step": 3250, + "valid accuracy": 0.18, + "train loss": 0.8321007430553437, + "train samples": 13000, + "train time": 35.81564853595046, + "eval time": 13.383609317002993, + "tokens / sec": 5888.515456820645, + "mem allocated avg": 6845503963.136, + "mem reserved avg": 11903065653.248, + "elapsed time": 1228.1345331240009 + }, + { + "step": 3500, + "valid accuracy": 0.18, + "train loss": 0.8267617487907409, + "train samples": 14000, + "train time": 35.759473790014454, + "eval time": 13.568141147006827, + "tokens / sec": 5865.578482269809, + "mem allocated avg": 6844375582.72, + "mem reserved avg": 11893385199.616, + "elapsed time": 1322.3741278140005 + }, + { + "step": 3750, + "valid accuracy": 0.18, + "train loss": 0.822540352344513, + "train samples": 15000, + "train time": 36.6447854490616, + "eval time": 13.383382205000089, + "tokens / sec": 5913.610827418539, + "mem allocated avg": 6855454945.28, + "mem reserved avg": 12064244367.36, + "elapsed time": 1417.8726171529997 + }, + { + "step": 4000, + "valid accuracy": 0.22, + "train loss": 0.842738341331482, + "train samples": 16000, + "train time": 35.83419257100468, + "eval time": 13.484180120998644, + "tokens / sec": 5703.295800373884, + "mem allocated avg": 6837201041.408, + "mem reserved avg": 11769015697.408, + "elapsed time": 1511.8286734409994 + }, + { + "step": 4250, + "valid accuracy": 0.24, + "train loss": 0.8195172207355499, + "train samples": 17000, + "train time": 36.032976000991766, + "eval time": 13.43221827600064, + "tokens / sec": 5866.542913196561, + "mem allocated avg": 6847173238.784, + "mem reserved avg": 11924070727.68, + "elapsed time": 1606.2413196950001 + }, + { + "step": 4500, + "valid accuracy": 0.22, + "train loss": 0.8333091423511505, + "train samples": 18000, + "train time": 35.92476197002543, + "eval time": 13.364069708994066, + "tokens / sec": 5784.812163081199, + "mem allocated avg": 6842308513.792, + "mem reserved avg": 11840637632.512, + "elapsed time": 1700.1633438569988 + }, + { + "step": 4750, + "valid accuracy": 0.24, + "train loss": 0.8247289218902588, + "train samples": 19000, + "train time": 36.319470202004595, + "eval time": 13.367499373998726, + "tokens / sec": 5780.343128144329, + "mem allocated avg": 6845010323.456, + "mem reserved avg": 11893443919.872, + "elapsed time": 1795.0117048679967 + }, + { + "step": 5000, + "valid accuracy": 0.24, + "train loss": 0.8317011270523071, + "train samples": 20000, + "train time": 35.778475134953624, + "eval time": 13.382634160996531, + "tokens / sec": 5821.377216731123, + "mem allocated avg": 6841479706.624, + "mem reserved avg": 11840956399.616, + "elapsed time": 1888.9356832179983 + }, + { + "step": 5000, + "test accuracy": 0.22062168309325247, + "train loss": 0.8317011270523071, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/boft--llama-3.2-3B-default.json b/MetaMathQA/results/boft--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..7b8de14f0396f2688d9b9e82200a7016ecc1fc50 --- /dev/null +++ b/MetaMathQA/results/boft--llama-3.2-3B-default.json @@ -0,0 +1,354 @@ +{ + "run_info": { + "created_at": "2025-06-20T00:26:06+00:00", + "total_time": 11113.556226242006, + "experiment_name": "boft/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "BOFT", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "boft_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "modules_to_save": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 14814855089, + "cuda_memory_max": 24427626496, + "cuda_memory_reserved_99th": 20103445872, + "train_time": 8291.859631775995, + "file_size": 3225360, + "num_trainable_params": 802816, + "num_total_params": 3213552640, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.291453486919403, + "train samples": 1000, + "train time": 168.6401632970519, + "eval time": 140.71104099299555, + "tokens / sec": 1255.4482625059293, + "mem allocated avg": 6794374191.104, + "mem reserved avg": 14862272954.368, + "elapsed time": 378.35506656600046 + }, + { + "step": 500, + "valid accuracy": 0.12, + "train loss": 1.0658165102005004, + "train samples": 2000, + "train time": 168.0782826189752, + "eval time": 140.55351014900225, + "tokens / sec": 1237.4888460248842, + "mem allocated avg": 6786098696.192, + "mem reserved avg": 14759126630.4, + "elapsed time": 750.4153373740046 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.8760707340240479, + "train samples": 3000, + "train time": 168.35559053501493, + "eval time": 140.5371915020005, + "tokens / sec": 1273.5009233649919, + "mem allocated avg": 6796379451.392, + "mem reserved avg": 14898109087.744, + "elapsed time": 1123.1088362480004 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.8187176239490509, + "train samples": 4000, + "train time": 168.23626853094902, + "eval time": 140.51234973900137, + "tokens / sec": 1238.3536666570453, + "mem allocated avg": 6788017170.432, + "mem reserved avg": 14785978564.608, + "elapsed time": 1495.2035204040003 + }, + { + "step": 1250, + "valid accuracy": 0.44, + "train loss": 0.7968595073223114, + "train samples": 5000, + "train time": 168.06973706404096, + "eval time": 140.56398986800195, + "tokens / sec": 1240.7825682534333, + "mem allocated avg": 6786994073.6, + "mem reserved avg": 14784728662.016, + "elapsed time": 1867.293767313 + }, + { + "step": 1500, + "valid accuracy": 0.3, + "train loss": 0.7768308148384094, + "train samples": 6000, + "train time": 168.12391281103191, + "eval time": 140.47015122300218, + "tokens / sec": 1245.0995013141533, + "mem allocated avg": 6790023022.592, + "mem reserved avg": 14800616685.568, + "elapsed time": 2239.2391544300044 + }, + { + "step": 1750, + "valid accuracy": 0.34, + "train loss": 0.7639130955934524, + "train samples": 7000, + "train time": 168.4569528100401, + "eval time": 140.76006173399946, + "tokens / sec": 1242.780404772479, + "mem allocated avg": 6790166409.216, + "mem reserved avg": 14820103421.952, + "elapsed time": 2611.854956449002 + }, + { + "step": 2000, + "valid accuracy": 0.28, + "train loss": 0.7575103138685226, + "train samples": 8000, + "train time": 168.38565446306166, + "eval time": 140.82750502999988, + "tokens / sec": 1233.4542432506432, + "mem allocated avg": 6787659706.368, + "mem reserved avg": 14766038843.392, + "elapsed time": 2984.338527646003 + }, + { + "step": 2250, + "valid accuracy": 0.36, + "train loss": 0.7480558000802994, + "train samples": 9000, + "train time": 168.98983921804756, + "eval time": 140.92262020800263, + "tokens / sec": 1271.9581307054364, + "mem allocated avg": 6798715979.776, + "mem reserved avg": 14937929809.92, + "elapsed time": 3357.8442202950027 + }, + { + "step": 2500, + "valid accuracy": 0.36, + "train loss": 0.7452825582027436, + "train samples": 10000, + "train time": 168.30827127001976, + "eval time": 140.89225408899802, + "tokens / sec": 1223.7485326527044, + "mem allocated avg": 6783722676.224, + "mem reserved avg": 14710111993.856, + "elapsed time": 3730.0927005050034 + }, + { + "step": 2750, + "valid accuracy": 0.4, + "train loss": 0.7368131847381592, + "train samples": 11000, + "train time": 168.8352410539519, + "eval time": 140.97951381299936, + "tokens / sec": 1254.9571918595636, + "mem allocated avg": 6794155292.672, + "mem reserved avg": 14876869132.288, + "elapsed time": 4103.762088249001 + }, + { + "step": 3000, + "valid accuracy": 0.38, + "train loss": 0.7284122853279114, + "train samples": 12000, + "train time": 168.7332625999261, + "eval time": 140.92822863799665, + "tokens / sec": 1237.0471404616308, + "mem allocated avg": 6789107718.144, + "mem reserved avg": 14802571231.232, + "elapsed time": 4477.013831755001 + }, + { + "step": 3250, + "valid accuracy": 0.34, + "train loss": 0.7360657904148101, + "train samples": 13000, + "train time": 168.6564349730761, + "eval time": 140.91345744199498, + "tokens / sec": 1250.4770424779092, + "mem allocated avg": 6791307786.24, + "mem reserved avg": 14825665069.056, + "elapsed time": 4850.336532419002 + }, + { + "step": 3500, + "valid accuracy": 0.34, + "train loss": 0.7245372575521469, + "train samples": 14000, + "train time": 168.69712368501496, + "eval time": 141.10813598799723, + "tokens / sec": 1243.3525564528145, + "mem allocated avg": 6789542191.104, + "mem reserved avg": 14803175211.008, + "elapsed time": 5223.900597244006 + }, + { + "step": 3750, + "valid accuracy": 0.36, + "train loss": 0.7196882257461548, + "train samples": 15000, + "train time": 169.02741387199057, + "eval time": 140.85168583100312, + "tokens / sec": 1282.0583066135978, + "mem allocated avg": 6800711397.376, + "mem reserved avg": 14974772576.256, + "elapsed time": 5597.923287113001 + }, + { + "step": 4000, + "valid accuracy": 0.4, + "train loss": 0.7386573747396469, + "train samples": 16000, + "train time": 168.47688378201565, + "eval time": 141.17620621900278, + "tokens / sec": 1213.062560347618, + "mem allocated avg": 6781920968.704, + "mem reserved avg": 14703241723.904, + "elapsed time": 5970.573302798002 + }, + { + "step": 4250, + "valid accuracy": 0.36, + "train loss": 0.7167660998106002, + "train samples": 17000, + "train time": 168.66243355697225, + "eval time": 141.03309625500697, + "tokens / sec": 1253.3259217358275, + "mem allocated avg": 6792739334.144, + "mem reserved avg": 14838457696.256, + "elapsed time": 6343.574297415005 + }, + { + "step": 4500, + "valid accuracy": 0.36, + "train loss": 0.7278824989795685, + "train samples": 18000, + "train time": 168.825120675996, + "eval time": 141.10180295899772, + "tokens / sec": 1230.966097745832, + "mem allocated avg": 6787403542.528, + "mem reserved avg": 14768026943.488, + "elapsed time": 6716.868663600006 + }, + { + "step": 4750, + "valid accuracy": 0.34, + "train loss": 0.7206774606704712, + "train samples": 19000, + "train time": 168.64492384497134, + "eval time": 140.88104952100548, + "tokens / sec": 1244.8581031290848, + "mem allocated avg": 6790186668.032, + "mem reserved avg": 14817972715.52, + "elapsed time": 7090.485984892002 + }, + { + "step": 5000, + "valid accuracy": 0.34, + "train loss": 0.7268091850280761, + "train samples": 20000, + "train time": 168.56219975605927, + "eval time": 140.98389447200316, + "tokens / sec": 1235.6269691628356, + "mem allocated avg": 6787183779.84, + "mem reserved avg": 14761332834.304, + "elapsed time": 7463.428281595006 + }, + { + "step": 5000, + "test accuracy": 0.3646702047005307, + "train loss": 0.7268091850280761, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/bone--llama-3.2-3B-bat.json b/MetaMathQA/results/bone--llama-3.2-3B-bat.json new file mode 100644 index 0000000000000000000000000000000000000000..4c684dc5739b8ad0d45d2dcf23abe52ecfa3f20a --- /dev/null +++ b/MetaMathQA/results/bone--llama-3.2-3B-bat.json @@ -0,0 +1,350 @@ +{ + "run_info": { + "created_at": "2025-06-20T03:31:24+00:00", + "total_time": 2742.3845372959986, + "experiment_name": "bone/llama-3.2-3B-bat", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "BONE", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "init_weights": "bat", + "layers_to_transform": null, + "layers_pattern": null, + "bias": "none", + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 14713983755, + "cuda_memory_max": 25251807232, + "cuda_memory_reserved_99th": 20472733368, + "train_time": 2430.7548372539895, + "file_size": 29367552, + "num_trainable_params": 7340032, + "num_total_params": 3220089856, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.34, + "train loss": 0.8741071329116822, + "train samples": 1000, + "train time": 44.769113782072964, + "eval time": 16.53786130100343, + "tokens / sec": 4729.130914464948, + "mem allocated avg": 6898425409.536, + "mem reserved avg": 14773294989.312, + "elapsed time": 124.73039968500234 + }, + { + "step": 500, + "valid accuracy": 0.42, + "train loss": 0.6946564470529556, + "train samples": 2000, + "train time": 43.747789238033874, + "eval time": 16.4541177170031, + "tokens / sec": 4754.4116770858745, + "mem allocated avg": 6890118709.248, + "mem reserved avg": 14662749913.088, + "elapsed time": 242.48505929599924 + }, + { + "step": 750, + "valid accuracy": 0.42, + "train loss": 0.6668610339164733, + "train samples": 3000, + "train time": 44.788394879076805, + "eval time": 8.99262467600056, + "tokens / sec": 4786.9766393472355, + "mem allocated avg": 6900886024.192, + "mem reserved avg": 14820195696.64, + "elapsed time": 354.3122298879971 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.6476555281877517, + "train samples": 4000, + "train time": 43.08444309095648, + "eval time": 14.581032188005338, + "tokens / sec": 4835.527282090601, + "mem allocated avg": 6892210176.0, + "mem reserved avg": 14677799075.84, + "elapsed time": 469.41999823199876 + }, + { + "step": 1250, + "valid accuracy": 0.38, + "train loss": 0.6442477897405624, + "train samples": 5000, + "train time": 43.81069704208494, + "eval time": 16.504536090003967, + "tokens / sec": 4759.979048031958, + "mem allocated avg": 6892437598.208, + "mem reserved avg": 14675995525.12, + "elapsed time": 587.4669312400001 + }, + { + "step": 1500, + "valid accuracy": 0.48, + "train loss": 0.6370412122011184, + "train samples": 6000, + "train time": 44.041188616007275, + "eval time": 11.50742915799492, + "tokens / sec": 4753.07335197389, + "mem allocated avg": 6893869041.664, + "mem reserved avg": 14704349020.16, + "elapsed time": 700.887209352004 + }, + { + "step": 1750, + "valid accuracy": 0.44, + "train loss": 0.6277673766613007, + "train samples": 7000, + "train time": 44.32280573899334, + "eval time": 16.494074002999696, + "tokens / sec": 4723.414876595195, + "mem allocated avg": 6895170344.96, + "mem reserved avg": 14718215389.184, + "elapsed time": 819.4313268580008 + }, + { + "step": 2000, + "valid accuracy": 0.48, + "train loss": 0.6278820457458496, + "train samples": 8000, + "train time": 43.325528461049544, + "eval time": 16.452074027998606, + "tokens / sec": 4793.848047040501, + "mem allocated avg": 6891568050.176, + "mem reserved avg": 14656710115.328, + "elapsed time": 936.9070930559974 + }, + { + "step": 2250, + "valid accuracy": 0.44, + "train loss": 0.6160005252361298, + "train samples": 9000, + "train time": 45.04456213898811, + "eval time": 16.52133422600309, + "tokens / sec": 4771.896757188206, + "mem allocated avg": 6903412344.832, + "mem reserved avg": 14851812360.192, + "elapsed time": 1056.8185863660037 + }, + { + "step": 2500, + "valid accuracy": 0.5, + "train loss": 0.6121727240085602, + "train samples": 10000, + "train time": 43.16439942702709, + "eval time": 16.356938169003115, + "tokens / sec": 4771.686916395162, + "mem allocated avg": 6888002562.048, + "mem reserved avg": 14598350569.472, + "elapsed time": 1173.7929829869972 + }, + { + "step": 2750, + "valid accuracy": 0.52, + "train loss": 0.6007345867156982, + "train samples": 11000, + "train time": 44.3066304440581, + "eval time": 16.514935120998416, + "tokens / sec": 4782.151065798665, + "mem allocated avg": 6899352545.28, + "mem reserved avg": 14785458470.912, + "elapsed time": 1292.7444534430033 + }, + { + "step": 3000, + "valid accuracy": 0.52, + "train loss": 0.5899704934358597, + "train samples": 12000, + "train time": 44.07467572299356, + "eval time": 16.412788394998643, + "tokens / sec": 4735.848796979486, + "mem allocated avg": 6894036676.608, + "mem reserved avg": 14687865405.44, + "elapsed time": 1411.115336062001 + }, + { + "step": 3250, + "valid accuracy": 0.48, + "train loss": 0.5988378477096558, + "train samples": 13000, + "train time": 44.070030323957326, + "eval time": 10.250203846997465, + "tokens / sec": 4785.587812163363, + "mem allocated avg": 6895260303.36, + "mem reserved avg": 14725043716.096, + "elapsed time": 1523.332073521 + }, + { + "step": 3500, + "valid accuracy": 0.5, + "train loss": 0.5801258901357651, + "train samples": 14000, + "train time": 43.991991777089424, + "eval time": 16.38271237299341, + "tokens / sec": 4767.913238909897, + "mem allocated avg": 6893688922.112, + "mem reserved avg": 14703484993.536, + "elapsed time": 1641.7187374700006 + }, + { + "step": 3750, + "valid accuracy": 0.5, + "train loss": 0.5768071869611741, + "train samples": 15000, + "train time": 45.04501243098639, + "eval time": 16.454509290000715, + "tokens / sec": 4810.810083180938, + "mem allocated avg": 6905122422.784, + "mem reserved avg": 14891314315.264, + "elapsed time": 1761.645320085001 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.5858320169448853, + "train samples": 16000, + "train time": 42.547905418032315, + "eval time": 16.350580427999375, + "tokens / sec": 4803.36218650576, + "mem allocated avg": 6886491265.024, + "mem reserved avg": 14582730981.376, + "elapsed time": 1878.0724109930015 + }, + { + "step": 4250, + "valid accuracy": 0.54, + "train loss": 0.5723247408866883, + "train samples": 17000, + "train time": 44.19116178697732, + "eval time": 16.508775556001638, + "tokens / sec": 4783.513070305705, + "mem allocated avg": 6897152284.672, + "mem reserved avg": 14738381602.816, + "elapsed time": 1996.8971549050038 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.5789256048202515, + "train samples": 18000, + "train time": 43.87211918797402, + "eval time": 16.414912490006827, + "tokens / sec": 4736.903615473535, + "mem allocated avg": 6893093124.096, + "mem reserved avg": 14658832433.152, + "elapsed time": 2114.9650602839974 + }, + { + "step": 4750, + "valid accuracy": 0.48, + "train loss": 0.568240401506424, + "train samples": 19000, + "train time": 43.939464293958736, + "eval time": 16.460097985000175, + "tokens / sec": 4777.914418698651, + "mem allocated avg": 6894218592.256, + "mem reserved avg": 14710372040.704, + "elapsed time": 2233.517725938 + }, + { + "step": 5000, + "valid accuracy": 0.5, + "train loss": 0.57634852206707, + "train samples": 20000, + "train time": 42.787552905057964, + "eval time": 16.445046182001533, + "tokens / sec": 4867.770785166333, + "mem allocated avg": 6890906441.728, + "mem reserved avg": 14656718503.936, + "elapsed time": 2350.279711092 + }, + { + "step": 5000, + "test accuracy": 0.5170583775587566, + "train loss": 0.57634852206707, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/bone--llama-3.2-3B-default.json b/MetaMathQA/results/bone--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..ec4da35505397250f98ac3bdd140f0c6e44fc008 --- /dev/null +++ b/MetaMathQA/results/bone--llama-3.2-3B-default.json @@ -0,0 +1,350 @@ +{ + "run_info": { + "created_at": "2025-06-20T04:17:11+00:00", + "total_time": 1867.121674144997, + "experiment_name": "bone/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "BONE", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "bias": "none", + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 11170837063, + "cuda_memory_max": 20248002560, + "cuda_memory_reserved_99th": 16303469363, + "train_time": 1664.0814183089897, + "file_size": 29367496, + "num_trainable_params": 7340032, + "num_total_params": 3220089856, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.34, + "train loss": 0.8771067566871643, + "train samples": 1000, + "train time": 29.468342912026856, + "eval time": 11.086663477995899, + "tokens / sec": 7184.625230948821, + "mem allocated avg": 6894354876.416, + "mem reserved avg": 11212691603.456, + "elapsed time": 88.56553585999791 + }, + { + "step": 500, + "valid accuracy": 0.38, + "train loss": 0.6947847135066986, + "train samples": 2000, + "train time": 29.13603712292388, + "eval time": 11.12908834600239, + "tokens / sec": 7138.753946615206, + "mem allocated avg": 6887297284.096, + "mem reserved avg": 11116172279.808, + "elapsed time": 169.94219922799675 + }, + { + "step": 750, + "valid accuracy": 0.42, + "train loss": 0.6673308206796646, + "train samples": 3000, + "train time": 29.74789179801155, + "eval time": 6.2111000180011615, + "tokens / sec": 7207.267037805055, + "mem allocated avg": 6897885888.512, + "mem reserved avg": 11257109282.816, + "elapsed time": 247.40845895299572 + }, + { + "step": 1000, + "valid accuracy": 0.44, + "train loss": 0.6480507221221924, + "train samples": 4000, + "train time": 29.01437903306214, + "eval time": 11.063560270995367, + "tokens / sec": 7180.439731713689, + "mem allocated avg": 6888501639.168, + "mem reserved avg": 11141564596.224, + "elapsed time": 328.43337820599845 + }, + { + "step": 1250, + "valid accuracy": 0.42, + "train loss": 0.6442041766643524, + "train samples": 5000, + "train time": 28.86099356606428, + "eval time": 11.061821620001865, + "tokens / sec": 7225.600169399779, + "mem allocated avg": 6888334700.544, + "mem reserved avg": 11139123511.296, + "elapsed time": 409.5306018880001 + }, + { + "step": 1500, + "valid accuracy": 0.52, + "train loss": 0.6375475705862045, + "train samples": 6000, + "train time": 29.36598393299937, + "eval time": 6.896059851998871, + "tokens / sec": 7128.349606047729, + "mem allocated avg": 6890338080.768, + "mem reserved avg": 11164893315.072, + "elapsed time": 487.1438905899995 + }, + { + "step": 1750, + "valid accuracy": 0.42, + "train loss": 0.6282199568748474, + "train samples": 7000, + "train time": 29.2208460940019, + "eval time": 11.139122824002698, + "tokens / sec": 7164.576936838726, + "mem allocated avg": 6891485964.288, + "mem reserved avg": 11174582157.312, + "elapsed time": 568.6407176649955 + }, + { + "step": 2000, + "valid accuracy": 0.44, + "train loss": 0.628275181055069, + "train samples": 8000, + "train time": 28.774674860083906, + "eval time": 11.096917715003656, + "tokens / sec": 7218.013791986054, + "mem allocated avg": 6889055956.992, + "mem reserved avg": 11126481879.04, + "elapsed time": 649.4662010969987 + }, + { + "step": 2250, + "valid accuracy": 0.5, + "train loss": 0.6164452042579651, + "train samples": 9000, + "train time": 29.666104338008154, + "eval time": 6.740810982002586, + "tokens / sec": 7245.575541396888, + "mem allocated avg": 6899385456.64, + "mem reserved avg": 11287358603.264, + "elapsed time": 727.5584506419982 + }, + { + "step": 2500, + "valid accuracy": 0.52, + "train loss": 0.6124898854494095, + "train samples": 10000, + "train time": 28.952800227045373, + "eval time": 11.054138113999215, + "tokens / sec": 7113.888756349109, + "mem allocated avg": 6884753041.408, + "mem reserved avg": 11077492408.32, + "elapsed time": 808.6757636719994 + }, + { + "step": 2750, + "valid accuracy": 0.48, + "train loss": 0.6010023313760757, + "train samples": 11000, + "train time": 29.36040201097785, + "eval time": 5.933361176998005, + "tokens / sec": 7216.556500853691, + "mem allocated avg": 6895703631.872, + "mem reserved avg": 11229007446.016, + "elapsed time": 885.2688505609985 + }, + { + "step": 3000, + "valid accuracy": 0.36, + "train loss": 0.590470621585846, + "train samples": 12000, + "train time": 29.152743853985157, + "eval time": 11.051910919995862, + "tokens / sec": 7159.909236861306, + "mem allocated avg": 6890226739.2, + "mem reserved avg": 11156563427.328, + "elapsed time": 966.2876440099935 + }, + { + "step": 3250, + "valid accuracy": 0.46, + "train loss": 0.5996054347753524, + "train samples": 13000, + "train time": 29.23224936202314, + "eval time": 11.06002619300125, + "tokens / sec": 7214.668888053154, + "mem allocated avg": 6892138940.416, + "mem reserved avg": 11182651998.208, + "elapsed time": 1047.7634995759945 + }, + { + "step": 3500, + "valid accuracy": 0.46, + "train loss": 0.5810788285732269, + "train samples": 14000, + "train time": 29.556202010979177, + "eval time": 7.767598452002858, + "tokens / sec": 7096.649289448104, + "mem allocated avg": 6891370110.976, + "mem reserved avg": 11166763974.656, + "elapsed time": 1126.3068484049945 + }, + { + "step": 3750, + "valid accuracy": 0.5, + "train loss": 0.5778432558774949, + "train samples": 15000, + "train time": 30.077826159038523, + "eval time": 11.010653469995304, + "tokens / sec": 7204.742751493022, + "mem allocated avg": 6901065279.488, + "mem reserved avg": 11319788961.792, + "elapsed time": 1209.0550349339974 + }, + { + "step": 4000, + "valid accuracy": 0.4, + "train loss": 0.5869229323863984, + "train samples": 16000, + "train time": 29.213863794990175, + "eval time": 11.144038623999222, + "tokens / sec": 6995.753845988955, + "mem allocated avg": 6883645001.728, + "mem reserved avg": 11058953584.64, + "elapsed time": 1290.3985370609953 + }, + { + "step": 4250, + "valid accuracy": 0.46, + "train loss": 0.5733816763162612, + "train samples": 17000, + "train time": 29.18649683901458, + "eval time": 11.153094029003114, + "tokens / sec": 7242.698607029438, + "mem allocated avg": 6893432758.272, + "mem reserved avg": 11193884344.32, + "elapsed time": 1372.1237251569983 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.5803762240409851, + "train samples": 18000, + "train time": 29.077459994943638, + "eval time": 11.118935573998897, + "tokens / sec": 7147.047920834147, + "mem allocated avg": 6888416004.096, + "mem reserved avg": 11124485390.336, + "elapsed time": 1453.4214935309938 + }, + { + "step": 4750, + "valid accuracy": 0.48, + "train loss": 0.5692038584947586, + "train samples": 19000, + "train time": 29.40723867896304, + "eval time": 11.099454375005735, + "tokens / sec": 7139.024588193769, + "mem allocated avg": 6890813089.792, + "mem reserved avg": 11168844349.44, + "elapsed time": 1535.6791463129994 + }, + { + "step": 5000, + "valid accuracy": 0.48, + "train loss": 0.5775641392469406, + "train samples": 20000, + "train time": 28.941933833950316, + "eval time": 11.18307958800142, + "tokens / sec": 7196.47834159849, + "mem allocated avg": 6887869800.448, + "mem reserved avg": 11118328152.064, + "elapsed time": 1617.277517963994 + }, + { + "step": 5000, + "test accuracy": 0.5079605761940864, + "train loss": 0.5775641392469406, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/fourierft--llama-3.2-3B-default.json b/MetaMathQA/results/fourierft--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..0aa2edcfa9ad98958d40cd6fe3b78a6d671f57b5 --- /dev/null +++ b/MetaMathQA/results/fourierft--llama-3.2-3B-default.json @@ -0,0 +1,354 @@ +{ + "run_info": { + "created_at": "2025-06-20T10:18:57+00:00", + "total_time": 2823.832106703994, + "experiment_name": "fourierft/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "FOURIERFT", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "n_frequency": 1000, + "scaling": 300, + "random_loc_seed": 777, + "fan_in_fan_out": false, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "bias": "none", + "modules_to_save": null, + "layers_to_transform": null, + "layers_pattern": null, + "n_frequency_pattern": {}, + "init_weights": false + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 13104129350, + "cuda_memory_max": 23653777408, + "cuda_memory_reserved_99th": 19017267937, + "train_time": 2424.3862988609762, + "file_size": 231416, + "num_trainable_params": 56000, + "num_total_params": 3212805824, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3263031902313231, + "train samples": 1000, + "train time": 53.55340486107161, + "eval time": 19.578013352002017, + "tokens / sec": 3953.4180982374883, + "mem allocated avg": 6781303625.728, + "mem reserved avg": 13152850804.736, + "elapsed time": 119.84825310099404 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 1.3399862418174744, + "train samples": 2000, + "train time": 52.85717789203045, + "eval time": 19.544192551999004, + "tokens / sec": 3935.03793231005, + "mem allocated avg": 6774035257.344, + "mem reserved avg": 13043463356.416, + "elapsed time": 233.5829256769939 + }, + { + "step": 750, + "valid accuracy": 0.0, + "train loss": 1.3045952091217041, + "train samples": 3000, + "train time": 53.35706212905643, + "eval time": 19.607110917990212, + "tokens / sec": 4018.2309790861696, + "mem allocated avg": 6783920330.752, + "mem reserved avg": 13205673869.312, + "elapsed time": 348.1469791559939 + }, + { + "step": 1000, + "valid accuracy": 0.0, + "train loss": 1.3111453976631164, + "train samples": 4000, + "train time": 52.95546973698947, + "eval time": 19.472347582006478, + "tokens / sec": 3934.1733919976355, + "mem allocated avg": 6776025266.176, + "mem reserved avg": 13077269446.656, + "elapsed time": 461.81266678999236 + }, + { + "step": 1250, + "valid accuracy": 0.0, + "train loss": 1.299716483592987, + "train samples": 5000, + "train time": 52.12036712520057, + "eval time": 19.626158429004136, + "tokens / sec": 4001.0846335572023, + "mem allocated avg": 6775331573.76, + "mem reserved avg": 13063344357.376, + "elapsed time": 574.6407375999988 + }, + { + "step": 1500, + "valid accuracy": 0.0, + "train loss": 1.2867344057559966, + "train samples": 6000, + "train time": 52.594848359090975, + "eval time": 19.54386943600548, + "tokens / sec": 3980.0666135738998, + "mem allocated avg": 6776458844.16, + "mem reserved avg": 13093568512.0, + "elapsed time": 688.0431025519938 + }, + { + "step": 1750, + "valid accuracy": 0.0, + "train loss": 1.2803141210079194, + "train samples": 7000, + "train time": 52.98738884186605, + "eval time": 19.568909612993593, + "tokens / sec": 3951.0344739725274, + "mem allocated avg": 6778496358.4, + "mem reserved avg": 13108768669.696, + "elapsed time": 801.9154772249894 + }, + { + "step": 2000, + "valid accuracy": 0.0, + "train loss": 1.2766506419181824, + "train samples": 8000, + "train time": 52.03297274692159, + "eval time": 19.525613270001486, + "tokens / sec": 3991.62279292005, + "mem allocated avg": 6774647097.344, + "mem reserved avg": 13051189264.384, + "elapsed time": 914.5343848449993 + }, + { + "step": 2250, + "valid accuracy": 0.0, + "train loss": 1.2596003375053406, + "train samples": 9000, + "train time": 53.934016149127274, + "eval time": 19.535415460006334, + "tokens / sec": 3985.388356870549, + "mem allocated avg": 6785830477.824, + "mem reserved avg": 13237223424.0, + "elapsed time": 1029.9007452719961 + }, + { + "step": 2500, + "valid accuracy": 0.0, + "train loss": 1.2684449093341827, + "train samples": 10000, + "train time": 52.006629903029534, + "eval time": 19.470633051998448, + "tokens / sec": 3960.3989026791724, + "mem allocated avg": 6771212331.008, + "mem reserved avg": 12996118052.864, + "elapsed time": 1142.5889472209965 + }, + { + "step": 2750, + "valid accuracy": 0.0, + "train loss": 1.2548872971534728, + "train samples": 11000, + "train time": 53.403087337108445, + "eval time": 19.463876378998975, + "tokens / sec": 3967.579601952513, + "mem allocated avg": 6781916252.16, + "mem reserved avg": 13168084516.864, + "elapsed time": 1257.0122518049902 + }, + { + "step": 3000, + "valid accuracy": 0.0, + "train loss": 1.253697858095169, + "train samples": 12000, + "train time": 53.20096563108382, + "eval time": 19.472515105997445, + "tokens / sec": 3923.443823321214, + "mem allocated avg": 6777045135.36, + "mem reserved avg": 13084844359.68, + "elapsed time": 1370.94780872899 + }, + { + "step": 3250, + "valid accuracy": 0.0, + "train loss": 1.248513156414032, + "train samples": 13000, + "train time": 52.962746563891415, + "eval time": 19.54665829600708, + "tokens / sec": 3982.06312328573, + "mem allocated avg": 6779038627.84, + "mem reserved avg": 13110345728.0, + "elapsed time": 1484.7621198889974 + }, + { + "step": 3500, + "valid accuracy": 0.0, + "train loss": 1.2477959940433503, + "train samples": 14000, + "train time": 52.93443578510778, + "eval time": 19.444701158994576, + "tokens / sec": 3962.4489595298505, + "mem allocated avg": 6776803573.76, + "mem reserved avg": 13097142059.008, + "elapsed time": 1598.8772237269877 + }, + { + "step": 3750, + "valid accuracy": 0.0, + "train loss": 1.228544222354889, + "train samples": 15000, + "train time": 53.31031796212483, + "eval time": 19.472959079008433, + "tokens / sec": 4064.9354249577, + "mem allocated avg": 6788200585.216, + "mem reserved avg": 13268999471.104, + "elapsed time": 1713.6814467679942 + }, + { + "step": 4000, + "valid accuracy": 0.0, + "train loss": 1.2609001460075377, + "train samples": 16000, + "train time": 51.9827769130934, + "eval time": 19.473652824002784, + "tokens / sec": 3931.552182017475, + "mem allocated avg": 6770180233.216, + "mem reserved avg": 12983610638.336, + "elapsed time": 1826.5604049959948 + }, + { + "step": 4250, + "valid accuracy": 0.0, + "train loss": 1.227214762210846, + "train samples": 17000, + "train time": 53.09942602888623, + "eval time": 19.547112297004787, + "tokens / sec": 3981.0034836347163, + "mem allocated avg": 6779591426.048, + "mem reserved avg": 13132760088.576, + "elapsed time": 1940.5098487799987 + }, + { + "step": 4500, + "valid accuracy": 0.0, + "train loss": 1.2504195840358734, + "train samples": 18000, + "train time": 52.23909889203787, + "eval time": 19.522137050997117, + "tokens / sec": 3978.207978462565, + "mem allocated avg": 6775933241.344, + "mem reserved avg": 13056079822.848, + "elapsed time": 2053.2267840139975 + }, + { + "step": 4750, + "valid accuracy": 0.0, + "train loss": 1.2349513354301453, + "train samples": 19000, + "train time": 53.36620609794045, + "eval time": 19.541859832999762, + "tokens / sec": 3933.931514912433, + "mem allocated avg": 6777532579.84, + "mem reserved avg": 13101604798.464, + "elapsed time": 2167.8329333979927 + }, + { + "step": 5000, + "valid accuracy": 0.0, + "train loss": 1.2480293517112733, + "train samples": 20000, + "train time": 52.46977503092785, + "eval time": 19.44991449599911, + "tokens / sec": 3969.5234042309344, + "mem allocated avg": 6773533165.568, + "mem reserved avg": 13049645760.512, + "elapsed time": 2281.220151823989 + }, + { + "step": 5000, + "test accuracy": 0.000758150113722517, + "train loss": 1.2480293517112733, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/fourierft--llama-3.2-3B-n_frequency-5000.json b/MetaMathQA/results/fourierft--llama-3.2-3B-n_frequency-5000.json new file mode 100644 index 0000000000000000000000000000000000000000..24096854af5978d81ab3d14161eb2904ead8a492 --- /dev/null +++ b/MetaMathQA/results/fourierft--llama-3.2-3B-n_frequency-5000.json @@ -0,0 +1,354 @@ +{ + "run_info": { + "created_at": "2025-06-20T09:31:48+00:00", + "total_time": 2824.376998209991, + "experiment_name": "fourierft/llama-3.2-3B-n_frequency-5000", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "FOURIERFT", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "n_frequency": 5000, + "scaling": 300, + "random_loc_seed": 777, + "fan_in_fan_out": false, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "bias": "none", + "modules_to_save": null, + "layers_to_transform": null, + "layers_pattern": null, + "n_frequency_pattern": {}, + "init_weights": false + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 13111221498, + "cuda_memory_max": 23681040384, + "cuda_memory_reserved_99th": 19054869872, + "train_time": 2421.913372163006, + "file_size": 1127472, + "num_trainable_params": 280000, + "num_total_params": 3213029824, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3800132541656494, + "train samples": 1000, + "train time": 53.57064967796032, + "eval time": 19.631924207002157, + "tokens / sec": 3952.1454616053315, + "mem allocated avg": 6784830552.064, + "mem reserved avg": 13158731218.944, + "elapsed time": 119.20255395398999 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 1.3702282276153563, + "train samples": 2000, + "train time": 53.00863014489005, + "eval time": 19.629790833001607, + "tokens / sec": 3923.7950392508, + "mem allocated avg": 6777176354.816, + "mem reserved avg": 13048941117.44, + "elapsed time": 232.4386439989903 + }, + { + "step": 750, + "valid accuracy": 0.0, + "train loss": 1.3024170677661895, + "train samples": 3000, + "train time": 53.97298614999454, + "eval time": 19.64192995200574, + "tokens / sec": 3972.3760957780855, + "mem allocated avg": 6787548153.856, + "mem reserved avg": 13211654946.816, + "elapsed time": 346.9217278779979 + }, + { + "step": 1000, + "valid accuracy": 0.0, + "train loss": 1.2704877371788026, + "train samples": 4000, + "train time": 52.95541349705309, + "eval time": 19.62998814698949, + "tokens / sec": 3934.1775701854103, + "mem allocated avg": 6779591346.176, + "mem reserved avg": 13082126450.688, + "elapsed time": 460.14450727400254 + }, + { + "step": 1250, + "valid accuracy": 0.0, + "train loss": 1.2236453666687013, + "train samples": 5000, + "train time": 53.36593960013124, + "eval time": 19.652927816001466, + "tokens / sec": 3907.698460152047, + "mem allocated avg": 6779029788.672, + "mem reserved avg": 13073486184.448, + "elapsed time": 573.5348878969962 + }, + { + "step": 1500, + "valid accuracy": 0.0, + "train loss": 1.1792121708393097, + "train samples": 6000, + "train time": 53.3776921518147, + "eval time": 19.616937039012555, + "tokens / sec": 3921.69446750581, + "mem allocated avg": 6779851802.624, + "mem reserved avg": 13098995941.376, + "elapsed time": 686.9838123609952 + }, + { + "step": 1750, + "valid accuracy": 0.02, + "train loss": 1.1485692322254182, + "train samples": 7000, + "train time": 53.188338823019876, + "eval time": 19.653264298991417, + "tokens / sec": 3936.1071361264494, + "mem allocated avg": 6782223466.496, + "mem reserved avg": 13116058370.048, + "elapsed time": 800.3354816049978 + }, + { + "step": 2000, + "valid accuracy": 0.06, + "train loss": 1.1230667443275453, + "train samples": 8000, + "train time": 53.074023688037414, + "eval time": 19.656479785000556, + "tokens / sec": 3913.3268135239105, + "mem allocated avg": 6778141935.616, + "mem reserved avg": 13055400345.6, + "elapsed time": 913.367253695993 + }, + { + "step": 2250, + "valid accuracy": 0.1, + "train loss": 1.094045166015625, + "train samples": 9000, + "train time": 54.34830153394432, + "eval time": 19.628162662993418, + "tokens / sec": 3955.008600696563, + "mem allocated avg": 6789509545.984, + "mem reserved avg": 13248556433.408, + "elapsed time": 1028.463336018991 + }, + { + "step": 2500, + "valid accuracy": 0.12, + "train loss": 1.077717797279358, + "train samples": 10000, + "train time": 52.1458756570355, + "eval time": 19.611369335994823, + "tokens / sec": 3949.823402231256, + "mem allocated avg": 6775024920.576, + "mem reserved avg": 13002233348.096, + "elapsed time": 1140.4990660109906 + }, + { + "step": 2750, + "valid accuracy": 0.12, + "train loss": 1.0569540388584138, + "train samples": 11000, + "train time": 53.227410834049806, + "eval time": 19.625236430001678, + "tokens / sec": 3980.6745562092756, + "mem allocated avg": 6785537161.216, + "mem reserved avg": 13177051938.816, + "elapsed time": 1254.066401210992 + }, + { + "step": 3000, + "valid accuracy": 0.12, + "train loss": 1.0361379137039184, + "train samples": 12000, + "train time": 53.65395914198598, + "eval time": 19.719437510997523, + "tokens / sec": 3890.3186892066865, + "mem allocated avg": 6780720910.336, + "mem reserved avg": 13092201168.896, + "elapsed time": 1367.8724600419955 + }, + { + "step": 3250, + "valid accuracy": 0.16, + "train loss": 1.0240549674034118, + "train samples": 13000, + "train time": 52.97706237102102, + "eval time": 19.7029277440015, + "tokens / sec": 3980.9870642311216, + "mem allocated avg": 6782688188.416, + "mem reserved avg": 13119816466.432, + "elapsed time": 1481.1549517469975 + }, + { + "step": 3500, + "valid accuracy": 0.18, + "train loss": 1.0098259932994842, + "train samples": 14000, + "train time": 52.869576787008555, + "eval time": 19.597270865997416, + "tokens / sec": 3967.3099870839346, + "mem allocated avg": 6780575592.448, + "mem reserved avg": 13102678540.288, + "elapsed time": 1594.3849144269916 + }, + { + "step": 3750, + "valid accuracy": 0.22, + "train loss": 0.9942408270835876, + "train samples": 15000, + "train time": 54.702630093932385, + "eval time": 19.623511597994366, + "tokens / sec": 3961.4731435744384, + "mem allocated avg": 6792074147.84, + "mem reserved avg": 13278612815.872, + "elapsed time": 1709.9712875620025 + }, + { + "step": 4000, + "valid accuracy": 0.16, + "train loss": 1.0123027296066285, + "train samples": 16000, + "train time": 52.456372838059906, + "eval time": 19.68401901901234, + "tokens / sec": 3896.056645603915, + "mem allocated avg": 6773958766.592, + "mem reserved avg": 12989172285.44, + "elapsed time": 1822.6668115109933 + }, + { + "step": 4250, + "valid accuracy": 0.24, + "train loss": 0.9849327182769776, + "train samples": 17000, + "train time": 53.25562528491719, + "eval time": 19.648335694990237, + "tokens / sec": 3969.3271625123257, + "mem allocated avg": 6783509901.312, + "mem reserved avg": 13139588415.488, + "elapsed time": 1936.0694442329986 + }, + { + "step": 4500, + "valid accuracy": 0.18, + "train loss": 0.9994378657341003, + "train samples": 18000, + "train time": 53.01732904899109, + "eval time": 19.688141086997348, + "tokens / sec": 3919.8127051621955, + "mem allocated avg": 6779470948.352, + "mem reserved avg": 13063528906.752, + "elapsed time": 2048.985867203999 + }, + { + "step": 4750, + "valid accuracy": 0.16, + "train loss": 0.9892346875667573, + "train samples": 19000, + "train time": 53.11992502908106, + "eval time": 19.68838914000662, + "tokens / sec": 3952.1704875348883, + "mem allocated avg": 6781060145.152, + "mem reserved avg": 13109733359.616, + "elapsed time": 2162.7099456459982 + }, + { + "step": 5000, + "valid accuracy": 0.2, + "train loss": 0.9978675174713135, + "train samples": 20000, + "train time": 52.76285280592856, + "eval time": 19.634052573994268, + "tokens / sec": 3947.4741967818154, + "mem allocated avg": 6777472888.832, + "mem reserved avg": 13055861719.04, + "elapsed time": 2275.669019541994 + }, + { + "step": 5000, + "test accuracy": 0.1197877179681577, + "train loss": 0.9978675174713135, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/full-finetuning--llama-3.2-3B-lr_0.00001.json b/MetaMathQA/results/full-finetuning--llama-3.2-3B-lr_0.00001.json new file mode 100644 index 0000000000000000000000000000000000000000..7a121108cd36e2ae6a01aedc53e82a4d48b2b3a6 --- /dev/null +++ b/MetaMathQA/results/full-finetuning--llama-3.2-3B-lr_0.00001.json @@ -0,0 +1,331 @@ +{ + "run_info": { + "created_at": "2025-06-20T18:02:43+00:00", + "total_time": 3274.9747593409993, + "experiment_name": "full-finetuning/llama-3.2-3B-lr_0.00001", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 1e-05 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": null, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 33098872284, + "cuda_memory_max": 37241225216, + "cuda_memory_reserved_99th": 33573390254, + "train_time": 3111.3685010060144, + "file_size": 6425499648, + "num_trainable_params": 3212749824, + "num_total_params": 3212749824, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.3, + "train loss": 1.0749022357463838, + "train samples": 1000, + "train time": 90.81602771116013, + "eval time": 10.388541491003707, + "tokens / sec": 2331.295535996918, + "mem allocated avg": 26069449254.912, + "mem reserved avg": 33116739600.384, + "elapsed time": 162.0596859770012 + }, + { + "step": 500, + "valid accuracy": 0.4, + "train loss": 0.7238605101108551, + "train samples": 2000, + "train time": 90.41340426202805, + "eval time": 10.403155545005575, + "tokens / sec": 2300.488535938847, + "mem allocated avg": 26062513567.744, + "mem reserved avg": 33090961408.0, + "elapsed time": 315.86630137299653 + }, + { + "step": 750, + "valid accuracy": 0.42, + "train loss": 0.6648618497848511, + "train samples": 3000, + "train time": 91.4961106939445, + "eval time": 5.590419113999815, + "tokens / sec": 2343.27993150631, + "mem allocated avg": 26071394062.336, + "mem reserved avg": 33094367182.848, + "elapsed time": 465.79339110500587 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.6407654472589492, + "train samples": 4000, + "train time": 89.8546926038689, + "eval time": 10.434167744999286, + "tokens / sec": 2318.5878662838986, + "mem allocated avg": 26063373086.72, + "mem reserved avg": 33094367182.848, + "elapsed time": 618.5050604129938 + }, + { + "step": 1250, + "valid accuracy": 0.46, + "train loss": 0.6343449921607971, + "train samples": 5000, + "train time": 90.3596406209981, + "eval time": 5.810965301003307, + "tokens / sec": 2307.86663787969, + "mem allocated avg": 26063789404.16, + "mem reserved avg": 33081876545.536, + "elapsed time": 766.6042792719963 + }, + { + "step": 1500, + "valid accuracy": 0.54, + "train loss": 0.6249808443784713, + "train samples": 6000, + "train time": 90.81503154609527, + "eval time": 10.435444819988334, + "tokens / sec": 2305.025901948283, + "mem allocated avg": 26066218485.76, + "mem reserved avg": 33089409515.52, + "elapsed time": 920.292813491993 + }, + { + "step": 1750, + "valid accuracy": 0.46, + "train loss": 0.6174132014513016, + "train samples": 7000, + "train time": 90.68820026615867, + "eval time": 10.286707318999106, + "tokens / sec": 2308.5142210956765, + "mem allocated avg": 26065828059.136, + "mem reserved avg": 33101774323.712, + "elapsed time": 1073.8488811849966 + }, + { + "step": 2000, + "valid accuracy": 0.42, + "train loss": 0.618268838763237, + "train samples": 8000, + "train time": 90.44998777209548, + "eval time": 10.380125819006935, + "tokens / sec": 2296.252383398064, + "mem allocated avg": 26062920781.824, + "mem reserved avg": 33096330117.12, + "elapsed time": 1227.2062568730034 + }, + { + "step": 2250, + "valid accuracy": 0.5, + "train loss": 0.6107994567155838, + "train samples": 9000, + "train time": 91.58726547904371, + "eval time": 10.372407121991273, + "tokens / sec": 2346.920162707366, + "mem allocated avg": 26073357961.216, + "mem reserved avg": 33114382401.536, + "elapsed time": 1381.3805919409933 + }, + { + "step": 2500, + "valid accuracy": 0.54, + "train loss": 0.6089532144069671, + "train samples": 10000, + "train time": 89.29193754095468, + "eval time": 10.391672718993505, + "tokens / sec": 2306.6696240691504, + "mem allocated avg": 26059719045.12, + "mem reserved avg": 33086842601.472, + "elapsed time": 1533.778675338006 + }, + { + "step": 2750, + "valid accuracy": 0.52, + "train loss": 0.6020698472261429, + "train samples": 11000, + "train time": 90.41624103189679, + "eval time": 10.369720178001444, + "tokens / sec": 2343.3953632871467, + "mem allocated avg": 26070059464.704, + "mem reserved avg": 33107805732.864, + "elapsed time": 1686.671367884992 + }, + { + "step": 3000, + "valid accuracy": 0.5, + "train loss": 0.5949549045562744, + "train samples": 12000, + "train time": 90.9437831780233, + "eval time": 7.315949440002441, + "tokens / sec": 2295.165130654474, + "mem allocated avg": 26064854972.416, + "mem reserved avg": 33098074947.584, + "elapsed time": 1837.2926549609983 + }, + { + "step": 3250, + "valid accuracy": 0.48, + "train loss": 0.6066494225263596, + "train samples": 13000, + "train time": 90.87308476005273, + "eval time": 5.963120047992561, + "tokens / sec": 2320.8302057410824, + "mem allocated avg": 26066388537.344, + "mem reserved avg": 33098318217.216, + "elapsed time": 1986.6408478410012 + }, + { + "step": 3500, + "valid accuracy": 0.48, + "train loss": 0.592242598772049, + "train samples": 14000, + "train time": 90.65281462905114, + "eval time": 7.1309342330059735, + "tokens / sec": 2313.7726154261322, + "mem allocated avg": 26065652588.544, + "mem reserved avg": 33100457312.256, + "elapsed time": 2137.073564691993 + }, + { + "step": 3750, + "valid accuracy": 0.48, + "train loss": 0.5925718579292297, + "train samples": 15000, + "train time": 91.80342563094746, + "eval time": 5.844810517999576, + "tokens / sec": 2360.5110431407275, + "mem allocated avg": 26075058659.328, + "mem reserved avg": 33131771985.92, + "elapsed time": 2287.0305021950044 + }, + { + "step": 4000, + "valid accuracy": 0.5, + "train loss": 0.6050453131198883, + "train samples": 16000, + "train time": 89.85742108603881, + "eval time": 5.86809825799719, + "tokens / sec": 2274.414261280792, + "mem allocated avg": 26058425257.984, + "mem reserved avg": 33098662150.144, + "elapsed time": 2435.1958582270017 + }, + { + "step": 4250, + "valid accuracy": 0.48, + "train loss": 0.5929686036109925, + "train samples": 17000, + "train time": 90.97368233802263, + "eval time": 5.8907580230006715, + "tokens / sec": 2323.6280489841133, + "mem allocated avg": 26067367372.8, + "mem reserved avg": 33099207409.664, + "elapsed time": 2584.8373482140014 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.6010294322967529, + "train samples": 18000, + "train time": 90.13679483698797, + "eval time": 6.106882603999111, + "tokens / sec": 2305.5845326632484, + "mem allocated avg": 26064599832.576, + "mem reserved avg": 33092253253.632, + "elapsed time": 2733.494644669001 + }, + { + "step": 4750, + "valid accuracy": 0.5, + "train loss": 0.5936577550172806, + "train samples": 19000, + "train time": 90.74229130300228, + "eval time": 5.885364143003244, + "tokens / sec": 2313.5739354319567, + "mem allocated avg": 26065537388.544, + "mem reserved avg": 33100717359.104, + "elapsed time": 2882.6415541759925 + }, + { + "step": 5000, + "valid accuracy": 0.5, + "train loss": 0.5987544150352478, + "train samples": 20000, + "train time": 90.54863398504676, + "eval time": 5.88336711798911, + "tokens / sec": 2300.2003545895063, + "mem allocated avg": 26062803286.016, + "mem reserved avg": 33083126448.128, + "elapsed time": 3031.523533478001 + }, + { + "step": 5000, + "test accuracy": 0.5003790750568613, + "train loss": 0.5987544150352478, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/ia3--llama-3.2-3B-default.json b/MetaMathQA/results/ia3--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..22ae90406a679eb47fbcc420ac99000c96249472 --- /dev/null +++ b/MetaMathQA/results/ia3--llama-3.2-3B-default.json @@ -0,0 +1,351 @@ +{ + "run_info": { + "created_at": "2025-06-19T21:59:33+00:00", + "total_time": 2004.8640038169979, + "experiment_name": "ia3/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "IA3", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj" + ], + "exclude_modules": null, + "feedforward_modules": [ + "down_proj" + ], + "fan_in_fan_out": false, + "modules_to_save": null, + "init_ia3_weights": true + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 12023227429, + "cuda_memory_max": 23137878016, + "cuda_memory_reserved_99th": 18398566154, + "train_time": 1782.9318781230104, + "file_size": 1157064, + "num_trainable_params": 286720, + "num_total_params": 3213036544, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3155810165405273, + "train samples": 1000, + "train time": 30.56459548201383, + "eval time": 10.972947114001727, + "tokens / sec": 6926.936105684404, + "mem allocated avg": 6780994971.648, + "mem reserved avg": 12076433014.784, + "elapsed time": 90.53726772200025 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 1.205229633808136, + "train samples": 2000, + "train time": 30.221456803970796, + "eval time": 10.954313254995213, + "tokens / sec": 6882.361805029583, + "mem allocated avg": 6773721065.472, + "mem reserved avg": 11963673346.048, + "elapsed time": 175.07058417100052 + }, + { + "step": 750, + "valid accuracy": 0.1, + "train loss": 1.0194582087993622, + "train samples": 3000, + "train time": 30.774312397006724, + "eval time": 10.944943730006344, + "tokens / sec": 6966.881899231445, + "mem allocated avg": 6784231882.752, + "mem reserved avg": 12126680776.704, + "elapsed time": 260.540154495 + }, + { + "step": 1000, + "valid accuracy": 0.24, + "train loss": 0.9196457831859589, + "train samples": 4000, + "train time": 30.61534244806535, + "eval time": 10.960088267995161, + "tokens / sec": 6804.95409624808, + "mem allocated avg": 6775492155.392, + "mem reserved avg": 11986893012.992, + "elapsed time": 345.30987053900026 + }, + { + "step": 1250, + "valid accuracy": 0.32, + "train loss": 0.8685842225551605, + "train samples": 5000, + "train time": 29.97266351111466, + "eval time": 10.924794500999269, + "tokens / sec": 6957.606551138459, + "mem allocated avg": 6775089207.296, + "mem reserved avg": 11983428517.888, + "elapsed time": 429.5542291879974 + }, + { + "step": 1500, + "valid accuracy": 0.32, + "train loss": 0.8332846148014068, + "train samples": 6000, + "train time": 29.98314001694962, + "eval time": 10.942266878999362, + "tokens / sec": 6981.6236685572, + "mem allocated avg": 6776724867.072, + "mem reserved avg": 12008594341.888, + "elapsed time": 513.8152235820016 + }, + { + "step": 1750, + "valid accuracy": 0.32, + "train loss": 0.8169269208908081, + "train samples": 7000, + "train time": 30.245623568014707, + "eval time": 10.940915298000618, + "tokens / sec": 6921.8278647558345, + "mem allocated avg": 6777912934.4, + "mem reserved avg": 12032065667.072, + "elapsed time": 598.2868188970024 + }, + { + "step": 2000, + "valid accuracy": 0.32, + "train loss": 0.8072074156999588, + "train samples": 8000, + "train time": 30.292844633964705, + "eval time": 10.95617212200159, + "tokens / sec": 6856.272578875894, + "mem allocated avg": 6775099170.816, + "mem reserved avg": 11967473385.472, + "elapsed time": 682.7948923380027 + }, + { + "step": 2250, + "valid accuracy": 0.32, + "train loss": 0.7952859619855881, + "train samples": 9000, + "train time": 31.20892413101683, + "eval time": 10.942549917002907, + "tokens / sec": 6887.388975590319, + "mem allocated avg": 6786161477.632, + "mem reserved avg": 12167709458.432, + "elapsed time": 768.9645714229991 + }, + { + "step": 2500, + "valid accuracy": 0.28, + "train loss": 0.7890167078971863, + "train samples": 10000, + "train time": 30.187670495011844, + "eval time": 10.954304017002869, + "tokens / sec": 6822.884860692832, + "mem allocated avg": 6771082014.72, + "mem reserved avg": 11910984499.2, + "elapsed time": 853.427360558002 + }, + { + "step": 2750, + "valid accuracy": 0.3, + "train loss": 0.7823473591804504, + "train samples": 11000, + "train time": 30.410061570059042, + "eval time": 10.93302121299348, + "tokens / sec": 6967.4636965751015, + "mem allocated avg": 6782254225.408, + "mem reserved avg": 12090903363.584, + "elapsed time": 938.3584665200033 + }, + { + "step": 3000, + "valid accuracy": 0.24, + "train loss": 0.7709820411205291, + "train samples": 12000, + "train time": 30.02989622000314, + "eval time": 10.940404225999373, + "tokens / sec": 6950.773271769175, + "mem allocated avg": 6776725577.728, + "mem reserved avg": 12003133358.08, + "elapsed time": 1022.4627897890023 + }, + { + "step": 3250, + "valid accuracy": 0.3, + "train loss": 0.7755767168998718, + "train samples": 13000, + "train time": 30.172652364024543, + "eval time": 10.940153044000908, + "tokens / sec": 6989.806446431653, + "mem allocated avg": 6778589339.648, + "mem reserved avg": 12038298402.816, + "elapsed time": 1107.0076802080002 + }, + { + "step": 3500, + "valid accuracy": 0.34, + "train loss": 0.7658302361965179, + "train samples": 14000, + "train time": 30.384311634006735, + "eval time": 10.941136569999799, + "tokens / sec": 6903.233567590308, + "mem allocated avg": 6777534660.608, + "mem reserved avg": 12020623605.76, + "elapsed time": 1191.893303306002 + }, + { + "step": 3750, + "valid accuracy": 0.34, + "train loss": 0.7585167481899261, + "train samples": 15000, + "train time": 31.250990667955193, + "eval time": 10.924158087997057, + "tokens / sec": 6934.276173913666, + "mem allocated avg": 6788426940.416, + "mem reserved avg": 12209652498.432, + "elapsed time": 1278.4574160839984 + }, + { + "step": 4000, + "valid accuracy": 0.26, + "train loss": 0.7766438691616059, + "train samples": 16000, + "train time": 30.222231689898763, + "eval time": 10.98030305699649, + "tokens / sec": 6762.339793335249, + "mem allocated avg": 6769563977.728, + "mem reserved avg": 11885533462.528, + "elapsed time": 1362.9405450319973 + }, + { + "step": 4250, + "valid accuracy": 0.34, + "train loss": 0.7542061095237732, + "train samples": 17000, + "train time": 30.273203028933494, + "eval time": 10.948997009996674, + "tokens / sec": 6982.710081849145, + "mem allocated avg": 6780103426.048, + "mem reserved avg": 12047483928.576, + "elapsed time": 1447.661586811002 + }, + { + "step": 4500, + "valid accuracy": 0.32, + "train loss": 0.7659628703594208, + "train samples": 18000, + "train time": 29.84466753601737, + "eval time": 10.942651322002348, + "tokens / sec": 6963.320993581165, + "mem allocated avg": 6775043430.4, + "mem reserved avg": 11968387743.744, + "elapsed time": 1531.5572027719973 + }, + { + "step": 4750, + "valid accuracy": 0.28, + "train loss": 0.7580052223205567, + "train samples": 19000, + "train time": 30.03731635398435, + "eval time": 10.927273799999966, + "tokens / sec": 6989.272860661278, + "mem allocated avg": 6776962899.968, + "mem reserved avg": 12017695981.568, + "elapsed time": 1615.9832882379997 + }, + { + "step": 5000, + "valid accuracy": 0.36, + "train loss": 0.7657463653087616, + "train samples": 20000, + "train time": 30.07570726004633, + "eval time": 10.953207714999735, + "tokens / sec": 6925.19042691597, + "mem allocated avg": 6774270615.552, + "mem reserved avg": 11958900228.096, + "elapsed time": 1700.4354192270039 + }, + { + "step": 5000, + "test accuracy": 0.34495830174374525, + "train loss": 0.7657463653087616, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/ia3--llama-3.2-3B-lr_0.001.json b/MetaMathQA/results/ia3--llama-3.2-3B-lr_0.001.json new file mode 100644 index 0000000000000000000000000000000000000000..51193628f701df4dd026f8d204d4a1ae5d8293a5 --- /dev/null +++ b/MetaMathQA/results/ia3--llama-3.2-3B-lr_0.001.json @@ -0,0 +1,350 @@ +{ + "run_info": { + "created_at": "2025-06-19T21:27:27+00:00", + "total_time": 1921.5641919770023, + "experiment_name": "ia3/llama-3.2-3B-lr_0.001", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "IA3", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "target_modules": [ + "k_proj", + "down_proj", + "v_proj" + ], + "exclude_modules": null, + "feedforward_modules": [ + "down_proj" + ], + "fan_in_fan_out": false, + "modules_to_save": null, + "init_ia3_weights": true + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 12023331867, + "cuda_memory_max": 23135780864, + "cuda_memory_reserved_99th": 18398356439, + "train_time": 1746.0246657649877, + "file_size": 1157064, + "num_trainable_params": 286720, + "num_total_params": 3213036544, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.18, + "train loss": 1.1670710837841034, + "train samples": 1000, + "train time": 30.829080988976784, + "eval time": 10.962777282999014, + "tokens / sec": 6867.509286952213, + "mem allocated avg": 6781095491.584, + "mem reserved avg": 12075594153.984, + "elapsed time": 91.04478788100096 + }, + { + "step": 500, + "valid accuracy": 0.34, + "train loss": 0.8285422480106354, + "train samples": 2000, + "train time": 30.237734625952726, + "eval time": 10.93798775599862, + "tokens / sec": 6878.656836331916, + "mem allocated avg": 6773575256.064, + "mem reserved avg": 11961039323.136, + "elapsed time": 175.57074494799963 + }, + { + "step": 750, + "valid accuracy": 0.34, + "train loss": 0.7387537934780121, + "train samples": 3000, + "train time": 30.784141963005823, + "eval time": 10.918857135002327, + "tokens / sec": 6964.657330961239, + "mem allocated avg": 6784163356.672, + "mem reserved avg": 12124793339.904, + "elapsed time": 261.120397177001 + }, + { + "step": 1000, + "valid accuracy": 0.36, + "train loss": 0.7030822492837906, + "train samples": 4000, + "train time": 30.625773959025537, + "eval time": 6.545184372997028, + "tokens / sec": 6802.636246147914, + "mem allocated avg": 6775321157.632, + "mem reserved avg": 11986549080.064, + "elapsed time": 341.78445690100125 + }, + { + "step": 1250, + "valid accuracy": 0.34, + "train loss": 0.6953592277765274, + "train samples": 5000, + "train time": 30.090904191973095, + "eval time": 7.180137749001005, + "tokens / sec": 6930.266989305977, + "mem allocated avg": 6774968741.888, + "mem reserved avg": 11983218802.688, + "elapsed time": 422.45400445199994 + }, + { + "step": 1500, + "valid accuracy": 0.34, + "train loss": 0.6861299908161164, + "train samples": 6000, + "train time": 30.086008766014857, + "eval time": 10.923475695002708, + "tokens / sec": 6957.75241003254, + "mem allocated avg": 6776914077.696, + "mem reserved avg": 12007201832.96, + "elapsed time": 506.8615667560007 + }, + { + "step": 1750, + "valid accuracy": 0.34, + "train loss": 0.6775313948392868, + "train samples": 7000, + "train time": 30.329398032976314, + "eval time": 7.039293795001868, + "tokens / sec": 6902.708710946855, + "mem allocated avg": 6778176180.224, + "mem reserved avg": 12032417988.608, + "elapsed time": 587.730657346001 + }, + { + "step": 2000, + "valid accuracy": 0.36, + "train loss": 0.6783386437892914, + "train samples": 8000, + "train time": 30.340071335995162, + "eval time": 8.14293124300093, + "tokens / sec": 6845.600252547578, + "mem allocated avg": 6775202904.064, + "mem reserved avg": 11967733432.32, + "elapsed time": 669.6239550099999 + }, + { + "step": 2250, + "valid accuracy": 0.5, + "train loss": 0.6720720986127854, + "train samples": 9000, + "train time": 31.104124111985584, + "eval time": 7.4280358140022145, + "tokens / sec": 6910.594853149151, + "mem allocated avg": 6785809762.304, + "mem reserved avg": 12167885619.2, + "elapsed time": 752.2532132060005 + }, + { + "step": 2500, + "valid accuracy": 0.46, + "train loss": 0.6705386472940444, + "train samples": 10000, + "train time": 30.09476044199255, + "eval time": 7.5499184540021815, + "tokens / sec": 6843.948812850663, + "mem allocated avg": 6770963554.304, + "mem reserved avg": 11912058241.024, + "elapsed time": 833.2611769709983 + }, + { + "step": 2750, + "valid accuracy": 0.48, + "train loss": 0.6631126835346222, + "train samples": 11000, + "train time": 30.640666239018174, + "eval time": 10.92325482400338, + "tokens / sec": 6915.025879241109, + "mem allocated avg": 6781913962.496, + "mem reserved avg": 12090299383.808, + "elapsed time": 918.4276470139994 + }, + { + "step": 3000, + "valid accuracy": 0.38, + "train loss": 0.6557366658449173, + "train samples": 12000, + "train time": 30.612569437977072, + "eval time": 10.933225860997482, + "tokens / sec": 6818.473712992361, + "mem allocated avg": 6776591689.728, + "mem reserved avg": 12003032694.784, + "elapsed time": 1003.438990486 + }, + { + "step": 3250, + "valid accuracy": 0.44, + "train loss": 0.6655691808462143, + "train samples": 13000, + "train time": 30.508301533980557, + "eval time": 7.2082155700009025, + "tokens / sec": 6912.905320707402, + "mem allocated avg": 6778600480.768, + "mem reserved avg": 12040143896.576, + "elapsed time": 1084.7670880670012 + }, + { + "step": 3500, + "valid accuracy": 0.46, + "train loss": 0.6528272937536239, + "train samples": 14000, + "train time": 30.571383574966603, + "eval time": 7.452295711998886, + "tokens / sec": 6860.991406740058, + "mem allocated avg": 6777338779.648, + "mem reserved avg": 12021227585.536, + "elapsed time": 1166.2843480039992 + }, + { + "step": 3750, + "valid accuracy": 0.48, + "train loss": 0.6513591132164002, + "train samples": 15000, + "train time": 31.176262214954477, + "eval time": 6.50122426100279, + "tokens / sec": 6950.897400909496, + "mem allocated avg": 6788519866.368, + "mem reserved avg": 12209543446.528, + "elapsed time": 1248.1537826940003 + }, + { + "step": 4000, + "valid accuracy": 0.42, + "train loss": 0.6660103598833084, + "train samples": 16000, + "train time": 30.1621740100818, + "eval time": 10.007692241000768, + "tokens / sec": 6775.804686084222, + "mem allocated avg": 6769538811.904, + "mem reserved avg": 11886321991.68, + "elapsed time": 1331.4140659110017 + }, + { + "step": 4250, + "valid accuracy": 0.4, + "train loss": 0.648773505806923, + "train samples": 17000, + "train time": 30.627343150990782, + "eval time": 9.851157391000015, + "tokens / sec": 6901.969882201866, + "mem allocated avg": 6780366684.16, + "mem reserved avg": 12050411552.768, + "elapsed time": 1415.4855422520013 + }, + { + "step": 4500, + "valid accuracy": 0.42, + "train loss": 0.6574939725399017, + "train samples": 18000, + "train time": 30.04905582394713, + "eval time": 6.792122120001295, + "tokens / sec": 6915.957733167199, + "mem allocated avg": 6775072815.104, + "mem reserved avg": 11969042055.168, + "elapsed time": 1495.5897211369993 + }, + { + "step": 4750, + "valid accuracy": 0.4, + "train loss": 0.6505398267507553, + "train samples": 19000, + "train time": 30.326544256924535, + "eval time": 7.6139581239986, + "tokens / sec": 6922.615324100572, + "mem allocated avg": 6777039572.992, + "mem reserved avg": 12019256262.656, + "elapsed time": 1577.114852814997 + }, + { + "step": 5000, + "valid accuracy": 0.42, + "train loss": 0.6568749620914459, + "train samples": 20000, + "train time": 30.342653310064634, + "eval time": 6.5661308569979155, + "tokens / sec": 6864.264567492972, + "mem allocated avg": 6774530805.76, + "mem reserved avg": 11958866673.664, + "elapsed time": 1657.5746541439985 + }, + { + "step": 5000, + "test accuracy": 0.41243366186504926, + "train loss": 0.6568749620914459, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/ln_tuning--llama-3.2-3B-default.json b/MetaMathQA/results/ln_tuning--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..70ba76cf1ad0f78691ed996c383e12eb4fb7ee3d --- /dev/null +++ b/MetaMathQA/results/ln_tuning--llama-3.2-3B-default.json @@ -0,0 +1,346 @@ +{ + "run_info": { + "created_at": "2025-06-20T11:06:05+00:00", + "total_time": 1870.2496634349955, + "experiment_name": "ln_tuning/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "LN_TUNING", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "target_modules": [ + "input_layernorm", + "norm", + "post_attention_layernorm" + ], + "exclude_modules": null, + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 11385589622, + "cuda_memory_max": 21177040896, + "cuda_memory_reserved_99th": 16903066091, + "train_time": 1657.2412179829698, + "file_size": 358288, + "num_trainable_params": 175104, + "num_total_params": 3212924928, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3265725662708283, + "train samples": 1000, + "train time": 27.216289801202947, + "eval time": 10.492610957997385, + "tokens / sec": 7779.127924726981, + "mem allocated avg": 6780187711.488, + "mem reserved avg": 11433404268.544, + "elapsed time": 87.52968039299594 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 1.3411514971256255, + "train samples": 2000, + "train time": 26.650248568999814, + "eval time": 10.469055254012346, + "tokens / sec": 7804.617636547848, + "mem allocated avg": 6772587255.808, + "mem reserved avg": 11331533012.992, + "elapsed time": 165.70980707599665 + }, + { + "step": 750, + "valid accuracy": 0.0, + "train loss": 1.2789560747146607, + "train samples": 3000, + "train time": 27.327283490114496, + "eval time": 10.448594682005933, + "tokens / sec": 7845.6755526965735, + "mem allocated avg": 6783227031.552, + "mem reserved avg": 11478560145.408, + "elapsed time": 245.043655957008 + }, + { + "step": 1000, + "valid accuracy": 0.0, + "train loss": 1.263298665046692, + "train samples": 4000, + "train time": 26.423721938888775, + "eval time": 10.48604148600134, + "tokens / sec": 7884.430531089723, + "mem allocated avg": 6773840521.216, + "mem reserved avg": 11353729269.76, + "elapsed time": 322.872066240001 + }, + { + "step": 1250, + "valid accuracy": 0.0, + "train loss": 1.2484543447494507, + "train samples": 5000, + "train time": 26.471019316944876, + "eval time": 10.449013539997395, + "tokens / sec": 7877.973926999808, + "mem allocated avg": 6774061203.456, + "mem reserved avg": 11347018383.36, + "elapsed time": 400.9208664790058 + }, + { + "step": 1500, + "valid accuracy": 0.0, + "train loss": 1.2315508608818053, + "train samples": 6000, + "train time": 26.742762298934394, + "eval time": 10.483759973009, + "tokens / sec": 7827.575837531978, + "mem allocated avg": 6775242500.096, + "mem reserved avg": 11381990490.112, + "elapsed time": 479.3449222920026 + }, + { + "step": 1750, + "valid accuracy": 0.0, + "train loss": 1.2309930021762847, + "train samples": 7000, + "train time": 26.920282723149285, + "eval time": 10.450218685000436, + "tokens / sec": 7776.849974163588, + "mem allocated avg": 6777141585.92, + "mem reserved avg": 11390496538.624, + "elapsed time": 558.0740258180012 + }, + { + "step": 2000, + "valid accuracy": 0.0, + "train loss": 1.2312077372074126, + "train samples": 8000, + "train time": 26.66103798917902, + "eval time": 10.460793553007534, + "tokens / sec": 7790.244328982917, + "mem allocated avg": 6774060457.984, + "mem reserved avg": 11331650453.504, + "elapsed time": 636.1544087350048 + }, + { + "step": 2250, + "valid accuracy": 0.0, + "train loss": 1.2250888612270354, + "train samples": 9000, + "train time": 27.347798306160257, + "eval time": 10.455188547988655, + "tokens / sec": 7859.791768011602, + "mem allocated avg": 6784883898.368, + "mem reserved avg": 11515327414.272, + "elapsed time": 715.6408126690076 + }, + { + "step": 2500, + "valid accuracy": 0.0, + "train loss": 1.237301394701004, + "train samples": 10000, + "train time": 26.43946731692995, + "eval time": 10.463690039992798, + "tokens / sec": 7790.134253881636, + "mem allocated avg": 6770695682.048, + "mem reserved avg": 11285622161.408, + "elapsed time": 793.5970988850022 + }, + { + "step": 2750, + "valid accuracy": 0.0, + "train loss": 1.2318837890625, + "train samples": 11000, + "train time": 27.072892207099358, + "eval time": 10.45099154500349, + "tokens / sec": 7826.3156510643585, + "mem allocated avg": 6780353579.008, + "mem reserved avg": 11444477231.104, + "elapsed time": 872.6363771199976 + }, + { + "step": 3000, + "valid accuracy": 0.0, + "train loss": 1.2326687624454498, + "train samples": 12000, + "train time": 26.88052615702327, + "eval time": 10.466728950996185, + "tokens / sec": 7765.138181473555, + "mem allocated avg": 6776020297.728, + "mem reserved avg": 11370128998.4, + "elapsed time": 951.0341247300094 + }, + { + "step": 3250, + "valid accuracy": 0.0, + "train loss": 1.2315667741298675, + "train samples": 13000, + "train time": 26.58970486979524, + "eval time": 10.440216451999731, + "tokens / sec": 7931.678859646707, + "mem allocated avg": 6777846503.424, + "mem reserved avg": 11400059551.744, + "elapsed time": 1029.1816804429982 + }, + { + "step": 3500, + "valid accuracy": 0.0, + "train loss": 1.232551732301712, + "train samples": 14000, + "train time": 26.459182894948754, + "eval time": 10.444537474992103, + "tokens / sec": 7927.304514004579, + "mem allocated avg": 6776805982.208, + "mem reserved avg": 11380858028.032, + "elapsed time": 1107.2912037770002 + }, + { + "step": 3750, + "valid accuracy": 0.0, + "train loss": 1.2162783181667327, + "train samples": 15000, + "train time": 27.070398101161118, + "eval time": 10.439593077011523, + "tokens / sec": 8005.164873829656, + "mem allocated avg": 6786829993.984, + "mem reserved avg": 11549276110.848, + "elapsed time": 1186.5560989549995 + }, + { + "step": 4000, + "valid accuracy": 0.0, + "train loss": 1.2475486118793488, + "train samples": 16000, + "train time": 26.172411711973837, + "eval time": 10.464052501003607, + "tokens / sec": 7808.71867098513, + "mem allocated avg": 6768875591.68, + "mem reserved avg": 11260808658.944, + "elapsed time": 1263.9761855469987 + }, + { + "step": 4250, + "valid accuracy": 0.0, + "train loss": 1.2161538779735566, + "train samples": 17000, + "train time": 26.80681787095091, + "eval time": 10.449379954006872, + "tokens / sec": 7885.643160543526, + "mem allocated avg": 6779425828.864, + "mem reserved avg": 11415653974.016, + "elapsed time": 1342.6529225870036 + }, + { + "step": 4500, + "valid accuracy": 0.0, + "train loss": 1.2418145356178283, + "train samples": 18000, + "train time": 26.542597533072694, + "eval time": 10.46835913900577, + "tokens / sec": 7829.602952049208, + "mem allocated avg": 6773693413.376, + "mem reserved avg": 11331625287.68, + "elapsed time": 1420.7107766840054 + }, + { + "step": 4750, + "valid accuracy": 0.0, + "train loss": 1.2255646660327912, + "train samples": 19000, + "train time": 26.923357297797338, + "eval time": 10.45829652599059, + "tokens / sec": 7797.653081593045, + "mem allocated avg": 6775938277.376, + "mem reserved avg": 11381587836.928, + "elapsed time": 1499.7020156110084 + }, + { + "step": 5000, + "valid accuracy": 0.0, + "train loss": 1.2370348122119903, + "train samples": 20000, + "train time": 26.415459764844854, + "eval time": 10.446229163004318, + "tokens / sec": 7884.776636641793, + "mem allocated avg": 6773129859.072, + "mem reserved avg": 11327984631.808, + "elapsed time": 1577.772203030996 + }, + { + "step": 5000, + "test accuracy": 0.0, + "train loss": 1.2370348122119903, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/loha--llama-3.2-3B-rank32.json b/MetaMathQA/results/loha--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000000000000000000000000000000000..f9a3ad7896e958147a554956cd68c2ce7649817f --- /dev/null +++ b/MetaMathQA/results/loha--llama-3.2-3B-rank32.json @@ -0,0 +1,355 @@ +{ + "run_info": { + "created_at": "2025-06-19T16:12:05+00:00", + "total_time": 2590.9341236870005, + "experiment_name": "loha/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "LOHA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "rank_pattern": {}, + "alpha_pattern": {}, + "r": 32, + "alpha": 64, + "rank_dropout": 0.0, + "module_dropout": 0.0, + "use_effective_conv2d": false, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 13446820344, + "cuda_memory_max": 23886561280, + "cuda_memory_reserved_99th": 19247870771, + "train_time": 2340.7451966560056, + "file_size": 73429560, + "num_trainable_params": 18350080, + "num_total_params": 3231099904, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.2914833688735963, + "train samples": 1000, + "train time": 47.4107696449737, + "eval time": 14.298813604000316, + "tokens / sec": 4465.630943885038, + "mem allocated avg": 7073903032.32, + "mem reserved avg": 13501707845.632, + "elapsed time": 120.40146815400112 + }, + { + "step": 500, + "valid accuracy": 0.36, + "train loss": 0.9051123185157776, + "train samples": 2000, + "train time": 47.1910586300055, + "eval time": 14.155041256999539, + "tokens / sec": 4407.508668766131, + "mem allocated avg": 7065529796.608, + "mem reserved avg": 13391154380.8, + "elapsed time": 234.63223427900084 + }, + { + "step": 750, + "valid accuracy": 0.34, + "train loss": 0.7515897085666656, + "train samples": 3000, + "train time": 48.203471163995346, + "eval time": 14.26281827999992, + "tokens / sec": 4447.8332124791605, + "mem allocated avg": 7076336949.248, + "mem reserved avg": 13550454046.72, + "elapsed time": 350.3563782780002 + }, + { + "step": 1000, + "valid accuracy": 0.4, + "train loss": 0.7082941273450851, + "train samples": 4000, + "train time": 47.1758063940124, + "eval time": 14.148222272999192, + "tokens / sec": 4416.161925457669, + "mem allocated avg": 7067704416.256, + "mem reserved avg": 13415607173.12, + "elapsed time": 464.4216197680016 + }, + { + "step": 1250, + "valid accuracy": 0.32, + "train loss": 0.6994056793451309, + "train samples": 5000, + "train time": 47.32811543400385, + "eval time": 14.265782994998517, + "tokens / sec": 4406.218123998481, + "mem allocated avg": 7067674988.544, + "mem reserved avg": 13411052158.976, + "elapsed time": 578.864341718001 + }, + { + "step": 1500, + "valid accuracy": 0.38, + "train loss": 0.6889224811792374, + "train samples": 6000, + "train time": 47.48961014100678, + "eval time": 9.485757000999001, + "tokens / sec": 4407.932585221307, + "mem allocated avg": 7068496666.624, + "mem reserved avg": 13434196328.448, + "elapsed time": 688.8339757740014 + }, + { + "step": 1750, + "valid accuracy": 0.36, + "train loss": 0.6795901688337326, + "train samples": 7000, + "train time": 47.5112849769921, + "eval time": 8.524607335999463, + "tokens / sec": 4406.426812101222, + "mem allocated avg": 7070726457.344, + "mem reserved avg": 13451493638.144, + "elapsed time": 797.8088079910012 + }, + { + "step": 2000, + "valid accuracy": 0.42, + "train loss": 0.680127969622612, + "train samples": 8000, + "train time": 47.15311444799954, + "eval time": 14.09636382700046, + "tokens / sec": 4404.714353047605, + "mem allocated avg": 7067623004.16, + "mem reserved avg": 13389174669.312, + "elapsed time": 911.8939753530012 + }, + { + "step": 2250, + "valid accuracy": 0.42, + "train loss": 0.6731046036481857, + "train samples": 9000, + "train time": 48.44002798400652, + "eval time": 14.30888277199847, + "tokens / sec": 4437.404538060332, + "mem allocated avg": 7078766321.664, + "mem reserved avg": 13582146207.744, + "elapsed time": 1028.5093922580018 + }, + { + "step": 2500, + "valid accuracy": 0.42, + "train loss": 0.6711453741788864, + "train samples": 10000, + "train time": 46.86391301901131, + "eval time": 8.751619284999833, + "tokens / sec": 4395.002182520381, + "mem allocated avg": 7063469082.624, + "mem reserved avg": 13336376770.56, + "elapsed time": 1137.0856770440005 + }, + { + "step": 2750, + "valid accuracy": 0.44, + "train loss": 0.6645345565080643, + "train samples": 11000, + "train time": 47.92562343400823, + "eval time": 7.835686906000774, + "tokens / sec": 4421.037950434847, + "mem allocated avg": 7074535438.336, + "mem reserved avg": 13512352989.184, + "elapsed time": 1246.1237790820014 + }, + { + "step": 3000, + "valid accuracy": 0.4, + "train loss": 0.6566170369386672, + "train samples": 12000, + "train time": 47.50991778100797, + "eval time": 14.152554526999666, + "tokens / sec": 4393.419516365485, + "mem allocated avg": 7068629661.696, + "mem reserved avg": 13428215250.944, + "elapsed time": 1360.8028930970013 + }, + { + "step": 3250, + "valid accuracy": 0.42, + "train loss": 0.6667062133550644, + "train samples": 13000, + "train time": 47.62723316902702, + "eval time": 14.332656014001259, + "tokens / sec": 4428.1598146069355, + "mem allocated avg": 7071043653.632, + "mem reserved avg": 13457114005.504, + "elapsed time": 1476.0946507730005 + }, + { + "step": 3500, + "valid accuracy": 0.42, + "train loss": 0.6537795497179031, + "train samples": 14000, + "train time": 47.07006615899445, + "eval time": 14.135684340000807, + "tokens / sec": 4456.12290604184, + "mem allocated avg": 7069669969.92, + "mem reserved avg": 13439749586.944, + "elapsed time": 1590.4238928290015 + }, + { + "step": 3750, + "valid accuracy": 0.46, + "train loss": 0.6509792991876602, + "train samples": 15000, + "train time": 48.58318820000204, + "eval time": 14.298812560000442, + "tokens / sec": 4460.452432802484, + "mem allocated avg": 7081669246.976, + "mem reserved avg": 13624240242.688, + "elapsed time": 1707.3096692510007 + }, + { + "step": 4000, + "valid accuracy": 0.46, + "train loss": 0.6675102390050888, + "train samples": 16000, + "train time": 46.83876558602242, + "eval time": 14.188353157000165, + "tokens / sec": 4363.330191199334, + "mem allocated avg": 7062227976.192, + "mem reserved avg": 13316957143.04, + "elapsed time": 1821.413719397 + }, + { + "step": 4250, + "valid accuracy": 0.46, + "train loss": 0.6494157313108444, + "train samples": 17000, + "train time": 46.9989987980116, + "eval time": 8.258924301999286, + "tokens / sec": 4497.7341093688, + "mem allocated avg": 7072862310.4, + "mem reserved avg": 13470619664.384, + "elapsed time": 1930.0706906220003 + }, + { + "step": 4500, + "valid accuracy": 0.44, + "train loss": 0.6580193819999695, + "train samples": 18000, + "train time": 47.171681194990015, + "eval time": 9.717189478000364, + "tokens / sec": 4405.566957449713, + "mem allocated avg": 7068038127.616, + "mem reserved avg": 13393654185.984, + "elapsed time": 2040.1967968460012 + }, + { + "step": 4750, + "valid accuracy": 0.48, + "train loss": 0.6511869616508484, + "train samples": 19000, + "train time": 47.517527918005726, + "eval time": 14.28858694399969, + "tokens / sec": 4418.138089217562, + "mem allocated avg": 7069871403.008, + "mem reserved avg": 13443927113.728, + "elapsed time": 2155.4679406510004 + }, + { + "step": 5000, + "valid accuracy": 0.46, + "train loss": 0.6569721374511719, + "train samples": 20000, + "train time": 46.99870921700858, + "eval time": 9.378413720998651, + "tokens / sec": 4431.6110691104805, + "mem allocated avg": 7066192863.232, + "mem reserved avg": 13386213490.688, + "elapsed time": 2265.1425104650007 + }, + { + "step": 5000, + "test accuracy": 0.4184988627748294, + "train loss": 0.6569721374511719, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/lokr--llama-3.2-3B-rank32.json b/MetaMathQA/results/lokr--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000000000000000000000000000000000..7ab43febfdae477412f1a945e23b064b703d62f6 --- /dev/null +++ b/MetaMathQA/results/lokr--llama-3.2-3B-rank32.json @@ -0,0 +1,358 @@ +{ + "run_info": { + "created_at": "2025-06-19T22:33:02+00:00", + "total_time": 2351.995087948999, + "experiment_name": "lokr/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "LOKR", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "rank_pattern": {}, + "alpha_pattern": {}, + "r": 32, + "alpha": 64, + "rank_dropout": 0.0, + "module_dropout": 0.0, + "use_effective_conv2d": false, + "decompose_both": false, + "decompose_factor": -1, + "rank_dropout_scale": false, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 13173683073, + "cuda_memory_max": 23565697024, + "cuda_memory_reserved_99th": 18987698094, + "train_time": 2152.0406475960117, + "file_size": 1131984, + "num_trainable_params": 279552, + "num_total_params": 3213029376, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.2610720434188842, + "train samples": 1000, + "train time": 43.70352009194903, + "eval time": 12.492729608995432, + "tokens / sec": 4844.438149479918, + "mem allocated avg": 6786119440.384, + "mem reserved avg": 13227744296.96, + "elapsed time": 111.06800683099573 + }, + { + "step": 500, + "valid accuracy": 0.32, + "train loss": 0.9418708410263061, + "train samples": 2000, + "train time": 42.27786245904281, + "eval time": 12.404362346002017, + "tokens / sec": 4919.714193249426, + "mem allocated avg": 6777965645.824, + "mem reserved avg": 13119581585.408, + "elapsed time": 215.0703402069994 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.7932645809650422, + "train samples": 3000, + "train time": 42.92248660406767, + "eval time": 12.39727676199982, + "tokens / sec": 4995.0740733571965, + "mem allocated avg": 6788190218.24, + "mem reserved avg": 13280533807.104, + "elapsed time": 319.9520932419982 + }, + { + "step": 1000, + "valid accuracy": 0.38, + "train loss": 0.7486661098003388, + "train samples": 4000, + "train time": 42.81222543502372, + "eval time": 12.462298920996545, + "tokens / sec": 4866.273544135012, + "mem allocated avg": 6778714585.088, + "mem reserved avg": 13136702734.336, + "elapsed time": 424.5020112219936 + }, + { + "step": 1250, + "valid accuracy": 0.3, + "train loss": 0.7329869548082352, + "train samples": 5000, + "train time": 42.917129570938414, + "eval time": 12.430814264000219, + "tokens / sec": 4859.085453403965, + "mem allocated avg": 6779904688.128, + "mem reserved avg": 13134379089.92, + "elapsed time": 529.3957975729936 + }, + { + "step": 1500, + "valid accuracy": 0.38, + "train loss": 0.7207228287458419, + "train samples": 6000, + "train time": 43.03383123301319, + "eval time": 12.441326129999652, + "tokens / sec": 4864.335663412017, + "mem allocated avg": 6779916724.224, + "mem reserved avg": 13160828370.944, + "elapsed time": 634.2579850239999 + }, + { + "step": 1750, + "valid accuracy": 0.34, + "train loss": 0.7103905143737793, + "train samples": 7000, + "train time": 42.76188673896104, + "eval time": 12.393813144997694, + "tokens / sec": 4895.831684836612, + "mem allocated avg": 6782196824.064, + "mem reserved avg": 13176313741.312, + "elapsed time": 738.7873818459993 + }, + { + "step": 2000, + "valid accuracy": 0.34, + "train loss": 0.709170572757721, + "train samples": 8000, + "train time": 42.39193291300762, + "eval time": 12.418300639998051, + "tokens / sec": 4899.422737486692, + "mem allocated avg": 6778828410.88, + "mem reserved avg": 13120303005.696, + "elapsed time": 842.8626621349977 + }, + { + "step": 2250, + "valid accuracy": 0.38, + "train loss": 0.7017016235589981, + "train samples": 9000, + "train time": 43.90131158899749, + "eval time": 12.417865242998232, + "tokens / sec": 4896.163513571884, + "mem allocated avg": 6790482182.144, + "mem reserved avg": 13307486404.608, + "elapsed time": 949.0399838449957 + }, + { + "step": 2500, + "valid accuracy": 0.36, + "train loss": 0.6999357705116271, + "train samples": 10000, + "train time": 41.90174934701645, + "eval time": 7.302261034004914, + "tokens / sec": 4915.474967268057, + "mem allocated avg": 6775840593.92, + "mem reserved avg": 13059930193.92, + "elapsed time": 1047.236226049994 + }, + { + "step": 2750, + "valid accuracy": 0.36, + "train loss": 0.694103113770485, + "train samples": 11000, + "train time": 43.541668150042824, + "eval time": 12.415209386999777, + "tokens / sec": 4866.166341396629, + "mem allocated avg": 6786276190.208, + "mem reserved avg": 13245360373.76, + "elapsed time": 1152.8413292650002 + }, + { + "step": 3000, + "valid accuracy": 0.4, + "train loss": 0.686756227850914, + "train samples": 12000, + "train time": 43.03442109594471, + "eval time": 7.144659414996568, + "tokens / sec": 4850.32666140987, + "mem allocated avg": 6781073500.16, + "mem reserved avg": 13155426107.392, + "elapsed time": 1252.4253450399992 + }, + { + "step": 3250, + "valid accuracy": 0.38, + "train loss": 0.6960614495277405, + "train samples": 13000, + "train time": 43.27108911598771, + "eval time": 7.294012983998982, + "tokens / sec": 4873.947115929577, + "mem allocated avg": 6783027929.088, + "mem reserved avg": 13189282529.28, + "elapsed time": 1352.4968370129936 + }, + { + "step": 3500, + "valid accuracy": 0.4, + "train loss": 0.6833453825712203, + "train samples": 14000, + "train time": 43.27389094301907, + "eval time": 7.8778488079988165, + "tokens / sec": 4847.03352134867, + "mem allocated avg": 6781185138.688, + "mem reserved avg": 13163252678.656, + "elapsed time": 1453.3269581979985 + }, + { + "step": 3750, + "valid accuracy": 0.38, + "train loss": 0.6804633007049561, + "train samples": 15000, + "train time": 44.034181773953605, + "eval time": 7.058443625996006, + "tokens / sec": 4921.245070759568, + "mem allocated avg": 6792035817.472, + "mem reserved avg": 13346417934.336, + "elapsed time": 1554.5450010249988 + }, + { + "step": 4000, + "valid accuracy": 0.36, + "train loss": 0.6990108703374862, + "train samples": 16000, + "train time": 42.4217994650171, + "eval time": 7.152937473998463, + "tokens / sec": 4817.6409906547, + "mem allocated avg": 6773882982.4, + "mem reserved avg": 13037876543.488, + "elapsed time": 1653.5999729619944 + }, + { + "step": 4250, + "valid accuracy": 0.4, + "train loss": 0.6789947774410248, + "train samples": 17000, + "train time": 43.347477565999725, + "eval time": 7.062385851000727, + "tokens / sec": 4876.615938681662, + "mem allocated avg": 6784096524.288, + "mem reserved avg": 13200808476.672, + "elapsed time": 1754.0244643159967 + }, + { + "step": 4500, + "valid accuracy": 0.36, + "train loss": 0.6891120710372924, + "train samples": 18000, + "train time": 42.82309688109672, + "eval time": 7.127946433000034, + "tokens / sec": 4852.941873331364, + "mem allocated avg": 6779667169.28, + "mem reserved avg": 13117727703.04, + "elapsed time": 1853.5914762979955 + }, + { + "step": 4750, + "valid accuracy": 0.38, + "train loss": 0.6815101335048676, + "train samples": 19000, + "train time": 43.26974187397718, + "eval time": 7.069867040001554, + "tokens / sec": 4851.866244347975, + "mem allocated avg": 6780809029.632, + "mem reserved avg": 13175080615.936, + "elapsed time": 1954.106976348994 + }, + { + "step": 5000, + "valid accuracy": 0.36, + "train loss": 0.6876721383333206, + "train samples": 20000, + "train time": 43.18727576800302, + "eval time": 7.149106363998726, + "tokens / sec": 4822.716790909798, + "mem allocated avg": 6778227302.4, + "mem reserved avg": 13118625284.096, + "elapsed time": 2054.7161079569996 + }, + { + "step": 5000, + "test accuracy": 0.3752843062926459, + "train loss": 0.6876721383333206, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/lora--llama-3.2-3B-rank32-dora.json b/MetaMathQA/results/lora--llama-3.2-3B-rank32-dora.json new file mode 100644 index 0000000000000000000000000000000000000000..ab8251af3e09bc8f85703ffc7af3761154d279b7 --- /dev/null +++ b/MetaMathQA/results/lora--llama-3.2-3B-rank32-dora.json @@ -0,0 +1,365 @@ +{ + "run_info": { + "created_at": "2025-06-19T18:37:24+00:00", + "total_time": 2286.5437473089987, + "experiment_name": "lora/llama-3.2-3B-rank32-dora", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "lora_alpha": 64, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": true, + "layer_replication": null, + "lora_bias": false + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 12490471636, + "cuda_memory_max": 24553455616, + "cuda_memory_reserved_99th": 19189150515, + "train_time": 2022.7454924520134, + "file_size": 37181760, + "num_trainable_params": 9289728, + "num_total_params": 3222039552, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.36, + "train loss": 0.9800839998722076, + "train samples": 1000, + "train time": 35.42731901501611, + "eval time": 16.70931195599769, + "tokens / sec": 5976.150775345474, + "mem allocated avg": 6924859500.544, + "mem reserved avg": 12552201306.112, + "elapsed time": 105.33911871900273 + }, + { + "step": 500, + "valid accuracy": 0.44, + "train loss": 0.7162023800611496, + "train samples": 2000, + "train time": 35.53461015297944, + "eval time": 16.7331051809997, + "tokens / sec": 5853.307496678993, + "mem allocated avg": 6917484427.264, + "mem reserved avg": 12427118772.224, + "elapsed time": 204.02196035100133 + }, + { + "step": 750, + "valid accuracy": 0.42, + "train loss": 0.6790966511964798, + "train samples": 3000, + "train time": 35.395415813978616, + "eval time": 10.35499690800134, + "tokens / sec": 6057.309825848329, + "mem allocated avg": 6927996166.144, + "mem reserved avg": 12609050902.528, + "elapsed time": 296.3724143870022 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.6590274780988693, + "train samples": 4000, + "train time": 35.01134122798976, + "eval time": 16.638093278997985, + "tokens / sec": 5950.528962696411, + "mem allocated avg": 6919690883.072, + "mem reserved avg": 12464313860.096, + "elapsed time": 394.33112582000103 + }, + { + "step": 1250, + "valid accuracy": 0.42, + "train loss": 0.6542477097511291, + "train samples": 5000, + "train time": 34.85555366096378, + "eval time": 16.627405782997812, + "tokens / sec": 5982.920312453697, + "mem allocated avg": 6919055253.504, + "mem reserved avg": 12449952563.2, + "elapsed time": 492.1167898590029 + }, + { + "step": 1500, + "valid accuracy": 0.4, + "train loss": 0.6471435966491699, + "train samples": 6000, + "train time": 35.407848127983016, + "eval time": 10.318167828998412, + "tokens / sec": 5911.994404273457, + "mem allocated avg": 6921185224.704, + "mem reserved avg": 12477500751.872, + "elapsed time": 584.325913470002 + }, + { + "step": 1750, + "valid accuracy": 0.42, + "train loss": 0.6376023133993148, + "train samples": 7000, + "train time": 35.61810469696138, + "eval time": 10.057756549002079, + "tokens / sec": 5877.7692350896, + "mem allocated avg": 6922196224.0, + "mem reserved avg": 12495888580.608, + "elapsed time": 676.5556904380028 + }, + { + "step": 2000, + "valid accuracy": 0.36, + "train loss": 0.6404745506048203, + "train samples": 8000, + "train time": 35.01814225999988, + "eval time": 10.846777078000741, + "tokens / sec": 5931.097042724754, + "mem allocated avg": 6919877345.28, + "mem reserved avg": 12428771328.0, + "elapsed time": 768.7593245980024 + }, + { + "step": 2250, + "valid accuracy": 0.48, + "train loss": 0.6327905882596969, + "train samples": 9000, + "train time": 35.941867801058834, + "eval time": 16.654083295998134, + "tokens / sec": 5980.434884178939, + "mem allocated avg": 6930785019.904, + "mem reserved avg": 12637135962.112, + "elapsed time": 868.0876048490027 + }, + { + "step": 2500, + "valid accuracy": 0.44, + "train loss": 0.6293514591455459, + "train samples": 10000, + "train time": 35.19044898093853, + "eval time": 16.654415837998386, + "tokens / sec": 5852.923334725435, + "mem allocated avg": 6914962546.688, + "mem reserved avg": 12361175924.736, + "elapsed time": 965.9673830700012 + }, + { + "step": 2750, + "valid accuracy": 0.34, + "train loss": 0.6212090995311738, + "train samples": 11000, + "train time": 35.78923041201051, + "eval time": 12.364532577001228, + "tokens / sec": 5920.2446535116005, + "mem allocated avg": 6926067247.104, + "mem reserved avg": 12561110007.808, + "elapsed time": 1060.7434992320013 + }, + { + "step": 3000, + "valid accuracy": 0.48, + "train loss": 0.6132309092283249, + "train samples": 12000, + "train time": 35.434680095979274, + "eval time": 10.902270734000922, + "tokens / sec": 5890.585139604081, + "mem allocated avg": 6921261266.944, + "mem reserved avg": 12472811520.0, + "elapsed time": 1153.3681941970026 + }, + { + "step": 3250, + "valid accuracy": 0.5, + "train loss": 0.6223928620815277, + "train samples": 13000, + "train time": 35.475069620017166, + "eval time": 9.885322058999009, + "tokens / sec": 5945.048234126565, + "mem allocated avg": 6922737405.952, + "mem reserved avg": 12498002509.824, + "elapsed time": 1245.241280964001 + }, + { + "step": 3500, + "valid accuracy": 0.5, + "train loss": 0.605602259516716, + "train samples": 14000, + "train time": 35.607162244014035, + "eval time": 10.090815307001321, + "tokens / sec": 5890.668808780496, + "mem allocated avg": 6920974434.304, + "mem reserved avg": 12474329858.048, + "elapsed time": 1337.4724736530006 + }, + { + "step": 3750, + "valid accuracy": 0.48, + "train loss": 0.6031041693687439, + "train samples": 15000, + "train time": 36.209776319014054, + "eval time": 10.371932055000798, + "tokens / sec": 5984.6544781390285, + "mem allocated avg": 6933558140.928, + "mem reserved avg": 12681738190.848, + "elapsed time": 1431.2058649130013 + }, + { + "step": 4000, + "valid accuracy": 0.46, + "train loss": 0.6162525477409363, + "train samples": 16000, + "train time": 35.48366187599095, + "eval time": 12.394127589999698, + "tokens / sec": 5759.636666425441, + "mem allocated avg": 6914222096.384, + "mem reserved avg": 12349406707.712, + "elapsed time": 1525.3134414390006 + }, + { + "step": 4250, + "valid accuracy": 0.5, + "train loss": 0.6013483003377914, + "train samples": 17000, + "train time": 35.15769277801883, + "eval time": 16.63699178299794, + "tokens / sec": 6012.59591562743, + "mem allocated avg": 6924507731.968, + "mem reserved avg": 12521616441.344, + "elapsed time": 1623.6120678540028 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.6073888168334961, + "train samples": 18000, + "train time": 34.98748015804085, + "eval time": 12.561758541996824, + "tokens / sec": 5939.781860861995, + "mem allocated avg": 6918951696.384, + "mem reserved avg": 12432495869.952, + "elapsed time": 1717.352138276001 + }, + { + "step": 4750, + "valid accuracy": 0.5, + "train loss": 0.5993685643672944, + "train samples": 19000, + "train time": 35.57701125005042, + "eval time": 13.379837485997996, + "tokens / sec": 5900.973483254653, + "mem allocated avg": 6921678901.248, + "mem reserved avg": 12490880581.632, + "elapsed time": 1812.886111721 + }, + { + "step": 5000, + "valid accuracy": 0.48, + "train loss": 0.6068210340738297, + "train samples": 20000, + "train time": 35.678432397002325, + "eval time": 10.087769599998865, + "tokens / sec": 5837.700425916121, + "mem allocated avg": 6918288025.6, + "mem reserved avg": 12423931101.184, + "elapsed time": 1905.221841073002 + }, + { + "step": 5000, + "test accuracy": 0.4806671721000758, + "train loss": 0.6068210340738297, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/lora--llama-3.2-3B-rank32-lorafa.json b/MetaMathQA/results/lora--llama-3.2-3B-rank32-lorafa.json new file mode 100644 index 0000000000000000000000000000000000000000..e95ab18d7e96829c53c3234d2af29a3bea7af97a --- /dev/null +++ b/MetaMathQA/results/lora--llama-3.2-3B-rank32-lorafa.json @@ -0,0 +1,367 @@ +{ + "run_info": { + "created_at": "2025-06-19T17:29:01+00:00", + "total_time": 2025.9028512089972, + "experiment_name": "lora/llama-3.2-3B-rank32-lorafa", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "lora-fa", + "optimizer_kwargs": { + "r": 32, + "lora_alpha": 64, + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "lora_alpha": 64, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "lora_bias": false + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 11106307276, + "cuda_memory_max": 20187185152, + "cuda_memory_reserved_99th": 16257394933, + "train_time": 1821.1390361119993, + "file_size": 36715216, + "num_trainable_params": 3670016, + "num_total_params": 3221924864, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.26, + "train loss": 1.13827001953125, + "train samples": 1000, + "train time": 39.487167649953335, + "eval time": 11.352047874999698, + "tokens / sec": 5361.716542367662, + "mem allocated avg": 6857574733.824, + "mem reserved avg": 11147042357.248, + "elapsed time": 95.33382818899918 + }, + { + "step": 500, + "valid accuracy": 0.34, + "train loss": 0.8058450784683228, + "train samples": 2000, + "train time": 38.91575912100234, + "eval time": 11.306344865999563, + "tokens / sec": 5344.749908469542, + "mem allocated avg": 6850229934.08, + "mem reserved avg": 11051613552.64, + "elapsed time": 184.45338391399855 + }, + { + "step": 750, + "valid accuracy": 0.4, + "train loss": 0.725865609407425, + "train samples": 3000, + "train time": 39.53630301699741, + "eval time": 9.965407437997783, + "tokens / sec": 5422.889436774727, + "mem allocated avg": 6861271248.896, + "mem reserved avg": 11192013684.736, + "elapsed time": 273.0429774479999 + }, + { + "step": 1000, + "valid accuracy": 0.36, + "train loss": 0.69585602581501, + "train samples": 4000, + "train time": 38.42195282199464, + "eval time": 11.263002069001232, + "tokens / sec": 5422.316792829388, + "mem allocated avg": 6851626665.984, + "mem reserved avg": 11074279571.456, + "elapsed time": 361.40852819099746 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.6884716705083848, + "train samples": 5000, + "train time": 38.42177955799343, + "eval time": 11.356440440998995, + "tokens / sec": 5427.598679682052, + "mem allocated avg": 6851712622.592, + "mem reserved avg": 11075865018.368, + "elapsed time": 449.83568274799836 + }, + { + "step": 1500, + "valid accuracy": 0.38, + "train loss": 0.6801862429380416, + "train samples": 6000, + "train time": 38.768619330003276, + "eval time": 11.348457601998234, + "tokens / sec": 5399.495871084515, + "mem allocated avg": 6853521098.752, + "mem reserved avg": 11096006066.176, + "elapsed time": 538.7312806489972 + }, + { + "step": 1750, + "valid accuracy": 0.38, + "train loss": 0.6713097202777862, + "train samples": 7000, + "train time": 38.99274470796445, + "eval time": 8.222045223003079, + "tokens / sec": 5369.0757490389815, + "mem allocated avg": 6854799144.96, + "mem reserved avg": 11113831858.176, + "elapsed time": 624.957832287997 + }, + { + "step": 2000, + "valid accuracy": 0.36, + "train loss": 0.6733613710403442, + "train samples": 8000, + "train time": 38.96502619797684, + "eval time": 9.028824541001086, + "tokens / sec": 5330.318500101101, + "mem allocated avg": 6852199981.056, + "mem reserved avg": 11058584485.888, + "elapsed time": 711.7122244169987 + }, + { + "step": 2250, + "valid accuracy": 0.38, + "train loss": 0.6658626307249069, + "train samples": 9000, + "train time": 39.83998639498168, + "eval time": 11.38518134900005, + "tokens / sec": 5395.282966940854, + "mem allocated avg": 6862685554.688, + "mem reserved avg": 11223865229.312, + "elapsed time": 802.4390404449987 + }, + { + "step": 2500, + "valid accuracy": 0.38, + "train loss": 0.6645791643857956, + "train samples": 10000, + "train time": 38.493957691986, + "eval time": 11.311897349998617, + "tokens / sec": 5350.631952372099, + "mem allocated avg": 6848127772.672, + "mem reserved avg": 11012925292.544, + "elapsed time": 890.7464078919984 + }, + { + "step": 2750, + "valid accuracy": 0.44, + "train loss": 0.658472005367279, + "train samples": 11000, + "train time": 38.51331885699619, + "eval time": 7.521690310000849, + "tokens / sec": 5501.499384842303, + "mem allocated avg": 6858912532.48, + "mem reserved avg": 11161915359.232, + "elapsed time": 975.6010923279973 + }, + { + "step": 3000, + "valid accuracy": 0.4, + "train loss": 0.6503657740354538, + "train samples": 12000, + "train time": 38.378428091957176, + "eval time": 9.959380172000238, + "tokens / sec": 5438.758447841249, + "mem allocated avg": 6853735892.992, + "mem reserved avg": 11091962757.12, + "elapsed time": 1062.3718837759989 + }, + { + "step": 3250, + "valid accuracy": 0.48, + "train loss": 0.6599743469953537, + "train samples": 13000, + "train time": 38.74303203701493, + "eval time": 9.720565422001528, + "tokens / sec": 5443.585308411229, + "mem allocated avg": 6855708461.056, + "mem reserved avg": 11117246021.632, + "elapsed time": 1149.5592005079998 + }, + { + "step": 3500, + "valid accuracy": 0.4, + "train loss": 0.6468936309814454, + "train samples": 14000, + "train time": 38.947772975978296, + "eval time": 10.49309463499958, + "tokens / sec": 5385.417033455723, + "mem allocated avg": 6854553325.568, + "mem reserved avg": 11102364631.04, + "elapsed time": 1237.83959684 + }, + { + "step": 3750, + "valid accuracy": 0.44, + "train loss": 0.6447412570714951, + "train samples": 15000, + "train time": 39.208677324022574, + "eval time": 11.265130790001422, + "tokens / sec": 5526.914315654032, + "mem allocated avg": 6864447199.232, + "mem reserved avg": 11258292076.544, + "elapsed time": 1327.6204509749987 + }, + { + "step": 4000, + "valid accuracy": 0.48, + "train loss": 0.6609537017345428, + "train samples": 16000, + "train time": 38.373366451996844, + "eval time": 8.435534727999766, + "tokens / sec": 5325.907495128434, + "mem allocated avg": 6846769313.792, + "mem reserved avg": 10994319360.0, + "elapsed time": 1412.8209538019983 + }, + { + "step": 4250, + "valid accuracy": 0.46, + "train loss": 0.6430994077920914, + "train samples": 17000, + "train time": 38.840016363014, + "eval time": 8.356262703997345, + "tokens / sec": 5442.556924391474, + "mem allocated avg": 6857134465.024, + "mem reserved avg": 11130768457.728, + "elapsed time": 1498.7970963499974 + }, + { + "step": 4500, + "valid accuracy": 0.5, + "train loss": 0.6519971441030502, + "train samples": 18000, + "train time": 38.99225058195225, + "eval time": 9.193580140999984, + "tokens / sec": 5329.725699295479, + "mem allocated avg": 6851737821.184, + "mem reserved avg": 11062996893.696, + "elapsed time": 1585.3292836179971 + }, + { + "step": 4750, + "valid accuracy": 0.42, + "train loss": 0.6448501836061478, + "train samples": 19000, + "train time": 39.31173135296194, + "eval time": 8.482506923999608, + "tokens / sec": 5340.365147366681, + "mem allocated avg": 6853984409.6, + "mem reserved avg": 11104352731.136, + "elapsed time": 1672.2648903240006 + }, + { + "step": 5000, + "valid accuracy": 0.46, + "train loss": 0.6509636770486832, + "train samples": 20000, + "train time": 38.96172000500519, + "eval time": 11.401191647000815, + "tokens / sec": 5345.759888763726, + "mem allocated avg": 6850959237.12, + "mem reserved avg": 11055900131.328, + "elapsed time": 1761.553419697997 + }, + { + "step": 5000, + "test accuracy": 0.42987111448066717, + "train loss": 0.6509636770486832, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/lora--llama-3.2-3B-rank32.json b/MetaMathQA/results/lora--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000000000000000000000000000000000..b90d9048cbc5c9d65b0d97dbd80348fedc99d0cd --- /dev/null +++ b/MetaMathQA/results/lora--llama-3.2-3B-rank32.json @@ -0,0 +1,365 @@ +{ + "run_info": { + "created_at": "2025-06-19T19:15:35+00:00", + "total_time": 1993.494420946001, + "experiment_name": "lora/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "lora_alpha": 64, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "lora_bias": false + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 11868689976, + "cuda_memory_max": 22273851392, + "cuda_memory_reserved_99th": 17710763212, + "train_time": 1796.1857790169925, + "file_size": 36715216, + "num_trainable_params": 9175040, + "num_total_params": 3221924864, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.34, + "train loss": 0.9827028260231018, + "train samples": 1000, + "train time": 31.395267726013117, + "eval time": 11.27943390099972, + "tokens / sec": 6743.659644748829, + "mem allocated avg": 6925580957.696, + "mem reserved avg": 11920245522.432, + "elapsed time": 94.68654379600048 + }, + { + "step": 500, + "valid accuracy": 0.44, + "train loss": 0.7164744178056717, + "train samples": 2000, + "train time": 30.728173206967767, + "eval time": 11.244831023999723, + "tokens / sec": 6768.869681873444, + "mem allocated avg": 6918363699.2, + "mem reserved avg": 11811654991.872, + "elapsed time": 182.6767855429971 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.6791989279985428, + "train samples": 3000, + "train time": 31.248708018982143, + "eval time": 6.873092081001232, + "tokens / sec": 6861.115661798283, + "mem allocated avg": 6929003134.976, + "mem reserved avg": 11970174517.248, + "elapsed time": 267.2763524209986 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.6590347054004669, + "train samples": 4000, + "train time": 31.016855426081747, + "eval time": 7.663122134003061, + "tokens / sec": 6716.864012746194, + "mem allocated avg": 6919503566.848, + "mem reserved avg": 11835008876.544, + "elapsed time": 351.92747904299904 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.6547032891511917, + "train samples": 5000, + "train time": 30.914218463025463, + "eval time": 11.249955232000502, + "tokens / sec": 6745.698593332356, + "mem allocated avg": 6919763681.28, + "mem reserved avg": 11832551014.4, + "elapsed time": 440.29597954699784 + }, + { + "step": 1500, + "valid accuracy": 0.42, + "train loss": 0.647298491358757, + "train samples": 6000, + "train time": 31.093457819981268, + "eval time": 11.25276822899832, + "tokens / sec": 6732.316528188762, + "mem allocated avg": 6920362313.728, + "mem reserved avg": 11859000295.424, + "elapsed time": 529.2981231249978 + }, + { + "step": 1750, + "valid accuracy": 0.46, + "train loss": 0.6378061240911483, + "train samples": 7000, + "train time": 31.079548971014447, + "eval time": 11.2527706639994, + "tokens / sec": 6736.101614449091, + "mem allocated avg": 6922653980.672, + "mem reserved avg": 11870048092.16, + "elapsed time": 617.7172930779998 + }, + { + "step": 2000, + "valid accuracy": 0.4, + "train loss": 0.641120473742485, + "train samples": 8000, + "train time": 30.851661891996628, + "eval time": 7.384566520999215, + "tokens / sec": 6732.084667823985, + "mem allocated avg": 6919747647.488, + "mem reserved avg": 11816562327.552, + "elapsed time": 702.0775224069985 + }, + { + "step": 2250, + "valid accuracy": 0.46, + "train loss": 0.6332860335111618, + "train samples": 9000, + "train time": 31.288193090975255, + "eval time": 11.258606130999397, + "tokens / sec": 6869.939704571801, + "mem allocated avg": 6930711803.904, + "mem reserved avg": 12003997384.704, + "elapsed time": 791.1291831710005 + }, + { + "step": 2500, + "valid accuracy": 0.44, + "train loss": 0.6298432033061981, + "train samples": 10000, + "train time": 30.668521790059458, + "eval time": 11.22552015600013, + "tokens / sec": 6715.908950876132, + "mem allocated avg": 6916224055.296, + "mem reserved avg": 11759050031.104, + "elapsed time": 878.9048607999976 + }, + { + "step": 2750, + "valid accuracy": 0.4, + "train loss": 0.6213459351062774, + "train samples": 11000, + "train time": 31.198134894020768, + "eval time": 7.820672179997928, + "tokens / sec": 6791.463679471677, + "mem allocated avg": 6926273599.488, + "mem reserved avg": 11930135691.264, + "elapsed time": 964.4001106439973 + }, + { + "step": 3000, + "valid accuracy": 0.46, + "train loss": 0.6136174714565277, + "train samples": 12000, + "train time": 30.652901480014407, + "eval time": 8.59450396900138, + "tokens / sec": 6809.502197894445, + "mem allocated avg": 6921910312.96, + "mem reserved avg": 11851509268.48, + "elapsed time": 1049.6233134680006 + }, + { + "step": 3250, + "valid accuracy": 0.46, + "train loss": 0.6227310271263122, + "train samples": 13000, + "train time": 30.898520497004938, + "eval time": 11.247846516002028, + "tokens / sec": 6825.601893153528, + "mem allocated avg": 6923552774.144, + "mem reserved avg": 11884266782.72, + "elapsed time": 1137.9473550990006 + }, + { + "step": 3500, + "valid accuracy": 0.52, + "train loss": 0.6058980323076248, + "train samples": 14000, + "train time": 31.043968706952, + "eval time": 7.071496761000162, + "tokens / sec": 6756.545916535101, + "mem allocated avg": 6922457063.424, + "mem reserved avg": 11865602129.92, + "elapsed time": 1222.3722963839973 + }, + { + "step": 3750, + "valid accuracy": 0.5, + "train loss": 0.6032638043165207, + "train samples": 15000, + "train time": 31.41906641800597, + "eval time": 6.834270917999675, + "tokens / sec": 6897.18138397039, + "mem allocated avg": 6932064409.6, + "mem reserved avg": 12041553182.72, + "elapsed time": 1307.517348808 + }, + { + "step": 4000, + "valid accuracy": 0.48, + "train loss": 0.6166473155021668, + "train samples": 16000, + "train time": 30.82234557695483, + "eval time": 6.627715251001064, + "tokens / sec": 6630.676419149782, + "mem allocated avg": 6914480900.096, + "mem reserved avg": 11738338557.952, + "elapsed time": 1390.9289551410002 + }, + { + "step": 4250, + "valid accuracy": 0.44, + "train loss": 0.601645546555519, + "train samples": 17000, + "train time": 30.811621871023817, + "eval time": 11.241402788000414, + "tokens / sec": 6860.690452611215, + "mem allocated avg": 6925075550.208, + "mem reserved avg": 11899366277.12, + "elapsed time": 1479.325017957999 + }, + { + "step": 4500, + "valid accuracy": 0.46, + "train loss": 0.6076700875759125, + "train samples": 18000, + "train time": 30.499847401017178, + "eval time": 11.232504903000518, + "tokens / sec": 6813.73900884072, + "mem allocated avg": 6919328847.872, + "mem reserved avg": 11814020579.328, + "elapsed time": 1567.0791362639975 + }, + { + "step": 4750, + "valid accuracy": 0.46, + "train loss": 0.5997640329599381, + "train samples": 19000, + "train time": 30.974938084971654, + "eval time": 11.246996836001927, + "tokens / sec": 6777.705234602477, + "mem allocated avg": 6921498724.352, + "mem reserved avg": 11864662605.824, + "elapsed time": 1655.6881185989987 + }, + { + "step": 5000, + "valid accuracy": 0.5, + "train loss": 0.6069052599668503, + "train samples": 20000, + "train time": 30.736502733019734, + "eval time": 11.28520023999954, + "tokens / sec": 6776.307695418065, + "mem allocated avg": 6918408683.52, + "mem reserved avg": 11806051401.728, + "elapsed time": 1743.785376189 + }, + { + "step": 5000, + "test accuracy": 0.48218347232752085, + "train loss": 0.6069052599668503, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/lora--llama-3.2-3B-rank64-rslora.json b/MetaMathQA/results/lora--llama-3.2-3B-rank64-rslora.json new file mode 100644 index 0000000000000000000000000000000000000000..fd499d5401ff62c7662909ccd5a0a1ff6849d391 --- /dev/null +++ b/MetaMathQA/results/lora--llama-3.2-3B-rank64-rslora.json @@ -0,0 +1,365 @@ +{ + "run_info": { + "created_at": "2025-06-19T18:02:52+00:00", + "total_time": 2068.5078051540004, + "experiment_name": "lora/llama-3.2-3B-rank64-rslora", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "lora_alpha": 64, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": true, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "lora_bias": false + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 12128059444, + "cuda_memory_max": 22538092544, + "cuda_memory_reserved_99th": 17953927987, + "train_time": 1871.457509397991, + "file_size": 73415408, + "num_trainable_params": 18350080, + "num_total_params": 3231099904, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.36, + "train loss": 0.8666043817996979, + "train samples": 1000, + "train time": 31.633613975991466, + "eval time": 11.220254406001914, + "tokens / sec": 6692.848947347132, + "mem allocated avg": 7072427177.984, + "mem reserved avg": 12177985503.232, + "elapsed time": 97.06891104899842 + }, + { + "step": 500, + "valid accuracy": 0.32, + "train loss": 0.697043846487999, + "train samples": 2000, + "train time": 31.400947067988454, + "eval time": 11.24747418500192, + "tokens / sec": 6623.844801548661, + "mem allocated avg": 7064966957.056, + "mem reserved avg": 12070787481.6, + "elapsed time": 188.02626212299947 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.6723507121801376, + "train samples": 3000, + "train time": 31.849995732016396, + "eval time": 11.249978227999236, + "tokens / sec": 6731.586459350098, + "mem allocated avg": 7075822055.424, + "mem reserved avg": 12225037205.504, + "elapsed time": 280.16055655299715 + }, + { + "step": 1000, + "valid accuracy": 0.38, + "train loss": 0.6529685587882995, + "train samples": 4000, + "train time": 31.612207354013663, + "eval time": 11.24677863100078, + "tokens / sec": 6590.365477074112, + "mem allocated avg": 7066256992.256, + "mem reserved avg": 12092287483.904, + "elapsed time": 371.5183315869981 + }, + { + "step": 1250, + "valid accuracy": 0.38, + "train loss": 0.6472815409898758, + "train samples": 5000, + "train time": 31.26670297003875, + "eval time": 8.06907803500144, + "tokens / sec": 6669.651104557813, + "mem allocated avg": 7066435080.192, + "mem reserved avg": 12087824744.448, + "elapsed time": 459.33407214199906 + }, + { + "step": 1500, + "valid accuracy": 0.44, + "train loss": 0.6395461517572403, + "train samples": 6000, + "train time": 31.471468601008382, + "eval time": 6.4898526670003776, + "tokens / sec": 6651.4531830043925, + "mem allocated avg": 7067292080.128, + "mem reserved avg": 12121664389.12, + "elapsed time": 545.9371380269986 + }, + { + "step": 1750, + "valid accuracy": 0.5, + "train loss": 0.629749027967453, + "train samples": 7000, + "train time": 31.650018079009897, + "eval time": 11.247470542999508, + "tokens / sec": 6614.688164707337, + "mem allocated avg": 7069213276.16, + "mem reserved avg": 12130329821.184, + "elapsed time": 637.2524904149977 + }, + { + "step": 2000, + "valid accuracy": 0.4, + "train loss": 0.6293291836977005, + "train samples": 8000, + "train time": 31.45956211398152, + "eval time": 11.187045163998846, + "tokens / sec": 6601.999075749819, + "mem allocated avg": 7066587928.576, + "mem reserved avg": 12076634341.376, + "elapsed time": 728.2233991199973 + }, + { + "step": 2250, + "valid accuracy": 0.4, + "train loss": 0.6171289530992508, + "train samples": 9000, + "train time": 31.87981533700804, + "eval time": 6.866186073002609, + "tokens / sec": 6742.448089104055, + "mem allocated avg": 7077788481.536, + "mem reserved avg": 12265227026.432, + "elapsed time": 815.9717469499992 + }, + { + "step": 2500, + "valid accuracy": 0.44, + "train loss": 0.6119417071342468, + "train samples": 10000, + "train time": 31.067599171023176, + "eval time": 10.55572699700133, + "tokens / sec": 6629.640058962326, + "mem allocated avg": 7062992943.104, + "mem reserved avg": 12015850487.808, + "elapsed time": 905.6140671029971 + }, + { + "step": 2750, + "valid accuracy": 0.48, + "train loss": 0.5985908216238022, + "train samples": 11000, + "train time": 31.864849751029396, + "eval time": 6.1964339680016565, + "tokens / sec": 6649.364477017663, + "mem allocated avg": 7072847513.6, + "mem reserved avg": 12192229359.616, + "elapsed time": 992.5055643979977 + }, + { + "step": 3000, + "valid accuracy": 0.48, + "train loss": 0.5865949945449829, + "train samples": 12000, + "train time": 31.337576934987737, + "eval time": 7.105518241998652, + "tokens / sec": 6660.725570232467, + "mem allocated avg": 7068560369.664, + "mem reserved avg": 12111589670.912, + "elapsed time": 1079.4884613180002 + }, + { + "step": 3250, + "valid accuracy": 0.56, + "train loss": 0.5926763614416123, + "train samples": 13000, + "train time": 31.477448584984813, + "eval time": 11.220603736997873, + "tokens / sec": 6700.066539084199, + "mem allocated avg": 7070318487.552, + "mem reserved avg": 12143046950.912, + "elapsed time": 1171.2174267509981 + }, + { + "step": 3500, + "valid accuracy": 0.54, + "train loss": 0.5736529529094696, + "train samples": 14000, + "train time": 31.59231336902303, + "eval time": 11.215632880001067, + "tokens / sec": 6639.273216556042, + "mem allocated avg": 7068978995.2, + "mem reserved avg": 12124969500.672, + "elapsed time": 1263.2709914519983 + }, + { + "step": 3750, + "valid accuracy": 0.54, + "train loss": 0.5691816571950913, + "train samples": 15000, + "train time": 31.92663248500321, + "eval time": 6.89942428699942, + "tokens / sec": 6787.530758271833, + "mem allocated avg": 7079011248.128, + "mem reserved avg": 12298890510.336, + "elapsed time": 1351.894684084 + }, + { + "step": 4000, + "valid accuracy": 0.56, + "train loss": 0.5762648656368256, + "train samples": 16000, + "train time": 31.08475098094641, + "eval time": 6.668889390999539, + "tokens / sec": 6574.7028221416895, + "mem allocated avg": 7061224237.056, + "mem reserved avg": 12000969097.216, + "elapsed time": 1437.9300296759975 + }, + { + "step": 4250, + "valid accuracy": 0.52, + "train loss": 0.562865238904953, + "train samples": 17000, + "train time": 31.594970259979164, + "eval time": 11.218020562002494, + "tokens / sec": 6690.590250935068, + "mem allocated avg": 7071853715.456, + "mem reserved avg": 12157852844.032, + "elapsed time": 1529.7590418299988 + }, + { + "step": 4500, + "valid accuracy": 0.5, + "train loss": 0.568256908416748, + "train samples": 18000, + "train time": 31.334908029966755, + "eval time": 11.24515695700029, + "tokens / sec": 6632.156054240077, + "mem allocated avg": 7066128418.816, + "mem reserved avg": 12073245343.744, + "elapsed time": 1620.988001589998 + }, + { + "step": 4750, + "valid accuracy": 0.52, + "train loss": 0.5585172891616821, + "train samples": 19000, + "train time": 31.425996138961636, + "eval time": 11.202903266999783, + "tokens / sec": 6680.4246736261675, + "mem allocated avg": 7068498065.408, + "mem reserved avg": 12124491350.016, + "elapsed time": 1712.544705993998 + }, + { + "step": 5000, + "valid accuracy": 0.52, + "train loss": 0.5657225311994553, + "train samples": 20000, + "train time": 31.088545892969705, + "eval time": 11.224285021999094, + "tokens / sec": 6699.573557317776, + "mem allocated avg": 7064964919.296, + "mem reserved avg": 12070275776.512, + "elapsed time": 1803.3861051699969 + }, + { + "step": 5000, + "test accuracy": 0.5299469294920395, + "train loss": 0.5657225311994553, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/lora--llama-3.2-3B-rank64.json b/MetaMathQA/results/lora--llama-3.2-3B-rank64.json new file mode 100644 index 0000000000000000000000000000000000000000..d2087db6a18d6b5c7dbb92b2acffcf079607a4ba --- /dev/null +++ b/MetaMathQA/results/lora--llama-3.2-3B-rank64.json @@ -0,0 +1,365 @@ +{ + "run_info": { + "created_at": "2025-06-19T16:55:20+00:00", + "total_time": 2017.2277705579982, + "experiment_name": "lora/llama-3.2-3B-rank64", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "lora_alpha": 128, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "lora_bias": false + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 12128055669, + "cuda_memory_max": 22540189696, + "cuda_memory_reserved_99th": 17953927987, + "train_time": 1853.4967184819961, + "file_size": 73415408, + "num_trainable_params": 18350080, + "num_total_params": 3231099904, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.38, + "train loss": 0.9425119986534118, + "train samples": 1000, + "train time": 31.823601707994385, + "eval time": 11.233909951999522, + "tokens / sec": 6652.892464614218, + "mem allocated avg": 7072427177.984, + "mem reserved avg": 12177985503.232, + "elapsed time": 97.04379223199976 + }, + { + "step": 500, + "valid accuracy": 0.4, + "train loss": 0.7080548154115677, + "train samples": 2000, + "train time": 31.45184341498316, + "eval time": 8.232533225000225, + "tokens / sec": 6613.125890767804, + "mem allocated avg": 7065105152.0, + "mem reserved avg": 12072179990.528, + "elapsed time": 184.765658884 + }, + { + "step": 750, + "valid accuracy": 0.48, + "train loss": 0.6735224899053573, + "train samples": 3000, + "train time": 31.813968455000577, + "eval time": 7.057446101998721, + "tokens / sec": 6739.20954888921, + "mem allocated avg": 7075631579.136, + "mem reserved avg": 12224064126.976, + "elapsed time": 272.18349517599927 + }, + { + "step": 1000, + "valid accuracy": 0.38, + "train loss": 0.6520720717906952, + "train samples": 4000, + "train time": 31.539530114994704, + "eval time": 6.8677342959999805, + "tokens / sec": 6605.551802464924, + "mem allocated avg": 7066230261.76, + "mem reserved avg": 12094502076.416, + "elapsed time": 358.6604399049993 + }, + { + "step": 1250, + "valid accuracy": 0.32, + "train loss": 0.6483409875631332, + "train samples": 5000, + "train time": 31.15382274799049, + "eval time": 6.63156994000019, + "tokens / sec": 6693.817374737786, + "mem allocated avg": 7066402795.52, + "mem reserved avg": 12090886586.368, + "elapsed time": 444.47985113600043 + }, + { + "step": 1500, + "valid accuracy": 0.44, + "train loss": 0.6400664356946946, + "train samples": 6000, + "train time": 31.237405868998394, + "eval time": 6.19883855199987, + "tokens / sec": 6701.292702661678, + "mem allocated avg": 7067143219.2, + "mem reserved avg": 12125288267.776, + "elapsed time": 529.970450933999 + }, + { + "step": 1750, + "valid accuracy": 0.42, + "train loss": 0.6309183040857315, + "train samples": 7000, + "train time": 31.58418034899296, + "eval time": 11.217398733000664, + "tokens / sec": 6628.476588175104, + "mem allocated avg": 7069430339.584, + "mem reserved avg": 12128735985.664, + "elapsed time": 620.7944932609989 + }, + { + "step": 2000, + "valid accuracy": 0.38, + "train loss": 0.6333342634439468, + "train samples": 8000, + "train time": 31.370570010996744, + "eval time": 11.2056582969999, + "tokens / sec": 6620.727641454827, + "mem allocated avg": 7066754975.744, + "mem reserved avg": 12075980029.952, + "elapsed time": 711.143019907 + }, + { + "step": 2250, + "valid accuracy": 0.42, + "train loss": 0.6244297958612443, + "train samples": 9000, + "train time": 32.090800706988375, + "eval time": 6.320641570999214, + "tokens / sec": 6698.118939524966, + "mem allocated avg": 7077559773.184, + "mem reserved avg": 12266535649.28, + "elapsed time": 798.2718276069991 + }, + { + "step": 2500, + "valid accuracy": 0.48, + "train loss": 0.6205919095277787, + "train samples": 10000, + "train time": 31.211024427002485, + "eval time": 7.8215817759992206, + "tokens / sec": 6599.1746115775, + "mem allocated avg": 7063100512.256, + "mem reserved avg": 12017771479.04, + "elapsed time": 885.0132823740005 + }, + { + "step": 2750, + "valid accuracy": 0.38, + "train loss": 0.6116842222213745, + "train samples": 11000, + "train time": 31.752687646014238, + "eval time": 11.215984603999459, + "tokens / sec": 6672.852464084136, + "mem allocated avg": 7072850802.688, + "mem reserved avg": 12190207705.088, + "elapsed time": 976.3748101059991 + }, + { + "step": 3000, + "valid accuracy": 0.48, + "train loss": 0.6028307398557663, + "train samples": 12000, + "train time": 31.220882691013685, + "eval time": 10.851913497001078, + "tokens / sec": 6685.621353687066, + "mem allocated avg": 7068516059.136, + "mem reserved avg": 12110624980.992, + "elapsed time": 1066.2028727340003 + }, + { + "step": 3250, + "valid accuracy": 0.54, + "train loss": 0.6109937611818314, + "train samples": 13000, + "train time": 31.23074521200033, + "eval time": 6.857214526000462, + "tokens / sec": 6752.992878279506, + "mem allocated avg": 7070265374.72, + "mem reserved avg": 12142795292.672, + "elapsed time": 1152.2392765660006 + }, + { + "step": 3500, + "valid accuracy": 0.54, + "train loss": 0.5937278937101365, + "train samples": 14000, + "train time": 31.52822203695905, + "eval time": 6.510061502001918, + "tokens / sec": 6652.7696916787745, + "mem allocated avg": 7069306679.296, + "mem reserved avg": 12124390686.72, + "elapsed time": 1238.6433643029995 + }, + { + "step": 3750, + "valid accuracy": 0.6, + "train loss": 0.5906780579090118, + "train samples": 15000, + "train time": 32.31397023300451, + "eval time": 8.545268227000633, + "tokens / sec": 6706.170688325575, + "mem allocated avg": 7078981097.472, + "mem reserved avg": 12299846811.648, + "elapsed time": 1328.641089326 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.6025177363157272, + "train samples": 16000, + "train time": 31.170676869962335, + "eval time": 6.8420828330017684, + "tokens / sec": 6556.57882735759, + "mem allocated avg": 7061331572.736, + "mem reserved avg": 12001287864.32, + "elapsed time": 1414.365250592 + }, + { + "step": 4250, + "valid accuracy": 0.54, + "train loss": 0.5884622411727906, + "train samples": 17000, + "train time": 31.543792515007226, + "eval time": 6.748535185997753, + "tokens / sec": 6701.445297024126, + "mem allocated avg": 7071957172.224, + "mem reserved avg": 12155780857.856, + "elapsed time": 1500.9030026039982 + }, + { + "step": 4500, + "valid accuracy": 0.54, + "train loss": 0.5941844927072525, + "train samples": 18000, + "train time": 31.45958714898734, + "eval time": 6.4977734870008135, + "tokens / sec": 6605.871813123572, + "mem allocated avg": 7066011588.608, + "mem reserved avg": 12069847957.504, + "elapsed time": 1586.8870083309994 + }, + { + "step": 4750, + "valid accuracy": 0.56, + "train loss": 0.5860341912508011, + "train samples": 19000, + "train time": 31.656771414985997, + "eval time": 6.746858504000556, + "tokens / sec": 6631.724923806254, + "mem allocated avg": 7068472178.688, + "mem reserved avg": 12124852060.16, + "elapsed time": 1673.7380427649987 + }, + { + "step": 5000, + "valid accuracy": 0.58, + "train loss": 0.5928755496740341, + "train samples": 20000, + "train time": 31.260896800042246, + "eval time": 6.4877336810022825, + "tokens / sec": 6662.636754545011, + "mem allocated avg": 7065262428.16, + "mem reserved avg": 12067549478.912, + "elapsed time": 1759.6036715839982 + }, + { + "step": 5000, + "test accuracy": 0.4890068233510235, + "train loss": 0.5928755496740341, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/oft--llama-3.2-3B-rank32.json b/MetaMathQA/results/oft--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000000000000000000000000000000000..1f7f3d5c466095fe2df55eaef2e114948786879b --- /dev/null +++ b/MetaMathQA/results/oft--llama-3.2-3B-rank32.json @@ -0,0 +1,356 @@ +{ + "run_info": { + "created_at": "2025-06-20T05:26:07+00:00", + "total_time": 6852.100763734001, + "experiment_name": "oft/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "OFT", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "oft_block_size": 0, + "module_dropout": 0.0, + "target_modules": [ + "q_proj", + "v_proj" + ], + "fan_in_fan_out": false, + "bias": "none", + "exclude_modules": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "modules_to_save": null, + "coft": false, + "eps": 6e-05, + "block_share": false + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 18387461314, + "cuda_memory_max": 28913434624, + "cuda_memory_reserved_99th": 24327110000, + "train_time": 5771.733417916999, + "file_size": 66533232, + "num_trainable_params": 16629760, + "num_total_params": 3229379584, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.36, + "train loss": 0.9642390298843384, + "train samples": 1000, + "train time": 135.13652412690135, + "eval time": 72.26809900400258, + "tokens / sec": 1566.704496566621, + "mem allocated avg": 7086081564.672, + "mem reserved avg": 18436381999.104, + "elapsed time": 282.8167244100041 + }, + { + "step": 500, + "valid accuracy": 0.34, + "train loss": 0.7172249312400818, + "train samples": 2000, + "train time": 134.07802616302797, + "eval time": 68.8729065929947, + "tokens / sec": 1551.2981951799843, + "mem allocated avg": 7078181507.072, + "mem reserved avg": 18331700559.872, + "elapsed time": 555.3704225070032 + }, + { + "step": 750, + "valid accuracy": 0.54, + "train loss": 0.6725650383234024, + "train samples": 3000, + "train time": 134.206523489971, + "eval time": 51.09546631400008, + "tokens / sec": 1597.5452938099672, + "mem allocated avg": 7088855150.592, + "mem reserved avg": 18489473499.136, + "elapsed time": 810.4512720499988 + }, + { + "step": 1000, + "valid accuracy": 0.46, + "train loss": 0.652460042476654, + "train samples": 4000, + "train time": 133.72679109000455, + "eval time": 48.579141383001115, + "tokens / sec": 1557.9226742962812, + "mem allocated avg": 7079289249.792, + "mem reserved avg": 18361211682.816, + "elapsed time": 1062.2896951580042 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.6465949823856354, + "train samples": 5000, + "train time": 134.02142679997633, + "eval time": 50.56292798400682, + "tokens / sec": 1556.004923833842, + "mem allocated avg": 7081315713.024, + "mem reserved avg": 18350482653.184, + "elapsed time": 1316.4387951009994 + }, + { + "step": 1500, + "valid accuracy": 0.42, + "train loss": 0.6379977518320084, + "train samples": 6000, + "train time": 133.0496824070433, + "eval time": 53.73340070099948, + "tokens / sec": 1573.329572930409, + "mem allocated avg": 7080897417.216, + "mem reserved avg": 18375531036.672, + "elapsed time": 1573.0179351469997 + }, + { + "step": 1750, + "valid accuracy": 0.5, + "train loss": 0.6291554419994354, + "train samples": 7000, + "train time": 133.47338955492887, + "eval time": 55.15936547000456, + "tokens / sec": 1568.5148979740509, + "mem allocated avg": 7082186061.824, + "mem reserved avg": 18390034939.904, + "elapsed time": 1831.3363242750056 + }, + { + "step": 2000, + "valid accuracy": 0.42, + "train loss": 0.6310958023071289, + "train samples": 8000, + "train time": 133.25556700096058, + "eval time": 72.25138587100082, + "tokens / sec": 1558.6290664952317, + "mem allocated avg": 7079499612.16, + "mem reserved avg": 18331633451.008, + "elapsed time": 2106.423032759005 + }, + { + "step": 2250, + "valid accuracy": 0.44, + "train loss": 0.6218489743471146, + "train samples": 9000, + "train time": 135.02108432800014, + "eval time": 44.298431456998514, + "tokens / sec": 1591.958774955749, + "mem allocated avg": 7091123408.896, + "mem reserved avg": 18522440728.576, + "elapsed time": 2356.0777650860036 + }, + { + "step": 2500, + "valid accuracy": 0.44, + "train loss": 0.6188951053619385, + "train samples": 10000, + "train time": 133.4561369928997, + "eval time": 71.44547970699932, + "tokens / sec": 1543.331049743768, + "mem allocated avg": 7075245447.168, + "mem reserved avg": 18280471330.816, + "elapsed time": 2630.538193225002 + }, + { + "step": 2750, + "valid accuracy": 0.46, + "train loss": 0.6104991520643235, + "train samples": 11000, + "train time": 134.51214510202408, + "eval time": 43.22248092100199, + "tokens / sec": 1575.181184117565, + "mem allocated avg": 7087120197.632, + "mem reserved avg": 18447404630.016, + "elapsed time": 2878.231181973999 + }, + { + "step": 3000, + "valid accuracy": 0.4, + "train loss": 0.6016042430400849, + "train samples": 12000, + "train time": 134.34936329597986, + "eval time": 58.55288808000478, + "tokens / sec": 1553.64338824705, + "mem allocated avg": 7081822816.256, + "mem reserved avg": 18370816638.976, + "elapsed time": 3140.938437593999 + }, + { + "step": 3250, + "valid accuracy": 0.54, + "train loss": 0.610588705778122, + "train samples": 13000, + "train time": 134.2449153930138, + "eval time": 58.681311781998374, + "tokens / sec": 1571.0166704084752, + "mem allocated avg": 7083565283.328, + "mem reserved avg": 18393910476.8, + "elapsed time": 3403.7443569960014 + }, + { + "step": 3500, + "valid accuracy": 0.54, + "train loss": 0.5950560384988784, + "train samples": 14000, + "train time": 133.02754676800396, + "eval time": 41.66271651999705, + "tokens / sec": 1576.741096833107, + "mem allocated avg": 7081388462.08, + "mem reserved avg": 18383894478.848, + "elapsed time": 3648.4037805520056 + }, + { + "step": 3750, + "valid accuracy": 0.5, + "train loss": 0.5935565856695175, + "train samples": 15000, + "train time": 134.96303366392385, + "eval time": 46.73629975599761, + "tokens / sec": 1605.6470732542934, + "mem allocated avg": 7093567961.088, + "mem reserved avg": 18554812366.848, + "elapsed time": 3900.5494019100006 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.6046672226190567, + "train samples": 16000, + "train time": 133.7693534100763, + "eval time": 55.96993844499957, + "tokens / sec": 1527.8013595048553, + "mem allocated avg": 7074107404.288, + "mem reserved avg": 18261940895.744, + "elapsed time": 4159.863935929003 + }, + { + "step": 4250, + "valid accuracy": 0.54, + "train loss": 0.5913312199115753, + "train samples": 17000, + "train time": 134.5800468690286, + "eval time": 45.97034073100076, + "tokens / sec": 1570.7306165951986, + "mem allocated avg": 7084561401.856, + "mem reserved avg": 18416173842.432, + "elapsed time": 4410.347729070003 + }, + { + "step": 4500, + "valid accuracy": 0.52, + "train loss": 0.5983910497426986, + "train samples": 18000, + "train time": 133.48891869902582, + "eval time": 45.915211307998106, + "tokens / sec": 1556.818363841587, + "mem allocated avg": 7079295299.584, + "mem reserved avg": 18337664860.16, + "elapsed time": 4659.401384982004 + }, + { + "step": 4750, + "valid accuracy": 0.54, + "train loss": 0.5894997611045837, + "train samples": 19000, + "train time": 134.47632633800822, + "eval time": 46.132873725000536, + "tokens / sec": 1561.159541734619, + "mem allocated avg": 7081984223.232, + "mem reserved avg": 18385320542.208, + "elapsed time": 4910.124472488002 + }, + { + "step": 5000, + "valid accuracy": 0.54, + "train loss": 0.5956899918317795, + "train samples": 20000, + "train time": 132.51259567801753, + "eval time": 45.78685289800342, + "tokens / sec": 1571.7751126548303, + "mem allocated avg": 7078406199.296, + "mem reserved avg": 18327925686.272, + "elapsed time": 5158.116421651001 + }, + { + "step": 5000, + "test accuracy": 0.489764973464746, + "train loss": 0.5956899918317795, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/prefixtuning--llama-3.2-3B-lr_0.001.json b/MetaMathQA/results/prefixtuning--llama-3.2-3B-lr_0.001.json new file mode 100644 index 0000000000000000000000000000000000000000..61a5c49e6e31bcc1988806ce2d74cb2efbc1a5a3 --- /dev/null +++ b/MetaMathQA/results/prefixtuning--llama-3.2-3B-lr_0.001.json @@ -0,0 +1,345 @@ +{ + "run_info": { + "created_at": "2025-06-19T20:20:55+00:00", + "total_time": 1959.214138561998, + "experiment_name": "prefixtuning/llama-3.2-3B-lr_0.001", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "PREFIX_TUNING", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "num_virtual_tokens": 200, + "token_dim": 1024, + "num_transformer_submodules": 1, + "num_attention_heads": 8, + "num_layers": 28, + "encoder_hidden_size": 3072, + "prefix_projection": false + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 11766684083, + "cuda_memory_max": 20912799744, + "cuda_memory_reserved_99th": 16945051074, + "train_time": 1661.6597991429953, + "file_size": 45875328, + "num_trainable_params": 11468800, + "num_total_params": 3224218624, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 7.371294965744019, + "train samples": 1000, + "train time": 27.91846932898261, + "eval time": 15.451216622001084, + "tokens / sec": 7583.474491569318, + "mem allocated avg": 7053410574.336, + "mem reserved avg": 11800925962.24, + "elapsed time": 86.14553656399949 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 3.853111123085022, + "train samples": 2000, + "train time": 27.30431010902612, + "eval time": 15.427179872000124, + "tokens / sec": 7617.661796598262, + "mem allocated avg": 7047124914.176, + "mem reserved avg": 11721854943.232, + "elapsed time": 164.76258564100135 + }, + { + "step": 750, + "valid accuracy": 0.0, + "train loss": 1.7293416724205017, + "train samples": 3000, + "train time": 28.03611285903753, + "eval time": 15.425274275999982, + "tokens / sec": 7647.31548478152, + "mem allocated avg": 7057104787.456, + "mem reserved avg": 11848237711.36, + "elapsed time": 244.72437485599949 + }, + { + "step": 1000, + "valid accuracy": 0.0, + "train loss": 1.1541715533733368, + "train samples": 4000, + "train time": 27.01217528603229, + "eval time": 15.417352960001153, + "tokens / sec": 7712.670223479868, + "mem allocated avg": 7050079920.128, + "mem reserved avg": 11745879916.544, + "elapsed time": 322.8701755410002 + }, + { + "step": 1250, + "valid accuracy": 0.08, + "train loss": 1.01127068066597, + "train samples": 5000, + "train time": 27.13179545197636, + "eval time": 15.418993674997182, + "tokens / sec": 7686.111314273877, + "mem allocated avg": 7048705087.488, + "mem reserved avg": 11725462044.672, + "elapsed time": 401.46412933300235 + }, + { + "step": 1500, + "valid accuracy": 0.08, + "train loss": 0.9543052833080292, + "train samples": 6000, + "train time": 27.5597544680204, + "eval time": 15.42255902100078, + "tokens / sec": 7595.532109798078, + "mem allocated avg": 7050476988.416, + "mem reserved avg": 11746567782.4, + "elapsed time": 480.1674746890021 + }, + { + "step": 1750, + "valid accuracy": 0.18, + "train loss": 0.9019801757335663, + "train samples": 7000, + "train time": 27.391848403010954, + "eval time": 15.41637391599943, + "tokens / sec": 7642.967240465137, + "mem allocated avg": 7051827261.44, + "mem reserved avg": 11769468682.24, + "elapsed time": 559.677030461 + }, + { + "step": 2000, + "valid accuracy": 0.12, + "train loss": 0.8851136872768403, + "train samples": 8000, + "train time": 27.14071328902719, + "eval time": 15.419172215999424, + "tokens / sec": 7652.562325396589, + "mem allocated avg": 7048325701.632, + "mem reserved avg": 11717526421.504, + "elapsed time": 638.8433813260017 + }, + { + "step": 2250, + "valid accuracy": 0.1, + "train loss": 0.8607708604335785, + "train samples": 9000, + "train time": 28.18215358697489, + "eval time": 15.430102889000409, + "tokens / sec": 7627.096323090928, + "mem allocated avg": 7058774517.76, + "mem reserved avg": 11887655780.352, + "elapsed time": 719.1557081280007 + }, + { + "step": 2500, + "valid accuracy": 0.16, + "train loss": 0.8404088478088378, + "train samples": 10000, + "train time": 26.82789152296391, + "eval time": 15.41505262499777, + "tokens / sec": 7677.3457885685175, + "mem allocated avg": 7045414729.728, + "mem reserved avg": 11679693799.424, + "elapsed time": 797.9182705759995 + }, + { + "step": 2750, + "valid accuracy": 0.14, + "train loss": 0.8259119842052459, + "train samples": 11000, + "train time": 27.303442178006662, + "eval time": 15.412094721999892, + "tokens / sec": 7760.230326221408, + "mem allocated avg": 7055038418.944, + "mem reserved avg": 11819196350.464, + "elapsed time": 877.4897030700013 + }, + { + "step": 3000, + "valid accuracy": 0.22, + "train loss": 0.8099327564239502, + "train samples": 12000, + "train time": 27.035110770961182, + "eval time": 12.827202022002894, + "tokens / sec": 7720.737738726083, + "mem allocated avg": 7049757696.0, + "mem reserved avg": 11756390842.368, + "elapsed time": 953.558885925002 + }, + { + "step": 3250, + "valid accuracy": 0.22, + "train loss": 0.8175602672100067, + "train samples": 13000, + "train time": 27.43706444997588, + "eval time": 15.41863539300175, + "tokens / sec": 7686.718831911532, + "mem allocated avg": 7051605612.544, + "mem reserved avg": 11776833880.064, + "elapsed time": 1033.2767371779992 + }, + { + "step": 3500, + "valid accuracy": 0.18, + "train loss": 0.7965063021183014, + "train samples": 14000, + "train time": 27.750962379970588, + "eval time": 15.41753050600164, + "tokens / sec": 7558.296434122523, + "mem allocated avg": 7051713462.272, + "mem reserved avg": 11763227557.888, + "elapsed time": 1113.7006878970024 + }, + { + "step": 3750, + "valid accuracy": 0.26, + "train loss": 0.788856605052948, + "train samples": 15000, + "train time": 27.955327479998232, + "eval time": 11.66996129099789, + "tokens / sec": 7751.760381095479, + "mem allocated avg": 7061477945.344, + "mem reserved avg": 11919800926.208, + "elapsed time": 1190.2235273900005 + }, + { + "step": 4000, + "valid accuracy": 0.26, + "train loss": 0.8037499711513519, + "train samples": 16000, + "train time": 26.957003097031702, + "eval time": 15.42233503099851, + "tokens / sec": 7581.443651742726, + "mem allocated avg": 7042861262.848, + "mem reserved avg": 11658604838.912, + "elapsed time": 1268.9300010120023 + }, + { + "step": 4250, + "valid accuracy": 0.28, + "train loss": 0.7835113587379455, + "train samples": 17000, + "train time": 27.92120910200174, + "eval time": 10.70234186000016, + "tokens / sec": 7570.911389537389, + "mem allocated avg": 7053768085.504, + "mem reserved avg": 11783242776.576, + "elapsed time": 1344.1117449459998 + }, + { + "step": 4500, + "valid accuracy": 0.28, + "train loss": 0.7824292014837265, + "train samples": 18000, + "train time": 26.99022845998479, + "eval time": 12.42347607800184, + "tokens / sec": 7699.749570779183, + "mem allocated avg": 7048212195.328, + "mem reserved avg": 11725470433.28, + "elapsed time": 1419.3379556770014 + }, + { + "step": 4750, + "valid accuracy": 0.28, + "train loss": 0.7803363995552063, + "train samples": 19000, + "train time": 27.08754148402295, + "eval time": 15.42501401300251, + "tokens / sec": 7750.389607112494, + "mem allocated avg": 7051630567.424, + "mem reserved avg": 11771876212.736, + "elapsed time": 1498.5280245210015 + }, + { + "step": 5000, + "valid accuracy": 0.26, + "train loss": 0.7887116296291351, + "train samples": 20000, + "train time": 26.98893836600837, + "eval time": 15.411349758000142, + "tokens / sec": 7717.235749529201, + "mem allocated avg": 7048690728.96, + "mem reserved avg": 11715764813.824, + "elapsed time": 1577.725424725002 + }, + { + "step": 5000, + "test accuracy": 0.1470811220621683, + "train loss": 0.7887116296291351, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/prompt_tuning--llama-3.2-3B-default.json b/MetaMathQA/results/prompt_tuning--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..643caf477be7615ac4ef139d1a9074ddd5bed284 --- /dev/null +++ b/MetaMathQA/results/prompt_tuning--llama-3.2-3B-default.json @@ -0,0 +1,348 @@ +{ + "run_info": { + "created_at": "2025-06-20T08:46:44+00:00", + "total_time": 2700.1305744579877, + "experiment_name": "prompt_tuning/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "PROMPT_TUNING", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "num_virtual_tokens": 200, + "token_dim": 3072, + "num_transformer_submodules": 1, + "num_attention_heads": 24, + "num_layers": 28, + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "tokenizer_name_or_path": null, + "tokenizer_kwargs": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 15297773830, + "cuda_memory_max": 24379392000, + "cuda_memory_reserved_99th": 20669781770, + "train_time": 2379.557773831024, + "file_size": 2457728, + "num_trainable_params": 614400, + "num_total_params": 3213364224, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 3.462425223350525, + "train samples": 1000, + "train time": 46.206722402057494, + "eval time": 15.901069569998072, + "tokens / sec": 4581.9956273412845, + "mem allocated avg": 7082871494.656, + "mem reserved avg": 15331489742.848, + "elapsed time": 119.40567356300016 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 2.259350722312927, + "train samples": 2000, + "train time": 45.66361523300293, + "eval time": 15.856271529002697, + "tokens / sec": 4554.939396249854, + "mem allocated avg": 7075523266.56, + "mem reserved avg": 15240674672.64, + "elapsed time": 232.12755202699918 + }, + { + "step": 750, + "valid accuracy": 0.0, + "train loss": 1.758247773170471, + "train samples": 3000, + "train time": 46.58154148896574, + "eval time": 15.854417883005226, + "tokens / sec": 4602.70298377282, + "mem allocated avg": 7085465481.216, + "mem reserved avg": 15376771448.832, + "elapsed time": 346.0752758900053 + }, + { + "step": 1000, + "valid accuracy": 0.0, + "train loss": 1.6028480381965637, + "train samples": 4000, + "train time": 45.41573346107907, + "eval time": 15.861826895998092, + "tokens / sec": 4587.30893729906, + "mem allocated avg": 7077486481.408, + "mem reserved avg": 15288170971.136, + "elapsed time": 458.6240012299968 + }, + { + "step": 1250, + "valid accuracy": 0.0, + "train loss": 1.5049157681465148, + "train samples": 5000, + "train time": 46.04039786210342, + "eval time": 15.877354786993237, + "tokens / sec": 4529.456948321703, + "mem allocated avg": 7076584331.264, + "mem reserved avg": 15265983102.976, + "elapsed time": 571.9228152269934 + }, + { + "step": 1500, + "valid accuracy": 0.0, + "train loss": 1.4375499501228333, + "train samples": 6000, + "train time": 45.70124057796784, + "eval time": 15.84707298700232, + "tokens / sec": 4580.4227052190045, + "mem allocated avg": 7078481408.0, + "mem reserved avg": 15279463596.032, + "elapsed time": 684.8850296739984 + }, + { + "step": 1750, + "valid accuracy": 0.0, + "train loss": 1.3827230257987977, + "train samples": 7000, + "train time": 44.976750778907444, + "eval time": 15.845691901995451, + "tokens / sec": 4654.7382008346485, + "mem allocated avg": 7079360505.856, + "mem reserved avg": 15298052751.36, + "elapsed time": 796.8428356289951 + }, + { + "step": 2000, + "valid accuracy": 0.0, + "train loss": 1.3338124525547028, + "train samples": 8000, + "train time": 45.10262611102371, + "eval time": 15.857041016992298, + "tokens / sec": 4604.964675199615, + "mem allocated avg": 7075931449.344, + "mem reserved avg": 15257242173.44, + "elapsed time": 908.9726742479979 + }, + { + "step": 2250, + "valid accuracy": 0.0, + "train loss": 1.2829065501689911, + "train samples": 9000, + "train time": 46.84363810600189, + "eval time": 15.872781344005489, + "tokens / sec": 4588.627371631486, + "mem allocated avg": 7087554078.72, + "mem reserved avg": 15416986435.584, + "elapsed time": 1023.331907868007 + }, + { + "step": 2500, + "valid accuracy": 0.0, + "train loss": 1.2462495183944702, + "train samples": 10000, + "train time": 45.55510413390584, + "eval time": 15.84976143699896, + "tokens / sec": 4521.271631705095, + "mem allocated avg": 7072915062.784, + "mem reserved avg": 15202909159.424, + "elapsed time": 1136.1328145180014 + }, + { + "step": 2750, + "valid accuracy": 0.0, + "train loss": 1.2045790712833404, + "train samples": 11000, + "train time": 45.34144312601711, + "eval time": 15.8525270359969, + "tokens / sec": 4673.009621928461, + "mem allocated avg": 7083153442.816, + "mem reserved avg": 15344005545.984, + "elapsed time": 1248.7101804669946 + }, + { + "step": 3000, + "valid accuracy": 0.0, + "train loss": 1.1678078708648683, + "train samples": 12000, + "train time": 45.599694666831056, + "eval time": 15.870247816987103, + "tokens / sec": 4577.464860786221, + "mem allocated avg": 7077996111.872, + "mem reserved avg": 15283892781.056, + "elapsed time": 1361.5449211609957 + }, + { + "step": 3250, + "valid accuracy": 0.04, + "train loss": 1.1313301923274994, + "train samples": 13000, + "train time": 45.95094640579191, + "eval time": 15.868188906999421, + "tokens / sec": 4589.698722144641, + "mem allocated avg": 7079686449.152, + "mem reserved avg": 15301248811.008, + "elapsed time": 1474.734694629995 + }, + { + "step": 3500, + "valid accuracy": 0.06, + "train loss": 1.1092858843803406, + "train samples": 14000, + "train time": 45.96525488591578, + "eval time": 15.86030059499899, + "tokens / sec": 4563.229346178814, + "mem allocated avg": 7078805225.472, + "mem reserved avg": 15302347718.656, + "elapsed time": 1588.1363447299955 + }, + { + "step": 3750, + "valid accuracy": 0.06, + "train loss": 1.079538120508194, + "train samples": 15000, + "train time": 46.46510764303093, + "eval time": 15.86466599200503, + "tokens / sec": 4663.779145091515, + "mem allocated avg": 7089610215.424, + "mem reserved avg": 15446287843.328, + "elapsed time": 1702.2553167559963 + }, + { + "step": 4000, + "valid accuracy": 0.04, + "train loss": 1.0899075508117675, + "train samples": 16000, + "train time": 45.08557640206709, + "eval time": 15.860410296008922, + "tokens / sec": 4533.001822521445, + "mem allocated avg": 7071494891.52, + "mem reserved avg": 15189319614.464, + "elapsed time": 1814.3939928110049 + }, + { + "step": 4250, + "valid accuracy": 0.04, + "train loss": 1.0607522547245025, + "train samples": 17000, + "train time": 46.2303190480452, + "eval time": 15.875090683999588, + "tokens / sec": 4572.518735601033, + "mem allocated avg": 7082239875.072, + "mem reserved avg": 15329283538.944, + "elapsed time": 1928.1608909490024 + }, + { + "step": 4500, + "valid accuracy": 0.04, + "train loss": 1.068591582775116, + "train samples": 18000, + "train time": 45.96484722109744, + "eval time": 15.854171614992083, + "tokens / sec": 4521.237697155087, + "mem allocated avg": 7076175783.936, + "mem reserved avg": 15251420479.488, + "elapsed time": 2041.5032397750037 + }, + { + "step": 4750, + "valid accuracy": 0.06, + "train loss": 1.0587167317867279, + "train samples": 19000, + "train time": 45.48911916205543, + "eval time": 15.858397545001935, + "tokens / sec": 4615.147619194169, + "mem allocated avg": 7079419088.896, + "mem reserved avg": 15298539290.624, + "elapsed time": 2154.3035376479966 + }, + { + "step": 5000, + "valid accuracy": 0.02, + "train loss": 1.0654937489032745, + "train samples": 20000, + "train time": 45.758550852071494, + "eval time": 15.85034008299408, + "tokens / sec": 4551.7175723796145, + "mem allocated avg": 7075618770.944, + "mem reserved avg": 15251386925.056, + "elapsed time": 2267.4055672899995 + }, + { + "step": 5000, + "test accuracy": 0.050037907505686124, + "train loss": 1.0654937489032745, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/prompt_tuning--llama-3.2-3B-lr_0.001.json b/MetaMathQA/results/prompt_tuning--llama-3.2-3B-lr_0.001.json new file mode 100644 index 0000000000000000000000000000000000000000..527c329ef82d08fac7e82e42e7c1f10aabf3936e --- /dev/null +++ b/MetaMathQA/results/prompt_tuning--llama-3.2-3B-lr_0.001.json @@ -0,0 +1,347 @@ +{ + "run_info": { + "created_at": "2025-06-20T08:01:25+00:00", + "total_time": 2714.5956150429993, + "experiment_name": "prompt_tuning/llama-3.2-3B-lr_0.001", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "PROMPT_TUNING", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "num_virtual_tokens": 200, + "token_dim": 3072, + "num_transformer_submodules": 1, + "num_attention_heads": 24, + "num_layers": 28, + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "tokenizer_name_or_path": null, + "tokenizer_kwargs": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 15297364466, + "cuda_memory_max": 24408752128, + "cuda_memory_reserved_99th": 20650676715, + "train_time": 2394.4007484640024, + "file_size": 2457728, + "num_trainable_params": 614400, + "num_total_params": 3213364224, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 2.454602773666382, + "train samples": 1000, + "train time": 46.58359175696387, + "eval time": 15.906975480989786, + "tokens / sec": 4544.926486231061, + "mem allocated avg": 7082736850.944, + "mem reserved avg": 15330147565.568, + "elapsed time": 120.51601758999459 + }, + { + "step": 500, + "valid accuracy": 0.02, + "train loss": 1.4034885478019714, + "train samples": 2000, + "train time": 45.99672341402038, + "eval time": 15.859127072995761, + "tokens / sec": 4521.952534049426, + "mem allocated avg": 7075398952.96, + "mem reserved avg": 15237637996.544, + "elapsed time": 234.56530582100095 + }, + { + "step": 750, + "valid accuracy": 0.1, + "train loss": 1.051814435005188, + "train samples": 3000, + "train time": 45.34941398198134, + "eval time": 15.839738530004979, + "tokens / sec": 4727.756792738001, + "mem allocated avg": 7085216630.784, + "mem reserved avg": 15378130403.328, + "elapsed time": 347.9996997119888 + }, + { + "step": 1000, + "valid accuracy": 0.2, + "train loss": 0.9425526282787323, + "train samples": 4000, + "train time": 44.85872337181354, + "eval time": 15.849193180998554, + "tokens / sec": 4644.269482954245, + "mem allocated avg": 7077280739.328, + "mem reserved avg": 15280109518.848, + "elapsed time": 460.8599872249906 + }, + { + "step": 1250, + "valid accuracy": 0.2, + "train loss": 0.9085307500362396, + "train samples": 5000, + "train time": 45.535731699026655, + "eval time": 15.864107311004773, + "tokens / sec": 4579.656287909338, + "mem allocated avg": 7076838449.152, + "mem reserved avg": 15263508463.616, + "elapsed time": 574.5614464429964 + }, + { + "step": 1500, + "valid accuracy": 0.18, + "train loss": 0.8753413548469543, + "train samples": 6000, + "train time": 45.47140344994841, + "eval time": 15.851111587006017, + "tokens / sec": 4603.5746451155, + "mem allocated avg": 7078501443.584, + "mem reserved avg": 15280914825.216, + "elapsed time": 688.3081236659928 + }, + { + "step": 1750, + "valid accuracy": 0.18, + "train loss": 0.8501973593235016, + "train samples": 7000, + "train time": 45.876367467062664, + "eval time": 15.86328411300201, + "tokens / sec": 4563.460700115549, + "mem allocated avg": 7079126001.664, + "mem reserved avg": 15302154780.672, + "elapsed time": 802.3839824919996 + }, + { + "step": 2000, + "valid accuracy": 0.3, + "train loss": 0.8353641645908356, + "train samples": 8000, + "train time": 45.395122604924836, + "eval time": 15.847279680005158, + "tokens / sec": 4575.293293237354, + "mem allocated avg": 7075813670.912, + "mem reserved avg": 15257200230.4, + "elapsed time": 915.8055839799927 + }, + { + "step": 2250, + "valid accuracy": 0.26, + "train loss": 0.8205823216438294, + "train samples": 9000, + "train time": 46.531550297062495, + "eval time": 15.857669960998464, + "tokens / sec": 4619.403364550472, + "mem allocated avg": 7087054014.464, + "mem reserved avg": 15417707855.872, + "elapsed time": 1030.8605109579948 + }, + { + "step": 2500, + "valid accuracy": 0.24, + "train loss": 0.8074139108657837, + "train samples": 10000, + "train time": 45.232053409854416, + "eval time": 15.864067172005889, + "tokens / sec": 4553.562893413265, + "mem allocated avg": 7073174814.72, + "mem reserved avg": 15210467295.232, + "elapsed time": 1144.3065934619954 + }, + { + "step": 2750, + "valid accuracy": 0.22, + "train loss": 0.800323983669281, + "train samples": 11000, + "train time": 46.27672885800712, + "eval time": 15.85089660200174, + "tokens / sec": 4578.564760921707, + "mem allocated avg": 7083499849.728, + "mem reserved avg": 15345020567.552, + "elapsed time": 1258.9190711479896 + }, + { + "step": 3000, + "valid accuracy": 0.28, + "train loss": 0.7878623747825623, + "train samples": 12000, + "train time": 45.57083585388318, + "eval time": 15.872650785997394, + "tokens / sec": 4580.3636490071885, + "mem allocated avg": 7078042595.328, + "mem reserved avg": 15285402730.496, + "elapsed time": 1372.7267461329902 + }, + { + "step": 3250, + "valid accuracy": 0.3, + "train loss": 0.7943042907714843, + "train samples": 13000, + "train time": 45.666222987070796, + "eval time": 15.852009978989372, + "tokens / sec": 4618.314942746877, + "mem allocated avg": 7079504875.52, + "mem reserved avg": 15299428483.072, + "elapsed time": 1486.5100108069892 + }, + { + "step": 3500, + "valid accuracy": 0.28, + "train loss": 0.780832305431366, + "train samples": 14000, + "train time": 45.84015418085619, + "eval time": 15.86955204399419, + "tokens / sec": 4575.6826901685245, + "mem allocated avg": 7078824071.168, + "mem reserved avg": 15300871323.648, + "elapsed time": 1600.7413567879994 + }, + { + "step": 3750, + "valid accuracy": 0.32, + "train loss": 0.7758122501373291, + "train samples": 15000, + "train time": 46.99727132692351, + "eval time": 15.8490629579901, + "tokens / sec": 4610.969826153641, + "mem allocated avg": 7089586788.352, + "mem reserved avg": 15444173914.112, + "elapsed time": 1716.2785189549904 + }, + { + "step": 4000, + "valid accuracy": 0.36, + "train loss": 0.7912874612808227, + "train samples": 16000, + "train time": 45.15887627698248, + "eval time": 15.855249352011015, + "tokens / sec": 4525.644056031772, + "mem allocated avg": 7071318118.4, + "mem reserved avg": 15188732411.904, + "elapsed time": 1829.5188424160006 + }, + { + "step": 4250, + "valid accuracy": 0.36, + "train loss": 0.7664959132671356, + "train samples": 17000, + "train time": 46.26589757904003, + "eval time": 15.853440922001028, + "tokens / sec": 4569.002463182864, + "mem allocated avg": 7081992153.088, + "mem reserved avg": 15327354159.104, + "elapsed time": 1944.2481972599926 + }, + { + "step": 4500, + "valid accuracy": 0.34, + "train loss": 0.7785169410705567, + "train samples": 18000, + "train time": 45.61058669183694, + "eval time": 15.866839458991308, + "tokens / sec": 4556.354457882774, + "mem allocated avg": 7075963725.824, + "mem reserved avg": 15250623561.728, + "elapsed time": 2058.0909812989994 + }, + { + "step": 4750, + "valid accuracy": 0.32, + "train loss": 0.7709811532497406, + "train samples": 19000, + "train time": 45.832340708962874, + "eval time": 15.847010081997723, + "tokens / sec": 4580.586475674911, + "mem allocated avg": 7079141249.024, + "mem reserved avg": 15295871713.28, + "elapsed time": 2172.3217773149954 + }, + { + "step": 5000, + "valid accuracy": 0.3, + "train loss": 0.7790318930149078, + "train samples": 20000, + "train time": 44.844002045996604, + "eval time": 15.846091532002902, + "tokens / sec": 4644.545323728393, + "mem allocated avg": 7075675734.016, + "mem reserved avg": 15251831521.28, + "elapsed time": 2285.3788618499966 + }, + { + "step": 5000, + "test accuracy": 0.25246398786959817, + "train loss": 0.7790318930149078, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/ptuning--llama-3.2-3B-default.json b/MetaMathQA/results/ptuning--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..e111eeafb4670a0a55be6caf1f876bf57d36ee33 --- /dev/null +++ b/MetaMathQA/results/ptuning--llama-3.2-3B-default.json @@ -0,0 +1,348 @@ +{ + "run_info": { + "created_at": "2025-06-19T19:48:53+00:00", + "total_time": 1918.2703526590012, + "experiment_name": "ptuning/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "P_TUNING", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "num_virtual_tokens": 20, + "token_dim": 3072, + "num_transformer_submodules": 1, + "num_attention_heads": 24, + "num_layers": 28, + "encoder_reparameterization_type": "MLP", + "encoder_hidden_size": 3072, + "encoder_num_layers": 2, + "encoder_dropout": 0.0 + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 11867101593, + "cuda_memory_max": 20937965568, + "cuda_memory_reserved_99th": 17215688540, + "train_time": 1707.340225783013, + "file_size": 245880, + "num_trainable_params": 28382208, + "num_total_params": 3241132032, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.06, + "train loss": 0.9461167964935303, + "train samples": 1000, + "train time": 29.476242057011405, + "eval time": 11.075081511000462, + "tokens / sec": 7182.699870305862, + "mem allocated avg": 7263395393.536, + "mem reserved avg": 11910330187.776, + "elapsed time": 89.09710205499869 + }, + { + "step": 500, + "valid accuracy": 0.3, + "train loss": 0.7913461194038391, + "train samples": 2000, + "train time": 28.956617519994325, + "eval time": 11.047425028998987, + "tokens / sec": 7182.986751003671, + "mem allocated avg": 7255670497.28, + "mem reserved avg": 11810254094.336, + "elapsed time": 171.9022758780011 + }, + { + "step": 750, + "valid accuracy": 0.26, + "train loss": 0.7562740923166275, + "train samples": 3000, + "train time": 29.73533859500094, + "eval time": 11.056799476999004, + "tokens / sec": 7210.309689765724, + "mem allocated avg": 7266187038.72, + "mem reserved avg": 11954009669.632, + "elapsed time": 255.8485612350014 + }, + { + "step": 1000, + "valid accuracy": 0.3, + "train loss": 0.7289484927654266, + "train samples": 4000, + "train time": 29.176458327034197, + "eval time": 11.069810884997423, + "tokens / sec": 7140.551387861937, + "mem allocated avg": 7258589235.2, + "mem reserved avg": 11838347542.528, + "elapsed time": 338.5030210529985 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.7231850942373276, + "train samples": 5000, + "train time": 29.15449026899296, + "eval time": 11.055301014999714, + "tokens / sec": 7152.860436794844, + "mem allocated avg": 7257714087.936, + "mem reserved avg": 11824925769.728, + "elapsed time": 421.85765765199903 + }, + { + "step": 1500, + "valid accuracy": 0.38, + "train loss": 0.711922277212143, + "train samples": 6000, + "train time": 29.099172437985544, + "eval time": 11.07098460600173, + "tokens / sec": 7193.709733364892, + "mem allocated avg": 7259322730.496, + "mem reserved avg": 11860233420.8, + "elapsed time": 504.97678817400083 + }, + { + "step": 1750, + "valid accuracy": 0.44, + "train loss": 0.7051182547807694, + "train samples": 7000, + "train time": 29.301267419017677, + "eval time": 11.044947161997698, + "tokens / sec": 7144.912778213831, + "mem allocated avg": 7260392302.592, + "mem reserved avg": 11872371736.576, + "elapsed time": 588.3443257949984 + }, + { + "step": 2000, + "valid accuracy": 0.38, + "train loss": 0.7055468891859055, + "train samples": 8000, + "train time": 29.128185330951965, + "eval time": 11.045154800998716, + "tokens / sec": 7130.413296955362, + "mem allocated avg": 7257253203.968, + "mem reserved avg": 11821100564.48, + "elapsed time": 671.2971968860002 + }, + { + "step": 2250, + "valid accuracy": 0.3, + "train loss": 0.699348534822464, + "train samples": 9000, + "train time": 29.44214156106318, + "eval time": 11.039785496999684, + "tokens / sec": 7300.691750095574, + "mem allocated avg": 7268387997.696, + "mem reserved avg": 11993788448.768, + "elapsed time": 755.1838785660002 + }, + { + "step": 2500, + "valid accuracy": 0.4, + "train loss": 0.6970288401842117, + "train samples": 10000, + "train time": 28.56064905500898, + "eval time": 11.062792377000733, + "tokens / sec": 7211.565801718971, + "mem allocated avg": 7253500915.712, + "mem reserved avg": 11774535401.472, + "elapsed time": 837.4507786270005 + }, + { + "step": 2750, + "valid accuracy": 0.38, + "train loss": 0.6885807738304138, + "train samples": 11000, + "train time": 29.626391561985656, + "eval time": 11.040969151999889, + "tokens / sec": 7151.765329121947, + "mem allocated avg": 7264164755.456, + "mem reserved avg": 11929330384.896, + "elapsed time": 921.4017121549987 + }, + { + "step": 3000, + "valid accuracy": 0.32, + "train loss": 0.6827223267555237, + "train samples": 12000, + "train time": 29.296160228008375, + "eval time": 11.056816091997462, + "tokens / sec": 7124.85862909926, + "mem allocated avg": 7259324233.728, + "mem reserved avg": 11842046918.656, + "elapsed time": 1004.5840267519998 + }, + { + "step": 3250, + "valid accuracy": 0.5, + "train loss": 0.6894591153860092, + "train samples": 13000, + "train time": 29.611147850035195, + "eval time": 11.049655115999485, + "tokens / sec": 7122.351388338677, + "mem allocated avg": 7259635709.952, + "mem reserved avg": 11876809310.208, + "elapsed time": 1088.4846693049985 + }, + { + "step": 3500, + "valid accuracy": 0.42, + "train loss": 0.6757243422269821, + "train samples": 14000, + "train time": 28.982272775025194, + "eval time": 8.037888349997957, + "tokens / sec": 7237.182591861713, + "mem allocated avg": 7260029884.416, + "mem reserved avg": 11864100569.088, + "elapsed time": 1168.5907526180017 + }, + { + "step": 3750, + "valid accuracy": 0.44, + "train loss": 0.6726652181148529, + "train samples": 15000, + "train time": 29.461453213014465, + "eval time": 11.036738884999068, + "tokens / sec": 7355.475591552708, + "mem allocated avg": 7270358327.296, + "mem reserved avg": 12018115411.968, + "elapsed time": 1252.6760096750004 + }, + { + "step": 4000, + "valid accuracy": 0.44, + "train loss": 0.6872537672519684, + "train samples": 16000, + "train time": 28.49340438899526, + "eval time": 11.04012111100019, + "tokens / sec": 7172.642384527876, + "mem allocated avg": 7252451676.16, + "mem reserved avg": 11753454829.568, + "elapsed time": 1334.9961819890013 + }, + { + "step": 4250, + "valid accuracy": 0.46, + "train loss": 0.6691881531476974, + "train samples": 17000, + "train time": 29.36704957404436, + "eval time": 11.048986494999554, + "tokens / sec": 7198.169481309866, + "mem allocated avg": 7262467567.616, + "mem reserved avg": 11896405098.496, + "elapsed time": 1418.9507249929993 + }, + { + "step": 4500, + "valid accuracy": 0.5, + "train loss": 0.6769082483053207, + "train samples": 18000, + "train time": 29.086171291994106, + "eval time": 8.132250926999404, + "tokens / sec": 7144.907382746569, + "mem allocated avg": 7257195100.16, + "mem reserved avg": 11816553938.944, + "elapsed time": 1499.0322536989988 + }, + { + "step": 4750, + "valid accuracy": 0.46, + "train loss": 0.6686601461172104, + "train samples": 19000, + "train time": 29.45103387799827, + "eval time": 7.564945229998557, + "tokens / sec": 7128.408492200246, + "mem allocated avg": 7260019183.616, + "mem reserved avg": 11863848910.848, + "elapsed time": 1579.1494789060016 + }, + { + "step": 5000, + "valid accuracy": 0.48, + "train loss": 0.6739867876768112, + "train samples": 20000, + "train time": 29.24236888399173, + "eval time": 6.952750485001161, + "tokens / sec": 7122.541980995923, + "mem allocated avg": 7256318291.968, + "mem reserved avg": 11821469663.232, + "elapsed time": 1658.0220765080012 + }, + { + "step": 5000, + "test accuracy": 0.3707354056103108, + "train loss": 0.6739867876768112, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/randlora--llama-3.2-3B-default.json b/MetaMathQA/results/randlora--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..8c878847194ebd6257116b0c979b6958a5da3264 --- /dev/null +++ b/MetaMathQA/results/randlora--llama-3.2-3B-default.json @@ -0,0 +1,356 @@ +{ + "run_info": { + "created_at": "2025-06-20T07:20:24+00:00", + "total_time": 2457.3893872150074, + "experiment_name": "randlora/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "RANDLORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "target_modules": [ + "v_proj", + "q_proj" + ], + "projection_prng_key": 0, + "save_projection": true, + "sparse": false, + "very_sparse": false, + "randlora_dropout": 0.0, + "fan_in_fan_out": false, + "randlora_alpha": 640, + "bias": "none", + "modules_to_save": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 12743670025, + "cuda_memory_max": 22798139392, + "cuda_memory_reserved_99th": 18436063232, + "train_time": 2213.072415724004, + "file_size": 2211281240, + "num_trainable_params": 9289728, + "num_total_params": 3222039552, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.38, + "train loss": 0.9159075767993927, + "train samples": 1000, + "train time": 50.62416129904159, + "eval time": 13.32173753400275, + "tokens / sec": 4182.172989481373, + "mem allocated avg": 6983776778.24, + "mem reserved avg": 12791771561.984, + "elapsed time": 114.85611474100733 + }, + { + "step": 500, + "valid accuracy": 0.34, + "train loss": 0.7009325810670852, + "train samples": 2000, + "train time": 49.47734279213182, + "eval time": 13.318595108998124, + "tokens / sec": 4203.843380875268, + "mem allocated avg": 6975756310.528, + "mem reserved avg": 12690437177.344, + "elapsed time": 222.717683150011 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.6809726172685623, + "train samples": 3000, + "train time": 50.701564677088754, + "eval time": 6.592474952994962, + "tokens / sec": 4228.6860645325305, + "mem allocated avg": 6985956540.416, + "mem reserved avg": 12840031223.808, + "elapsed time": 325.2694208340108 + }, + { + "step": 1000, + "valid accuracy": 0.32, + "train loss": 0.6661903276443482, + "train samples": 4000, + "train time": 49.452677299879724, + "eval time": 13.326040301006287, + "tokens / sec": 4212.835611238114, + "mem allocated avg": 6977344550.912, + "mem reserved avg": 12711484194.816, + "elapsed time": 432.82023598300293 + }, + { + "step": 1250, + "valid accuracy": 0.44, + "train loss": 0.665697453379631, + "train samples": 5000, + "train time": 49.56871296803001, + "eval time": 6.698036557994783, + "tokens / sec": 4207.0489127789, + "mem allocated avg": 6977509738.496, + "mem reserved avg": 12708397187.072, + "elapsed time": 534.2725243740133 + }, + { + "step": 1500, + "valid accuracy": 0.44, + "train loss": 0.658678293466568, + "train samples": 6000, + "train time": 49.71162069692218, + "eval time": 13.42559558400535, + "tokens / sec": 4210.906767176883, + "mem allocated avg": 6978434217.984, + "mem reserved avg": 12733680451.584, + "elapsed time": 642.8949007330084 + }, + { + "step": 1750, + "valid accuracy": 0.44, + "train loss": 0.6513392345905304, + "train samples": 7000, + "train time": 49.957065908936784, + "eval time": 8.692238900999655, + "tokens / sec": 4190.698476600257, + "mem allocated avg": 6980155148.288, + "mem reserved avg": 12746875731.968, + "elapsed time": 746.9297674360132 + }, + { + "step": 2000, + "valid accuracy": 0.36, + "train loss": 0.6511732361316681, + "train samples": 8000, + "train time": 49.75638979690848, + "eval time": 13.350251003997982, + "tokens / sec": 4174.257835983607, + "mem allocated avg": 6976487055.36, + "mem reserved avg": 12692744044.544, + "elapsed time": 855.1831161730079 + }, + { + "step": 2250, + "valid accuracy": 0.38, + "train loss": 0.6382467728853226, + "train samples": 9000, + "train time": 51.20128064297023, + "eval time": 13.277926524999202, + "tokens / sec": 4198.098119827237, + "mem allocated avg": 6988260448.256, + "mem reserved avg": 12868644765.696, + "elapsed time": 965.4057929810078 + }, + { + "step": 2500, + "valid accuracy": 0.42, + "train loss": 0.6324679807424546, + "train samples": 10000, + "train time": 47.79617171884456, + "eval time": 13.268003197008511, + "tokens / sec": 4309.278182603765, + "mem allocated avg": 6973276801.024, + "mem reserved avg": 12640810172.416, + "elapsed time": 1071.8066454930085 + }, + { + "step": 2750, + "valid accuracy": 0.42, + "train loss": 0.6214727911949157, + "train samples": 11000, + "train time": 49.63376283789694, + "eval time": 8.245235234993743, + "tokens / sec": 4268.888512281446, + "mem allocated avg": 6983764305.92, + "mem reserved avg": 12802987130.88, + "elapsed time": 1175.2128759590123 + }, + { + "step": 3000, + "valid accuracy": 0.46, + "train loss": 0.6079807863235474, + "train samples": 12000, + "train time": 49.776777152961586, + "eval time": 13.29031453501375, + "tokens / sec": 4193.340990289104, + "mem allocated avg": 6978680711.168, + "mem reserved avg": 12727900700.672, + "elapsed time": 1283.6379908250092 + }, + { + "step": 3250, + "valid accuracy": 0.5, + "train loss": 0.6133705099821091, + "train samples": 13000, + "train time": 50.014745363077964, + "eval time": 7.092912267995416, + "tokens / sec": 4216.77644200688, + "mem allocated avg": 6980747913.216, + "mem reserved avg": 12754257707.008, + "elapsed time": 1386.1836155580095 + }, + { + "step": 3500, + "valid accuracy": 0.52, + "train loss": 0.5912622555494308, + "train samples": 14000, + "train time": 49.560089439110016, + "eval time": 13.321606318990234, + "tokens / sec": 4232.236107194697, + "mem allocated avg": 6979099045.888, + "mem reserved avg": 12738579398.656, + "elapsed time": 1494.848658177012 + }, + { + "step": 3750, + "valid accuracy": 0.48, + "train loss": 0.5849999967813492, + "train samples": 15000, + "train time": 51.10861245104752, + "eval time": 13.350408840997261, + "tokens / sec": 4240.048586870968, + "mem allocated avg": 6990205292.544, + "mem reserved avg": 12906016014.336, + "elapsed time": 1605.1250539940083 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.5914600425958634, + "train samples": 16000, + "train time": 48.92153307204717, + "eval time": 13.309209176004515, + "tokens / sec": 4177.567364845621, + "mem allocated avg": 6971749750.784, + "mem reserved avg": 12621868695.552, + "elapsed time": 1712.7276146870136 + }, + { + "step": 4250, + "valid accuracy": 0.54, + "train loss": 0.575433883190155, + "train samples": 17000, + "train time": 50.056106529867975, + "eval time": 13.322275185011677, + "tokens / sec": 4223.041196259767, + "mem allocated avg": 6981706383.36, + "mem reserved avg": 12772226105.344, + "elapsed time": 1821.8377219670074 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.5807004086971282, + "train samples": 18000, + "train time": 49.559131018977496, + "eval time": 13.371259606996318, + "tokens / sec": 4193.334219690434, + "mem allocated avg": 6976847835.136, + "mem reserved avg": 12694061056.0, + "elapsed time": 1929.8977656790084 + }, + { + "step": 4750, + "valid accuracy": 0.52, + "train loss": 0.5704656873941422, + "train samples": 19000, + "train time": 49.80019182183605, + "eval time": 13.346957685993402, + "tokens / sec": 4215.62633234572, + "mem allocated avg": 6979789905.92, + "mem reserved avg": 12742303940.608, + "elapsed time": 2038.7162974260136 + }, + { + "step": 5000, + "valid accuracy": 0.52, + "train loss": 0.5784689987897873, + "train samples": 20000, + "train time": 49.38921916205436, + "eval time": 13.307282750000013, + "tokens / sec": 4217.114656471855, + "mem allocated avg": 6976297842.688, + "mem reserved avg": 12688323248.128, + "elapsed time": 2146.6737093550037 + }, + { + "step": 5000, + "test accuracy": 0.5072024260803639, + "train loss": 0.5784689987897873, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/vblora--llama-3.2-3B-default.json b/MetaMathQA/results/vblora--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..818a4491f9bc50ce03efe6d4effb18de2b6bd862 --- /dev/null +++ b/MetaMathQA/results/vblora--llama-3.2-3B-default.json @@ -0,0 +1,357 @@ +{ + "run_info": { + "created_at": "2025-06-19T23:49:12+00:00", + "total_time": 2210.184595478997, + "experiment_name": "vblora/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "VBLORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 4, + "num_vectors": 256, + "vector_length": 256, + "topk": 2, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "save_only_topk_weights": false, + "vblora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "modules_to_save": null, + "init_vector_bank_bound": 0.02, + "init_logits_std": 0.1, + "layers_to_transform": null, + "layers_pattern": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 11735344663, + "cuda_memory_max": 22181576704, + "cuda_memory_reserved_99th": 17635223797, + "train_time": 1961.761054087001, + "file_size": 4864912, + "num_trainable_params": 1212416, + "num_total_params": 3213962240, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.308416832447052, + "train samples": 1000, + "train time": 40.12101128498034, + "eval time": 12.847303112997906, + "tokens / sec": 5277.010554299236, + "mem allocated avg": 6798303909.888, + "mem reserved avg": 11786547888.128, + "elapsed time": 101.0704645309961 + }, + { + "step": 500, + "valid accuracy": 0.38, + "train loss": 1.0353211228847503, + "train samples": 2000, + "train time": 39.44148263899842, + "eval time": 12.918682472998626, + "tokens / sec": 5273.508653408011, + "mem allocated avg": 6790843463.68, + "mem reserved avg": 11678368399.36, + "elapsed time": 195.96383863499796 + }, + { + "step": 750, + "valid accuracy": 0.3, + "train loss": 0.8149400608539581, + "train samples": 3000, + "train time": 39.73428049797076, + "eval time": 12.816220013999555, + "tokens / sec": 5395.869695210651, + "mem allocated avg": 6801506754.56, + "mem reserved avg": 11833448595.456, + "elapsed time": 291.3562671039981 + }, + { + "step": 1000, + "valid accuracy": 0.34, + "train loss": 0.766725031375885, + "train samples": 4000, + "train time": 39.66955411599338, + "eval time": 12.954798815000686, + "tokens / sec": 5251.785774824381, + "mem allocated avg": 6791902521.344, + "mem reserved avg": 11700715651.072, + "elapsed time": 386.48246048299916 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.7548577107191086, + "train samples": 5000, + "train time": 39.58740361595119, + "eval time": 12.921318173001055, + "tokens / sec": 5267.786744063522, + "mem allocated avg": 6792241373.184, + "mem reserved avg": 11698584944.64, + "elapsed time": 481.6297270110008 + }, + { + "step": 1500, + "valid accuracy": 0.42, + "train loss": 0.744083244919777, + "train samples": 6000, + "train time": 39.58100679998461, + "eval time": 7.990361490003124, + "tokens / sec": 5288.672950079719, + "mem allocated avg": 6792975718.4, + "mem reserved avg": 11729908006.912, + "elapsed time": 571.7995823969977 + }, + { + "step": 1750, + "valid accuracy": 0.4, + "train loss": 0.7353366105556488, + "train samples": 7000, + "train time": 39.89848869200068, + "eval time": 9.013003496002057, + "tokens / sec": 5247.191231129864, + "mem allocated avg": 6795538806.784, + "mem reserved avg": 11737281593.344, + "elapsed time": 663.3418345429964 + }, + { + "step": 2000, + "valid accuracy": 0.36, + "train loss": 0.735884799003601, + "train samples": 8000, + "train time": 39.573877232054656, + "eval time": 13.005171182994673, + "tokens / sec": 5248.3106161699825, + "mem allocated avg": 6792414906.368, + "mem reserved avg": 11683577724.928, + "elapsed time": 758.2818646219966 + }, + { + "step": 2250, + "valid accuracy": 0.34, + "train loss": 0.7294247032403945, + "train samples": 9000, + "train time": 40.16309046502283, + "eval time": 12.827437616993848, + "tokens / sec": 5351.878989172747, + "mem allocated avg": 6803159742.464, + "mem reserved avg": 11872145244.16, + "elapsed time": 854.2901024749954 + }, + { + "step": 2500, + "valid accuracy": 0.36, + "train loss": 0.7273153622150421, + "train samples": 10000, + "train time": 39.40831322706072, + "eval time": 12.578817460002028, + "tokens / sec": 5226.48606686793, + "mem allocated avg": 6788682082.304, + "mem reserved avg": 11624815525.888, + "elapsed time": 948.6046375669976 + }, + { + "step": 2750, + "valid accuracy": 0.3, + "train loss": 0.7221734907627105, + "train samples": 11000, + "train time": 39.99460277392063, + "eval time": 12.831681943003787, + "tokens / sec": 5297.739827488966, + "mem allocated avg": 6798795204.608, + "mem reserved avg": 11800045158.4, + "elapsed time": 1044.2291650330008 + }, + { + "step": 3000, + "valid accuracy": 0.44, + "train loss": 0.7163265677690506, + "train samples": 12000, + "train time": 39.72457089692762, + "eval time": 9.721318816998973, + "tokens / sec": 5254.455750864856, + "mem allocated avg": 6794274019.328, + "mem reserved avg": 11717786468.352, + "elapsed time": 1136.276137433997 + }, + { + "step": 3250, + "valid accuracy": 0.24, + "train loss": 0.7239821909666061, + "train samples": 13000, + "train time": 39.57894092098286, + "eval time": 12.939295003001462, + "tokens / sec": 5328.616559524724, + "mem allocated avg": 6796102031.36, + "mem reserved avg": 11749923225.6, + "elapsed time": 1231.5789504099957 + }, + { + "step": 3500, + "valid accuracy": 0.3, + "train loss": 0.7123430745601654, + "train samples": 14000, + "train time": 39.774808847985696, + "eval time": 12.81972120499995, + "tokens / sec": 5273.438291096208, + "mem allocated avg": 6794877718.528, + "mem reserved avg": 11727257206.784, + "elapsed time": 1327.2042175199967 + }, + { + "step": 3750, + "valid accuracy": 0.32, + "train loss": 0.7080619329214096, + "train samples": 15000, + "train time": 40.429172475058294, + "eval time": 12.810948685997573, + "tokens / sec": 5360.065188910042, + "mem allocated avg": 6804612114.432, + "mem reserved avg": 11907847159.808, + "elapsed time": 1424.0900874009967 + }, + { + "step": 4000, + "valid accuracy": 0.42, + "train loss": 0.7257569855451584, + "train samples": 16000, + "train time": 39.64596449997771, + "eval time": 12.844434396996803, + "tokens / sec": 5154.950890401844, + "mem allocated avg": 6787030419.456, + "mem reserved avg": 11605689499.648, + "elapsed time": 1519.344677588997 + }, + { + "step": 4250, + "valid accuracy": 0.38, + "train loss": 0.7041294666528701, + "train samples": 17000, + "train time": 39.938396073041076, + "eval time": 12.83599700799823, + "tokens / sec": 5292.876549508964, + "mem allocated avg": 6797280624.64, + "mem reserved avg": 11765333098.496, + "elapsed time": 1614.829351510998 + }, + { + "step": 4500, + "valid accuracy": 0.38, + "train loss": 0.7148806138038635, + "train samples": 18000, + "train time": 39.55479707601626, + "eval time": 12.901207727001747, + "tokens / sec": 5253.926586972907, + "mem allocated avg": 6791958038.528, + "mem reserved avg": 11679299534.848, + "elapsed time": 1710.0793527279966 + }, + { + "step": 4750, + "valid accuracy": 0.32, + "train loss": 0.7083848255872727, + "train samples": 19000, + "train time": 39.88160159892141, + "eval time": 12.8585010780007, + "tokens / sec": 5264.056396513368, + "mem allocated avg": 6794248144.896, + "mem reserved avg": 11730923028.48, + "elapsed time": 1806.240128286001 + }, + { + "step": 5000, + "valid accuracy": 0.36, + "train loss": 0.7142883945703506, + "train samples": 20000, + "train time": 39.631631882970396, + "eval time": 12.818420095005422, + "tokens / sec": 5255.398026885119, + "mem allocated avg": 6791235981.312, + "mem reserved avg": 11677395320.832, + "elapsed time": 1901.3912653839943 + }, + { + "step": 5000, + "test accuracy": 0.36997725549658833, + "train loss": 0.7142883945703506, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/results/vera--llama-3.2-3B-default.json b/MetaMathQA/results/vera--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..cd834c83a3ffdcac46eea84f89434579897fef90 --- /dev/null +++ b/MetaMathQA/results/vera--llama-3.2-3B-default.json @@ -0,0 +1,353 @@ +{ + "run_info": { + "created_at": "2025-06-19T20:53:39+00:00", + "total_time": 2024.6820476150024, + "experiment_name": "vera/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "VERA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 256, + "target_modules": [ + "v_proj", + "q_proj" + ], + "projection_prng_key": 0, + "save_projection": true, + "vera_dropout": 0.0, + "d_initial": 0.1, + "fan_in_fan_out": false, + "bias": "none", + "modules_to_save": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null + }, + "error_msg": "" + }, + "train_info": { + "cuda_memory_reserved_avg": 11489715316, + "cuda_memory_max": 21596471296, + "cuda_memory_reserved_99th": 17291123097, + "train_time": 1819.9693055349999, + "file_size": 6821968, + "num_trainable_params": 129024, + "num_total_params": 3212878848, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3017588877677917, + "train samples": 1000, + "train time": 32.843521857023006, + "eval time": 11.480974874997628, + "tokens / sec": 6446.294064372017, + "mem allocated avg": 6784826523.648, + "mem reserved avg": 11538438029.312, + "elapsed time": 95.45296428899746 + }, + { + "step": 500, + "valid accuracy": 0.28, + "train loss": 1.0202219936847687, + "train samples": 2000, + "train time": 32.35236015598639, + "eval time": 11.4980273259971, + "tokens / sec": 6429.051821788439, + "mem allocated avg": 6777359808.512, + "mem reserved avg": 11429948162.048, + "elapsed time": 183.95939499299857 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.8040032889842987, + "train samples": 3000, + "train time": 32.52055500800634, + "eval time": 11.426841341002728, + "tokens / sec": 6592.784162115804, + "mem allocated avg": 6787965165.568, + "mem reserved avg": 11585061912.576, + "elapsed time": 272.8589564269969 + }, + { + "step": 1000, + "valid accuracy": 0.3, + "train loss": 0.7544035723209381, + "train samples": 4000, + "train time": 32.27830113501477, + "eval time": 11.54098314699877, + "tokens / sec": 6454.3669485133405, + "mem allocated avg": 6779215933.44, + "mem reserved avg": 11460172316.672, + "elapsed time": 361.1500098109973 + }, + { + "step": 1250, + "valid accuracy": 0.44, + "train loss": 0.7379197257757187, + "train samples": 5000, + "train time": 32.060909217962035, + "eval time": 11.406497389998549, + "tokens / sec": 6504.431879404317, + "mem allocated avg": 6779128844.288, + "mem reserved avg": 11454770053.12, + "elapsed time": 449.3482204989996 + }, + { + "step": 1500, + "valid accuracy": 0.4, + "train loss": 0.7252234178781509, + "train samples": 6000, + "train time": 31.98088176901365, + "eval time": 11.480169268001191, + "tokens / sec": 6545.504326988923, + "mem allocated avg": 6780286265.344, + "mem reserved avg": 11479667441.664, + "elapsed time": 537.3097453219998 + }, + { + "step": 1750, + "valid accuracy": 0.4, + "train loss": 0.7148357192277909, + "train samples": 7000, + "train time": 32.29452324002341, + "eval time": 11.44221062500219, + "tokens / sec": 6482.678144650271, + "mem allocated avg": 6782215264.256, + "mem reserved avg": 11493600919.552, + "elapsed time": 625.780868398997 + }, + { + "step": 2000, + "valid accuracy": 0.4, + "train loss": 0.7139411936998368, + "train samples": 8000, + "train time": 32.33002986999054, + "eval time": 11.472246884000924, + "tokens / sec": 6424.243987253105, + "mem allocated avg": 6778636718.08, + "mem reserved avg": 11439217573.888, + "elapsed time": 714.3076436519987 + }, + { + "step": 2250, + "valid accuracy": 0.38, + "train loss": 0.7067342863082886, + "train samples": 9000, + "train time": 32.69249906902769, + "eval time": 11.424881449998793, + "tokens / sec": 6574.841511692143, + "mem allocated avg": 6789716504.576, + "mem reserved avg": 11617542602.752, + "elapsed time": 803.4051666009982 + }, + { + "step": 2500, + "valid accuracy": 0.4, + "train loss": 0.7048580280542374, + "train samples": 10000, + "train time": 31.796681229010574, + "eval time": 11.401134708998143, + "tokens / sec": 6477.625715607085, + "mem allocated avg": 6775192217.6, + "mem reserved avg": 11386755219.456, + "elapsed time": 890.7853266579987 + }, + { + "step": 2750, + "valid accuracy": 0.36, + "train loss": 0.6994425257444382, + "train samples": 11000, + "train time": 32.589996781029186, + "eval time": 6.453208308001194, + "tokens / sec": 6501.412118068606, + "mem allocated avg": 6785945655.296, + "mem reserved avg": 11552530890.752, + "elapsed time": 974.6122346880002 + }, + { + "step": 3000, + "valid accuracy": 0.4, + "train loss": 0.6912879683971405, + "train samples": 12000, + "train time": 32.34826778500428, + "eval time": 11.457833226999355, + "tokens / sec": 6452.617536966281, + "mem allocated avg": 6780318763.008, + "mem reserved avg": 11474030297.088, + "elapsed time": 1062.897270567999 + }, + { + "step": 3250, + "valid accuracy": 0.4, + "train loss": 0.700449936747551, + "train samples": 13000, + "train time": 32.51472582996939, + "eval time": 8.004199169998174, + "tokens / sec": 6486.322569744963, + "mem allocated avg": 6782387701.76, + "mem reserved avg": 11501452656.64, + "elapsed time": 1148.3985279560002 + }, + { + "step": 3500, + "valid accuracy": 0.36, + "train loss": 0.6886729755401612, + "train samples": 14000, + "train time": 32.572147220984334, + "eval time": 11.456443364000734, + "tokens / sec": 6439.550901479111, + "mem allocated avg": 6781381988.352, + "mem reserved avg": 11484943876.096, + "elapsed time": 1237.2252680229976 + }, + { + "step": 3750, + "valid accuracy": 0.38, + "train loss": 0.6851948540210724, + "train samples": 15000, + "train time": 32.8770313250061, + "eval time": 8.042231839001033, + "tokens / sec": 6591.318962402083, + "mem allocated avg": 6791807023.104, + "mem reserved avg": 11653781389.312, + "elapsed time": 1323.4750151669978 + }, + { + "step": 4000, + "valid accuracy": 0.36, + "train loss": 0.7032276903390884, + "train samples": 16000, + "train time": 31.65130396198947, + "eval time": 7.9955749260007, + "tokens / sec": 6457.016754994822, + "mem allocated avg": 6773653422.08, + "mem reserved avg": 11367989903.36, + "elapsed time": 1407.2714081800004 + }, + { + "step": 4250, + "valid accuracy": 0.36, + "train loss": 0.684476065993309, + "train samples": 17000, + "train time": 32.02934406197164, + "eval time": 8.007123895000404, + "tokens / sec": 6599.854170943876, + "mem allocated avg": 6784119472.128, + "mem reserved avg": 11519949537.28, + "elapsed time": 1492.0019941529972 + }, + { + "step": 4500, + "valid accuracy": 0.38, + "train loss": 0.6939880999326706, + "train samples": 18000, + "train time": 31.936327281997364, + "eval time": 9.855819755000994, + "tokens / sec": 6507.260467522446, + "mem allocated avg": 6777879162.88, + "mem reserved avg": 11436331892.736, + "elapsed time": 1578.2498042659972 + }, + { + "step": 4750, + "valid accuracy": 0.36, + "train loss": 0.68637368786335, + "train samples": 19000, + "train time": 32.33460194401778, + "eval time": 6.469711448000453, + "tokens / sec": 6492.704019164238, + "mem allocated avg": 6781104441.344, + "mem reserved avg": 11484004352.0, + "elapsed time": 1662.171022565999 + }, + { + "step": 5000, + "valid accuracy": 0.38, + "train loss": 0.6926896897554398, + "train samples": 20000, + "train time": 32.14674746405217, + "eval time": 8.441190715999255, + "tokens / sec": 6479.038049896257, + "mem allocated avg": 6777818853.376, + "mem reserved avg": 11434117300.224, + "elapsed time": 1747.4833575960001 + }, + { + "step": 5000, + "test accuracy": 0.3684609552691433, + "train loss": 0.6926896897554398, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "gpu": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/MetaMathQA/run.py b/MetaMathQA/run.py new file mode 100644 index 0000000000000000000000000000000000000000..467548d3b4cb910b59a64ccc65e6e7a91a1848cf --- /dev/null +++ b/MetaMathQA/run.py @@ -0,0 +1,466 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Main entry point to run the experiments. Contains general setup and the proper training code. +""" + +import argparse +import datetime as dt +import gc +import json +import os +import random +import sys +import textwrap +import time +from contextlib import ContextManager, nullcontext +from functools import partial +from typing import Any, Callable, Literal, Optional + +import torch +from torch import nn +from torch.amp import GradScaler, autocast +from tqdm import tqdm +from transformers import GenerationConfig, set_seed +from utils import ( + FILE_NAME_TRAIN_PARAMS, + BucketIterator, + TrainResult, + TrainStatus, + get_accuracy, + get_base_model_info, + get_dataset_info, + get_file_size, + get_model, + get_optimizer_and_scheduler, + get_peft_branch, + get_tokenizer, + get_train_config, + init_cuda, + log_results, + validate_experiment_path, +) + +from data import get_train_valid_test_datasets +from peft import AdaLoraConfig, PeftConfig +from peft.utils import CONFIG_NAME + + +# # suppress all warnings +# warnings.filterwarnings("ignore") # FIXME? + +dtype_to_bytes_linear = {"float32": 4, "float16": 2, "bfloat16": 2, "int8": 1, "int4": 0.5} +# if lr scheduler with warmup is used, the ratio of warmup steps to total steps +BUCKET_FACTOR = 20 # number of batches per bucket, increasing this further has diminishing returns + + +def get_generation_config(*, seq_len, generate_kwargs) -> GenerationConfig: + # filter out None values so that we don't depend on setting correct defaults in the config + generation_kwargs = {k: v for k, v in generate_kwargs.items() if v is not None} + if ("max_length" in generation_kwargs) and ("max_new_tokens" in generation_kwargs): + # transformers does not support setting both max_length and max_new_tokens, but what we want in this case is to + # take the smaller of the two values + new_max_length = min(generation_kwargs["max_new_tokens"] + seq_len, generation_kwargs["max_length"]) + del generation_kwargs["max_new_tokens"] + generation_kwargs["max_length"] = new_max_length + generation_config = GenerationConfig(**generate_kwargs) + return generation_config + + +def evaluate(model, tokenizer, ds, batch_size, generate_kwargs, use_tqdm: bool = False) -> tuple[list[str], list[str]]: + with torch.inference_mode(): + predictions = [] + responses = [] + pbar = range(0, len(ds), batch_size) + if use_tqdm: + pbar = tqdm(pbar) + for j in pbar: + sliced = ds[j : j + batch_size] + responses += sliced.pop("response") + batch = tokenizer.pad(sliced, return_tensors="pt", padding_side="left").to(model.device) + seq_len = batch["input_ids"].shape[1] + generation_config = get_generation_config(seq_len=seq_len, generate_kwargs=generate_kwargs) + outputs = model.generate(**batch, generation_config=generation_config, pad_token_id=tokenizer.eos_token_id) + predictions += tokenizer.batch_decode(outputs, skip_special_tokens=True) + return predictions, responses + + +class DummyGradScaler: + # if no mixed precision is being used + def scale(self, loss): + return loss + + def unscale_(self, optimizer): + pass + + def step(self, optimizer): + optimizer.step() + + def update(self): + pass + + +def train( + *, + model: nn.Module, + max_steps: int, + batch_size: int, + batch_size_eval: int, + tokenizer: Any, + cuda_memory_init: int, + eval_steps: int, + generation_kwargs: dict[str, Any], + grad_norm_clip: float, + optimizer_type: str, + optimizer_kwargs: dict[str, Any], + query_template: str, + lr_scheduler_arg: Optional[Literal["cosine"]], + use_amp: bool, + is_adalora: bool, +) -> TrainResult: + cuda_memory_allocated_log = [] + cuda_memory_reserved_log = [] + losses = [] + durations = [] + metrics = [] + sample = 0 # keep count of the current sample + total_samples = 0 # total number of samples over all epochs + total_tokens = [] # total number of tokens over all epochs + if use_amp: + grad_scaler: GradScaler | DummyGradScaler = GradScaler(device="cuda") + autocast_ctx: Callable[[], ContextManager[Any]] = partial(autocast, device_type="cuda") + else: + grad_scaler = DummyGradScaler() + autocast_ctx = nullcontext + + optimizer, lr_scheduler = get_optimizer_and_scheduler( + model, + optimizer_type=optimizer_type, + max_steps=max_steps, + lr_scheduler_arg=lr_scheduler_arg, + **optimizer_kwargs, + ) + # print this after getting the optimizer, in case it modifies requires_gard + if hasattr(model, "get_nb_trainable_parameters"): + num_trainable_params, num_params = model.get_nb_trainable_parameters() + else: + num_params = model.num_parameters() + num_trainable_params = num_params + print_verbose( + f"trainable params: {num_trainable_params:,d} || all params: {num_params:,d} || " + f"trainable: {100 * num_trainable_params / num_params:.4f}%" + ) + + status = TrainStatus.FAILED + tic_train = time.perf_counter() + eval_time = 0.0 + error_msg = "" + + ds_train, ds_valid, ds_test = get_train_valid_test_datasets( + tokenizer=tokenizer, query_template=query_template, print_fn=print_verbose + ) + # note: bucketing by length is only really worth it for the train dataset, since it's length is big compared to the + # batch size + iterator_train = BucketIterator( + ds_train, + batch_size=batch_size, + bucket_factor=BUCKET_FACTOR, + delete_cols=["response"], + ) + try: + pbar = tqdm(range(1, max_steps + 1)) + for step, batch in zip(pbar, iterator_train): + tic = time.perf_counter() + + # create the batch + tokens_per_sample = [len(i) for i in batch["input_ids"]] + total_tokens.append(sum(tokens_per_sample) + len(tokens_per_sample)) # add EOS token + batch = tokenizer.pad(batch, return_tensors="pt").to(model.device) + actual_batch_size = len(batch["input_ids"]) + total_samples += actual_batch_size + sample += batch_size + if sample >= len(ds_train): # new epoch + sample = 0 + + # add labels, they are automatically shifted by transformers + labels = batch["input_ids"].clone() + # We want to ignore the padding tokens except for the first EOS token; if we don't ignore them, the loss + # will be dominated by padding tokens; if we ignore all, the model will not learn to predict the EOS token. + # TODO: Note that the longest sequence in the batch won't have any PAD/EOS token at the end, this is fine if + # the batch size is > 1 but should still be fixed eventually. + for i, num_tokens in enumerate(tokens_per_sample): + labels[i, num_tokens + 1 :] = -100 + batch["labels"] = labels + num_items_in_batch = batch["attention_mask"].sum().item() + + # train step + optimizer.zero_grad() + with autocast_ctx(): + outputs = model(**batch, num_items_in_batch=num_items_in_batch) + loss = outputs.loss + grad_scaler.scale(loss).backward() + if grad_norm_clip: + grad_scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), grad_norm_clip) + grad_scaler.step(optimizer) + grad_scaler.update() + lr_scheduler.step() + + if is_adalora: + model.base_model.update_and_allocate(step) + + losses.append(loss.item()) + pbar.set_postfix({"loss": loss.item()}) + cuda_memory_allocated_log.append(torch.cuda.memory_allocated() - cuda_memory_init) + cuda_memory_reserved_log.append(torch.cuda.memory_reserved() - cuda_memory_init) + toc = time.perf_counter() + durations.append(toc - tic) + + # every couple of steps, evaluate; this can be slow due to generation + if step % eval_steps == 0: + tic_eval = time.perf_counter() + loss_avg = sum(losses[-eval_steps:]) / eval_steps + memory_allocated_avg = sum(cuda_memory_allocated_log[-eval_steps:]) / eval_steps + memory_reserved_avg = sum(cuda_memory_reserved_log[-eval_steps:]) / eval_steps + token_sum = sum(total_tokens[-eval_steps:]) + dur_train = sum(durations[-eval_steps:]) + tokens_per_sec = token_sum / dur_train + + model.eval() + predictions, responses = evaluate( + model=model, + tokenizer=tokenizer, + ds=ds_valid, + batch_size=batch_size_eval, + generate_kwargs={**generation_kwargs}, + ) + model.train() + + example = random.choice(predictions) + example = textwrap.shorten(example, width=750) + example = textwrap.indent(example, " ") + print_verbose(f"\nExample prediction:\n{example}\n") + accuracy = get_accuracy(predictions=predictions, responses=responses) + num_tokens_generated = sum(sum(mask) for mask in tokenizer(predictions)["attention_mask"]) + + toc_eval = time.perf_counter() + dur_eval = toc_eval - tic_eval + eval_time += toc_eval - tic_eval + elapsed = time.perf_counter() - tic_train + + metrics.append( + { + "step": step, + "valid accuracy": accuracy, + "train loss": loss_avg, + "train samples": total_samples, + "train time": dur_train, + "eval time": dur_eval, + "tokens / sec": tokens_per_sec, + "mem allocated avg": memory_allocated_avg, + "mem reserved avg": memory_reserved_avg, + "elapsed time": elapsed, + } + ) + + log_dict = { + "step": f"{step:5d}", + "samples": f"{total_samples:7d}", + "lr": f"{lr_scheduler.get_last_lr()[0]:.2e}", + "loss avg": f"{loss_avg:.4f}", + "valid acc": f"{accuracy:.3f}", + "gen valid tokens": num_tokens_generated, + "train time": f"{dur_train:.1f}s", + "eval time": f"{dur_eval:.1f}s", + "train tokens / sec": f"{tokens_per_sec:.0f}", + "mem allocated": f"{memory_allocated_avg:.0f}", + "mem reserved": f"{memory_reserved_avg:.0f}", + "elapsed time": f"{elapsed // 60:.0f}min {elapsed % 60:.0f}s", + } + print_verbose(json.dumps(log_dict)) + + # # TODO is this needed? + torch.cuda.empty_cache() + gc.collect() + + print_verbose(f"Training finished after {max_steps} steps, evaluation on test set follows.") + # test set evaluation + model.eval() + predictions, responses = evaluate( + model=model, + tokenizer=tokenizer, + ds=ds_test, + batch_size=batch_size_eval, + generate_kwargs={**generation_kwargs, "pad_token_id": tokenizer.eos_token_id}, + use_tqdm=len(ds_test) > 100, + ) + accuracy = get_accuracy(predictions=predictions, responses=responses) + metrics.append( + { + "step": step, + "test accuracy": accuracy, + "train loss": sum(losses[-eval_steps:]) / eval_steps, + "train samples": total_samples, + "train total tokens": sum(total_tokens), + } + ) + print_verbose(f"Test accuracy: {accuracy:.3f}") + + except KeyboardInterrupt: + print_verbose("canceled training") + status = TrainStatus.CANCELED + error_msg = "manually canceled" + except torch.OutOfMemoryError as exc: + # ouch, still let's try to log some results + print_verbose("out of memory error encountered") + status = TrainStatus.CANCELED + error_msg = str(exc) + except Exception as exc: + print_verbose(f"encountered an error: {exc}") + status = TrainStatus.CANCELED + error_msg = str(exc) + + toc_train = time.perf_counter() + train_time = toc_train - tic_train - eval_time + + if status != TrainStatus.CANCELED: + status = TrainStatus.SUCCESS + train_result = TrainResult( + status=status, + train_time=train_time, + cuda_memory_reserved_log=cuda_memory_reserved_log, + losses=losses, + metrics=metrics, + error_msg=error_msg, + num_trainable_params=num_trainable_params, + num_total_params=num_params, + ) + return train_result + + +def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None: + tic_total = time.perf_counter() + start_date = dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat() + + peft_branch = get_peft_branch() + if peft_branch == "main": + print_verbose("===== This experiment is categorized as a MAIN run because the PEFT branch is 'main' ======") + else: + print_verbose( + f"===== This experiment is categorized as a TEST run because the PEFT branch is '{peft_branch}' ======" + ) + + # load configs + peft_config: Optional[PeftConfig] = None + if os.path.exists(os.path.join(path_experiment, CONFIG_NAME)): + peft_config = PeftConfig.from_pretrained(path_experiment) + else: + print_verbose(f"Could not find PEFT config at {path_experiment}, performing FULL FINETUNING") + path_train_config = os.path.join(path_experiment, FILE_NAME_TRAIN_PARAMS) + train_config = get_train_config(path_train_config) + set_seed(train_config.seed) + + # initialize objects + cuda_memory_init = init_cuda() + tokenizer = get_tokenizer(model_id=train_config.model_id, max_seq_length=train_config.max_seq_length) + + model_info = get_base_model_info(train_config.model_id) + metamath_info = get_dataset_info("meta-math/MetaMathQA") + gsm8k_info = get_dataset_info("openai/gsm8k") + model = get_model( + model_id=train_config.model_id, + dtype=train_config.dtype, + compile=train_config.compile, + attn_implementation=train_config.attn_implementation, + peft_config=peft_config, + autocast_adapter_dtype=train_config.autocast_adapter_dtype, + ) + print_verbose(model) + + # train model + train_result = train( + model=model, + max_steps=train_config.max_steps, + batch_size=train_config.batch_size, + batch_size_eval=train_config.batch_size_eval, + tokenizer=tokenizer, + cuda_memory_init=cuda_memory_init, + eval_steps=train_config.eval_steps, + generation_kwargs=train_config.generation_kwargs, + grad_norm_clip=train_config.grad_norm_clip, + optimizer_type=train_config.optimizer_type, + optimizer_kwargs=train_config.optimizer_kwargs, + query_template=train_config.query_template, + lr_scheduler_arg=train_config.lr_scheduler, + use_amp=train_config.use_amp, + is_adalora=isinstance(peft_config, AdaLoraConfig), + ) + + if train_result.status == TrainStatus.FAILED: + print_verbose("Training failed, not logging results") + sys.exit(1) + + file_size = get_file_size( + model, + peft_config=peft_config, + clean=clean, + print_fn=print_verbose, + ) + + time_total = time.perf_counter() - tic_total + # log results: print and save to file + log_results( + experiment_name=experiment_name, + train_result=train_result, + cuda_memory_init=cuda_memory_init, + time_total=time_total, + file_size=file_size, + model_info=model_info, + datasets_info={"metamath": metamath_info, "gsm8k": gsm8k_info}, + start_date=start_date, + train_config=train_config, + peft_config=peft_config, + print_fn=print_verbose, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output") + parser.add_argument("path_experiment", type=str, help="Path to the experiment directory") + parser.add_argument( + "--clean", + action="store_true", + help="Delete training artifacts after run finishes (logs are still saved)", + ) + args = parser.parse_args() + + experiment_name = validate_experiment_path(args.path_experiment) + + if args.verbose: + + def print_verbose(*args, **kwargs) -> None: + kwargs["file"] = sys.stderr + print(*args, **kwargs) + else: + + def print_verbose(*args, **kwargs) -> None: + pass + + main( + path_experiment=args.path_experiment, + experiment_name=experiment_name, + clean=args.clean, + ) diff --git a/MetaMathQA/temporary_results/.gitkeep b/MetaMathQA/temporary_results/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MetaMathQA/utils.py b/MetaMathQA/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..00b24c3d2aaa026589aa989a5a0a441a7d9c86e0 --- /dev/null +++ b/MetaMathQA/utils.py @@ -0,0 +1,702 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +All utilities not related to data handling. +""" + +import enum +import json +import os +import platform +import subprocess +import tempfile +import warnings +from dataclasses import asdict, dataclass +from decimal import Decimal, DivisionByZero, InvalidOperation +from typing import Any, Callable, Literal, Optional + +import bitsandbytes +import datasets +import huggingface_hub +import numpy as np +import torch +import transformers +from torch import nn +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + get_cosine_schedule_with_warmup, +) + +import peft +from peft import PeftConfig, get_peft_model, prepare_model_for_kbit_training +from peft.optimizers import create_lorafa_optimizer, create_loraplus_optimizer +from peft.utils import SAFETENSORS_WEIGHTS_NAME + + +if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available, currently only CUDA is supported") + +device = "cuda" +CUDA_MEMORY_INIT_THRESHOLD = 500 * 2**20 # 500MB +FILE_NAME_DEFAULT_TRAIN_PARAMS = os.path.join(os.path.dirname(__file__), "default_training_params.json") +FILE_NAME_TRAIN_PARAMS = "training_params.json" # specific params for this experiment +# main results +RESULT_PATH = os.path.join(os.path.dirname(__file__), "results") +# testing results +RESULT_PATH_TEST = os.path.join(os.path.dirname(__file__), "temporary_results") +# cancelled results +RESULT_PATH_CANCELLED = os.path.join(os.path.dirname(__file__), "cancelled_results") +hf_api = huggingface_hub.HfApi() +WARMUP_STEP_RATIO = 0.1 + + +@dataclass +class TrainConfig: + """All configuration parameters associated with training the model + + Args: + model_id: The model identifier + dtype: The data type to use for the model + max_seq_length: The maximum sequence length + batch_size: The batch size for training + batch_size_eval: The batch size for eval/test, can be much higher than for training + max_steps: The maximum number of steps to train for + eval_steps: The number of steps between evaluations + compile: Whether to compile the model + query_template: The template for the query + seed: The random seed + grad_norm_clip: The gradient norm clipping value (set to 0 to skip) + optimizer_type: The name of a torch optimizer (e.g. AdamW) or a PEFT method ("lora+", "lora-fa") + optimizer_kwargs: The optimizer keyword arguments (lr etc.) + lr_scheduler: The learning rate scheduler (currently only None or 'cosine' are supported) + use_amp: Whether to use automatic mixed precision + autocast_adapter_dtype: Whether to cast adapter dtype to float32, same argument as in PEFT + generation_kwargs: Arguments passed to transformers GenerationConfig (used in evaluation) + attn_implementation: The attention implementation to use (if any), see transformers docs + """ + + model_id: str + dtype: Literal["float32", "float16", "bfloat16", "int8", "int4"] + max_seq_length: int + batch_size: int + batch_size_eval: int + max_steps: int + eval_steps: int + compile: bool + query_template: str + seed: int + grad_norm_clip: float # set to 0 to skip + optimizer_type: str + optimizer_kwargs: dict[str, Any] + lr_scheduler: Optional[Literal["cosine"]] + use_amp: bool + autocast_adapter_dtype: bool + generation_kwargs: dict[str, Any] + attn_implementation: Optional[str] + + def __post_init__(self) -> None: + if not isinstance(self.model_id, str): + raise ValueError(f"Invalid model_id: {self.model_id}") + if self.dtype not in ["float32", "float16", "bfloat16", "int8", "int4"]: + raise ValueError(f"Invalid dtype: {self.dtype}") + if self.max_seq_length < 0: + raise ValueError(f"Invalid max_seq_length: {self.max_seq_length}") + if self.batch_size <= 0: + raise ValueError(f"Invalid batch_size: {self.batch_size}") + if self.batch_size_eval <= 0: + raise ValueError(f"Invalid eval batch_size: {self.batch_size_eval}") + if self.max_steps <= 0: + raise ValueError(f"Invalid max_steps: {self.max_steps}") + if self.eval_steps <= 0: + raise ValueError(f"Invalid eval_steps: {self.eval_steps}") + if self.eval_steps > self.max_steps: + raise ValueError(f"Invalid eval_steps: {self.eval_steps} > max_steps: {self.max_steps}") + if self.grad_norm_clip < 0: + raise ValueError(f"Invalid grad_norm_clip: {self.grad_norm_clip}") + if self.optimizer_type not in ["lora+", "lora-fa"] and not hasattr(torch.optim, self.optimizer_type): + raise ValueError(f"Invalid optimizer_type: {self.optimizer_type}") + if self.lr_scheduler not in [None, "cosine"]: + raise ValueError(f"Invalid lr_scheduler: {self.lr_scheduler}, must be None or 'cosine'") + if "{query}" not in self.query_template: + raise ValueError("Invalid query_template, must contain '{query}'") + + +def validate_experiment_path(path: str) -> str: + # the experiment path should take the form of ./experiments// + # e.g. ./experiments/lora/rank32 + # it should contain: + # - adapter_config.json + # - optional: training_params.json + if not os.path.exists(FILE_NAME_DEFAULT_TRAIN_PARAMS): + raise FileNotFoundError( + f"Missing default training params file '{FILE_NAME_DEFAULT_TRAIN_PARAMS}' in the ./experiments directory" + ) + if not os.path.exists(path): + raise FileNotFoundError(f"Path {path} does not exist") + + # check path structure + path_parts = path.rstrip(os.path.sep).split(os.path.sep) + if (len(path_parts) != 3) or (path_parts[-3] != "experiments"): + raise ValueError( + f"Path {path} does not have the correct structure, should be ./experiments//" + ) + + experiment_name = os.path.join(*path_parts[-2:]) + return experiment_name + + +def get_train_config(path: str) -> TrainConfig: + # first, load the default params, then update with experiment-specific params + with open(FILE_NAME_DEFAULT_TRAIN_PARAMS) as f: + default_config_kwargs = json.load(f) + + config_kwargs = {} + if os.path.exists(path): + with open(path) as f: + config_kwargs = json.load(f) + + config_kwargs = {**default_config_kwargs, **config_kwargs} + return TrainConfig(**config_kwargs) + + +def init_cuda() -> int: + torch.manual_seed(0) + torch.cuda.reset_peak_memory_stats() + torch.cuda.manual_seed_all(0) + # might not be necessary, but just to be sure + nn.Linear(1, 1).to(device) + + cuda_memory_init = torch.cuda.max_memory_reserved() + if cuda_memory_init > CUDA_MEMORY_INIT_THRESHOLD: + raise RuntimeError( + f"CUDA memory usage at start is too high: {cuda_memory_init // 2**20}MB, please ensure that no other " + f"processes are running on {device}." + ) + + torch.cuda.reset_peak_memory_stats() + cuda_memory_init = torch.cuda.max_memory_reserved() + return cuda_memory_init + + +def get_tokenizer(*, model_id: str, max_seq_length: int): + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.model_max_length = max_seq_length + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + +def get_base_model( + *, + model_id: str, + dtype: Literal["float32", "float16", "bfloat16", "int8", "int4"], + compile: bool, + attn_implementation: Optional[str], +) -> nn.Module: + kwargs: dict[str, Any] = { + "pretrained_model_name_or_path": model_id, + "device_map": device, + "attn_implementation": attn_implementation, + } + if dtype == "int4": + quant_config = BitsAndBytesConfig(load_in_4bit=True) + kwargs["quantization_config"] = quant_config + elif dtype == "int8": + quant_config = BitsAndBytesConfig(load_in_8bit=True) + kwargs["quantization_config"] = quant_config + elif dtype == "bfloat16": + kwargs["torch_dtype"] = torch.bfloat16 + elif dtype == "float16": + kwargs["torch_dtype"] = torch.float16 + elif dtype != "float32": + raise ValueError(f"Invalid dtype: {dtype}") + + model = AutoModelForCausalLM.from_pretrained(**kwargs) + + if dtype in ["int8", "int4"]: + model = prepare_model_for_kbit_training(model) + + if compile: + model = torch.compile(model) + + return model + + +def get_model( + *, + model_id: str, + dtype: Literal["float32", "float16", "bfloat16", "int8", "int4"], + compile: bool, + attn_implementation: Optional[str], + peft_config: Optional[PeftConfig], + autocast_adapter_dtype: bool, +) -> nn.Module: + base_model = get_base_model( + model_id=model_id, dtype=dtype, compile=compile, attn_implementation=attn_implementation + ) + if peft_config is None: + model = base_model + else: + model = get_peft_model(base_model, peft_config, autocast_adapter_dtype=autocast_adapter_dtype) + return model + + +class DummyScheduler: + # if no lr scheduler is being used + def __init__(self, lr): + self.lr = lr + + def get_last_lr(self): + return [self.lr] + + def step(self): + pass + + +def get_optimizer_and_scheduler( + model, *, optimizer_type: str, max_steps: int, lr_scheduler_arg: Optional[Literal["cosine"]], **optimizer_kwargs +) -> tuple[torch.optim.Optimizer, Any]: + if optimizer_type == "lora+": + optimizer = create_loraplus_optimizer(model, optimizer_cls=torch.optim.AdamW, **optimizer_kwargs) + elif optimizer_type == "lora-fa": + optimizer = create_lorafa_optimizer(model, **optimizer_kwargs) + else: + cls = getattr(torch.optim, optimizer_type) + optimizer = cls(model.parameters(), **optimizer_kwargs) + + if lr_scheduler_arg == "cosine": + warmup_steps = int(WARMUP_STEP_RATIO * max_steps) + lr_scheduler = get_cosine_schedule_with_warmup( + optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps + ) + elif lr_scheduler_arg is None: + lr_scheduler = DummyScheduler(optimizer_kwargs["lr"]) + else: + raise ValueError(f"Invalid lr_scheduler argument: {lr_scheduler_arg}") + + return optimizer, lr_scheduler + + +class BucketIterator: + """ + Iterator that yields batches of data from a torch Dataset, grouped in buckets by sequence length + + The iterator will yield batches of size `batch_size`, where the samples in each batch are sorted by sequence length. + This is done to minimize the amount of padding required for each batch. To avoid sorting the entire dataset and thus + introducing a bias, the dataset is first split into buckets of size `batch_size * bucket_factor`. + + Args: + ds: The torch Dataset to iterate over + batch_size: The batch size + bucket_factor: The factor by which to multiply the batch size to determine the bucket size + delete_cols: The columns to delete from the dataset before yielding a batch + """ + + def __init__(self, ds, *, batch_size: int, bucket_factor: int, delete_cols: list[str]) -> None: + self.ds = ds + self.batch_size = batch_size + self.bucket_factor = bucket_factor + self.delete_cols = set(delete_cols) + + assert self.bucket_factor > 0, "bucket_factor must be greater than 0" + + def _batch_iterator(self, bucket): + tokens_per_sample_bucket = torch.tensor([len(i) for i in bucket["input_ids"]]) + # sort long to short instead to encounter possible OOM errors as early as possible + sorted = torch.argsort(tokens_per_sample_bucket, descending=True) + cls = type(bucket) # conserve the type returned by the ds + bucket = {k: [v[i] for i in sorted] for k, v in bucket.items() if k not in self.delete_cols} + num_samples = len(bucket["input_ids"]) + for j in range(0, num_samples, self.batch_size): + batch = {k: v[j : j + self.batch_size] for k, v in bucket.items()} + yield cls(batch) + + def __iter__(self): + bucket_size = self.batch_size * self.bucket_factor + for i in range(0, len(self.ds), bucket_size): + bucket = self.ds[i : i + bucket_size] + yield from self._batch_iterator(bucket) + + # if there is a remainder, we yield the last batch + if len(self.ds) % bucket_size != 0: + bucket = self.ds[-(len(self.ds) % bucket_size) :] + yield from self._batch_iterator(bucket) + + +def get_file_size( + model: nn.Module, *, peft_config: Optional[PeftConfig], clean: bool, print_fn: Callable[..., None] +) -> int: + file_size = 99999999 # set a default dummy value + if peft_config is not None: + try: + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=clean) as tmp_dir: + model.save_pretrained(tmp_dir) + stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME)) + file_size = stat.st_size + if not clean: + print_fn(f"Saved PEFT checkpoint to {tmp_dir}") + except Exception as exc: + print(f"Failed to save PEFT checkpoint due to the following error: {exc}") + else: + print_fn("Not saving the fully fine-tuned model because it's too big, estimating the size instead") + try: + num_params = model.num_parameters() + dtype_size = next(model.parameters()).element_size() + file_size = num_params * dtype_size + except Exception as exc: + print(f"Failed to determine file size for fully finetuned model because of: {exc}") + return file_size + + +################## +# ANSWER PARSING # +################## + + +def parse_answer(text: str) -> Optional[str]: + """ + A label/prediction can look like this: + + Question: If the magnitude of vector v is equal to 4, what is the dot product of vector v with itself?. Think step + by step + Answer: The dot product of a vector with itself is equal to the square of its magnitude. So, the dot product of + vector v with itself is equal to $4^2 = \boxed{16}$.The answer is: 16 + + We want to extract '16' from this string. + + """ + # This implementation is based on sampling meta-llama/Llama-3.1-8B-Instruct. It may not work for other models. + candidate_delimiters = [ + # MetaMath: + "The answer is: ", + "The answer is ", + "The final answer is: ", + "The final answer is ", + # GSM8K: + "#### ", + ] + text = text.strip() + text = text.rstrip(".!?") + for delimiter in candidate_delimiters: + if delimiter in text: + break + else: # no match + return None + + text = text.rpartition(delimiter)[-1].strip() + # if a new paragraph follows after the final answer, we want to remove it + text = text.split("\n", 1)[0] + # note: we can just remove % here since the GSM8K dataset just omits it, i.e. 50% -> 50, no need to divide by 100 + text = text.strip(" .!?$%") + return text + + +def convert_to_decimal(s: Optional[str]) -> Optional[Decimal]: + """ + Converts a string representing a number to a Decimal. + + The string may be: + - A simple number (e.g., "13", "65.33") + - A fraction (e.g., "20/14") + """ + if s is None: + return None + + try: + s = s.strip() + # Check if the string represents a fraction. + if "/" in s: + parts = s.split("/") + if len(parts) != 2: + return None + numerator = Decimal(parts[0].strip()) + denominator = Decimal(parts[1].strip()) + if denominator == 0: + return None + value = numerator / denominator + else: + # Parse as a regular decimal or integer string. + value = Decimal(s) + return value + except (DivisionByZero, InvalidOperation, ValueError): + return None + + +def get_accuracy(*, predictions: list[str], responses: list[str]) -> float: + if len(predictions) != len(responses): + raise ValueError(f"Prediction length mismatch: {len(predictions)} != {len(responses)}") + + y_true: list[str | float | None] = [] + y_pred: list[str | float | None] = [] + + for prediction, response in zip(predictions, responses): + parsed_prediction = parse_answer(prediction) + parsed_response = parse_answer(response) + if parsed_response is None: + raise ValueError(f"Error encountered while trying to parse response: {response}") + + decimal_prediction = convert_to_decimal(parsed_prediction) + decimal_answer = convert_to_decimal(parsed_response) + if decimal_prediction is not None: + y_pred.append(float(decimal_prediction)) + elif parsed_prediction is not None: + y_pred.append(parsed_prediction) + else: + y_pred.append(None) + + # we convert decimals to float so that stuff like this works: + # float(convert_to_decimal('20/35')) == float(convert_to_decimal('0.5714285714285714')) + if decimal_answer is not None: + y_true.append(float(decimal_answer)) + elif parsed_prediction is not None: + y_true.append(parsed_response) + else: + y_true.append(None) + + correct: list[bool] = [] + for true, pred in zip(y_true, y_pred): + if (true is not None) and (pred is not None): + correct.append(true == pred) + else: + correct.append(False) + + accuracy = sum(correct) / len(correct) + return accuracy + + +########### +# LOGGING # +########### + + +def get_base_model_info(model_id: str) -> Optional[huggingface_hub.ModelInfo]: + try: + return hf_api.model_info(model_id) + except Exception as exc: + warnings.warn(f"Could not retrieve model info, failed with error {exc}") + return None + + +def get_dataset_info(dataset_id: str) -> Optional[huggingface_hub.DatasetInfo]: + try: + return hf_api.dataset_info(dataset_id) + except Exception as exc: + warnings.warn(f"Could not retrieve dataset info, failed with error {exc}") + return None + + +def get_git_hash(module) -> Optional[str]: + if "site-packages" in module.__path__[0]: + return None + + return subprocess.check_output("git rev-parse HEAD".split(), cwd=os.path.dirname(module.__file__)).decode().strip() + + +def get_package_info() -> dict[str, Optional[str]]: + """Get the package versions and commit hashes of transformers, peft, datasets, bnb, and torch""" + package_info = { + "transformers-version": transformers.__version__, + "transformers-commit-hash": get_git_hash(transformers), + "peft-version": peft.__version__, + "peft-commit-hash": get_git_hash(peft), + "datasets-version": datasets.__version__, + "datasets-commit-hash": get_git_hash(datasets), + "bitsandbytes-version": bitsandbytes.__version__, + "bitsandbytes-commit-hash": get_git_hash(bitsandbytes), + "torch-version": torch.__version__, + "torch-commit-hash": get_git_hash(torch), + } + return package_info + + +def get_system_info() -> dict[str, str]: + system_info = { + "system": platform.system(), + "release": platform.release(), + "version": platform.version(), + "machine": platform.machine(), + "processor": platform.processor(), + "gpu": torch.cuda.get_device_name(0), + } + return system_info + + +@dataclass +class MetaInfo: + package_info: dict[str, Optional[str]] + system_info: dict[str, str] + pytorch_info: str + + +def get_meta_info() -> MetaInfo: + meta_info = MetaInfo( + package_info=get_package_info(), + system_info=get_system_info(), + pytorch_info=torch.__config__.show(), + ) + return meta_info + + +def get_peft_branch() -> str: + return ( + subprocess.check_output("git rev-parse --abbrev-ref HEAD".split(), cwd=os.path.dirname(peft.__file__)) + .decode() + .strip() + ) + + +class TrainStatus(enum.Enum): + FAILED = "failed" + SUCCESS = "success" + CANCELED = "canceled" + + +@dataclass +class TrainResult: + status: TrainStatus + train_time: float + cuda_memory_reserved_log: list[int] + losses: list[float] + metrics: list[Any] # TODO + error_msg: str + num_trainable_params: int + num_total_params: int + + +def log_to_console(log_data: dict[str, Any], print_fn: Callable[..., None]) -> None: + cuda_memory_max = log_data["train_info"]["cuda_memory_max"] + cuda_memory_avg = log_data["train_info"]["cuda_memory_reserved_avg"] + cuda_memory_reserved_99th = log_data["train_info"]["cuda_memory_reserved_99th"] + time_train = log_data["train_info"]["train_time"] + time_total = log_data["run_info"]["total_time"] + file_size = log_data["train_info"]["file_size"] + + print_fn(f"cuda memory max: {cuda_memory_max // 2**20}MB") + print_fn(f"cuda memory reserved avg: {cuda_memory_avg // 2**20}MB") + print_fn(f"cuda memory reserved 99th percentile: {cuda_memory_reserved_99th // 2**20}MB") + print_fn(f"train time: {time_train}s") + print_fn(f"total time: {time_total:.2f}s") + print_fn(f"file size of checkpoint: {file_size / 2**20:.1f}MB") + + +def log_to_file( + *, log_data: dict, save_dir: str, experiment_name: str, timestamp: str, print_fn: Callable[..., None] +) -> None: + if save_dir.endswith(RESULT_PATH): + file_name = f"{experiment_name.replace(os.path.sep, '--')}.json" + else: + # For cancelled and temporary runs, we want to include the timestamp, as these runs are not tracked in git, thus + # we need unique names to avoid losing history. + file_name = f"{experiment_name.replace(os.path.sep, '--')}--{timestamp.replace(':', '-')}.json" + file_name = os.path.join(save_dir, file_name) + with open(file_name, "w") as f: + json.dump(log_data, f, indent=2) + print_fn(f"Saved log to: {file_name}") + + +def log_results( + *, + experiment_name: str, + train_result: TrainResult, + cuda_memory_init: int, + time_total: float, + file_size: int, + model_info: Optional[huggingface_hub.ModelInfo], + datasets_info: dict[str, Optional[huggingface_hub.DatasetInfo]], + start_date: str, + train_config: TrainConfig, + peft_config: Optional[PeftConfig], + print_fn: Callable[..., None], +) -> None: + # collect results + cuda_memory_final = torch.cuda.max_memory_reserved() + cuda_memory_avg = int(sum(train_result.cuda_memory_reserved_log) / len(train_result.cuda_memory_reserved_log)) + cuda_memory_reserved_99th = int(np.percentile(train_result.cuda_memory_reserved_log, 99)) + + meta_info = get_meta_info() + if model_info is not None: + model_sha = model_info.sha + model_created_at = model_info.created_at.isoformat() + else: + model_sha = None + model_created_at = None + + dataset_info_log = {} + for key, dataset_info in datasets_info.items(): + if dataset_info is not None: + dataset_sha = dataset_info.sha + dataset_created_at = dataset_info.created_at.isoformat() + else: + dataset_sha = None + dataset_created_at = None + dataset_info_log[key] = {"sha": dataset_sha, "created_at": dataset_created_at} + + peft_branch = get_peft_branch() + + if train_result.status == TrainStatus.CANCELED: + save_dir = RESULT_PATH_CANCELLED + print_fn("Experiment run was categorized as canceled") + elif peft_branch != "main": + save_dir = RESULT_PATH_TEST + print_fn(f"Experiment run was categorized as a test run on branch {peft_branch}") + elif train_result.status == TrainStatus.SUCCESS: + save_dir = RESULT_PATH + print_fn("Experiment run was categorized as successful run") + else: + save_dir = tempfile.mkdtemp() + print_fn(f"Experiment could not be categorized, writing results to {save_dir}. Please open an issue on PEFT.") + + if peft_config is None: + peft_config_dict: Optional[dict[str, Any]] = None + else: + peft_config_dict = peft_config.to_dict() + for key, value in peft_config_dict.items(): + if isinstance(value, set): + peft_config_dict[key] = list(value) + + log_data = { + "run_info": { + "created_at": start_date, + "total_time": time_total, + "experiment_name": experiment_name, + "peft_branch": peft_branch, + "train_config": asdict(train_config), + "peft_config": peft_config_dict, + "error_msg": train_result.error_msg, + }, + "train_info": { + "cuda_memory_reserved_avg": cuda_memory_avg, + "cuda_memory_max": (cuda_memory_final - cuda_memory_init), + "cuda_memory_reserved_99th": cuda_memory_reserved_99th, + "train_time": train_result.train_time, + "file_size": file_size, + "num_trainable_params": train_result.num_trainable_params, + "num_total_params": train_result.num_total_params, + "status": train_result.status.value, + "metrics": train_result.metrics, + }, + "meta_info": { + "model_info": {"sha": model_sha, "created_at": model_created_at}, + "dataset_info": dataset_info_log, + **asdict(meta_info), + }, + } + + log_to_console(log_data, print_fn=print) # use normal print to be able to redirect if so desired + log_to_file( + log_data=log_data, save_dir=save_dir, experiment_name=experiment_name, timestamp=start_date, print_fn=print_fn + ) diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9b4133545f4774ea0fa36c20c6b13f35ac82372b --- /dev/null +++ b/README.md @@ -0,0 +1,112 @@ +--- +title: PEFT Method Comparison +sdk: gradio +app_file: app.py +pinned: false +emoji: ⚖️ +--- + +# Comparison of PEFT Methods + +The goal of this project is to provide replicable experiments that produce outcomes allowing us to compare different PEFT methods with one another. This gives you more information to make an informed decision about which methods best fit your use case and what trade-offs to expect. + +## Community Contributions + +We envision the PEFT method comparison project as an ongoing endeavor with heavy involvement from the community. As maintainers, it is impossible for us to know all the perfect hyperparameters for each method or to predict all the use cases that PEFT users may have. As a consequence, community contributions are very welcome. + +Below, we outline all the ways you can contribute to this project. + +### Creating New Experiments + +Creating a new experiment requires setting up a new PEFT configuration for us to test. This will result in one more data point being added to the total comparison. + +Working on this is especially relevant if: + +1. You are the author of a paper whose method is introduced in PEFT, or worked on the PEFT integration, and know what hyperparameters work best. +2. You have experience with a specific method and want to share your knowledge with the community. + +Of course, you can contribute even without meeting these criteria. Please follow the instructions below. + +#### How to Add New Experiments + +Start by navigating to one of the existing experiment folders, e.g. `peft/method_comparison/MetaMathQA`, if your experiment involves using the [MetaMathQA dataset](https://huggingface.co/datasets/meta-math/MetaMathQA). There, create a new directory inside the `experiments/` folder using a descriptive name. For example, if you want to test LoRA with rank 123 using Llama-3.2 3B as the base model, you could name the folder `experiments/lora/llama-3.2-3B-rank123`. + +Inside this directory, you will find a default configuration file called `default_training_params.json`, which contains the default parameters used in the `run.py` training script. Create a new JSON file containing all the parameters you want to modify compared to the defaults, and save it as `training_params.json` in the newly created folder. If you are satisfied with all the default training parameters, you can skip this step. + +Finally, you need to create a PEFT configuration file for the PEFT method you want to add. This should be a JSON file called `adapter_config.json`, placed in the same directory. Below is an example of how this could look: + +```python +from peft import LoraConfig +config = LoraConfig(r=123) +config.save_pretrained("experiments/lora/llama-3.2-3B-rank123/") +``` + +Once you've created the configuration files for your experiment, please [create a PR on PEFT](https://github.com/huggingface/peft/pulls). After it is reviewed and merged, we will run it on our hardware to ensure that the results are comparable. Of course, it is best if you run the experiment at least once on your hardware to verify that the proposed settings work well. + +#### Considerations When Adding New Experiments + +When adding a new experiment, please consider the following points: + +1. Avoid changing too many training parameters at once, as this would make it difficult to compare results with existing ones. For example, if all existing results were created with 5000 training steps but your result uses 10000 steps, it would be unclear whether an improvement in the test score is due to the PEFT method itself or simply due to longer training. Similarly, using a completely different base model, especially if it is significantly more capable, does not contribute to a fair comparison. +2. Avoid suggesting configurations that are very close to existing ones. For example, if there is already an experiment with LoRA and rank 123, do not add an experiment with LoRA and rank 124. +3. Experiments for less-tested methods are more valuable than additional experiments for widely tested methods. +4. Do not edit existing experiments, always create new ones. +5. If you found hyper parameters that work especially well with a given method but are not trivial to find out, consider updating the PEFT documentation of that method so that other users can benefit from your findings. + +### Updating the Training Script + +We provide a training script that includes features typically useful for improving training outcomes, such as AMP support, a cosine learning rate schedule, etc. However, there is always room for improvement. For example, at the time of writing, the script does not support gradient accumulation. Therefore, PRs that extend the training script are welcome. + +#### How to Update the Training Script + +Follow the same process as when contributing to PEFT in general (see the [contribution guidelines](https://huggingface.co/docs/peft/developer_guides/contributing)). If the same training script is used across multiple datasets, please ensure that all relevant scripts are updated accordingly. + +#### Considerations When Updating the Training Script + +1. Updates should be backward-compatible. By default, any new features should be disabled to ensure that existing results remain valid. For example, if you add gradient accumulation, ensure it is disabled by default so that new experiments must opt in. +2. Before adding a bug fix that could invalidate existing results, consider whether the trade-off is worthwhile. If we already have many experimental results, rerunning all of them can be expensive. If the bug fix is not critical, it may not be worth invalidating previous results. However, if you discover a significant bug that could meaningfully impact outcomes, it should be addressed. +3. Avoid unnecessary complexity. While we could add support for DeepSpeed, FSDP, etc., doing so would add significant complexity, exclude users with limited hardware, and is unlikely to alter the relative performance of different PEFT methods. +4. Minimize reliance on specific training frameworks. For example, we deliberately avoid using the `Trainer` class from transformers or PyTorch Lightning. This ensures transparency, making it easier to understand the training process and replicate results over time. If a training framework were used, we would have to pin the version or risk future incompatibilities. + +### Adding a New Dataset + +Adding a new dataset increases the breadth and usefulness of the PEFT method comparison. The goal is not necessarily to outperform benchmarks or replicate paper results, but to fairly compare different PEFT methods in a way that is useful for PEFT users. If this involves replicating an experiment from a paper, that is great, but it is not a requirement. + +#### How to Add a New Dataset + +The easiest way to add support for a new dataset is to copy an existing setup, such as `method_comparison/MetaMathQA`, rename it, and modify `data.py`, as well as any other necessary parts of the code. Ideally, as much existing code as possible should be reused. The general folder structure and experiment logging format should remain consistent. + +After adding the dataset, ensure it functions correctly and produces meaningful results by running at least one experimental setup, such as using LoRA with default settings. + +#### Considerations When Adding a New Dataset + +1. Before beginning, it is best to open an [issue on PEFT](https://github.com/huggingface/peft/issues) to share your plans. This allows for early feedback and prevents wasted effort on impractical ideas. +2. The most valuable new datasets are those that test different capabilities than those already present. Bonus points if the task is similar to what users may face in the real world. Task ideas that would be great to add: + - A task involving both language and image modalities. + - An image generation task (like stable diffusion) + - A task involving audio (like whisper) + - A task that requires knowledge preservation (checked, for instance, via an auxiliary test set) + - Learning something completely new (e.g. a new language) + - A reinforcement learning task (e.g. using [trl](https://github.com/huggingface/trl)) +3. Training should be reasonably fast. Running dozens of experiments is impractical if each one takes multiple days and incurs high costs. Ideally, training should take a few hours at most on high-end consumer hardware. +4. The chosen base model should not be too large, to avoid VRAM constraints. Morevoer, if the base model is too powerful, there is little room for improvement through further fine-tuning. +5. Test scores should be informative and have a broad range: + - Besides loss, there should ideally be at least one additional metric, such as accuracy. + - Comparisons are not meaningful if all methods score near 0% or near 100%. The dataset should yield a range of scores to facilitate meaningful differentiation between methods. +6. The dataset should be publicly available and have a track record as a useful dataset. The license should permit the intended usage. + +## Result dashboard + +For convenience, we included a [Gradio](https://www.gradio.app/) app that shows the results of the experiments. It allows you to filter down the task and base model and show the experiment results for this selection. Give it a try! + +This app requires additional packages to be installed, please install the packages listed in `requirements-app.txt`, e.g. via: + +```sh +python -m pip install -r requirements-app.txt +``` + +To launch the demo, run: + +```sh +python app.py +``` diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..6f7d927fbfd2db7253a7cde70786a050b63041a9 --- /dev/null +++ b/app.py @@ -0,0 +1,359 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Gradio app to show the results""" + +import os +import tempfile + +import gradio as gr +import plotly.express as px +import plotly.graph_objects as go +from processing import load_df +from sanitizer import parse_and_filter + + +metric_preferences = { + "cuda_memory_reserved_avg": "lower", + "cuda_memory_max": "lower", + "cuda_memory_reserved_99th": "lower", + "total_time": "lower", + "train_time": "lower", + "file_size": "lower", + "test_accuracy": "higher", + "train_loss": "lower", +} + + +def get_model_ids(task_name, df): + filtered = df[df["task_name"] == task_name] + return sorted(filtered["model_id"].unique()) + + +def filter_data(task_name, model_id, df): + filtered = df[(df["task_name"] == task_name) & (df["model_id"] == model_id)] + return filtered + + +# Compute the Pareto frontier for two selected metrics. +def compute_pareto_frontier(df, metric_x, metric_y): + if df.empty: + return df + + df = df.copy() + points = df[[metric_x, metric_y]].values + selected_indices = [] + + def dominates(a, b, metric_x, metric_y): + # Check for each metric whether b is as good or better than a + if metric_preferences[metric_x] == "higher": + cond_x = b[0] >= a[0] + better_x = b[0] > a[0] + else: + cond_x = b[0] <= a[0] + better_x = b[0] < a[0] + if metric_preferences[metric_y] == "higher": + cond_y = b[1] >= a[1] + better_y = b[1] > a[1] + else: + cond_y = b[1] <= a[1] + better_y = b[1] < a[1] + return cond_x and cond_y and (better_x or better_y) + + for i, point in enumerate(points): + dominated = False + for j, other_point in enumerate(points): + if i == j: + continue + if dominates(point, other_point, metric_x, metric_y): + dominated = True + break + if not dominated: + selected_indices.append(i) + pareto_df = df.iloc[selected_indices] + return pareto_df + + +def generate_pareto_plot(df, metric_x, metric_y): + if df.empty: + return {} + + # Compute Pareto frontier and non-frontier points. + pareto_df = compute_pareto_frontier(df, metric_x, metric_y) + non_pareto_df = df.drop(pareto_df.index) + + # Create an empty figure. + fig = go.Figure() + + # Draw the line connecting Pareto frontier points. + if not pareto_df.empty: + # Sort the Pareto frontier points by metric_x for a meaningful connection. + pareto_sorted = pareto_df.sort_values(by=metric_x) + line_trace = go.Scatter( + x=pareto_sorted[metric_x], + y=pareto_sorted[metric_y], + mode="lines", + line={"color": "rgba(0,0,255,0.3)", "width": 4}, + name="Pareto Frontier", + ) + fig.add_trace(line_trace) + + # Add non-frontier points in gray with semi-transparency. + if not non_pareto_df.empty: + non_frontier_trace = go.Scatter( + x=non_pareto_df[metric_x], + y=non_pareto_df[metric_y], + mode="markers", + marker={"color": "rgba(128,128,128,0.5)", "size": 12}, + hoverinfo="text", + text=non_pareto_df.apply( + lambda row: f"experiment_name: {row['experiment_name']}
" + f"peft_type: {row['peft_type']}
" + f"{metric_x}: {row[metric_x]}
" + f"{metric_y}: {row[metric_y]}", + axis=1, + ), + showlegend=False, + ) + fig.add_trace(non_frontier_trace) + + # Add Pareto frontier points with legend + if not pareto_df.empty: + pareto_scatter = px.scatter( + pareto_df, + x=metric_x, + y=metric_y, + color="experiment_name", + hover_data={"experiment_name": True, "peft_type": True, metric_x: True, metric_y: True}, + ) + for trace in pareto_scatter.data: + trace.marker = {"size": 12} + fig.add_trace(trace) + + # Update layout with axes labels. + fig.update_layout( + title=f"Pareto Frontier for {metric_x} vs {metric_y}", + template="seaborn", + height=700, + autosize=True, + xaxis_title=metric_x, + yaxis_title=metric_y, + ) + + return fig + + +def compute_pareto_summary(filtered, pareto_df, metric_x, metric_y): + if filtered.empty: + return "No data available." + + stats = filtered[[metric_x, metric_y]].agg(["min", "max", "mean"]).to_string() + total_points = len(filtered) + pareto_points = len(pareto_df) + excluded_points = total_points - pareto_points + summary_text = ( + f"{stats}\n\n" + f"Total points: {total_points}\n" + f"Pareto frontier points: {pareto_points}\n" + f"Excluded points: {excluded_points}" + ) + return summary_text + + +def export_csv(df): + if df.empty: + return None + csv_data = df.to_csv(index=False) + with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8") as tmp: + tmp.write(csv_data) + tmp_path = tmp.name + return tmp_path + + +def build_app(df): + with gr.Blocks(theme=gr.themes.Soft()) as demo: + gr.Markdown("# PEFT method comparison") + gr.Markdown( + "Find more information [on the PEFT GitHub repo](https://github.com/huggingface/peft/tree/main/method_comparison)" + ) + + # Hidden state to store the current filter query. + filter_state = gr.State("") + + gr.Markdown("## Choose the task and base model") + with gr.Row(): + task_dropdown = gr.Dropdown( + label="Select Task", + choices=sorted(df["task_name"].unique()), + value=sorted(df["task_name"].unique())[0], + ) + model_dropdown = gr.Dropdown( + label="Select Model ID", choices=get_model_ids(sorted(df["task_name"].unique())[0], df) + ) + + data_table = gr.DataFrame(label="Results", value=df, interactive=False) + + with gr.Row(): + filter_textbox = gr.Textbox( + label="Filter DataFrame", + placeholder="Enter filter (e.g.: peft_type=='LORA')", + interactive=True, + ) + apply_filter_button = gr.Button("Apply Filter") + reset_filter_button = gr.Button("Reset Filter") + + gr.Markdown("## Pareto plot") + gr.Markdown( + "Select 2 criteria to plot the Pareto frontier. This will show the best PEFT methods along this axis and " + "the trade-offs with the other axis. The PEFT methods that Pareto-dominate are shown in colors. All other " + "methods are inferior with regard to these two metrics. Hover over a point to show details." + ) + + with gr.Row(): + x_default = ( + "cuda_memory_max" if "cuda_memory_max" in metric_preferences else list(metric_preferences.keys())[0] + ) + y_default = ( + "test_accuracy" if "test_accuracy" in metric_preferences else list(metric_preferences.keys())[1] + ) + metric_x_dropdown = gr.Dropdown( + label="1st metric for Pareto plot", + choices=list(metric_preferences.keys()), + value=x_default, + ) + metric_y_dropdown = gr.Dropdown( + label="2nd metric for Pareto plot", + choices=list(metric_preferences.keys()), + value=y_default, + ) + + pareto_plot = gr.Plot(label="Pareto Frontier Plot") + summary_box = gr.Textbox(label="Summary Statistics", lines=6) + csv_output = gr.File(label="Export Filtered Data as CSV") + + def update_on_task(task_name, current_filter): + new_models = get_model_ids(task_name, df) + filtered = filter_data(task_name, new_models[0] if new_models else "", df) + if current_filter.strip(): + try: + mask = parse_and_filter(filtered, current_filter) + df_queried = filtered[mask] + if not df_queried.empty: + filtered = df_queried + except Exception: + # invalid filter query + pass + return gr.update(choices=new_models, value=new_models[0] if new_models else None), filtered + + task_dropdown.change( + fn=update_on_task, inputs=[task_dropdown, filter_state], outputs=[model_dropdown, data_table] + ) + + def update_on_model(task_name, model_id, current_filter): + filtered = filter_data(task_name, model_id, df) + if current_filter.strip(): + try: + mask = parse_and_filter(filtered, current_filter) + filtered = filtered[mask] + except Exception: + pass + return filtered + + model_dropdown.change( + fn=update_on_model, inputs=[task_dropdown, model_dropdown, filter_state], outputs=data_table + ) + + def update_pareto_plot_and_summary(task_name, model_id, metric_x, metric_y, current_filter): + filtered = filter_data(task_name, model_id, df) + if current_filter.strip(): + try: + mask = parse_and_filter(filtered, current_filter) + filtered = filtered[mask] + except Exception as e: + return generate_pareto_plot(filtered, metric_x, metric_y), f"Filter error: {e}" + + pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y) + fig = generate_pareto_plot(filtered, metric_x, metric_y) + summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y) + return fig, summary + + for comp in [model_dropdown, metric_x_dropdown, metric_y_dropdown]: + comp.change( + fn=update_pareto_plot_and_summary, + inputs=[task_dropdown, model_dropdown, metric_x_dropdown, metric_y_dropdown, filter_state], + outputs=[pareto_plot, summary_box], + ) + + def apply_filter(filter_query, task_name, model_id, metric_x, metric_y): + filtered = filter_data(task_name, model_id, df) + if filter_query.strip(): + try: + mask = parse_and_filter(filtered, filter_query) + filtered = filtered[mask] + except Exception as e: + # Update the table, plot, and summary even if there is a filter error. + return ( + filter_query, + filtered, + generate_pareto_plot(filtered, metric_x, metric_y), + f"Filter error: {e}", + ) + + pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y) + fig = generate_pareto_plot(filtered, metric_x, metric_y) + summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y) + return filter_query, filtered, fig, summary + + apply_filter_button.click( + fn=apply_filter, + inputs=[filter_textbox, task_dropdown, model_dropdown, metric_x_dropdown, metric_y_dropdown], + outputs=[filter_state, data_table, pareto_plot, summary_box], + ) + + def reset_filter(task_name, model_id, metric_x, metric_y): + filtered = filter_data(task_name, model_id, df) + pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y) + fig = generate_pareto_plot(filtered, metric_x, metric_y) + summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y) + # Return empty strings to clear the filter state and textbox. + return "", "", filtered, fig, summary + + reset_filter_button.click( + fn=reset_filter, + inputs=[task_dropdown, model_dropdown, metric_x_dropdown, metric_y_dropdown], + outputs=[filter_state, filter_textbox, data_table, pareto_plot, summary_box], + ) + + gr.Markdown("## Export data") + # Export button for CSV download. + export_button = gr.Button("Export Filtered Data") + export_button.click( + fn=lambda task, model: export_csv(filter_data(task, model, df)), + inputs=[task_dropdown, model_dropdown], + outputs=csv_output, + ) + + demo.load( + fn=update_pareto_plot_and_summary, + inputs=[task_dropdown, model_dropdown, metric_x_dropdown, metric_y_dropdown, filter_state], + outputs=[pareto_plot, summary_box], + ) + + return demo + + +path = os.path.join(os.path.dirname(__file__), "MetaMathQA", "results") +df = load_df(path, task_name="MetaMathQA") +demo = build_app(df) +demo.launch() diff --git a/processing.py b/processing.py new file mode 100644 index 0000000000000000000000000000000000000000..f8a74f4c694e4138c5ba81c85c82483fc3ac2cd2 --- /dev/null +++ b/processing.py @@ -0,0 +1,145 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data processing used for analyzing and presenting the results""" + +import json +import os + +import pandas as pd + + +def preprocess(rows, task_name: str, print_fn=print): + results = [] + skipped = 0 + for row in rows: + run_info = row["run_info"] + train_info = row["train_info"] + meta_info = row["meta_info"] + if run_info["peft_config"]: + peft_type = run_info["peft_config"]["peft_type"] + else: + peft_type = "full-finetuning" + if train_info["status"] != "success": + skipped += 1 + continue + + train_metrics = train_info["metrics"][-1] + + # extract the fields that make most sense + dct = { + "task_name": task_name, + "experiment_name": run_info["experiment_name"], + "model_id": run_info["train_config"]["model_id"], + "train_config": run_info["train_config"], + "peft_type": peft_type, + "peft_config": run_info["peft_config"], + "cuda_memory_reserved_avg": train_info["cuda_memory_reserved_avg"], + "cuda_memory_max": train_info["cuda_memory_max"], + "cuda_memory_reserved_99th": train_info["cuda_memory_reserved_99th"], + "total_time": run_info["total_time"], + "train_time": train_info["train_time"], + "file_size": train_info["file_size"], + "test_accuracy": train_metrics["test accuracy"], + "train_loss": train_metrics["train loss"], + "train_samples": train_metrics["train samples"], + "train_total_tokens": train_metrics["train total tokens"], + "peft_version": meta_info["package_info"]["peft-version"], + "peft_branch": run_info["peft_branch"], + "transformers_version": meta_info["package_info"]["transformers-version"], + "datasets_version": meta_info["package_info"]["datasets-version"], + "torch_version": meta_info["package_info"]["torch-version"], + "bitsandbytes_version": meta_info["package_info"]["bitsandbytes-version"], + "package_info": meta_info["package_info"], + "system_info": meta_info["system_info"], + "created_at": run_info["created_at"], + } + results.append(dct) + + if skipped: + print_fn(f"Skipped {skipped} of {len(rows)} entries because the train status != success") + + return results + + +def load_jsons(path): + results = [] + for fn in os.listdir(path): + if fn.endswith(".json"): + with open(os.path.join(path, fn)) as f: + row = json.load(f) + results.append(row) + return results + + +def load_df(path, task_name, print_fn=print): + jsons = load_jsons(path) + preprocessed = preprocess(jsons, task_name=task_name, print_fn=print_fn) + dtype_dict = { + "task_name": "string", + "experiment_name": "string", + "model_id": "string", + "train_config": "string", + "peft_type": "string", + "peft_config": "string", + "cuda_memory_reserved_avg": int, + "cuda_memory_max": int, + "cuda_memory_reserved_99th": int, + "total_time": float, + "train_time": float, + "file_size": int, + "test_accuracy": float, + "train_loss": float, + "train_samples": int, + "train_total_tokens": int, + "peft_version": "string", + "peft_branch": "string", + "transformers_version": "string", + "datasets_version": "string", + "torch_version": "string", + "bitsandbytes_version": "string", + "package_info": "string", + "system_info": "string", + "created_at": "string", + } + df = pd.DataFrame(preprocessed) + df = df.astype(dtype_dict) + df["created_at"] = pd.to_datetime(df["created_at"]) + # round training time to nearest second + df["train_time"] = df["train_time"].round().astype(int) + df["total_time"] = df["total_time"].round().astype(int) + + # reorder columns for better viewing, pinned_columns arg in Gradio seems not to work correctly + important_columns = [ + "experiment_name", + "peft_type", + "total_time", + "train_time", + "test_accuracy", + "train_loss", + "cuda_memory_max", + "cuda_memory_reserved_99th", + "cuda_memory_reserved_avg", + "file_size", + "created_at", + "task_name", + ] + other_columns = [col for col in df if col not in important_columns] + df = df[important_columns + other_columns] + + size_before_drop_dups = len(df) + columns = ["experiment_name", "model_id", "peft_type", "created_at"] + # we want to keep only the most recent run for each experiment + df = df.sort_values("created_at").drop_duplicates(columns, keep="last") + return df diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..05c4d2caef0fa0171adaefc2ef73943e16b5d0b0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +dash +gradio>=5.21.0 +pandas diff --git a/sanitizer.py b/sanitizer.py new file mode 100644 index 0000000000000000000000000000000000000000..7659d650c0fb293806d314f7334950ebaffbda33 --- /dev/null +++ b/sanitizer.py @@ -0,0 +1,100 @@ +import ast + +import pandas as pd + + +def _evaluate_node(df, node): + """ + Recursively evaluates an AST node to generate a pandas boolean mask. + """ + # Base Case: A simple comparison like 'price > 100' + if isinstance(node, ast.Compare): + if not isinstance(node.left, ast.Name): + raise ValueError("Left side of comparison must be a column name.") + col = node.left.id + if col not in df.columns: + raise ValueError(f"Column '{col}' not found in DataFrame.") + + if len(node.ops) > 1: + raise ValueError("Chained comparisons like '10 < price < 100' are not supported.") + + op_node = node.ops[0] + val_node = node.comparators[0] + try: + value = ast.literal_eval(val_node) + except ValueError: + raise ValueError("Right side of comparison must be a literal (number, string, list).") + + operator_map = { + ast.Gt: lambda c, v: df[c] > v, + ast.GtE: lambda c, v: df[c] >= v, + ast.Lt: lambda c, v: df[c] < v, + ast.LtE: lambda c, v: df[c] <= v, + ast.Eq: lambda c, v: df[c] == v, + ast.NotEq: lambda c, v: df[c] != v, + ast.In: lambda c, v: df[c].isin(v), + ast.NotIn: lambda c, v: ~df[c].isin(v) + } + op_type = type(op_node) + if op_type not in operator_map: + raise ValueError(f"Unsupported operator '{op_type.__name__}'.") + return operator_map[op_type](col, value) + + # Recursive Step: "Bitwise" operation & and | (the same as boolean operations) + elif isinstance(node, ast.BinOp): + if isinstance(node.op, ast.BitOr): + return _evaluate_node(df, node.left) | _evaluate_node(df, node.right) + elif isinstance(node.op, ast.BitAnd): + return _evaluate_node(df, node.left) & _evaluate_node(df, node.right) + + # Recursive Step: A boolean operation like '... and ...' or '... or ...' + elif isinstance(node, ast.BoolOp): + op_type = type(node.op) + # Evaluate the first value in the boolean expression + result = _evaluate_node(df, node.values[0]) + # Combine it with the rest of the values based on the operator + for i in range(1, len(node.values)): + if op_type is ast.And or op_type is ast.BitAnd: + result &= _evaluate_node(df, node.values[i]) + elif op_type is ast.Or or op_type is ast.BitOr: + result |= _evaluate_node(df, node.values[i]) + return result + + elif isinstance(node, ast.UnaryOp): + if not isinstance(node.op, ast.Not): + raise ValueError("Only supported unary op is negation.") + return ~_evaluate_node(df, node.operand) + + # If the node is not a comparison or boolean op, it's an unsupported expression type + else: + raise ValueError(f"Unsupported expression type: {type(node).__name__}") + + +def parse_and_filter(df, filter_str): + """ + Filters a pandas DataFrame using a string expression parsed by AST. + This is done to avoid the security vulnerables that `DataFrame.query` + brings (arbitrary code execution). + + Args: + df (pd.DataFrame): The DataFrame to filter. + filter_str (str): A string representing a filter expression. + e.g., "price > 100 and stock < 50" + Supported operators: >, >=, <, <=, ==, !=, in, not in, and, or. + + Returns: + pd.Series: A boolean Series representing the filter mask. + """ + if not filter_str: + return pd.Series([True] * len(df), index=df.index) + + try: + # 'eval' mode ensures the source is a single expression. + tree = ast.parse(filter_str, mode='eval') + expression_node = tree.body + except (SyntaxError, ValueError) as e: + raise ValueError(f"Invalid filter syntax: {e}") + + # The recursive evaluation starts here + mask = _evaluate_node(df, expression_node) + return mask diff --git a/test_sanitizer.py b/test_sanitizer.py new file mode 100644 index 0000000000000000000000000000000000000000..59c0dd191e887aaeebbfce9dff9e88e6be0e2152 --- /dev/null +++ b/test_sanitizer.py @@ -0,0 +1,38 @@ +import pandas as pd +import pytest + +from .sanitizer import parse_and_filter + + +@pytest.fixture +def df_products(): + data = { + 'product_id': [101, 102, 103, 104, 105, 106], + 'category': ['Electronics', 'Books', 'Electronics', 'Home Goods', 'Books', 'Electronics'], + 'price': [799.99, 19.99, 49.50, 120.00, 24.99, 150.00], + 'stock': [15, 300, 50, 25, 150, 0] + } + return pd.DataFrame(data) + + +def test_exploit_fails(df_products): + with pytest.raises(ValueError) as e: + mask1 = parse_and_filter(df_products, + """price < 50 and @os.system("/bin/echo password")""") + assert 'Invalid filter syntax' in str(e) + + +@pytest.mark.parametrize('expression,ids', [ + ("price < 50", [102, 103, 105]), + ("product_id in [101, 102]", [101, 102]), + ("price < 50 and category == 'Electronics'", [103]), + ("stock < 100 or category == 'Home Goods'", [101, 103, 104, 106]), + ("(price > 100 and stock < 20) or category == 'Books'", [101, 102, 105, 106]), + ("not (price > 50 or stock > 100)", [103]), + ("not price > 50", [102, 103, 105]), + ("(price < 50) & (category == 'Electronics')", [103]), + ("(stock < 100) | (category == 'Home Goods')", [101, 103, 104, 106]), +]) +def test_operations(df_products, expression, ids): + mask1 = parse_and_filter(df_products, expression) + assert sorted(df_products[mask1].product_id) == sorted(ids)