Egoli commited on Jul 14, 2024

Commit

3cd5588

verified ·

1 Parent(s): 30df527

Upload 54 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

adapter_config.json +34 -0
checkpoint-100/README.md +202 -0
checkpoint-100/adapter_config.json +34 -0
checkpoint-100/adapter_model.safetensors +3 -0
checkpoint-100/added_tokens.json +5 -0
checkpoint-100/merges.txt +0 -0
checkpoint-100/optimizer.pt +3 -0
checkpoint-100/rng_state.pth +3 -0
checkpoint-100/scheduler.pt +3 -0
checkpoint-100/special_tokens_map.json +20 -0
checkpoint-100/tokenizer.json +0 -0
checkpoint-100/tokenizer_config.json +44 -0
checkpoint-100/trainer_state.json +193 -0
checkpoint-100/training_args.bin +3 -0
checkpoint-100/vocab.json +0 -0
checkpoint-200/README.md +202 -0
checkpoint-200/adapter_config.json +34 -0
checkpoint-200/adapter_model.safetensors +3 -0
checkpoint-200/added_tokens.json +5 -0
checkpoint-200/merges.txt +0 -0
checkpoint-200/optimizer.pt +3 -0
checkpoint-200/rng_state.pth +3 -0
checkpoint-200/scheduler.pt +3 -0
checkpoint-200/special_tokens_map.json +20 -0
checkpoint-200/tokenizer.json +0 -0
checkpoint-200/tokenizer_config.json +44 -0
checkpoint-200/trainer_state.json +353 -0
checkpoint-200/training_args.bin +3 -0
checkpoint-200/vocab.json +0 -0
checkpoint-240/README.md +202 -0
checkpoint-240/adapter_config.json +34 -0
checkpoint-240/adapter_model.safetensors +3 -0
checkpoint-240/added_tokens.json +5 -0
checkpoint-240/merges.txt +0 -0
checkpoint-240/optimizer.pt +3 -0
checkpoint-240/rng_state.pth +3 -0
checkpoint-240/scheduler.pt +3 -0
checkpoint-240/special_tokens_map.json +20 -0
checkpoint-240/tokenizer.json +0 -0
checkpoint-240/tokenizer_config.json +44 -0
checkpoint-240/trainer_state.json +417 -0
checkpoint-240/training_args.bin +3 -0
checkpoint-240/vocab.json +0 -0
config.json +28 -0
generation_config.json +14 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-1.5B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 8,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": true,
+  "use_rslora": false
+}

checkpoint-100/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-1.5B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

checkpoint-100/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-1.5B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 8,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": true,
+  "use_rslora": false
+}

checkpoint-100/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af6eb31e3f5681f5ddcb0a97e13730261601fa70d74d493da03663b07d2243d0
+size 39588464

checkpoint-100/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

checkpoint-100/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-100/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebcb99312f2db4690364c5b92a6f8a7594054a6239968d34260d41676b2fd1eb
+size 79513462

checkpoint-100/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff39c56748cc5e85691d62bfd6a79e60ab92c1f59b698543eba03446c493cd5c
+size 14244

checkpoint-100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8210c3e296732d9d03b85e895008ef852933ca6cd6840952b6a81f4a6dc65e17
+size 1064

checkpoint-100/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-100/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-100/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,193 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 8.0,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.758865475654602,
+      "learning_rate": 4.994647308096509e-05,
+      "loss": 4.3569,
+      "num_input_tokens_seen": 5888,
+      "step": 5
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.4234931468963623,
+      "learning_rate": 4.9786121534345265e-05,
+      "loss": 4.043,
+      "num_input_tokens_seen": 11872,
+      "step": 10
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.5642849206924438,
+      "learning_rate": 4.951963201008076e-05,
+      "loss": 3.7996,
+      "num_input_tokens_seen": 17616,
+      "step": 15
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.0812954902648926,
+      "learning_rate": 4.914814565722671e-05,
+      "loss": 3.7366,
+      "num_input_tokens_seen": 23584,
+      "step": 20
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.3543965816497803,
+      "learning_rate": 4.867325323737765e-05,
+      "loss": 3.6712,
+      "num_input_tokens_seen": 29600,
+      "step": 25
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.436577558517456,
+      "learning_rate": 4.8096988312782174e-05,
+      "loss": 3.6046,
+      "num_input_tokens_seen": 35184,
+      "step": 30
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 1.079750657081604,
+      "learning_rate": 4.742181853831721e-05,
+      "loss": 3.4873,
+      "num_input_tokens_seen": 41056,
+      "step": 35
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 1.5652551651000977,
+      "learning_rate": 4.665063509461097e-05,
+      "loss": 3.3525,
+      "num_input_tokens_seen": 47040,
+      "step": 40
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 1.6999948024749756,
+      "learning_rate": 4.5786740307563636e-05,
+      "loss": 3.2724,
+      "num_input_tokens_seen": 53072,
+      "step": 45
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 1.4872270822525024,
+      "learning_rate": 4.4833833507280884e-05,
+      "loss": 3.3493,
+      "num_input_tokens_seen": 59360,
+      "step": 50
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 1.450323462486267,
+      "learning_rate": 4.379599518697444e-05,
+      "loss": 3.2383,
+      "num_input_tokens_seen": 64928,
+      "step": 55
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 1.70930016040802,
+      "learning_rate": 4.267766952966369e-05,
+      "loss": 3.1941,
+      "num_input_tokens_seen": 70752,
+      "step": 60
+    },
+    {
+      "epoch": 5.2,
+      "grad_norm": 1.9510873556137085,
+      "learning_rate": 4.148364537750172e-05,
+      "loss": 3.1134,
+      "num_input_tokens_seen": 76544,
+      "step": 65
+    },
+    {
+      "epoch": 5.6,
+      "grad_norm": 1.781402826309204,
+      "learning_rate": 4.021903572521802e-05,
+      "loss": 3.0551,
+      "num_input_tokens_seen": 82736,
+      "step": 70
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 1.4807748794555664,
+      "learning_rate": 3.888925582549006e-05,
+      "loss": 3.0148,
+      "num_input_tokens_seen": 88464,
+      "step": 75
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 1.936543345451355,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 2.9215,
+      "num_input_tokens_seen": 94496,
+      "step": 80
+    },
+    {
+      "epoch": 6.8,
+      "grad_norm": 2.542340040206909,
+      "learning_rate": 3.6057217255475034e-05,
+      "loss": 2.8317,
+      "num_input_tokens_seen": 100192,
+      "step": 85
+    },
+    {
+      "epoch": 7.2,
+      "grad_norm": 2.17262864112854,
+      "learning_rate": 3.456708580912725e-05,
+      "loss": 2.8665,
+      "num_input_tokens_seen": 105904,
+      "step": 90
+    },
+    {
+      "epoch": 7.6,
+      "grad_norm": 1.919408917427063,
+      "learning_rate": 3.303598663257904e-05,
+      "loss": 2.8028,
+      "num_input_tokens_seen": 111856,
+      "step": 95
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 2.5374443531036377,
+      "learning_rate": 3.147047612756302e-05,
+      "loss": 2.9059,
+      "num_input_tokens_seen": 118064,
+      "step": 100
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 240,
+  "num_input_tokens_seen": 118064,
+  "num_train_epochs": 20,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 935221387051008.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88861ed737b91798f2b18542cf81036f47fb195cc2283a0f86fbef9f5c7d5c56
+size 5304

checkpoint-100/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-200/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-1.5B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

checkpoint-200/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-1.5B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 8,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": true,
+  "use_rslora": false
+}

checkpoint-200/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5a1ab856bf07f021b2dd2bdb096a19a1afd852d467dd5510be76e8768d15564
+size 39588464

checkpoint-200/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

checkpoint-200/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f356a0679178142786859e1fa31514d14477e6ac628b118ef9663ac9907ab813
+size 79513462

checkpoint-200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d44a53784733ae7dcb55555ed75e1fea306bd7427001380dc0f3854a19687855
+size 14244

checkpoint-200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3133cb56fc46e3d814a06b3f520019ad72b705f2bc8fd0dd898390bb36dcd1da
+size 1064

checkpoint-200/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-200/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,353 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 16.0,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.758865475654602,
+      "learning_rate": 4.994647308096509e-05,
+      "loss": 4.3569,
+      "num_input_tokens_seen": 5888,
+      "step": 5
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.4234931468963623,
+      "learning_rate": 4.9786121534345265e-05,
+      "loss": 4.043,
+      "num_input_tokens_seen": 11872,
+      "step": 10
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.5642849206924438,
+      "learning_rate": 4.951963201008076e-05,
+      "loss": 3.7996,
+      "num_input_tokens_seen": 17616,
+      "step": 15
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.0812954902648926,
+      "learning_rate": 4.914814565722671e-05,
+      "loss": 3.7366,
+      "num_input_tokens_seen": 23584,
+      "step": 20
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.3543965816497803,
+      "learning_rate": 4.867325323737765e-05,
+      "loss": 3.6712,
+      "num_input_tokens_seen": 29600,
+      "step": 25
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.436577558517456,
+      "learning_rate": 4.8096988312782174e-05,
+      "loss": 3.6046,
+      "num_input_tokens_seen": 35184,
+      "step": 30
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 1.079750657081604,
+      "learning_rate": 4.742181853831721e-05,
+      "loss": 3.4873,
+      "num_input_tokens_seen": 41056,
+      "step": 35
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 1.5652551651000977,
+      "learning_rate": 4.665063509461097e-05,
+      "loss": 3.3525,
+      "num_input_tokens_seen": 47040,
+      "step": 40
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 1.6999948024749756,
+      "learning_rate": 4.5786740307563636e-05,
+      "loss": 3.2724,
+      "num_input_tokens_seen": 53072,
+      "step": 45
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 1.4872270822525024,
+      "learning_rate": 4.4833833507280884e-05,
+      "loss": 3.3493,
+      "num_input_tokens_seen": 59360,
+      "step": 50
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 1.450323462486267,
+      "learning_rate": 4.379599518697444e-05,
+      "loss": 3.2383,
+      "num_input_tokens_seen": 64928,
+      "step": 55
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 1.70930016040802,
+      "learning_rate": 4.267766952966369e-05,
+      "loss": 3.1941,
+      "num_input_tokens_seen": 70752,
+      "step": 60
+    },
+    {
+      "epoch": 5.2,
+      "grad_norm": 1.9510873556137085,
+      "learning_rate": 4.148364537750172e-05,
+      "loss": 3.1134,
+      "num_input_tokens_seen": 76544,
+      "step": 65
+    },
+    {
+      "epoch": 5.6,
+      "grad_norm": 1.781402826309204,
+      "learning_rate": 4.021903572521802e-05,
+      "loss": 3.0551,
+      "num_input_tokens_seen": 82736,
+      "step": 70
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 1.4807748794555664,
+      "learning_rate": 3.888925582549006e-05,
+      "loss": 3.0148,
+      "num_input_tokens_seen": 88464,
+      "step": 75
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 1.936543345451355,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 2.9215,
+      "num_input_tokens_seen": 94496,
+      "step": 80
+    },
+    {
+      "epoch": 6.8,
+      "grad_norm": 2.542340040206909,
+      "learning_rate": 3.6057217255475034e-05,
+      "loss": 2.8317,
+      "num_input_tokens_seen": 100192,
+      "step": 85
+    },
+    {
+      "epoch": 7.2,
+      "grad_norm": 2.17262864112854,
+      "learning_rate": 3.456708580912725e-05,
+      "loss": 2.8665,
+      "num_input_tokens_seen": 105904,
+      "step": 90
+    },
+    {
+      "epoch": 7.6,
+      "grad_norm": 1.919408917427063,
+      "learning_rate": 3.303598663257904e-05,
+      "loss": 2.8028,
+      "num_input_tokens_seen": 111856,
+      "step": 95
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 2.5374443531036377,
+      "learning_rate": 3.147047612756302e-05,
+      "loss": 2.9059,
+      "num_input_tokens_seen": 118064,
+      "step": 100
+    },
+    {
+      "epoch": 8.4,
+      "grad_norm": 1.868935465812683,
+      "learning_rate": 2.9877258050403212e-05,
+      "loss": 2.6716,
+      "num_input_tokens_seen": 124192,
+      "step": 105
+    },
+    {
+      "epoch": 8.8,
+      "grad_norm": 1.9682857990264893,
+      "learning_rate": 2.8263154805501297e-05,
+      "loss": 2.7336,
+      "num_input_tokens_seen": 129856,
+      "step": 110
+    },
+    {
+      "epoch": 9.2,
+      "grad_norm": 2.1156065464019775,
+      "learning_rate": 2.663507823075358e-05,
+      "loss": 2.6835,
+      "num_input_tokens_seen": 135760,
+      "step": 115
+    },
+    {
+      "epoch": 9.6,
+      "grad_norm": 2.3779890537261963,
+      "learning_rate": 2.5e-05,
+      "loss": 2.5898,
+      "num_input_tokens_seen": 141760,
+      "step": 120
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 2.671842336654663,
+      "learning_rate": 2.3364921769246423e-05,
+      "loss": 2.5017,
+      "num_input_tokens_seen": 147664,
+      "step": 125
+    },
+    {
+      "epoch": 10.4,
+      "grad_norm": 2.809380292892456,
+      "learning_rate": 2.173684519449872e-05,
+      "loss": 2.4384,
+      "num_input_tokens_seen": 153312,
+      "step": 130
+    },
+    {
+      "epoch": 10.8,
+      "grad_norm": 3.9723222255706787,
+      "learning_rate": 2.0122741949596797e-05,
+      "loss": 2.5169,
+      "num_input_tokens_seen": 159584,
+      "step": 135
+    },
+    {
+      "epoch": 11.2,
+      "grad_norm": 2.3110992908477783,
+      "learning_rate": 1.852952387243698e-05,
+      "loss": 2.3124,
+      "num_input_tokens_seen": 165040,
+      "step": 140
+    },
+    {
+      "epoch": 11.6,
+      "grad_norm": 2.2998690605163574,
+      "learning_rate": 1.6964013367420966e-05,
+      "loss": 2.4734,
+      "num_input_tokens_seen": 170944,
+      "step": 145
+    },
+    {
+      "epoch": 12.0,
+      "grad_norm": 3.0829477310180664,
+      "learning_rate": 1.5432914190872757e-05,
+      "loss": 2.3957,
+      "num_input_tokens_seen": 177168,
+      "step": 150
+    },
+    {
+      "epoch": 12.4,
+      "grad_norm": 2.7017433643341064,
+      "learning_rate": 1.3942782744524973e-05,
+      "loss": 2.5141,
+      "num_input_tokens_seen": 183088,
+      "step": 155
+    },
+    {
+      "epoch": 12.8,
+      "grad_norm": 3.001948118209839,
+      "learning_rate": 1.2500000000000006e-05,
+      "loss": 2.2276,
+      "num_input_tokens_seen": 188736,
+      "step": 160
+    },
+    {
+      "epoch": 13.2,
+      "grad_norm": 3.0778212547302246,
+      "learning_rate": 1.1110744174509952e-05,
+      "loss": 2.1839,
+      "num_input_tokens_seen": 194592,
+      "step": 165
+    },
+    {
+      "epoch": 13.6,
+      "grad_norm": 3.4998250007629395,
+      "learning_rate": 9.780964274781984e-06,
+      "loss": 2.3212,
+      "num_input_tokens_seen": 200528,
+      "step": 170
+    },
+    {
+      "epoch": 14.0,
+      "grad_norm": 2.9788782596588135,
+      "learning_rate": 8.51635462249828e-06,
+      "loss": 2.2398,
+      "num_input_tokens_seen": 206256,
+      "step": 175
+    },
+    {
+      "epoch": 14.4,
+      "grad_norm": 3.000025510787964,
+      "learning_rate": 7.3223304703363135e-06,
+      "loss": 2.2431,
+      "num_input_tokens_seen": 212480,
+      "step": 180
+    },
+    {
+      "epoch": 14.8,
+      "grad_norm": 3.488719940185547,
+      "learning_rate": 6.204004813025568e-06,
+      "loss": 2.0743,
+      "num_input_tokens_seen": 218208,
+      "step": 185
+    },
+    {
+      "epoch": 15.2,
+      "grad_norm": 3.7175214290618896,
+      "learning_rate": 5.166166492719124e-06,
+      "loss": 2.1343,
+      "num_input_tokens_seen": 223632,
+      "step": 190
+    },
+    {
+      "epoch": 15.6,
+      "grad_norm": 3.5645270347595215,
+      "learning_rate": 4.213259692436367e-06,
+      "loss": 2.3751,
+      "num_input_tokens_seen": 229776,
+      "step": 195
+    },
+    {
+      "epoch": 16.0,
+      "grad_norm": 3.673673629760742,
+      "learning_rate": 3.3493649053890326e-06,
+      "loss": 2.0981,
+      "num_input_tokens_seen": 235392,
+      "step": 200
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 240,
+  "num_input_tokens_seen": 235392,
+  "num_train_epochs": 20,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1864612690919424.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88861ed737b91798f2b18542cf81036f47fb195cc2283a0f86fbef9f5c7d5c56
+size 5304

checkpoint-200/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-240/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2-1.5B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

checkpoint-240/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2-1.5B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 8,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": true,
+  "use_rslora": false
+}

checkpoint-240/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fda7d6167c440da3e80fdb3580d334fd34dce3b2eea000bf7763b8b38bddf21
+size 39588464

checkpoint-240/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

checkpoint-240/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-240/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6eb8127430096672be9d5f8d2aa0b72c052b68156233fd83e3ec31a6b219fba
+size 79513462

checkpoint-240/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1cb512edbf725e7c96c3109f7cd79eba5fb2a57850749b080cd81dd106ee0914
+size 14244

checkpoint-240/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:573e42e65bdbeb9991d17e7ed690aa2f143f2a22479ade9d84c12c5ef1c9d7ac
+size 1064

checkpoint-240/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-240/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-240/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-240/trainer_state.json ADDED Viewed

	@@ -0,0 +1,417 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 19.2,
+  "eval_steps": 500,
+  "global_step": 240,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.758865475654602,
+      "learning_rate": 4.994647308096509e-05,
+      "loss": 4.3569,
+      "num_input_tokens_seen": 5888,
+      "step": 5
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.4234931468963623,
+      "learning_rate": 4.9786121534345265e-05,
+      "loss": 4.043,
+      "num_input_tokens_seen": 11872,
+      "step": 10
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.5642849206924438,
+      "learning_rate": 4.951963201008076e-05,
+      "loss": 3.7996,
+      "num_input_tokens_seen": 17616,
+      "step": 15
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.0812954902648926,
+      "learning_rate": 4.914814565722671e-05,
+      "loss": 3.7366,
+      "num_input_tokens_seen": 23584,
+      "step": 20
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.3543965816497803,
+      "learning_rate": 4.867325323737765e-05,
+      "loss": 3.6712,
+      "num_input_tokens_seen": 29600,
+      "step": 25
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.436577558517456,
+      "learning_rate": 4.8096988312782174e-05,
+      "loss": 3.6046,
+      "num_input_tokens_seen": 35184,
+      "step": 30
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 1.079750657081604,
+      "learning_rate": 4.742181853831721e-05,
+      "loss": 3.4873,
+      "num_input_tokens_seen": 41056,
+      "step": 35
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 1.5652551651000977,
+      "learning_rate": 4.665063509461097e-05,
+      "loss": 3.3525,
+      "num_input_tokens_seen": 47040,
+      "step": 40
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 1.6999948024749756,
+      "learning_rate": 4.5786740307563636e-05,
+      "loss": 3.2724,
+      "num_input_tokens_seen": 53072,
+      "step": 45
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 1.4872270822525024,
+      "learning_rate": 4.4833833507280884e-05,
+      "loss": 3.3493,
+      "num_input_tokens_seen": 59360,
+      "step": 50
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 1.450323462486267,
+      "learning_rate": 4.379599518697444e-05,
+      "loss": 3.2383,
+      "num_input_tokens_seen": 64928,
+      "step": 55
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 1.70930016040802,
+      "learning_rate": 4.267766952966369e-05,
+      "loss": 3.1941,
+      "num_input_tokens_seen": 70752,
+      "step": 60
+    },
+    {
+      "epoch": 5.2,
+      "grad_norm": 1.9510873556137085,
+      "learning_rate": 4.148364537750172e-05,
+      "loss": 3.1134,
+      "num_input_tokens_seen": 76544,
+      "step": 65
+    },
+    {
+      "epoch": 5.6,
+      "grad_norm": 1.781402826309204,
+      "learning_rate": 4.021903572521802e-05,
+      "loss": 3.0551,
+      "num_input_tokens_seen": 82736,
+      "step": 70
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 1.4807748794555664,
+      "learning_rate": 3.888925582549006e-05,
+      "loss": 3.0148,
+      "num_input_tokens_seen": 88464,
+      "step": 75
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 1.936543345451355,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 2.9215,
+      "num_input_tokens_seen": 94496,
+      "step": 80
+    },
+    {
+      "epoch": 6.8,
+      "grad_norm": 2.542340040206909,
+      "learning_rate": 3.6057217255475034e-05,
+      "loss": 2.8317,
+      "num_input_tokens_seen": 100192,
+      "step": 85
+    },
+    {
+      "epoch": 7.2,
+      "grad_norm": 2.17262864112854,
+      "learning_rate": 3.456708580912725e-05,
+      "loss": 2.8665,
+      "num_input_tokens_seen": 105904,
+      "step": 90
+    },
+    {
+      "epoch": 7.6,
+      "grad_norm": 1.919408917427063,
+      "learning_rate": 3.303598663257904e-05,
+      "loss": 2.8028,
+      "num_input_tokens_seen": 111856,
+      "step": 95
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 2.5374443531036377,
+      "learning_rate": 3.147047612756302e-05,
+      "loss": 2.9059,
+      "num_input_tokens_seen": 118064,
+      "step": 100
+    },
+    {
+      "epoch": 8.4,
+      "grad_norm": 1.868935465812683,
+      "learning_rate": 2.9877258050403212e-05,
+      "loss": 2.6716,
+      "num_input_tokens_seen": 124192,
+      "step": 105
+    },
+    {
+      "epoch": 8.8,
+      "grad_norm": 1.9682857990264893,
+      "learning_rate": 2.8263154805501297e-05,
+      "loss": 2.7336,
+      "num_input_tokens_seen": 129856,
+      "step": 110
+    },
+    {
+      "epoch": 9.2,
+      "grad_norm": 2.1156065464019775,
+      "learning_rate": 2.663507823075358e-05,
+      "loss": 2.6835,
+      "num_input_tokens_seen": 135760,
+      "step": 115
+    },
+    {
+      "epoch": 9.6,
+      "grad_norm": 2.3779890537261963,
+      "learning_rate": 2.5e-05,
+      "loss": 2.5898,
+      "num_input_tokens_seen": 141760,
+      "step": 120
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 2.671842336654663,
+      "learning_rate": 2.3364921769246423e-05,
+      "loss": 2.5017,
+      "num_input_tokens_seen": 147664,
+      "step": 125
+    },
+    {
+      "epoch": 10.4,
+      "grad_norm": 2.809380292892456,
+      "learning_rate": 2.173684519449872e-05,
+      "loss": 2.4384,
+      "num_input_tokens_seen": 153312,
+      "step": 130
+    },
+    {
+      "epoch": 10.8,
+      "grad_norm": 3.9723222255706787,
+      "learning_rate": 2.0122741949596797e-05,
+      "loss": 2.5169,
+      "num_input_tokens_seen": 159584,
+      "step": 135
+    },
+    {
+      "epoch": 11.2,
+      "grad_norm": 2.3110992908477783,
+      "learning_rate": 1.852952387243698e-05,
+      "loss": 2.3124,
+      "num_input_tokens_seen": 165040,
+      "step": 140
+    },
+    {
+      "epoch": 11.6,
+      "grad_norm": 2.2998690605163574,
+      "learning_rate": 1.6964013367420966e-05,
+      "loss": 2.4734,
+      "num_input_tokens_seen": 170944,
+      "step": 145
+    },
+    {
+      "epoch": 12.0,
+      "grad_norm": 3.0829477310180664,
+      "learning_rate": 1.5432914190872757e-05,
+      "loss": 2.3957,
+      "num_input_tokens_seen": 177168,
+      "step": 150
+    },
+    {
+      "epoch": 12.4,
+      "grad_norm": 2.7017433643341064,
+      "learning_rate": 1.3942782744524973e-05,
+      "loss": 2.5141,
+      "num_input_tokens_seen": 183088,
+      "step": 155
+    },
+    {
+      "epoch": 12.8,
+      "grad_norm": 3.001948118209839,
+      "learning_rate": 1.2500000000000006e-05,
+      "loss": 2.2276,
+      "num_input_tokens_seen": 188736,
+      "step": 160
+    },
+    {
+      "epoch": 13.2,
+      "grad_norm": 3.0778212547302246,
+      "learning_rate": 1.1110744174509952e-05,
+      "loss": 2.1839,
+      "num_input_tokens_seen": 194592,
+      "step": 165
+    },
+    {
+      "epoch": 13.6,
+      "grad_norm": 3.4998250007629395,
+      "learning_rate": 9.780964274781984e-06,
+      "loss": 2.3212,
+      "num_input_tokens_seen": 200528,
+      "step": 170
+    },
+    {
+      "epoch": 14.0,
+      "grad_norm": 2.9788782596588135,
+      "learning_rate": 8.51635462249828e-06,
+      "loss": 2.2398,
+      "num_input_tokens_seen": 206256,
+      "step": 175
+    },
+    {
+      "epoch": 14.4,
+      "grad_norm": 3.000025510787964,
+      "learning_rate": 7.3223304703363135e-06,
+      "loss": 2.2431,
+      "num_input_tokens_seen": 212480,
+      "step": 180
+    },
+    {
+      "epoch": 14.8,
+      "grad_norm": 3.488719940185547,
+      "learning_rate": 6.204004813025568e-06,
+      "loss": 2.0743,
+      "num_input_tokens_seen": 218208,
+      "step": 185
+    },
+    {
+      "epoch": 15.2,
+      "grad_norm": 3.7175214290618896,
+      "learning_rate": 5.166166492719124e-06,
+      "loss": 2.1343,
+      "num_input_tokens_seen": 223632,
+      "step": 190
+    },
+    {
+      "epoch": 15.6,
+      "grad_norm": 3.5645270347595215,
+      "learning_rate": 4.213259692436367e-06,
+      "loss": 2.3751,
+      "num_input_tokens_seen": 229776,
+      "step": 195
+    },
+    {
+      "epoch": 16.0,
+      "grad_norm": 3.673673629760742,
+      "learning_rate": 3.3493649053890326e-06,
+      "loss": 2.0981,
+      "num_input_tokens_seen": 235392,
+      "step": 200
+    },
+    {
+      "epoch": 16.4,
+      "grad_norm": 3.5783541202545166,
+      "learning_rate": 2.578181461682794e-06,
+      "loss": 2.151,
+      "num_input_tokens_seen": 241168,
+      "step": 205
+    },
+    {
+      "epoch": 16.8,
+      "grad_norm": 3.5505430698394775,
+      "learning_rate": 1.9030116872178316e-06,
+      "loss": 2.1801,
+      "num_input_tokens_seen": 247200,
+      "step": 210
+    },
+    {
+      "epoch": 17.2,
+      "grad_norm": 3.7030887603759766,
+      "learning_rate": 1.3267467626223606e-06,
+      "loss": 2.3428,
+      "num_input_tokens_seen": 253536,
+      "step": 215
+    },
+    {
+      "epoch": 17.6,
+      "grad_norm": 3.326767921447754,
+      "learning_rate": 8.51854342773295e-07,
+      "loss": 2.1907,
+      "num_input_tokens_seen": 259568,
+      "step": 220
+    },
+    {
+      "epoch": 18.0,
+      "grad_norm": 3.3685076236724854,
+      "learning_rate": 4.803679899192392e-07,
+      "loss": 2.0246,
+      "num_input_tokens_seen": 265056,
+      "step": 225
+    },
+    {
+      "epoch": 18.4,
+      "grad_norm": 3.4106569290161133,
+      "learning_rate": 2.1387846565474045e-07,
+      "loss": 2.1411,
+      "num_input_tokens_seen": 271312,
+      "step": 230
+    },
+    {
+      "epoch": 18.8,
+      "grad_norm": 3.188873529434204,
+      "learning_rate": 5.352691903491303e-08,
+      "loss": 2.0321,
+      "num_input_tokens_seen": 277008,
+      "step": 235
+    },
+    {
+      "epoch": 19.2,
+      "grad_norm": 3.023102045059204,
+      "learning_rate": 0.0,
+      "loss": 2.2785,
+      "num_input_tokens_seen": 282848,
+      "step": 240
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 240,
+  "num_input_tokens_seen": 282848,
+  "num_train_epochs": 20,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2240526315257856.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-240/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88861ed737b91798f2b18542cf81036f47fb195cc2283a0f86fbef9f5c7d5c56
+size 5304

checkpoint-240/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "Qwen/Qwen2-1.5B-Instruct",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.42.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.42.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f99ed3114e8d6550597464904b2bc7db3fdf52bd0f2a0f9428b826e8b4207979
+size 973272632

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0edffcc6d7000d32fe3e61f6b2ebae6ce326866a22ad68a0e868c10bd36453b7
+size 997323016

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:729e998c8a54d367da5dd56df489fd603d9cdd64a0cb3cb7e7839af40abda79a
+size 995739192

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9952fd5b5b99009d05da9bfda19e410762fd03e489f8a041fdd282428a4f949e
+size 121131784