{ "cells": [ { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "eb33b19f-1206-41ee-84e2-e6258a12eef7", "showTitle": false, "title": "" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ "Python 3.11.0rc1\r\n" ] } ], "source": [ "if 'dbutils' in locals():\n", " dbutils.library.restartPython()\n", "\n", "!python --version" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "0ea8b46b-839b-445b-8043-ccdf4e920ace", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "6d394937-6c99-4a7c-9d32-7600a280032f", "showTitle": false, "title": "" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ "workding dir: /Workspace/Users/donghao.huang@mastercard.com/llm-finetuning\n" ] } ], "source": [ "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "workding_dir = str(Path.cwd().parent)\n", "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "8affcc96-edf6-4489-b656-afb475a038e6", "showTitle": false, "title": "" } }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "False" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "need_to_setup_env = False\n", "need_to_setup_env" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "72f9cf79-7b0d-4d9e-90a0-1fa5251b947f", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "if need_to_setup_env:\n", " %pip config set global.index-url https://artifacts.forge.mastercard.com/artifactory/api/pypi/python/simple\n", " %pip install tf-keras\n", " %pip install -q --upgrade accelerate einops xformers torchvision\n", " %pip install -r requirements.txt\n", " !cd ../LLaMA-Factory && pip install -e .[torch,bitsandbytes] && FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --upgrade flash-attn" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "c06c61fd-4c6f-4099-bd3b-46188ab835d7", "showTitle": false, "title": "" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ "workding dir: /Workspace/Users/donghao.huang@mastercard.com/llm-finetuning\n" ] } ], "source": [ "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "9f67ec60-2f24-411c-84eb-0dd664b44775", "showTitle": false, "title": "" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ "loading env vars from: /Workspace/Users/donghao.huang@mastercard.com/llm-finetuning/.env\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dotenv import find_dotenv, load_dotenv\n", "\n", "found_dotenv = find_dotenv(\".env\")\n", "\n", "if len(found_dotenv) == 0:\n", " found_dotenv = find_dotenv(\".env.example\")\n", "print(f\"loading env vars from: {found_dotenv}\")\n", "load_dotenv(found_dotenv, override=True)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "f1597656-8042-4878-9d3b-9ebfb8dd86dc", "showTitle": false, "title": "" } }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('unsloth/Qwen2-1.5B-Instruct',\n", " True,\n", " None,\n", " None,\n", " 2048,\n", " 10,\n", " None,\n", " 'datasets/mac/mac.tsv',\n", " 'results/mac-results_lf-r3.csv',\n", " 'TRUE',\n", " 'true')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "model_name = os.getenv(\"MODEL_NAME\")\n", "token = os.getenv(\"HF_TOKEN\") or None\n", "load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n", "local_model = os.getenv(\"LOCAL_MODEL\")\n", "hub_model = os.getenv(\"HUB_MODEL\")\n", "num_train_epochs = int(os.getenv(\"NUM_TRAIN_EPOCHS\") or 0)\n", "data_path = os.getenv(\"DATA_PATH\")\n", "results_path = os.getenv(\"RESULTS_PATH\")\n", "\n", "max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = (\n", " None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", ")\n", "\n", "model_name, load_in_4bit, local_model, hub_model, max_seq_length, num_train_epochs, dtype, data_path, results_path, os.getenv(\"DISABLE_MLFLOW_INTEGRATION\"), os.getenv(\"WANDB_DISABLED\")" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "e3ab54ba-7b6d-4817-bf2e-c5d711508b58", "showTitle": false, "title": "" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ "Sat Jul 6 05:25:48 2024 \r\n+---------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |\r\n|-----------------------------------------+----------------------+----------------------+\r\n| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\r\n| | | MIG M. |\r\n|=========================================+======================+======================|\r\n| 0 Tesla T4 Off | 00000001:00:00.0 Off | 0 |\r\n| N/A 63C P8 11W / 70W | 2MiB / 15360MiB | 0% Default |\r\n| | | N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n \r\n+---------------------------------------------------------------------------------------+\r\n| Processes: |\r\n| GPU GI CI PID Type Process name GPU Memory |\r\n| ID ID Usage |\r\n|=======================================================================================|\r\n| No running processes found |\r\n+---------------------------------------------------------------------------------------+\r\n" ] } ], "source": [ "!nvidia-smi" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "b2a43943-9324-4839-9a47-cfa72de2244b", "showTitle": false, "title": "" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ "Python 3.11.0rc1\r\nName: flash-attn\nVersion: 2.5.9.post1\nSummary: Flash Attention: Fast and Memory-Efficient Exact Attention\nHome-page: https://github.com/Dao-AILab/flash-attention\nAuthor: Tri Dao\nAuthor-email: trid@cs.stanford.edu\nLicense: \nLocation: /local_disk0/.ephemeral_nfs/envs/pythonEnv-40f92d71-6c52-44a3-a1ef-62cdea633f68/lib/python3.11/site-packages\nRequires: einops, torch\nRequired-by: \nCPU times: user 10.1 ms, sys: 14.5 ms, total: 24.6 ms\nWall time: 4.25 s\n" ] } ], "source": [ "%%time\n", "!python --version\n", "!pip show flash-attn" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "df316e8f-0710-445e-b6f9-fd67ebfeac5a", "showTitle": false, "title": "" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ "Current Directory:\r\n/Workspace/Users/donghao.huang@mastercard.com/llm-finetuning/llama-factory\r\nconfig/llama3_8b_lora_sft.yaml:\r\n {\r\n \"model_name_or_path\": \"gradientai/Llama-3-8B-Instruct-Gradient-1048k\",\r\n \"stage\": \"sft\",\r\n \"do_train\": true,\r\n \"finetuning_type\": \"lora\",\r\n \"lora_target\": \"all\",\r\n \"quantization_bit\": 4,\r\n \"loraplus_lr_ratio\": 16.0,\r\n \"dataset\": \"alpaca_mac\",\r\n \"template\": \"llama3\",\r\n \"cutoff_len\": 1024,\r\n \"max_samples\": 4528,\r\n \"overwrite_cache\": true,\r\n \"preprocessing_num_workers\": 16,\r\n \"output_dir\": \"/Workspace/Users/donghao.huang@mastercard.com/lf-saves/llama3-8b/lora/sft/\",\r\n \"logging_steps\": 10,\r\n \"save_steps\": 560,\r\n \"plot_loss\": true,\r\n \"overwrite_output_dir\": true,\r\n \"per_device_train_batch_size\": 1,\r\n \"gradient_accumulation_steps\": 8,\r\n \"learning_rate\": 0.0001,\r\n \"num_train_epochs\": 6.0,\r\n \"lr_scheduler_type\": \"cosine\",\r\n \"warmup_ratio\": 0.1,\r\n \"bf16\": true,\r\n \"ddp_timeout\": 180000000,\r\n \"val_size\": 0.01,\r\n \"per_device_eval_batch_size\": 1,\r\n \"eval_strategy\": \"steps\",\r\n \"eval_steps\": 560,\r\n \"report_to\": \"none\"\r\n}\r\n2024-07-06 05:25:57.983661: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\r\nTo enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\r\n[2024-07-06 05:26:07,672] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n07/06/2024 05:26:16 - WARNING - llamafactory.hparams.parser - We recommend enable `upcast_layernorm` in quantized training.\r\n07/06/2024 05:26:16 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\r\n[INFO|tokenization_utils_base.py:2161] 2024-07-06 05:26:16,583 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--gradientai--Llama-3-8B-Instruct-Gradient-1048k/snapshots/8697fb25cb77c852311e03b4464b8467471d56a4/tokenizer.json\r\n[INFO|tokenization_utils_base.py:2161] 2024-07-06 05:26:16,583 >> loading file added_tokens.json from cache at None\r\n[INFO|tokenization_utils_base.py:2161] 2024-07-06 05:26:16,583 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--gradientai--Llama-3-8B-Instruct-Gradient-1048k/snapshots/8697fb25cb77c852311e03b4464b8467471d56a4/special_tokens_map.json\r\n[INFO|tokenization_utils_base.py:2161] 2024-07-06 05:26:16,583 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--gradientai--Llama-3-8B-Instruct-Gradient-1048k/snapshots/8697fb25cb77c852311e03b4464b8467471d56a4/tokenizer_config.json\r\n[WARNING|logging.py:313] 2024-07-06 05:26:16,885 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\r\n07/06/2024 05:26:16 - INFO - llamafactory.data.template - Replace eos token: <|eot_id|>\r\n07/06/2024 05:26:16 - INFO - llamafactory.data.template - Add pad token: <|eot_id|>\r\n07/06/2024 05:26:17 - INFO - llamafactory.data.loader - Loading dataset alpaca_mac.json...\r\n\rConverting format of dataset (num_proc=16): 0%| | 0/4528 [00:00<|start_header_id|>user<|end_header_id|>\r\n\r\nPlease translate the following Chinese text into English and provide only the translated content, nothing else.\r\n全仗着狐仙搭救。<|eot_id|><|start_header_id|>assistant<|end_header_id|>\r\n\r\nBecause I was protected by a fox fairy.<|eot_id|>\r\nlabel_ids:\r\n[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 18433, 358, 574, 2682, 555, 264, 39935, 45586, 13, 128009]\r\nlabels:\r\nBecause I was protected by a fox fairy.<|eot_id|>\r\n[INFO|configuration_utils.py:733] 2024-07-06 05:26:21,331 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gradientai--Llama-3-8B-Instruct-Gradient-1048k/snapshots/8697fb25cb77c852311e03b4464b8467471d56a4/config.json\r\n[INFO|configuration_utils.py:800] 2024-07-06 05:26:21,332 >> Model config LlamaConfig {\r\n \"_name_or_path\": \"gradientai/Llama-3-8B-Instruct-Gradient-1048k\",\r\n \"architectures\": [\r\n \"LlamaForCausalLM\"\r\n ],\r\n \"attention_bias\": false,\r\n \"attention_dropout\": 0.0,\r\n \"bos_token_id\": 128000,\r\n \"eos_token_id\": 128001,\r\n \"hidden_act\": \"silu\",\r\n \"hidden_size\": 4096,\r\n \"initializer_range\": 0.02,\r\n \"intermediate_size\": 14336,\r\n \"max_position_embeddings\": 1048576,\r\n \"mlp_bias\": false,\r\n \"model_type\": \"llama\",\r\n \"num_attention_heads\": 32,\r\n \"num_hidden_layers\": 32,\r\n \"num_key_value_heads\": 8,\r\n \"pretraining_tp\": 1,\r\n \"rms_norm_eps\": 1e-05,\r\n \"rope_scaling\": null,\r\n \"rope_theta\": 3580165449.0,\r\n \"tie_word_embeddings\": false,\r\n \"torch_dtype\": \"bfloat16\",\r\n \"transformers_version\": \"4.42.3\",\r\n \"use_cache\": true,\r\n \"vocab_size\": 128256\r\n}\r\n\r\n07/06/2024 05:26:21 - INFO - llamafactory.model.model_utils.quantization - Quantizing model to 4 bit with bitsandbytes.\r\n[INFO|modeling_utils.py:3556] 2024-07-06 05:26:21,358 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--gradientai--Llama-3-8B-Instruct-Gradient-1048k/snapshots/8697fb25cb77c852311e03b4464b8467471d56a4/model.safetensors.index.json\r\n[INFO|modeling_utils.py:1531] 2024-07-06 05:26:21,360 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.\r\n[INFO|configuration_utils.py:1000] 2024-07-06 05:26:21,361 >> Generate config GenerationConfig {\r\n \"bos_token_id\": 128000,\r\n \"eos_token_id\": 128001\r\n}\r\n\r\n\rLoading checkpoint shards: 0%| | 0/4 [00:00> All model checkpoint weights were used when initializing LlamaForCausalLM.\r\n\r\n[INFO|modeling_utils.py:4372] 2024-07-06 05:26:25,528 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at gradientai/Llama-3-8B-Instruct-Gradient-1048k.\r\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.\r\n[INFO|configuration_utils.py:955] 2024-07-06 05:26:25,553 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--gradientai--Llama-3-8B-Instruct-Gradient-1048k/snapshots/8697fb25cb77c852311e03b4464b8467471d56a4/generation_config.json\r\n[INFO|configuration_utils.py:1000] 2024-07-06 05:26:25,553 >> Generate config GenerationConfig {\r\n \"bos_token_id\": 128000,\r\n \"do_sample\": true,\r\n \"eos_token_id\": [\r\n 128001,\r\n 128009\r\n ],\r\n \"max_length\": 4096,\r\n \"temperature\": 0.6,\r\n \"top_p\": 0.9\r\n}\r\n\r\n07/06/2024 05:26:25 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\r\n07/06/2024 05:26:25 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.\r\n07/06/2024 05:26:25 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\r\n07/06/2024 05:26:25 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\r\n07/06/2024 05:26:25 - INFO - llamafactory.model.model_utils.misc - Found linear modules: gate_proj,v_proj,o_proj,k_proj,up_proj,down_proj,q_proj\r\n07/06/2024 05:26:26 - INFO - llamafactory.model.loader - trainable params: 20,971,520 || all params: 8,051,232,768 || trainable%: 0.2605\r\n[INFO|trainer.py:642] 2024-07-06 05:26:26,145 >> Using auto half precision backend\r\n07/06/2024 05:26:26 - WARNING - llamafactory.train.callbacks - Previous trainer log in this folder will be deleted.\r\ntraining_args.resume_from_checkpoint: None\r\n07/06/2024 05:26:26 - INFO - llamafactory.train.trainer_utils - Using LoRA+ optimizer with loraplus lr ratio 16.00.\r\n[INFO|trainer.py:2128] 2024-07-06 05:26:26,821 >> ***** Running training *****\r\n[INFO|trainer.py:2129] 2024-07-06 05:26:26,821 >> Num examples = 4,482\r\n[INFO|trainer.py:2130] 2024-07-06 05:26:26,821 >> Num Epochs = 6\r\n[INFO|trainer.py:2131] 2024-07-06 05:26:26,821 >> Instantaneous batch size per device = 1\r\n[INFO|trainer.py:2134] 2024-07-06 05:26:26,821 >> Total train batch size (w. parallel, distributed & accumulation) = 8\r\n[INFO|trainer.py:2135] 2024-07-06 05:26:26,821 >> Gradient Accumulation steps = 8\r\n[INFO|trainer.py:2136] 2024-07-06 05:26:26,821 >> Total optimization steps = 3,360\r\n[INFO|trainer.py:2137] 2024-07-06 05:26:26,825 >> Number of trainable parameters = 20,971,520\r\n\r 0%| | 0/3360 [00:00