{ "cells": [ { "cell_type": "markdown", "id": "9c3e4532", "metadata": { "papermill": { "duration": 1.349527, "end_time": "2023-10-20T04:29:10.661274", "exception": false, "start_time": "2023-10-20T04:29:09.311747", "status": "completed" }, "tags": [] }, "source": [ "# Train models using HuggingFace libraries\n", "\n", "This notebook takes parameters from a params.json file which is automatically\n", "created by Substratus K8s operator.\n", "\n", "The following parameters influence what happens in this notebook:\n", "- `dataset_urls`: A comma separated list of URLs. The URLs should point to\n", " json files that contain your training dataset. If unset a json or jsonl\n", " file should be present under the `/content/data/` directory.\n", "- `prompt_template`: The prompt template to use for training\n", "- `push_to_hub`: if this variable is set a repo id, then the trained\n", " model will get pushed to HuggingFace hub. For example,\n", " set it to \"substratusai/my-model\" to publish to substratusai HF org." ] }, { "cell_type": "code", "execution_count": 1, "id": "86ccd646", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T04:29:12.619462Z", "iopub.status.busy": "2023-10-20T04:29:12.619161Z", "iopub.status.idle": "2023-10-20T04:29:12.631792Z", "shell.execute_reply": "2023-10-20T04:29:12.631220Z" }, "papermill": { "duration": 0.998973, "end_time": "2023-10-20T04:29:12.633545", "exception": false, "start_time": "2023-10-20T04:29:11.634572", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'dataset_urls': 'https://huggingface.co/datasets/weaviate/WithRetrieval-SchemaSplit-Train-80/resolve/main/WithRetrieval-Random-Train-80.json',\n", " 'inference_prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n```graphql\\n',\n", " 'logging_steps': 50,\n", " 'modules_to_save': 'embed_tokens, lm_head',\n", " 'num_train_epochs': 3,\n", " 'per_device_eval_batch_size': 1,\n", " 'per_device_train_batch_size': 1,\n", " 'prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n{output}\\n',\n", " 'push_to_hub': 'substratusai/wgql-WithRetrieval-SchemaSplit-Train-80',\n", " 'save_steps': 50,\n", " 'target_modules': 'q_proj, up_proj, o_proj, k_proj, down_proj, gate_proj, v_proj',\n", " 'warmup_steps': 100}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import json\n", "from pathlib import Path\n", "\n", "params = {}\n", "params_path = Path(\"/content/params.json\")\n", "if params_path.is_file():\n", " with params_path.open(\"r\", encoding=\"UTF-8\") as params_file:\n", " params = json.load(params_file)\n", "\n", "\n", "params" ] }, { "cell_type": "code", "execution_count": 2, "id": "9fafd16b-d8c9-47bf-9116-c27b1d43a019", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T04:29:15.467402Z", "iopub.status.busy": "2023-10-20T04:29:15.466447Z", "iopub.status.idle": "2023-10-20T04:29:18.661586Z", "shell.execute_reply": "2023-10-20T04:29:18.660751Z" }, "papermill": { "duration": 5.013972, "end_time": "2023-10-20T04:29:18.663116", "exception": false, "start_time": "2023-10-20T04:29:13.649144", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using the following URLs for the dataset: ['https://huggingface.co/datasets/weaviate/WithRetrieval-SchemaSplit-Train-80/resolve/main/WithRetrieval-Random-Train-80.json']\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "975d63e8725b4e61b10cc1257c68b465", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00\n" ] } ], "source": [ "default_prompt = \"\"\"\n", "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", "### Instruction:\n", "{prompt}\n", "### Response:\n", "{completion}\n", "\"\"\"\n", "\n", "prompt = params.get(\"prompt_template\", default_prompt)\n", "\n", "eos_token = tokenizer.convert_ids_to_tokens(model.config.eos_token_id)\n", "if prompt[-len(eos_token):] != eos_token:\n", " prompt = prompt + eos_token\n", "\n", "print(prompt)\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "0abf96e1-3bc1-4ae7-80ac-c2e585e9c7c1", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T04:32:11.202559Z", "iopub.status.busy": "2023-10-20T04:32:11.201752Z", "iopub.status.idle": "2023-10-20T04:32:12.068010Z", "shell.execute_reply": "2023-10-20T04:32:12.067140Z" }, "papermill": { "duration": 2.153714, "end_time": "2023-10-20T04:32:12.069912", "exception": false, "start_time": "2023-10-20T04:32:09.916198", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fri Oct 20 04:32:11 2023 \r\n", "+-----------------------------------------------------------------------------+\r\n", "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\r\n", "|-------------------------------+----------------------+----------------------+\r\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", "| | | MIG M. |\r\n", "|===============================+======================+======================|\r\n", "| 0 NVIDIA L4 Off | 00000000:00:04.0 Off | 0 |\r\n", "| N/A 56C P0 30W / 72W | 3570MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "| 1 NVIDIA L4 Off | 00000000:00:05.0 Off | 0 |\r\n", "| N/A 56C P0 29W / 72W | 4096MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", "| 2 NVIDIA L4 Off | 00000000:00:06.0 Off | 0 |\r\n", "| N/A 58C P0 30W / 72W | 4096MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", "| 3 NVIDIA L4 Off | 00000000:00:07.0 Off | 0 |\r\n", "| N/A 58C P0 31W / 72W | 3570MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", " \r\n", "+-----------------------------------------------------------------------------+\r\n", "| Processes: |\r\n", "| GPU GI CI PID Type Process name GPU Memory |\r\n", "| ID ID Usage |\r\n", "|=============================================================================|\r\n", "+-----------------------------------------------------------------------------+\r\n" ] } ], "source": [ "! nvidia-smi" ] }, { "attachments": {}, "cell_type": "markdown", "id": "4d1e1795-c783-4ddf-999e-f1de19258928", "metadata": { "papermill": { "duration": 1.669218, "end_time": "2023-10-20T04:32:14.740838", "exception": false, "start_time": "2023-10-20T04:32:13.071620", "status": "completed" }, "tags": [] }, "source": [ "Prompt before fine tuning" ] }, { "cell_type": "code", "execution_count": 7, "id": "f5dd944b-e2bd-4bfd-a5fa-55bc90239926", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T04:32:19.219082Z", "iopub.status.busy": "2023-10-20T04:32:19.218724Z", "iopub.status.idle": "2023-10-20T04:32:19.240542Z", "shell.execute_reply": "2023-10-20T04:32:19.239838Z" }, "papermill": { "duration": 3.091602, "end_time": "2023-10-20T04:32:19.242046", "exception": false, "start_time": "2023-10-20T04:32:16.150444", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "LlamaTokenizerFast(name_or_path='/content/model/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t32000: AddedToken(\"[PAD]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from typing import Dict\n", "# source: https://github.com/artidoro/qlora\n", "DEFAULT_PAD_TOKEN = params.get(\"pad_token\", \"[PAD]\")\n", "\n", "def smart_tokenizer_and_embedding_resize(\n", " special_tokens_dict: Dict,\n", " tokenizer: transformers.PreTrainedTokenizer,\n", " model: transformers.PreTrainedModel,\n", "):\n", " \"\"\"Resize tokenizer and embedding.\n", "\n", " Note: This is the unoptimized version that may make your embedding size not be divisible by 64.\n", " \"\"\"\n", " num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)\n", " model.resize_token_embeddings(len(tokenizer))\n", " if num_new_tokens > 0:\n", " input_embeddings_data = model.get_input_embeddings().weight.data\n", " output_embeddings_data = model.get_output_embeddings().weight.data\n", "\n", " input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", " output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", "\n", " input_embeddings_data[-num_new_tokens:] = input_embeddings_avg\n", " output_embeddings_data[-num_new_tokens:] = output_embeddings_avg\n", "\n", "if tokenizer._pad_token is None:\n", " smart_tokenizer_and_embedding_resize(\n", " special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),\n", " tokenizer=tokenizer,\n", " model=model,\n", " )\n", "\n", "if isinstance(tokenizer, transformers.LlamaTokenizer):\n", " # LLaMA tokenizer may not have correct special tokens set.\n", " # Check and add them if missing to prevent them from being parsed into different tokens.\n", " # Note that these are present in the vocabulary.\n", " # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token.\n", " print('Adding special tokens.')\n", " tokenizer.add_special_tokens({\n", " \"eos_token\": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),\n", " \"bos_token\": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),\n", " \"unk_token\": tokenizer.convert_ids_to_tokens(\n", " model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id\n", " ),\n", " })\n", "\n", "tokenizer" ] }, { "cell_type": "code", "execution_count": 8, "id": "e78b510d", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T04:32:21.353541Z", "iopub.status.busy": "2023-10-20T04:32:21.352756Z", "iopub.status.idle": "2023-10-20T04:32:28.425095Z", "shell.execute_reply": "2023-10-20T04:32:28.424393Z" }, "papermill": { "duration": 8.18185, "end_time": "2023-10-20T04:32:28.426669", "exception": false, "start_time": "2023-10-20T04:32:20.244819", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fe3e712d3b954f6d9a6c5c0bac764ca5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/3190 [00:00, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules=['q_proj', 'up_proj', 'o_proj', 'k_proj', 'down_proj', 'gate_proj', 'v_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['embed_tokens', 'lm_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "trainable params: 564,281,344 || all params: 7,040,552,960 || trainable%: 8.01473047935144\n" ] } ], "source": [ "from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training\n", "\n", "target_modules = params.get(\"target_modules\")\n", "if target_modules:\n", " target_modules = [mod.strip() for mod in target_modules.split(\",\")]\n", "\n", "modules_to_save = params.get(\"modules_to_save\")\n", "if modules_to_save:\n", " modules_to_save = [mod.strip() for mod in modules_to_save.split(\",\")]\n", "\n", "lora_config2 = LoraConfig(\n", " r=16,\n", " lora_alpha=16,\n", " lora_dropout=0.05,\n", " bias=\"none\",\n", " task_type=\"CAUSAL_LM\",\n", " target_modules=target_modules,\n", " modules_to_save = modules_to_save\n", ")\n", "print(lora_config2)\n", "\n", "model = prepare_model_for_kbit_training(model)\n", "\n", "# add LoRA adaptor\n", "model = get_peft_model(model, lora_config2)\n", "model.print_trainable_parameters()" ] }, { "cell_type": "code", "execution_count": 10, "id": "70a3e36c-62cf-45aa-8f37-0db0e40857dc", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T04:34:07.472890Z", "iopub.status.busy": "2023-10-20T04:34:07.472055Z", "iopub.status.idle": "2023-10-20T04:34:07.493277Z", "shell.execute_reply": "2023-10-20T04:34:07.492455Z" }, "papermill": { "duration": 1.07078, "end_time": "2023-10-20T04:34:07.494832", "exception": false, "start_time": "2023-10-20T04:34:06.424052", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "TrainingArguments(\n", "_n_gpu=4,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "auto_find_batch_size=False,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "data_seed=None,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_backend=None,\n", "ddp_broadcast_buffers=None,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "ddp_timeout=1800,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "dispatch_batches=None,\n", "do_eval=False,\n", "do_predict=False,\n", "do_train=False,\n", "eval_accumulation_steps=None,\n", "eval_delay=0,\n", "eval_steps=None,\n", "evaluation_strategy=no,\n", "fp16=True,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "fsdp=[],\n", "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},\n", "fsdp_min_num_params=0,\n", "fsdp_transformer_layer_cls_to_wrap=None,\n", "full_determinism=False,\n", "gradient_accumulation_steps=4,\n", "gradient_checkpointing=False,\n", "greater_is_better=None,\n", "group_by_length=False,\n", "half_precision_backend=auto,\n", "hub_always_push=False,\n", "hub_model_id=None,\n", "hub_private_repo=False,\n", "hub_strategy=every_save,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "include_tokens_per_second=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=3e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=False,\n", "local_rank=0,\n", "log_level=passive,\n", "log_level_replica=warning,\n", "log_on_each_node=True,\n", "logging_dir=/content/artifacts/checkpoints/runs/Oct20_04-34-07_wgqlg-withretrieval-schemasplit-train-80-v4-modeller-hhc7z,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=50,\n", "logging_strategy=steps,\n", "lr_scheduler_type=cosine,\n", "max_grad_norm=1.0,\n", "max_steps=-1,\n", "metric_for_best_model=None,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=3.0,\n", "optim=paged_adamw_32bit,\n", "optim_args=None,\n", "output_dir=/content/artifacts/checkpoints,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=1,\n", "per_device_train_batch_size=1,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=/content/artifacts/checkpoints,\n", "save_on_each_node=False,\n", "save_safetensors=False,\n", "save_steps=50,\n", "save_strategy=steps,\n", "save_total_limit=None,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_cpu=False,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.02,\n", "warmup_steps=100,\n", "weight_decay=0.0,\n", ")" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from utils import parse_training_args\n", "\n", "training_args = parse_training_args(params)\n", "training_args" ] }, { "cell_type": "code", "execution_count": 11, "id": "2ae3e5f9-e28e-457b-b6bf-a62a472241bf", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T04:34:09.671203Z", "iopub.status.busy": "2023-10-20T04:34:09.670480Z", "iopub.status.idle": "2023-10-20T04:34:09.673797Z", "shell.execute_reply": "2023-10-20T04:34:09.673184Z" }, "papermill": { "duration": 1.114255, "end_time": "2023-10-20T04:34:09.675322", "exception": false, "start_time": "2023-10-20T04:34:08.561067", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# data = data[\"train\"].train_test_split(test_size=0.1)\n", "# data\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "5bc91439-6108-445c-8f85-e6558c9f0677", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T04:34:11.808129Z", "iopub.status.busy": "2023-10-20T04:34:11.807696Z", "iopub.status.idle": "2023-10-20T04:34:12.065605Z", "shell.execute_reply": "2023-10-20T04:34:12.064728Z" }, "papermill": { "duration": 1.353165, "end_time": "2023-10-20T04:34:12.067207", "exception": false, "start_time": "2023-10-20T04:34:10.714042", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! mkdir -p {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": 13, "id": "b33e407a-9d4f-49f6-a74b-b80db8cc3a8a", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T04:34:14.147929Z", "iopub.status.busy": "2023-10-20T04:34:14.147060Z", "iopub.status.idle": "2023-10-20T08:45:44.939895Z", "shell.execute_reply": "2023-10-20T08:45:44.939204Z" }, "papermill": { "duration": 15092.697329, "end_time": "2023-10-20T08:45:45.801991", "exception": false, "start_time": "2023-10-20T04:34:13.104662", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [2391/2391 4:11:21, Epoch 2/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
501.069500
1000.515400
1500.274500
2000.173600
2500.118500
3000.084000
3500.065700
4000.054700
4500.048400
5000.044000
5500.039800
6000.039200
6500.038100
7000.034400
7500.034300
8000.032600
8500.027300
9000.026600
9500.027700
10000.026800
10500.026100
11000.026900
11500.026200
12000.025400
12500.023900
13000.025000
13500.024000
14000.025500
14500.024200
15000.023000
15500.024700
16000.023400
16500.019500
17000.019600
17500.020400
18000.019600
18500.019200
19000.019600
19500.018700
20000.019400
20500.020000
21000.020300
21500.019400
22000.019400
22500.019700
23000.019300
23500.019500

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=2391, training_loss=0.07075796158230988, metrics={'train_runtime': 15090.3717, 'train_samples_per_second': 0.634, 'train_steps_per_second': 0.158, 'total_flos': 3.0420974601928704e+17, 'train_loss': 0.07075796158230988, 'epoch': 3.0})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer = transformers.Trainer(\n", " model=model,\n", " train_dataset=data[\"train\"],\n", "# eval_dataset=data[\"test\"],\n", " args=training_args,\n", " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", ")\n", "model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n", "\n", "checkpoint_path = Path(\"/content/artifacts/checkpoints\")\n", "\n", "# Only set resume_from_checkpoint True when directory exists and contains files\n", "resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())\n", "if resume_from_checkpoint:\n", " print(\"Resuming from checkpoint:\", list(checkpoint_path.rglob(\"\")))\n", "trainer.train(resume_from_checkpoint=resume_from_checkpoint)" ] }, { "cell_type": "code", "execution_count": 14, "id": "172e47a7-400e-4f82-a5e3-38135ecf532f", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T08:45:47.918275Z", "iopub.status.busy": "2023-10-20T08:45:47.917652Z", "iopub.status.idle": "2023-10-20T08:46:06.598970Z", "shell.execute_reply": "2023-10-20T08:46:06.598308Z" }, "papermill": { "duration": 19.754456, "end_time": "2023-10-20T08:46:06.600631", "exception": false, "start_time": "2023-10-20T08:45:46.846175", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "PeftModelForCausalLM(\n", " (base_model): LoraModel(\n", " (model): LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): ModulesToSaveWrapper(\n", " (original_module): Embedding(32001, 4096)\n", " (modules_to_save): ModuleDict(\n", " (default): Embedding(32001, 4096)\n", " )\n", " )\n", " (layers): ModuleList(\n", " (0-31): 32 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (k_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (v_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (o_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(\n", " in_features=4096, out_features=11008, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=11008, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (up_proj): Linear(\n", " in_features=4096, out_features=11008, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=11008, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (down_proj): Linear(\n", " in_features=11008, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=11008, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (act_fn): SiLUActivation()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): ModulesToSaveWrapper(\n", " (original_module): Linear(in_features=4096, out_features=32001, bias=False)\n", " (modules_to_save): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=32001, bias=False)\n", " )\n", " )\n", " )\n", " )\n", ")" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.save_pretrained(trained_model_path_lora)\n", "model" ] }, { "cell_type": "code", "execution_count": 15, "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T08:46:08.783717Z", "iopub.status.busy": "2023-10-20T08:46:08.782966Z", "iopub.status.idle": "2023-10-20T08:46:09.033621Z", "shell.execute_reply": "2023-10-20T08:46:09.032713Z" }, "papermill": { "duration": 1.261451, "end_time": "2023-10-20T08:46:09.035288", "exception": false, "start_time": "2023-10-20T08:46:07.773837", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 1.2G\r\n", " 512 -rw-r--r-- 1 root 3003 88 Oct 20 08:45 README.md\r\n", "1.0K -rw-r--r-- 1 root 3003 550 Oct 20 08:46 adapter_config.json\r\n", "1.2G -rw-r--r-- 1 root 3003 1.2G Oct 20 08:45 adapter_model.bin\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -lash {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": 16, "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T08:46:11.071508Z", "iopub.status.busy": "2023-10-20T08:46:11.070823Z", "iopub.status.idle": "2023-10-20T08:47:03.388936Z", "shell.execute_reply": "2023-10-20T08:47:03.388264Z" }, "papermill": { "duration": 54.544709, "end_time": "2023-10-20T08:47:04.578229", "exception": false, "start_time": "2023-10-20T08:46:10.033520", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): Embedding(32001, 4096)\n", " (layers): ModuleList(\n", " (0-31): 32 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", " (up_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", " (down_proj): Linear(in_features=11008, out_features=4096, bias=False)\n", " (act_fn): SiLUActivation()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): Linear(in_features=4096, out_features=32001, bias=False)\n", ")" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = model.merge_and_unload().half()\n", "model" ] }, { "cell_type": "code", "execution_count": 17, "id": "270a9a72-3a12-4d83-aa7d-2d167cb28cb4", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T08:47:06.891191Z", "iopub.status.busy": "2023-10-20T08:47:06.890837Z", "iopub.status.idle": "2023-10-20T08:47:07.137287Z", "shell.execute_reply": "2023-10-20T08:47:07.136391Z" }, "papermill": { "duration": 1.395578, "end_time": "2023-10-20T08:47:07.138853", "exception": false, "start_time": "2023-10-20T08:47:05.743275", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 0\r\n", "drwxr-xr-x 1 root 3003 0 Oct 20 04:34 checkpoints\r\n", "drwxr-xr-x 1 root 3003 0 Oct 20 04:34 lora\r\n", "drwxr-xr-x 1 root 3003 0 Oct 20 04:29 src\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -l {trained_model_path}" ] }, { "cell_type": "code", "execution_count": 18, "id": "260e9d79-6eb8-4516-bf8f-825a25606391", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T08:47:09.318793Z", "iopub.status.busy": "2023-10-20T08:47:09.317959Z", "iopub.status.idle": "2023-10-20T08:50:32.295853Z", "shell.execute_reply": "2023-10-20T08:50:32.295187Z" }, "papermill": { "duration": 205.2148, "end_time": "2023-10-20T08:50:33.467533", "exception": false, "start_time": "2023-10-20T08:47:08.252733", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "('/content/artifacts/tokenizer_config.json',\n", " '/content/artifacts/special_tokens_map.json',\n", " '/content/artifacts/tokenizer.model',\n", " '/content/artifacts/added_tokens.json',\n", " '/content/artifacts/tokenizer.json')" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.save_pretrained(trained_model_path)\n", "tokenizer.save_pretrained(trained_model_path)" ] }, { "cell_type": "code", "execution_count": 19, "id": "a575ff52-a6ff-4322-af8e-2629c0e110a0", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T08:50:35.318692Z", "iopub.status.busy": "2023-10-20T08:50:35.317875Z", "iopub.status.idle": "2023-10-20T08:50:35.798065Z", "shell.execute_reply": "2023-10-20T08:50:35.797213Z" }, "papermill": { "duration": 1.406588, "end_time": "2023-10-20T08:50:35.799961", "exception": false, "start_time": "2023-10-20T08:50:34.393373", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 13G\r\n", " 512 -rw-r--r-- 1 root 3003 21 Oct 20 08:50 added_tokens.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 20 04:34 checkpoints\r\n", "1.0K -rw-r--r-- 1 root 3003 648 Oct 20 08:47 config.json\r\n", " 512 -rw-r--r-- 1 root 3003 183 Oct 20 08:47 generation_config.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 20 04:34 lora\r\n", "9.3G -rw-r--r-- 1 root 3003 9.3G Oct 20 08:47 pytorch_model-00001-of-00002.bin\r\n", "3.3G -rw-r--r-- 1 root 3003 3.3G Oct 20 08:49 pytorch_model-00002-of-00002.bin\r\n", " 24K -rw-r--r-- 1 root 3003 24K Oct 20 08:50 pytorch_model.bin.index.json\r\n", "1.0K -rw-r--r-- 1 root 3003 552 Oct 20 08:50 special_tokens_map.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 20 04:29 src\r\n", "1.8M -rw-r--r-- 1 root 3003 1.8M Oct 20 08:50 tokenizer.json\r\n", "489K -rw-r--r-- 1 root 3003 489K Oct 20 08:50 tokenizer.model\r\n", "1.5K -rw-r--r-- 1 root 3003 1.1K Oct 20 08:50 tokenizer_config.json\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "total 13G\r\n", "2.0K -rw-r--r-- 1 root 3003 1.6K Oct 5 05:21 .gitattributes\r\n", "7.0K -rw-r--r-- 1 root 3003 6.9K Oct 5 05:21 LICENSE.txt\r\n", " 11K -rw-r--r-- 1 root 3003 11K Oct 5 05:21 README.md\r\n", "1.2M -rw-r--r-- 1 root 3003 1.2M Oct 5 05:21 Responsible-Use-Guide.pdf\r\n", "5.0K -rw-r--r-- 1 root 3003 4.7K Oct 5 05:21 USE_POLICY.md\r\n", "1.0K -rw-r--r-- 1 root 3003 609 Oct 5 05:21 config.json\r\n", " 512 -rw-r--r-- 1 root 3003 188 Oct 5 05:21 generation_config.json\r\n", "9.3G -rw-r--r-- 1 root 3003 9.3G Oct 5 05:22 model-00001-of-00002.safetensors\r\n", "3.3G -rw-r--r-- 1 root 3003 3.3G Oct 5 05:22 model-00002-of-00002.safetensors\r\n", " 27K -rw-r--r-- 1 root 3003 27K Oct 5 05:21 model.safetensors.index.json\r\n", " 512 -rw-r--r-- 1 root 3003 414 Oct 5 05:21 special_tokens_map.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 20 08:50 src\r\n", "1.8M -rw-r--r-- 1 root 3003 1.8M Oct 5 05:21 tokenizer.json\r\n", "489K -rw-r--r-- 1 root 3003 489K Oct 5 05:21 tokenizer.model\r\n", "1.0K -rw-r--r-- 1 root 3003 776 Oct 5 05:21 tokenizer_config.json\r\n" ] } ], "source": [ "! ls -lash {trained_model_path}\n", "! ls -lash {model_path}" ] }, { "cell_type": "code", "execution_count": 20, "id": "6d90a920-fb22-4291-8466-411ff41e31be", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T08:50:37.854282Z", "iopub.status.busy": "2023-10-20T08:50:37.853439Z", "iopub.status.idle": "2023-10-20T08:50:38.103769Z", "shell.execute_reply": "2023-10-20T08:50:38.102831Z" }, "papermill": { "duration": 1.245211, "end_time": "2023-10-20T08:50:38.105374", "exception": false, "start_time": "2023-10-20T08:50:36.860163", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 13G\r\n", " 512 -rw-r--r-- 1 root 3003 21 Oct 20 08:50 added_tokens.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 20 04:34 checkpoints\r\n", "1.0K -rw-r--r-- 1 root 3003 648 Oct 20 08:47 config.json\r\n", " 512 -rw-r--r-- 1 root 3003 183 Oct 20 08:47 generation_config.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 20 04:34 lora\r\n", "9.3G -rw-r--r-- 1 root 3003 9.3G Oct 20 08:47 pytorch_model-00001-of-00002.bin\r\n", "3.3G -rw-r--r-- 1 root 3003 3.3G Oct 20 08:49 pytorch_model-00002-of-00002.bin\r\n", " 24K -rw-r--r-- 1 root 3003 24K Oct 20 08:50 pytorch_model.bin.index.json\r\n", "1.0K -rw-r--r-- 1 root 3003 552 Oct 20 08:50 special_tokens_map.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 20 04:29 src\r\n", "1.8M -rw-r--r-- 1 root 3003 1.8M Oct 20 08:50 tokenizer.json\r\n", "489K -rw-r--r-- 1 root 3003 489K Oct 20 08:50 tokenizer.model\r\n", "1.5K -rw-r--r-- 1 root 3003 1.1K Oct 20 08:50 tokenizer_config.json\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -lash {trained_model_path}" ] }, { "cell_type": "code", "execution_count": 21, "id": "202a694a", "metadata": { "execution": { "iopub.execute_input": "2023-10-20T08:50:39.994565Z", "iopub.status.busy": "2023-10-20T08:50:39.993719Z" }, "papermill": { "duration": null, "end_time": null, "exception": false, "start_time": "2023-10-20T08:50:39.040596", "status": "running" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "79038243991f481395b9433f4dcf59b5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "pytorch_model-00001-of-00002.bin: 0%| | 0.00/9.98G [00:00