Spaces:

dsmueller
/

fine-tuning-playground

Runtime error

File size: 16,168 Bytes

feee6eb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "from trl import SFTTrainer\n",
    "from peft import LoraConfig, get_peft_model\n",
    "\n",
    "import os\n",
    "from uuid import uuid4\n",
    "import pandas as pd\n",
    "\n",
    "import subprocess\n",
    "import evaluate\n",
    "import transformers\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def max_token_len(dataset):\n",
    "    max_seq_length = 0\n",
    "    for row in dataset:\n",
    "        tokens = len(tokenizer(row['text'])['input_ids'])\n",
    "        if tokens > max_seq_length:\n",
    "            max_seq_length = tokens\n",
    "    return max_seq_length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model Max Length: 1000000000000000019884624838656\n"
     ]
    }
   ],
   "source": [
    "# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'\n",
    "model_name = 'mistralai/Mistral-7B-v0.1'\n",
    "# model_name = 'distilbert-base-uncased'\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model_max_length = tokenizer.model_max_length\n",
    "print(\"Model Max Length:\", model_max_length)\n",
    "\n",
    "# dataset = load_dataset(\"imdb\", split=\"train\")\n",
    "dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'\n",
    "dataset = load_dataset(dataset_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Max token length train: 1121\n",
      "Max token length validation: 38\n",
      "Block size: 2242\n",
      "{'project_name': './llms/ams_data_train-100_4ba55532-e0b2-478b-9f5b-beb082e1b557', 'model_name': 'mistralai/Mistral-7B-v0.1', 'repo_id': 'ai-aerospace/ams-data-train-100-11b94ea4-2b2b-4db3-9e69-acb5a5d9f3e8', 'train_data': 'train_data', 'data_directory': './fine_tune_data/', 'block_size': 2242, 'model_max_length': 1121, 'logging_steps': -1, 'evaluation_strategy': 'epoch', 'save_total_limit': 1, 'save_strategy': 'epoch', 'mixed_precision': 'fp16', 'lr': 3e-05, 'epochs': 3, 'batch_size': 2, 'warmup_ratio': 0.1, 'gradient_accumulation': 1, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0, 'max_grad_norm': 1, 'seed': 42, 'quantization': 'int4', 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05}\n"
     ]
    }
   ],
   "source": [
    "# Write dataset files into data directory\n",
    "data_directory = './fine_tune_data/'\n",
    "\n",
    "# Create the data directory if it doesn't exist\n",
    "os.makedirs(data_directory, exist_ok=True)\n",
    "\n",
    "# Write the train data to a CSV file\n",
    "train_data='train_data'\n",
    "train_filename = os.path.join(data_directory, train_data)\n",
    "dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)\n",
    "max_token_length_train=max_token_len(dataset['train'])\n",
    "print('Max token length train: '+str(max_token_length_train))\n",
    "\n",
    "# Write the validation data to a CSV file\n",
    "validation_data='validation_data'\n",
    "validation_filename = os.path.join(data_directory, validation_data)\n",
    "dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)\n",
    "max_token_length_validation=max_token_len(dataset['validation'])\n",
    "print('Max token length validation: '+str(max_token_length_validation))\n",
    "      \n",
    "max_token_length=max(max_token_length_train,max_token_length_validation)\n",
    "# max_token_length=max_token_length_train\n",
    "if max_token_length > model_max_length:\n",
    "    raise ValueError(\"Maximum token length exceeds model limits.\")\n",
    "block_size=2*max_token_length\n",
    "print('Block size: '+str(block_size))\n",
    "\n",
    "# Define project parameters\n",
    "username='ai-aerospace'\n",
    "project_name='./llms/'+'ams_data_train-100_'+str(uuid4())\n",
    "repo_name='ams-data-train-100-'+str(uuid4())\n",
    "\n",
    "model_params={\n",
    "  \"project_name\": project_name,\n",
    "  \"model_name\": model_name,\n",
    "  \"repo_id\": username+'/'+repo_name,\n",
    "  \"train_data\": train_data,\n",
    "  \"validation_data\": validation_data,\n",
    "  \"data_directory\": data_directory,\n",
    "  \"block_size\": block_size,\n",
    "  \"model_max_length\": max_token_length,\n",
    "  \"logging_steps\": -1,\n",
    "  \"evaluation_strategy\": \"epoch\",\n",
    "  \"save_total_limit\": 1,\n",
    "  \"save_strategy\": \"epoch\",\n",
    "  \"mixed_precision\": \"fp16\",\n",
    "  \"lr\": 0.00003,\n",
    "  \"epochs\": 3,\n",
    "  \"batch_size\": 2,\n",
    "  \"warmup_ratio\": 0.1,\n",
    "  \"gradient_accumulation\": 1,\n",
    "  \"optimizer\": \"adamw_torch\",\n",
    "  \"scheduler\": \"linear\",\n",
    "  \"weight_decay\": 0,\n",
    "  \"max_grad_norm\": 1,\n",
    "  \"seed\": 42,\n",
    "  \"quantization\": \"int4\",\n",
    "  \"lora_r\": 16,\n",
    "  \"lora_alpha\": 32,\n",
    "  \"lora_dropout\": 0.05\n",
    "}\n",
    "for key, value in model_params.items():\n",
    "  os.environ[key] = str(value)\n",
    "\n",
    "print(model_params)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Start trainer\n",
    "# trainer = SFTTrainer(\n",
    "#     model_name,\n",
    "#     train_dataset=dataset,\n",
    "#     dataset_text_field=\"text\",\n",
    "#     max_seq_length=512,\n",
    "# )\n",
    "\n",
    "peft_config = LoraConfig(\n",
    "    r=model_params['lora_r'],\n",
    "    lora_alpha=model_params['lora_alpha'],\n",
    "    lora_dropout=model_params['lora_dropout']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the model\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    model_name,\n",
    "    load_in_4bit=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setting up the LoRA model\n",
    "# import os\n",
    "# os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
    "# from transformers import AutoModelForSequenceClassification\n",
    "# from peft import LoraConfig, get_peft_model, TaskType\n",
    "\n",
    "# MODEL =\"xlm-roberta-large\"\n",
    "\n",
    "# config = LoraConfig(\n",
    "#     task_type=\"SEQ_CLS\",\n",
    "#     r=16,\n",
    "#     lora_alpha=16,\n",
    "#     target_modules=[\"query\", \"value\"],  # Targets the attention blocks in the model\n",
    "#     lora_dropout=0.1,\n",
    "#     bias=\"none\",\n",
    "#     modules_to_save=[\"classifier\"],\n",
    "# )\n",
    "\n",
    "# model = AutoModelForSequenceClassification.from_pretrained(\n",
    "#     MODEL,\n",
    "#     num_labels=len(unique_subissues),\n",
    "#     id2label=id2label,\n",
    "#     label2id=label2id,\n",
    "#     ignore_mismatched_sizes=True\n",
    "# )  \n",
    "\n",
    "lora_model = get_peft_model(model, peft_config)\n",
    "lora_model.print_trainable_parameters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = SFTTrainer(\n",
    "    model,\n",
    "    train_dataset=dataset,\n",
    "    dataset_text_field=\"text\",\n",
    "    peft_config=peft_config,\n",
    "    max_seq_length=model_params['model_max_length']\n",
    ")\n",
    "\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4fbe714ca43d4e53aec27f4ce4fb4706",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "826f51589454434b891a94b0d5ef8a73",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "81418551f332492293ee9795f98a62f7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "367f897f76f845d782ebc3f9be4eec4d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "NameError",
     "evalue": "name 'lora_model' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[8], line 18\u001b[0m\n\u001b[1;32m     13\u001b[0m     results\u001b[38;5;241m.\u001b[39mupdate(precision_metric\u001b[38;5;241m.\u001b[39mcompute(predictions\u001b[38;5;241m=\u001b[39mpredictions, references \u001b[38;5;241m=\u001b[39m labels, average\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmacro\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m     15\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m results\n\u001b[1;32m     17\u001b[0m trainer \u001b[38;5;241m=\u001b[39m transformers\u001b[38;5;241m.\u001b[39mTrainer(\n\u001b[0;32m---> 18\u001b[0m     model\u001b[38;5;241m=\u001b[39m\u001b[43mlora_model\u001b[49m,\n\u001b[1;32m     19\u001b[0m     train_dataset\u001b[38;5;241m=\u001b[39mtrain_dataset,\n\u001b[1;32m     20\u001b[0m     eval_dataset\u001b[38;5;241m=\u001b[39mval_dataset,\n\u001b[1;32m     21\u001b[0m     compute_metrics\u001b[38;5;241m=\u001b[39mcompute_metrics,\n\u001b[1;32m     22\u001b[0m     args\u001b[38;5;241m=\u001b[39mtransformers\u001b[38;5;241m.\u001b[39mTrainingArguments(\n\u001b[1;32m     23\u001b[0m         per_device_train_batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m8\u001b[39m,\n\u001b[1;32m     24\u001b[0m         per_device_eval_batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m32\u001b[39m,\n\u001b[1;32m     25\u001b[0m         gradient_accumulation_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m,\n\u001b[1;32m     26\u001b[0m         warmup_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m,\n\u001b[1;32m     27\u001b[0m         max_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m12276\u001b[39m,\n\u001b[1;32m     28\u001b[0m         learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2e-4\u001b[39m,\n\u001b[1;32m     29\u001b[0m         fp16\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m     30\u001b[0m         eval_steps\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1000\u001b[39m,\n\u001b[1;32m     31\u001b[0m         logging_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1000\u001b[39m,\n\u001b[1;32m     32\u001b[0m         save_steps\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1000\u001b[39m,\n\u001b[1;32m     33\u001b[0m         evaluation_strategy\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msteps\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     34\u001b[0m         do_eval\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m     35\u001b[0m         load_best_model_at_end\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m     36\u001b[0m         metric_for_best_model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mf1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     37\u001b[0m         output_dir\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel_outputs\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     38\u001b[0m         logging_dir\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel_outputs\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     39\u001b[0m         remove_unused_columns \u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, \n\u001b[1;32m     40\u001b[0m         report_to\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwandb\u001b[39m\u001b[38;5;124m'\u001b[39m  \u001b[38;5;66;03m# enable logging to W&B\u001b[39;00m\n\u001b[1;32m     41\u001b[0m     ),\n\u001b[1;32m     42\u001b[0m )\n\u001b[1;32m     43\u001b[0m trainer\u001b[38;5;241m.\u001b[39mtrain()\n",
      "\u001b[0;31mNameError\u001b[0m: name 'lora_model' is not defined"
     ]
    }
   ],
   "source": [
    "f1_metric = evaluate.load(\"f1\")\n",
    "recall_metric = evaluate.load(\"recall\")\n",
    "accuracy_metric = evaluate.load(\"accuracy\")\n",
    "precision_metric = evaluate.load(\"precision\")\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    logits, labels = eval_pred\n",
    "    predictions = np.argmax(logits, axis=-1)\n",
    "    results = {}\n",
    "    results.update(f1_metric.compute(predictions=predictions, references = labels, average=\"macro\"))\n",
    "    results.update(recall_metric.compute(predictions=predictions, references = labels, average=\"macro\"))\n",
    "    results.update(accuracy_metric.compute(predictions=predictions, references = labels))\n",
    "    results.update(precision_metric.compute(predictions=predictions, references = labels, average=\"macro\"))\n",
    "\n",
    "    return results\n",
    "\n",
    "# See https://towardsdatascience.com/fine-tune-your-llm-without-maxing-out-your-gpu-db2278603d78 for details\n",
    "trainer = transformers.Trainer(\n",
    "    model=lora_model,\n",
    "    train_dataset=model_params['train_data'],\n",
    "    eval_dataset=model_params['validation_data'],\n",
    "    compute_metrics=compute_metrics,\n",
    "    args=transformers.TrainingArguments(\n",
    "        per_device_train_batch_size=model_params['batch_size'],\n",
    "        per_device_eval_batch_size=model_params['batch_size'],\n",
    "        gradient_accumulation_steps=model_params['gradient_accumulation'],\n",
    "        warmup_steps=100,\n",
    "        max_steps=12276,\n",
    "        learning_rate=model_params['lr'],\n",
    "        fp16=True,\n",
    "        eval_steps= 1000,\n",
    "        logging_steps=1000,\n",
    "        save_steps=1000,\n",
    "        evaluation_strategy=model_params['evaluation_strategy'],\n",
    "        do_eval=True,\n",
    "        load_best_model_at_end=True,\n",
    "        metric_for_best_model=\"f1\",\n",
    "        output_dir='model_outputs',\n",
    "        logging_dir='model_outputs',\n",
    "        remove_unused_columns =False, \n",
    "        report_to='wandb'  # enable logging to W&B\n",
    "    ),\n",
    ")\n",
    "trainer.train()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}