File size: 8,003 Bytes

2cddd11

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "12315053-0630-4d3d-8028-02035c2dbf14",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Slide Speech Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8eed0bf-d822-4091-8762-df6582095ab4",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Dir Structure:\n",
    " - data\n",
    "   - info\n",
    "   - test\n",
    "   - val\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e999f437-d756-492d-b873-6ee656279b53",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Phi 3 Tinkering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f5f1033-9118-4106-a7fb-6c3b527fe075",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Prompt template for Phi-3\n",
    "<|system|>\n",
    "You are a python developer.<|end|>\n",
    "<|user|>\n",
    "Help me generate a bubble sort algorithm<|end|>\n",
    "<|assistant|>\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e9e81b0-ae3d-46f1-97ff-138984d07a28",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
    "\n",
    "cache_dir = \"./../cache\"\n",
    "model_id = \"microsoft/Phi-3-mini-4k-instruct\"\n",
    "device = \"cuda:0\"\n",
    "dtype = torch.float16\n",
    "\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    model_id,\n",
    "    device_map = device,\n",
    "    torch_dtype = dtype,\n",
    "    trust_remote_code=True,\n",
    "    cache_dir = cache_dir,\n",
    "    attn_implementation = \"flash_attention_2\"\n",
    ")\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir = cache_dir)\n",
    "\n",
    "pipe = pipeline(\n",
    "    \"text-generation\",\n",
    "    model = model,\n",
    "    tokenizer = tokenizer\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93b9fb26-4661-4a35-ad62-b87834f577bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "messages = [\n",
    "    {\"role\": \"system\", \"content\": \"You are a python developer\"},\n",
    "    {\"role\": \"user\", \"content\": \"Help me generate a bubble sort algorithm\"}\n",
    "]\n",
    "\n",
    "generation_args = {\n",
    "    \"max_new_tokens\": 600,\n",
    "    \"return_full_text\": False,\n",
    "    \"temperature\": 1.0,\n",
    "    \"do_sample\": True\n",
    "}\n",
    "\n",
    "output = pipe(messages, **generation_args)\n",
    "print(output[0][\"generated_text\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "62db724a-9e20-422d-a5b3-dd55cae55cc7",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Training Phi 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2036e6b5-c794-4668-9a79-8a53a2736cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n",
    "from huggingface_hub import ModelCard, ModelCardData, HfApi\n",
    "from datasets import load_dataset\n",
    "from jinja2 import Template\n",
    "from trl import SFTTrainer\n",
    "import yaml\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2dcff92e-ab3e-407a-8595-31ffae5f7acd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Model Configs\n",
    "MODEL_ID = \"microsoft/Phi-3-mini-4k-instruct\"\n",
    "NEW_MODEL_NAME = \"opus-phi-3-mini-4k-instruct\"\n",
    "CACHE_DIR = \"./../cache\"\n",
    "\n",
    "# Dataset Configs\n",
    "DATASET_NAME = \"\"\n",
    "SPLIT = \"train\"\n",
    "\n",
    "# the maximum length of the sequences that the model will handle\n",
    "MAX_SEQ_LENGTH = 4096\n",
    "num_train_epochs = 1\n",
    "license = \"apache-2.0\"\n",
    "username = \"darshanmakwana412\"\n",
    "learning_rate = 1.41e-5\n",
    "per_device_train_batch_size = 4\n",
    "gradient_accumulation_steps = 1\n",
    "\n",
    "# If bd16 is supported use bf16 otherwise use f16\n",
    "if torch.cuda.is_bf16_supported():\n",
    "    compute_dtype = torch.bfloat16\n",
    "else:\n",
    "    compute_dtype = torch.float16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0fbcf3c6-dd94-4133-a805-910d57c9f974",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)\n",
    "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)\n",
    "# dataset = load_dataset(DATASET_NAME, split=SPLIT)\n",
    "\n",
    "# EOS Token is used to mark the end of a sentence\n",
    "EOS_TOKEN=tokenizer.eos_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e31954ea-1838-4992-9132-32e59c42a128",
   "metadata": {},
   "outputs": [],
   "source": [
    "def formatting_prompts_func(examples):\n",
    "    # Extract the conversations from the examples.\n",
    "    convos = examples[\"conversations\"]\n",
    "    # Initialize an empty list to store the formatted texts.\n",
    "    texts = []\n",
    "    # Define a dictionary to map the 'from' field in the conversation to a prefix.\n",
    "    mapper = {\"system\": \"system\\n\", \"human\": \"\\nuser\\n\", \"gpt\": \"\\nassistant\\n\"}\n",
    "    # Define a dictionary to map the 'from' field in the conversation to a suffix.\n",
    "    end_mapper = {\"system\": \"\", \"human\": \"\", \"gpt\": \"\"}\n",
    "    # Iterate over each conversation.\n",
    "    for convo in convos:\n",
    "        # Format the conversation by joining each turn with its corresponding prefix and suffix.\n",
    "        # Append the EOS token to the end of the conversation.\n",
    "        text = \"\".join(f\"{mapper[(turn := x['from'])]} {x['value']}\\n{end_mapper[turn]}\" for x in convo)\n",
    "        texts.append(f\"{text}{EOS_TOKEN}\")\n",
    "    # Return the formatted texts.\n",
    "    return {\"text\": texts}\n",
    "\n",
    "dataset = dataset.map(formatting_prompts_func, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3086c2a4-7cca-461e-894c-376046089fab",
   "metadata": {},
   "outputs": [],
   "source": [
    "args = TrainingArguments(\n",
    "    evaluation_strategy=\"steps\",\n",
    "    per_device_train_batch_size=7,\n",
    "    gradient_accumulation_steps=4,\n",
    "    gradient_checkpointing=True,\n",
    "    learning_rate=1e-4,\n",
    "    fp16 = not torch.cuda.is_bf16_supported(),\n",
    "    bf16 = torch.cuda.is_bf16_supported(),\n",
    "    max_steps=-1,\n",
    "    num_train_epochs=3,\n",
    "    save_strategy=\"epoch\",\n",
    "    logging_steps=10,\n",
    "    output_dir=NEW_MODEL_NAME,\n",
    "    optim=\"paged_adamw_32bit\",\n",
    "    lr_scheduler_type=\"linear\"\n",
    ")\n",
    "\n",
    "trainer = SFTTrainer(\n",
    "    model=model,\n",
    "    args=args,\n",
    "    train_dataset=dataset,\n",
    "    dataset_text_field=\"text\",\n",
    "    max_seq_length=128,\n",
    "    formatting_func=formatting_prompts_func\n",
    ")\n",
    "\n",
    "device = \"cuda:0\"\n",
    "\n",
    "import gc\n",
    "import os\n",
    "\n",
    "gc.collect()\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "trainer.train()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}