{ "cells": [ { "cell_type": "markdown", "id": "12315053-0630-4d3d-8028-02035c2dbf14", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## Slide Speech Dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "f8eed0bf-d822-4091-8762-df6582095ab4", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "Dir Structure:\n", " - data\n", " - info\n", " - test\n", " - val\n", "\"\"\"" ] }, { "cell_type": "markdown", "id": "e999f437-d756-492d-b873-6ee656279b53", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## Phi 3 Tinkering" ] }, { "cell_type": "code", "execution_count": null, "id": "3f5f1033-9118-4106-a7fb-6c3b527fe075", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "Prompt template for Phi-3\n", "<|system|>\n", "You are a python developer.<|end|>\n", "<|user|>\n", "Help me generate a bubble sort algorithm<|end|>\n", "<|assistant|>\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "1e9e81b0-ae3d-46f1-97ff-138984d07a28", "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n", "\n", "cache_dir = \"./../cache\"\n", "model_id = \"microsoft/Phi-3-mini-4k-instruct\"\n", "device = \"cuda:0\"\n", "dtype = torch.float16\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_id,\n", " device_map = device,\n", " torch_dtype = dtype,\n", " trust_remote_code=True,\n", " cache_dir = cache_dir,\n", " attn_implementation = \"flash_attention_2\"\n", ")\n", "tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir = cache_dir)\n", "\n", "pipe = pipeline(\n", " \"text-generation\",\n", " model = model,\n", " tokenizer = tokenizer\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "93b9fb26-4661-4a35-ad62-b87834f577bc", "metadata": {}, "outputs": [], "source": [ "messages = [\n", " {\"role\": \"system\", \"content\": \"You are a python developer\"},\n", " {\"role\": \"user\", \"content\": \"Help me generate a bubble sort algorithm\"}\n", "]\n", "\n", "generation_args = {\n", " \"max_new_tokens\": 600,\n", " \"return_full_text\": False,\n", " \"temperature\": 1.0,\n", " \"do_sample\": True\n", "}\n", "\n", "output = pipe(messages, **generation_args)\n", "print(output[0][\"generated_text\"])" ] }, { "cell_type": "markdown", "id": "62db724a-9e20-422d-a5b3-dd55cae55cc7", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## Training Phi 3" ] }, { "cell_type": "code", "execution_count": null, "id": "2036e6b5-c794-4668-9a79-8a53a2736cfa", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n", "from huggingface_hub import ModelCard, ModelCardData, HfApi\n", "from datasets import load_dataset\n", "from jinja2 import Template\n", "from trl import SFTTrainer\n", "import yaml\n", "import torch" ] }, { "cell_type": "code", "execution_count": null, "id": "2dcff92e-ab3e-407a-8595-31ffae5f7acd", "metadata": {}, "outputs": [], "source": [ "# Model Configs\n", "MODEL_ID = \"microsoft/Phi-3-mini-4k-instruct\"\n", "NEW_MODEL_NAME = \"opus-phi-3-mini-4k-instruct\"\n", "CACHE_DIR = \"./../cache\"\n", "\n", "# Dataset Configs\n", "DATASET_NAME = \"\"\n", "SPLIT = \"train\"\n", "\n", "# the maximum length of the sequences that the model will handle\n", "MAX_SEQ_LENGTH = 4096\n", "num_train_epochs = 1\n", "license = \"apache-2.0\"\n", "username = \"darshanmakwana412\"\n", "learning_rate = 1.41e-5\n", "per_device_train_batch_size = 4\n", "gradient_accumulation_steps = 1\n", "\n", "# If bd16 is supported use bf16 otherwise use f16\n", "if torch.cuda.is_bf16_supported():\n", " compute_dtype = torch.bfloat16\n", "else:\n", " compute_dtype = torch.float16" ] }, { "cell_type": "code", "execution_count": null, "id": "0fbcf3c6-dd94-4133-a805-910d57c9f974", "metadata": {}, "outputs": [], "source": [ "model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)\n", "# dataset = load_dataset(DATASET_NAME, split=SPLIT)\n", "\n", "# EOS Token is used to mark the end of a sentence\n", "EOS_TOKEN=tokenizer.eos_token_id" ] }, { "cell_type": "code", "execution_count": null, "id": "e31954ea-1838-4992-9132-32e59c42a128", "metadata": {}, "outputs": [], "source": [ "def formatting_prompts_func(examples):\n", " # Extract the conversations from the examples.\n", " convos = examples[\"conversations\"]\n", " # Initialize an empty list to store the formatted texts.\n", " texts = []\n", " # Define a dictionary to map the 'from' field in the conversation to a prefix.\n", " mapper = {\"system\": \"system\\n\", \"human\": \"\\nuser\\n\", \"gpt\": \"\\nassistant\\n\"}\n", " # Define a dictionary to map the 'from' field in the conversation to a suffix.\n", " end_mapper = {\"system\": \"\", \"human\": \"\", \"gpt\": \"\"}\n", " # Iterate over each conversation.\n", " for convo in convos:\n", " # Format the conversation by joining each turn with its corresponding prefix and suffix.\n", " # Append the EOS token to the end of the conversation.\n", " text = \"\".join(f\"{mapper[(turn := x['from'])]} {x['value']}\\n{end_mapper[turn]}\" for x in convo)\n", " texts.append(f\"{text}{EOS_TOKEN}\")\n", " # Return the formatted texts.\n", " return {\"text\": texts}\n", "\n", "dataset = dataset.map(formatting_prompts_func, batched=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "3086c2a4-7cca-461e-894c-376046089fab", "metadata": {}, "outputs": [], "source": [ "args = TrainingArguments(\n", " evaluation_strategy=\"steps\",\n", " per_device_train_batch_size=7,\n", " gradient_accumulation_steps=4,\n", " gradient_checkpointing=True,\n", " learning_rate=1e-4,\n", " fp16 = not torch.cuda.is_bf16_supported(),\n", " bf16 = torch.cuda.is_bf16_supported(),\n", " max_steps=-1,\n", " num_train_epochs=3,\n", " save_strategy=\"epoch\",\n", " logging_steps=10,\n", " output_dir=NEW_MODEL_NAME,\n", " optim=\"paged_adamw_32bit\",\n", " lr_scheduler_type=\"linear\"\n", ")\n", "\n", "trainer = SFTTrainer(\n", " model=model,\n", " args=args,\n", " train_dataset=dataset,\n", " dataset_text_field=\"text\",\n", " max_seq_length=128,\n", " formatting_func=formatting_prompts_func\n", ")\n", "\n", "device = \"cuda:0\"\n", "\n", "import gc\n", "import os\n", "\n", "gc.collect()\n", "torch.cuda.empty_cache()\n", "\n", "trainer.train()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }