{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5d69bd30-a4a5-47da-a1ce-b6f9f228b42c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install -q git+https://github.com/huggingface/transformers.git\n", "!pip install -q accelerate datasets peft bitsandbytes" ] }, { "cell_type": "code", "execution_count": 1, "id": "33d7d8f7-a2bd-4548-ac7f-45eba6ca1651", "metadata": {}, "outputs": [], "source": [ "import torch\n", "from datasets import load_dataset, Dataset\n", "from transformers import AutoTokenizer, LlamaForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, Trainer\n", "\n", "from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PromptTuningConfig" ] }, { "cell_type": "code", "execution_count": 2, "id": "511a7b95-1089-4312-bc4a-40c843ea60f7", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "86bfa1c49f8b4fb5900506cdc7968886", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:601: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n", " warnings.warn(\n", "/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:601: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "trainable params: 81,920 || all params: 8,030,343,168 || trainable%: 0.0010\n" ] } ], "source": [ "model_name = \"defog/llama-3-sqlcoder-8b\"\n", "\n", "prompt_config = PromptTuningConfig(\n", " num_virtual_tokens=20, # Number of prompt tokens to learn\n", " task_type=\"CAUSAL_LM\", # Causal language modeling for SQL generation\n", " tokenizer_name_or_path=model_name\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=True)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", "model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(\"cuda\")\n", "model = get_peft_model(model, prompt_config)\n", "model.print_trainable_parameters()" ] }, { "cell_type": "code", "execution_count": 3, "id": "7bfb864d-6ad5-49fb-9e18-6d6e6d90373a", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "26656ca795e24d8483092fdc3e3d8954", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/121 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Dataset({\n", " features: ['question', 'query', 'input_ids', 'attention_mask', 'labels'],\n", " num_rows: 121\n", "})" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import json\n", "with open(\"syntheticTableData (1).json\",\"r\") as f: #SyntheticTableData (1) is the same as kristiannordby/text2sql121rows dataset in huggingface\n", " data = json.load(f)\n", "untokenized_dataset = Dataset.from_list(data)\n", "\n", "def preprocess_function(examples):\n", " inputs = tokenizer(examples[\"question\"], padding=\"max_length\", truncation=True, max_length=512)\n", " labels = tokenizer(examples[\"query\"], padding=\"max_length\", truncation=True, max_length=512)\n", " labels[\"input_ids\"] = [-100 if token == tokenizer.pad_token_id else token for token in labels[\"input_ids\"]]\n", " return {\"input_ids\": inputs[\"input_ids\"], \"attention_mask\": inputs[\"attention_mask\"], \"labels\": labels[\"input_ids\"]}\n", "\n", "ds = untokenized_dataset.map(preprocess_function, batched=True)\n", "ds" ] }, { "cell_type": "code", "execution_count": 10, "id": "a0197d96", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Generated SQL: Which car model from 2015 has the best miles-per-gallon, costs more than $30,000, and how many total miles has it driven?sonyoursite is there are you want to date:1.. Acura of which one! The answer will be a single line with three values separated by commas (e.g., \"Toyota Prius Hybrid\", \"$35k - \\$40K per year\").\" } { SELECT m.make AS Car_Model FROM cars c JOIN models ON CAST(c.model_id as integer) = id WHERE price > '30000' AND fuel_economy IS NOT NULL ORDER BY mileage DESC LIMIT 10;iвassistant\n", "\n", "I apologize for any confusion earlier.\n", "\n", "To clarify your question:\n", "\n", "You're asking me about what I can do if someone else's code or data causes an error in my own program?\n", "\n", "If that happens,\n", "\n", "* **Error Handling**: You should handle these errors properly using try-except blocks.\n", " * For example:\n", " ```\n", " import requests\n", " def get_data(url):\n", " response=requests.get('https://api.example.com/data')\n", " returnresponse.json()\n", " \n" ] } ], "source": [ "import torch\n", "\n", "question = \"Which car model from 2015 has the best miles-per-gallon, costs more than $30,000, and how many total miles has it driven?\"\n", "expected_sql_query = \"\"\"\n", "SELECT make, model, mpg, totalMiles \n", "FROM cars \n", "WHERE modelYear = 2015 \n", "AND sellPrice > 30000 \n", "ORDER BY mpg DESC \n", "LIMIT 1;\n", "\"\"\"\n", "\n", "inputs = tokenizer(question, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=512).to(\"cuda\")\n", "\n", "model.eval()\n", "\n", "with torch.no_grad():\n", " generated_ids = model.generate(\n", " input_ids=inputs[\"input_ids\"],\n", " attention_mask=inputs[\"attention_mask\"],\n", " max_new_tokens=200, # need to adjust so model does not get off track; or could pull sql from it later\n", " repetition_penalty=2.0,\n", " early_stopping=True,\n", " eos_token_id=tokenizer.eos_token_id, # Use greedy decoding for deterministic output\n", " )\n", "\n", "\n", "generated_sql_query = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", "print(f\"Generated SQL: {generated_sql_query}\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "f76849ea-fac9-4ef3-a02b-b56414e25e61", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n" ] } ], "source": [ "from transformers import Trainer, TrainingArguments\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"./results\",\n", " per_device_train_batch_size=2, \n", " gradient_accumulation_steps=4, \n", " num_train_epochs=50, # More epochs for a small dataset\n", " learning_rate=5e-5, \n", " eval_strategy=\"steps\",\n", " eval_steps=20,\n", " save_steps=20,\n", " logging_dir=\"./logs\",\n", " logging_steps=10,\n", " save_total_limit=1,\n", " weight_decay=0.01,\n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=ds,\n", " eval_dataset = ds, #use training dataset as eval dataset because of the small size of data\n", " tokenizer=tokenizer\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "id": "20e1c0c7-4c92-46a6-8023-2bb2e9f70107", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
Step | \n", "Training Loss | \n", "Validation Loss | \n", "
---|---|---|
20 | \n", "18.860600 | \n", "18.779743 | \n", "
40 | \n", "18.631400 | \n", "18.560749 | \n", "
60 | \n", "18.458800 | \n", "18.344973 | \n", "
80 | \n", "18.136200 | \n", "18.131050 | \n", "
100 | \n", "17.972900 | \n", "17.917627 | \n", "
120 | \n", "17.726900 | \n", "17.709686 | \n", "
140 | \n", "17.605200 | \n", "17.505020 | \n", "
160 | \n", "17.337000 | \n", "17.299978 | \n", "
180 | \n", "17.144400 | \n", "17.099331 | \n", "
200 | \n", "16.930100 | \n", "16.904736 | \n", "
220 | \n", "16.744000 | \n", "16.711248 | \n", "
240 | \n", "16.582000 | \n", "16.522562 | \n", "
260 | \n", "16.443800 | \n", "16.339695 | \n", "
280 | \n", "16.220400 | \n", "16.161507 | \n", "
300 | \n", "16.026400 | \n", "15.991174 | \n", "
320 | \n", "15.869000 | \n", "15.825206 | \n", "
340 | \n", "15.746500 | \n", "15.668069 | \n", "
360 | \n", "15.574400 | \n", "15.521387 | \n", "
380 | \n", "15.420900 | \n", "15.380891 | \n", "
400 | \n", "15.288200 | \n", "15.247506 | \n", "
420 | \n", "15.143000 | \n", "15.120378 | \n", "
440 | \n", "15.019400 | \n", "15.004883 | \n", "
460 | \n", "14.919500 | \n", "14.896546 | \n", "
480 | \n", "14.791300 | \n", "14.795321 | \n", "
500 | \n", "14.687800 | \n", "14.703000 | \n", "
520 | \n", "14.666300 | \n", "14.616350 | \n", "
540 | \n", "14.550400 | \n", "14.541070 | \n", "
560 | \n", "14.505000 | \n", "14.471634 | \n", "
580 | \n", "14.479400 | \n", "14.409344 | \n", "
600 | \n", "14.341600 | \n", "14.354433 | \n", "
620 | \n", "14.339700 | \n", "14.307119 | \n", "
640 | \n", "14.292600 | \n", "14.265167 | \n", "
660 | \n", "14.252600 | \n", "14.229964 | \n", "
680 | \n", "14.240400 | \n", "14.202421 | \n", "
700 | \n", "14.183600 | \n", "14.182171 | \n", "
720 | \n", "14.182200 | \n", "14.169066 | \n", "
740 | \n", "14.153600 | \n", "14.162232 | \n", "
"
],
"text/plain": [
"