{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5d69bd30-a4a5-47da-a1ce-b6f9f228b42c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install -q git+https://github.com/huggingface/transformers.git\n", "!pip install -q accelerate datasets peft bitsandbytes" ] }, { "cell_type": "code", "execution_count": 1, "id": "33d7d8f7-a2bd-4548-ac7f-45eba6ca1651", "metadata": {}, "outputs": [], "source": [ "import torch\n", "from datasets import load_dataset, Dataset\n", "from transformers import AutoTokenizer, LlamaForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, Trainer\n", "\n", "from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PromptTuningConfig" ] }, { "cell_type": "code", "execution_count": 2, "id": "511a7b95-1089-4312-bc4a-40c843ea60f7", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "86bfa1c49f8b4fb5900506cdc7968886", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/4 [00:00 '30000' AND fuel_economy IS NOT NULL ORDER BY mileage DESC LIMIT 10;iвassistant\n", "\n", "I apologize for any confusion earlier.\n", "\n", "To clarify your question:\n", "\n", "You're asking me about what I can do if someone else's code or data causes an error in my own program?\n", "\n", "If that happens,\n", "\n", "* **Error Handling**: You should handle these errors properly using try-except blocks.\n", " * For example:\n", " ```\n", " import requests\n", " def get_data(url):\n", " response=requests.get('https://api.example.com/data')\n", " returnresponse.json()\n", " \n" ] } ], "source": [ "import torch\n", "\n", "question = \"Which car model from 2015 has the best miles-per-gallon, costs more than $30,000, and how many total miles has it driven?\"\n", "expected_sql_query = \"\"\"\n", "SELECT make, model, mpg, totalMiles \n", "FROM cars \n", "WHERE modelYear = 2015 \n", "AND sellPrice > 30000 \n", "ORDER BY mpg DESC \n", "LIMIT 1;\n", "\"\"\"\n", "\n", "inputs = tokenizer(question, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=512).to(\"cuda\")\n", "\n", "model.eval()\n", "\n", "with torch.no_grad():\n", " generated_ids = model.generate(\n", " input_ids=inputs[\"input_ids\"],\n", " attention_mask=inputs[\"attention_mask\"],\n", " max_new_tokens=200, # need to adjust so model does not get off track; or could pull sql from it later\n", " repetition_penalty=2.0,\n", " early_stopping=True,\n", " eos_token_id=tokenizer.eos_token_id, # Use greedy decoding for deterministic output\n", " )\n", "\n", "\n", "generated_sql_query = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", "print(f\"Generated SQL: {generated_sql_query}\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "f76849ea-fac9-4ef3-a02b-b56414e25e61", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n" ] } ], "source": [ "from transformers import Trainer, TrainingArguments\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"./results\",\n", " per_device_train_batch_size=2, \n", " gradient_accumulation_steps=4, \n", " num_train_epochs=50, # More epochs for a small dataset\n", " learning_rate=5e-5, \n", " eval_strategy=\"steps\",\n", " eval_steps=20,\n", " save_steps=20,\n", " logging_dir=\"./logs\",\n", " logging_steps=10,\n", " save_total_limit=1,\n", " weight_decay=0.01,\n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=ds,\n", " eval_dataset = ds, #use training dataset as eval dataset because of the small size of data\n", " tokenizer=tokenizer\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "id": "20e1c0c7-4c92-46a6-8023-2bb2e9f70107", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [750/750 36:17, Epoch 49/50]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining LossValidation Loss
2018.86060018.779743
4018.63140018.560749
6018.45880018.344973
8018.13620018.131050
10017.97290017.917627
12017.72690017.709686
14017.60520017.505020
16017.33700017.299978
18017.14440017.099331
20016.93010016.904736
22016.74400016.711248
24016.58200016.522562
26016.44380016.339695
28016.22040016.161507
30016.02640015.991174
32015.86900015.825206
34015.74650015.668069
36015.57440015.521387
38015.42090015.380891
40015.28820015.247506
42015.14300015.120378
44015.01940015.004883
46014.91950014.896546
48014.79130014.795321
50014.68780014.703000
52014.66630014.616350
54014.55040014.541070
56014.50500014.471634
58014.47940014.409344
60014.34160014.354433
62014.33970014.307119
64014.29260014.265167
66014.25260014.229964
68014.24040014.202421
70014.18360014.182171
72014.18220014.169066
74014.15360014.162232

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=750, training_loss=15.830242533365885, metrics={'train_runtime': 2180.7907, 'train_samples_per_second': 2.774, 'train_steps_per_second': 0.344, 'total_flos': 1.3720107025327718e+17, 'train_loss': 15.830242533365885, 'epoch': 49.18032786885246})" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer.train()" ] }, { "cell_type": "code", "execution_count": 11, "id": "79786af2-4a19-464f-9f23-5bcfca6f3d16", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Generated SQL: Which car model from 2015 has the best miles-per-gallon, costs more than $30,000, and how many total miles has it driven?sonyoursite is there are you want to date:1.. Acura of which one! The answer will be a single line with three values separated by commas (e.g., \"Toyota Prius Hybrid\", \"$35k - \\$40K per year\").\" } { SELECT m.make AS Car_Model FROM cars c JOIN models ON CAST(c.model_id as integer) = id WHERE price > '30000' AND fuel_economy IS NOT NULL ORDER BY mileage DESC LIMIT 10;iвassistant\n", "\n", "I apologize for any confusion earlier.\n", "\n", "To clarify your question:\n", "\n", "You're asking me about what I can do if someone else's code or data causes an error in my own program?\n", "\n", "If that happens,\n", "\n", "* **Error Handling**: You should handle these errors properly using try-except blocks.\n", " * For example:\n", " ```\n", " import requests\n", " def get_data(url):\n", " response=requests.get('https://api.example.com/data')\n", " returnresponse.json()\n", " \n" ] } ], "source": [ "import torch\n", "\n", "question = \"Which car model from 2015 has the best miles-per-gallon, costs more than $30,000, and how many total miles has it driven?\"\n", "expected_sql_query = \"\"\"\n", "SELECT make, model, mpg, totalMiles \n", "FROM cars \n", "WHERE modelYear = 2015 \n", "AND sellPrice > 30000 \n", "ORDER BY mpg DESC \n", "LIMIT 1;\n", "\"\"\"\n", "\n", "inputs = tokenizer(question, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=512).to(\"cuda\")\n", "\n", "model.eval()\n", "\n", "with torch.no_grad():\n", " generated_ids = model.generate(\n", " input_ids=inputs[\"input_ids\"],\n", " attention_mask=inputs[\"attention_mask\"],\n", " max_new_tokens=200, # Allow for sufficient token generation\n", " repetition_penalty=2.0,\n", " early_stopping=True,\n", " eos_token_id=tokenizer.eos_token_id, # Use greedy decoding for deterministic output\n", " )\n", "\n", "\n", "generated_sql_query = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", "print(f\"Generated SQL: {generated_sql_query}\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "f6ac37df-0d98-42db-82e4-31aeb1d57baa", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "abaf926b5cb74411bcbce6570542dc13", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='