{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 27963, "status": "ok", "timestamp": 1731936147140, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "kH18jD5cR_Ks", "outputId": "2909fc12-c908-4670-d2c4-81a69281f9c6" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.4/122.4 MB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.9/310.9 kB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0m" ] } ], "source": [ "# !pip install -q accelerate peft bitsandbytes transformers trl faiss-gpu langchain_community wandb flash-attn\n", "!pip install -q accelerate peft bitsandbytes transformers trl datasets\n", "\n", "# flash-attn" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "executionInfo": { "elapsed": 46336, "status": "ok", "timestamp": 1731936193472, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "cgVNTbBa-D3j" }, "outputs": [], "source": [ "# load the required packages.\n", "import torch\n", "from datasets import load_dataset, Dataset\n", "from peft import LoraConfig, AutoPeftModelForCausalLM, PeftModel, get_peft_model\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoConfig, set_seed\n", "from trl import SFTTrainer\n", "import bitsandbytes as bnb\n", "import transformers\n", "\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import sqlparse\n", "import re\n", "import json\n", "\n", "from huggingface_hub import hf_hub_download\n", "from huggingface_hub import HfFileSystem" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "basaX_55Yf_D" }, "outputs": [], "source": [ "#transformers.logging.set_verbosity_info()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "executionInfo": { "elapsed": 546, "status": "ok", "timestamp": 1731936211421, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "bkkjgGdlrNcq" }, "outputs": [], "source": [ "WRITE_TOKEN = userdata.get('hf_write')\n", "READ_TOKEN = userdata.get('hf_read')\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "executionInfo": { "elapsed": 392, "status": "ok", "timestamp": 1731936214421, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "7CKnwlRfZj4V" }, "outputs": [], "source": [ "model_name = \"meta-llama/Llama-3.2-1B-Instruct\"\n", "out_name = \"lleticiasilvaa/Llama-3.2-schemaLinking-v0\"\n", "prev_checkpoint = None #\"checkpoint-1000\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9zs7nCmt-pMC" }, "outputs": [], "source": [ "#!huggingface-cli login" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "executionInfo": { "elapsed": 411, "status": "ok", "timestamp": 1731936218771, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "PZdnxs8k-Cgl" }, "outputs": [], "source": [ "spider_id= \"NESPED-GEN/spider_selector_schemaReduzido\" #\"NESPED-GEN/spider_variacoes_formato_schema\"" ] }, { "cell_type": "markdown", "metadata": { "id": "xT2iRdCN_MFH" }, "source": [ "### Load Data\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 405, "referenced_widgets": [ "8545121d22d34d1ba75911f2cc39b3bb", "eafc6618ade14d68957144605968bb23", "456d26b40c5f4032873077e9be498a17", "cc1733274d52432780780e74dd8f56d5", "4ca9cd81c1474606a83d49b5414afc98", "7551e02a42a54f2b9e85cadd83c543d8", "248758dd3c9445c393c32bd2e98f374c", "08561481cfb74ffa9cd612326049c4e9", "9bf9f998edef4a92946aa0352d75affd", "fd384e8cb7b84f4f8f2d1b22c4752efc", "393db0d1eabd4d04b86cbe62e5b164b3", "dab7ee68aaf043de845cb14cfa2da05f", "fee67e3fa09144f6a50d14ce435794cb", "a12e11985b354406a9908fdf9d517b8d", "1c149d1fb61b4597817ae85537d2d852", "0469926dd4824e7da4aea1eaac9df5e8", "948e8d9afd47441f9256950647489ee0", "83f6b1b3edda48b7a03fe427cf40972f", "b8a8fd2411ce4460a9e30dd7bff6883a", "33e65fd6ac3447ad80cebf7f95acb682", "e81acf953a3d490783862fb9b71dcf09", "9992354938114d9a8efe32ba67935f0e", "fe1bdb4f03f649518f09ee2b588a4d02", "b1c24d6465e0473c8ae0eae6dc218b8c", "056bc2fe6d1c4f4283d76d8c04bccb5a", "c612714916684c3a9b720547a2a17a30", "bf761ff2a975403e8631d0a8b2a1e1fb", "f8f6f6e95769484d8feae33a28761769", "adf6de0cb9dc47ab8e0caef99bdc1cb5", "77a3c5aac5674e7abac452cb1164a508", "7a816d9c420348f4ab2ad6eb7834266f", "0b9d8dbd5edb43d9a7678e1939b02c1a", "4b7b2688aa82489a9459fb692a99bb24", "dae66e24e77f4746985acf61cd3635ac", "f2f82ce4e0854a2796ac6ff608f87264", "0e733769c0cc4e1485efd37f15668ead", "9a3cdac86c8f4526b367404c54de31b1", "1cc39eeb7e9047baa1fce35929b4575f", "9a83de57cca14879a0d686e31291ef77", "27cfba42522d4181bc00d9b30f106ff1", "3b23958f163e4a368d89676c871a9eab", "a14405280a054c58af43b9af4962079c", "d92bf726a7ef41159bba60f1b3477947", "f35373c71b2b4b969fb21e823f1031c6", "817b76c0ac454dbe8725bce7579d832f", "4813b8f96ddb4f5981cf20f9f72d5e29", "79fa1519e4a048c680aae88eda778a40", "351136e2314b448bb063522e5fff4f0c", "ca54087e57254ef1936af0b107782891", "460da755dad542abbd2323a12ae3154a", "74230c6457e5415ea5c21f672cfcd715", "e75caca50a7549eea9bb7c2d626e4131", "4d0062b671a24eddbc06c83748eea66c", "75100a7aebab4e218603368f3235580f", "8423377a97a9420986a3c3bef6f7351d" ] }, "executionInfo": { "elapsed": 5777, "status": "ok", "timestamp": 1731936228847, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "lLKgY40efdJo", "outputId": "1ac23817-d38d-4e12-ccbd-080e23848bca" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8545121d22d34d1ba75911f2cc39b3bb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "README.md: 0%| | 0.00/885 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dab7ee68aaf043de845cb14cfa2da05f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "dev-00000-of-00001.parquet: 0%| | 0.00/369k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fe1bdb4f03f649518f09ee2b588a4d02", "version_major": 2, "version_minor": 0 }, "text/plain": [ "train-00000-of-00001.parquet: 0%| | 0.00/2.70M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dae66e24e77f4746985acf61cd3635ac", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating dev split: 0%| | 0/1034 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "817b76c0ac454dbe8725bce7579d832f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0%| | 0/8656 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Dataset({\n", " features: ['db_id', 'question_en', 'query', 'answer', 'hardness', 'query_llm', 'schema_SQLDatabase', 'schema_our', 'schema_dict', 'selector', 'selector_correct', 'schema_SQLDatabase_reduzido', 'schema_SQLDatabase_reduzido_tabelas'],\n", " num_rows: 8656\n", "})" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# bird = load_dataset(bird_id, split=\"train\").to_pandas()\n", "spider = load_dataset(spider_id, split=\"train\")\n", "spider" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "executionInfo": { "elapsed": 537, "status": "ok", "timestamp": 1731936253440, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "po1gNdLjFKdZ" }, "outputs": [], "source": [ "df = spider.to_pandas()" ] }, { "cell_type": "markdown", "metadata": { "id": "ON-hPsEYM1Bu" }, "source": [ "# Load Base Model" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "executionInfo": { "elapsed": 537, "status": "ok", "timestamp": 1731936272658, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "yEAZpfzlNOHW" }, "outputs": [], "source": [ "def download_checkpoint(adapter_model_id, checkpoint):\n", " fs = HfFileSystem()\n", " for file in fs.ls(f'{adapter_model_id}/{checkpoint}', detail=False):\n", " file_name = file.split(checkpoint)[-1]\n", "\n", " hf_hub_download(repo_id=adapter_model_id, filename=(f'{checkpoint}{file_name}'), local_dir='out')\n", "\n", " for file in fs.ls(f'{adapter_model_id}/logs', detail=False):\n", " file_name = file.split(checkpoint)[-1]\n", "\n", " hf_hub_download(repo_id=adapter_model_id, filename=(f'logs/{file_name.split(\"/\")[-1]}'), local_dir='out')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "enUxjGXEqHxg" }, "outputs": [], "source": [ "# download_checkpoint(out_name, prev_checkpoint)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 209, "referenced_widgets": [ "5bc55abd849b4a11852d14476acc068a", "042c2b3a658545758c66b37faaa28f1a", "8e03b6e83a4741ab993e988c1f7f5756", "1abd604b1efd464eb7b25efb6af1ccf7", "f35fb497a3ec4d528da124083c9e6311", "37e07f486a174112a2697b7cbfc457e8", "083e76dc7c714978b0d7c2b8f662c54f", "96d0a4057c4f46d68ac6da5534f0f236", "5d4057d1ee0842ecbe46bd7311347977", "89df1bee2fb44e20b1ce2c1ddf65e999", "b6630c925bbe4ca4a2222e51a89476ee", "d30c123ea8494e51a39faa87cfd71a48", "ced2e61d7f1e421b8fbc1a4a3f8dbf71", "aefa3b589fb849a6b064e408a9d5232f", "9ec9e78665634ad7b8a635f5d67c2792", "deda638b08f846cead83605dd229bf97", "39c8a3e2a2574b16b20975f09277feeb", "5b3e48a392b141dd90ca5cb3a79262a4", "89fb0f03d7ff4fd985bab417f0e8c09a", "d3407ea946834589808ae9271f0c281d", "d2ab6fad873e45619d5022a9ebf34af6", "f2baf82f0f3e4cff9648d9a82347de68", "b05ce72db5074d44a91b7305ed846e61", "ea6a12a145a8474eb3ae4f5104a3ac6c", "4f67463e26d54d9fb3eedcfb4f11ac20", "fffb33ddf3a24f79ae3d06442677419b", "1da3cfa6c2c44c5f880ba4f107b1d34b", "de454663a3d9448dba272ea5d8d828b7", "d76943322dfc4c8f8c9ca114ac06e5dc", "89a6b8614b7f4b6b9249634181ff0a01", "44f358cce58c4fceb7dc7f7f43796468", "ca800f0e7c3a4f24832e158885ba11df", "7273633153a9400c9a377c94b473934e", "6eacc7e8c8794caf91b850b5bbcef4e9", "2f5e050ee4534935bc89af6578212c36", "2e6d1305e1e64e36970c73101531328d", "96057c92e2bd4c59a96c62914d5a0a99", "e96ec7c34fdf4603b0709ede0542ea31", "8f246f9d38864a5fa6a0a7f0875ead4c", "0333a30a66cf4b29ba3a9da9f749d60f", "1355308c69ea4d2aab8de9748c391a6f", "1f25b80e179a4d1aaed2ecbb991334ff", "4e98655673b94e29ad2b393e1843a07e", "c90d45314d414e44938bde81fad0fac1", "2a5957ccf3144a3ebe50358a2ebbdbda", "77dfed6c4d2f40c8bcbfe6260f6fac05", "2a033ecdc65a4a09afb8d7881c6fc4ec", "6dd88d9842db4b0088c87db89329bafc", "40f3a592fe74446385531e1f501a839a", "9a03bc79b1ed465bba30aeb76229e6e7", "a7ded74614d84ec68d332209a6495ce9", "c3dbd4d217f547fc9e268af4e6fbb533", "36edef43f5794092b828b18eb4785b50", "617c9a26b1294edf99465c117929f8ff", "9e794848b2434f53a3f7adb8cae42c16", "3ca77aa81f9b4607b6cbcf793db13916", "c1b22f75ed494d95a2916883eb69e5f0", "223540f2b4e14e70816b44460df1e02a", "114cb69f840b4935914326e3f5df38c0", "52b88832ecbe4f6fb9db752bf1eaa292", "b970201d85e64c098541ab8a0426c099", "84eabbe3fc5449e1bb7f117f0519ffed", "196779bdf7af44878699a8a828cef3a3", "f441af495eee4f1a989f8443585122c0", "89a8b694d1d643259fb63bbccbc8dc78", "603944cee179476b9dc8b15d38a4dcb6" ] }, "executionInfo": { "elapsed": 64277, "status": "ok", "timestamp": 1731936340118, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "M7DoqQMlM_nW", "outputId": "4ae87bb1-9a84-4ea5-f7e6-6538c6690494" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5bc55abd849b4a11852d14476acc068a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/54.5k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d30c123ea8494e51a39faa87cfd71a48", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0%| | 0.00/9.09M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b05ce72db5074d44a91b7305ed846e61", "version_major": 2, "version_minor": 0 }, "text/plain": [ "special_tokens_map.json: 0%| | 0.00/296 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6eacc7e8c8794caf91b850b5bbcef4e9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/877 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2a5957ccf3144a3ebe50358a2ebbdbda", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/2.47G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3ca77aa81f9b4607b6cbcf793db13916", "version_major": 2, "version_minor": 0 }, "text/plain": [ "generation_config.json: 0%| | 0.00/189 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "seed=14\n", "\n", "\n", "if (prev_checkpoint != None):\n", " try:\n", " download_checkpoint(out_name, prev_checkpoint)\n", " except:\n", " pass\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=READ_TOKEN, map_device=\"auto\", add_eos_token=True, use_fast=True)\n", "\n", "new_tokens = {'additional_special_tokens': ['[SQL]','[/SQL]', '[QUESTION]','[/QUESTION]']}\n", "#adicionar tokens especiais:\n", "# if (prev_checkpoint == None):\n", "# tokenizer.add_special_tokens(new_tokens)\n", "\n", "\n", "if torch.cuda.is_bf16_supported():\n", " compute_dtype = torch.bfloat16\n", " attn_implementation = 'flash_attention_2'\n", "else:\n", " compute_dtype = torch.float16\n", " attn_implementation = 'sdpa'\n", "\n", "tokenizer.pad_token = tokenizer.eos_token\n", "tokenizer.padding_side = \"right\"\n", "\n", "\n", "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_quant_type=\"nf4\",\n", " bnb_4bit_compute_dtype=compute_dtype,\n", " bnb_4bit_use_double_quant=False,\n", ")\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " torch_dtype=compute_dtype,\n", " device_map=\"auto\",\n", " quantization_config=bnb_config,\n", "\n", " trust_remote_code=True,\n", " token=READ_TOKEN,\n", " # attn_implementation=attn_implementation\n", ")\n", "\n", "# se adicionar special_tokens tem que fazer resize do tokenizer:\n", "# model.resize_token_embeddings(len(tokenizer))\n", "\n", "## model.resize_token_embeddings(max(len(tokenizer), model.config.vocab_size))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 8479, "status": "ok", "timestamp": 1730570376902, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "fLuqzhSJBvi8", "outputId": "65293049-fcc8-491b-aaca-579bca0686c7" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`\n", "The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`\n" ] }, { "data": { "text/plain": [ "Embedding(32004, 2048)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# se adicionar special_tokens tem que fazer resize do tokenizer:\n", "#model.resize_token_embeddings(len(tokenizer))\n", "\n", "#model.resize_token_embeddings(max(len(tokenizer), model.config.vocab_size))" ] }, { "cell_type": "markdown", "metadata": { "id": "_I7-bFfm5gqS" }, "source": [ "#### Chat Template - Gerar SQL" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cYVA3Q7ZCzHi" }, "outputs": [], "source": [ "# tokenizer.chat_template = \"\"\"\n", "# {% if messages[0]['role'] == 'system' %}\n", "# {% set loop_messages = messages[1:] %}\n", "# {% set system_message = messages[0]['content'] %}\n", "# {% else %}\n", "# {% set loop_messages = messages %}\n", "# {% set system_message = 'Given a user question and the schema of a database, your task is to generate an SQL query that accurately answers the question based on the provided schema.' %}\n", "# {% endif %}\n", "# {{ '# <|system|>/n/' + system_message + '/n//n/' }}\n", "# {% if messages|selectattr(\"role\", \"equalto\", \"example\")|list %}\n", "# Below are some examples of question and their corresponding SQL queries:/n//n/\n", "# {% else %}\n", "# /n/\n", "# {% endif %}\n", "# {% for message in loop_messages %}\n", "# {% if message['role'] == 'example' %}\n", "# {{ message['content'] }}/n//n/\n", "# {% elif message['role'] == 'schema' %}\n", "# # <|schema|>/n/The query will run on a database with the following schema:/n/{{ message['content'] }}/n//n/\n", "# {% elif message['role'] == 'user' %}\n", "# # <|user|>/n/[QUESTION]{{ message['content'] }}[/QUESTION]/n//n/\n", "# {% elif message['role'] == 'assistant' %}\n", "# # <|assistant|>/n/[SQL]{{ message['content'] }}[/SQL]\n", "# {% endif %}\n", "# {% endfor %}\n", "# {% if add_generation_prompt %}\n", "# # <|assistant|>/n/[SQL]\n", "# {% endif %}\n", "# \"\"\".replace(\"\\n\",\"\").replace(\" \", \"\").replace(\"/n/\", \"\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NDD-URc1cbFx" }, "outputs": [], "source": [ "import re\n", "\n", "def replace_alias_with_table(query):\n", " # Expressão regular para encontrar tabelas com alias, capturando o nome da tabela e o alias\n", " alias_pattern = re.compile(r'(\\bFROM\\b|\\bJOIN\\b)\\s+(\\w+)\\s+AS\\s+(\\w+)', re.IGNORECASE)\n", "\n", " # Substituições de aliases encontrados no padrão\n", " aliases = {match.group(3): match.group(2) for match in alias_pattern.finditer(query)}\n", "\n", " # Substituir cada alias pelo nome da tabela correspondente\n", " for alias, table in aliases.items():\n", " query = re.sub(r'\\b' + alias + r'\\b', table, query)\n", "\n", " # Remover 'AS' e alias das cláusulas 'FROM' e 'JOIN'\n", " query = re.sub(r'\\bAS\\s+\\w+', '', query, flags=re.IGNORECASE)\n", " return query" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "n8lBUu3acgfg" }, "outputs": [], "source": [ "count = 0\n", "for i in spider:\n", " if 'AS ' in i['query_llm']:\n", " count += 1\n", " print(count)\n", " print('--------------------------')\n", " print('query = ', i['query'])\n", " print('--------------------------')\n", " print('query_llm = ', i['query_llm'])\n", " print('--------------------------')\n", " print(replace_alias_with_table(i['query']))\n", " print('--------------------------')\n", "print(count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6RDDdVgP5gqT" }, "outputs": [], "source": [ "def to_sql(query):\n", " return sqlparse.format(replace_alias_with_table(query), reindent=True, keyword_case='upper')\n", "\n", "def apply_template(row, tokenizer=tokenizer, n_examplea=0):\n", " question = row['question_en']\n", " schema = row['schema_SQLDatabase_reduzido_tabelas']\n", " sql = to_sql(row['query'])\n", "\n", " system = \"Given a user question and the schema of a database, your task is to generate an SQL query that accurately answers the question based on the provided schema.\"\n", "\n", " chat = [\n", " {'role': 'system', 'content': system},\n", " {'role': 'user', 'content': f\"# Schema:\\n```sql\\n{schema}\\n```\\n\\n# Question: {question}\"},\n", " {'role': 'assistant', 'content': f\"```sql\\n{sql}\\n```\\n\"}\n", " ]\n", "\n", " row['text'] = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)\n", "\n", " return row" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2trHEegL5gqU" }, "outputs": [], "source": [ "# spider_chain = json.load(open(\"/content/drive/Shareddrives/LLMs/Datasets/spider/spider_chain_of_thought.json\", \"r\"))\n", "# bird_chain = json.load(open(\"/content/drive/Shareddrives/LLMs/Datasets/bird/bird_chain_of_thought.json\", \"r\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N4jWrC7s5gqU" }, "outputs": [], "source": [ "# df['CoT'] = spider_chain + bird_chain" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bTF0pBsw5gqU" }, "outputs": [], "source": [ "df = df.apply(apply_template, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "H5azRSqHPoXp" }, "outputs": [], "source": [ "pd.set_option('display.max_colwidth', None) #definir a opção para mostrar todo o conteúdo das células\n", "pd.set_option('display.max_rows', None) #definir a opção para mostrar todas as linhas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "L4tjUv7o5gqV" }, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DfJvLaGR5gqV" }, "outputs": [], "source": [ "# df['n_tokens'] = df['text'].apply(lambda x: len(tokenizer.encode(x)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vJseOHIu5gqW" }, "outputs": [], "source": [ "# import seaborn as sns\n", "# sns.histplot(df['n_tokens'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 12, "status": "ok", "timestamp": 1731587180051, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "PIvSnr6Y5gqW", "outputId": "dc8e1eb8-d666-4828-b348-2e20986f50dd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n", "\n", "Cutting Knowledge Date: December 2023\n", "Today Date: 14 Nov 2024\n", "\n", "Given a user question and the schema of a database, your task is to generate an SQL query that accurately answers the question based on the provided schema.<|eot_id|><|start_header_id|>user<|end_header_id|>\n", "\n", "# Schema:\n", "```sql\n", "CREATE TABLE Courses (\n", " course_id VARCHAR(100),\n", " course_name VARCHAR(120),\n", " course_description VARCHAR(255),\n", " other_details VARCHAR(255),\n", " PRIMARY KEY (course_id)\n", ");\n", "\n", "CREATE TABLE Student_Course_Attendance (\n", " student_id INTEGER,\n", " course_id INTEGER,\n", " date_of_attendance DATETIME,\n", " PRIMARY KEY (student_id)\n", ");\n", "```\n", "\n", "# Question: How many students are attending English courses?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", "\n", "```sql\n", "SELECT count(*)\n", "FROM courses\n", "JOIN student_course_attendance ON courses.course_id = student_course_attendance.course_id\n", "WHERE courses.course_name = \"English\"\n", "```<|eot_id|>\n" ] } ], "source": [ "print(df['text'][df.index[70]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "roZzKNOj5gqW" }, "outputs": [], "source": [ "_df = pd.DataFrame(columns=['text'])\n", "_df['text'] = df.sample(frac=1, random_state=14).reset_index(drop=True)['text']\n", "_df = Dataset.from_pandas(_df)\n", "_df = _df.train_test_split(test_size=0.01, shuffle=True, seed=14)\n", "train_dataset, valid_dataset = _df[\"train\"], _df[\"test\"]" ] }, { "cell_type": "markdown", "metadata": { "id": "leGEqxURAL4r" }, "source": [ "#### Chat Template - Schema Linking" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "executionInfo": { "elapsed": 379, "status": "ok", "timestamp": 1731936398766, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "aHGTJNq2AmUs" }, "outputs": [], "source": [ "def apply_template(row, tokenizer=tokenizer, n_examplea=0):\n", " question = row['question_en']\n", " schema = row['schema_SQLDatabase']\n", " schema_linking = row['selector_correct']\n", "\n", " system = \"Given a user question and the schema of a database, your task is to generate an JSON with the the names of tables and columns of the schema that the question is referring to.\"\n", "\n", " chat = [{'role': 'user', 'content': f\"# System:\\n{system}\\n\\n# Schema:\\n```sql\\n{schema}\\n```\\n\\n# Question: {question}\"},\n", " {'role': 'assistant', 'content': f\"```json\\n{schema_linking}\\n```\"}\n", " ]\n", "\n", " row['text'] = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)\n", "\n", " return row" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "executionInfo": { "elapsed": 14436, "status": "ok", "timestamp": 1731936414427, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "yvcMZAL3E3TB" }, "outputs": [], "source": [ "df = df.apply(apply_template, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-8OByl30hBp7" }, "outputs": [], "source": [ "# df['n_tokens'] = df['text'].apply(lambda x: len(tokenizer.encode(x)))\n", "# import seaborn as sns\n", "# sns.histplot(df['n_tokens'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 536, "status": "ok", "timestamp": 1731936466271, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "TxrNQ8OGvToc", "outputId": "a65d3db9-f7be-4ed1-c5bf-93ab506f8a97" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n", "\n", "Cutting Knowledge Date: December 2023\n", "Today Date: 18 Nov 2024\n", "\n", "<|eot_id|><|start_header_id|>user<|end_header_id|>\n", "\n", "# System:\n", "Given a user question and the schema of a database, your task is to generate an JSON with the the names of tables and columns of the schema that the question is referring to.\n", "\n", "# Schema:\n", "```sql\n", "CREATE TABLE Addresses (\n", " address_id INTEGER,\n", " line_1 VARCHAR(80),\n", " line_2 VARCHAR(80),\n", " city VARCHAR(50),\n", " zip_postcode CHAR(20),\n", " state_province_county VARCHAR(50),\n", " country VARCHAR(50),\n", " PRIMARY KEY (address_id)\n", ");\n", "\n", "CREATE TABLE People (\n", " person_id INTEGER,\n", " first_name VARCHAR(255),\n", " middle_name VARCHAR(255),\n", " last_name VARCHAR(255),\n", " cell_mobile_number VARCHAR(40),\n", " email_address VARCHAR(40),\n", " login_name VARCHAR(40),\n", " password VARCHAR(40),\n", " PRIMARY KEY (person_id)\n", ");\n", "\n", "CREATE TABLE Students (\n", " student_id INTEGER,\n", " student_details VARCHAR(255),\n", " PRIMARY KEY (student_id),\n", " FOREIGN KEY (student_id) REFERENCES People(person_id)\n", ");\n", "\n", "CREATE TABLE Courses (\n", " course_id VARCHAR(100),\n", " course_name VARCHAR(120),\n", " course_description VARCHAR(255),\n", " other_details VARCHAR(255),\n", " PRIMARY KEY (course_id)\n", ");\n", "\n", "CREATE TABLE People_Addresses (\n", " person_address_id INTEGER,\n", " person_id INTEGER,\n", " address_id INTEGER,\n", " date_from DATETIME,\n", " date_to DATETIME,\n", " PRIMARY KEY (person_address_id),\n", " FOREIGN KEY (address_id) REFERENCES Addresses(address_id),\n", " FOREIGN KEY (person_id) REFERENCES People(person_id)\n", ");\n", "\n", "CREATE TABLE Student_Course_Registrations (\n", " student_id INTEGER,\n", " course_id INTEGER,\n", " registration_date DATETIME,\n", " PRIMARY KEY (student_id),\n", " FOREIGN KEY (course_id) REFERENCES Courses(course_id),\n", " FOREIGN KEY (student_id) REFERENCES Students(student_id)\n", ");\n", "\n", "CREATE TABLE Student_Course_Attendance (\n", " student_id INTEGER,\n", " course_id INTEGER,\n", " date_of_attendance DATETIME,\n", " PRIMARY KEY (student_id),\n", " FOREIGN KEY (student_id) REFERENCES Student_Course_Registrations(student_id),\n", " FOREIGN KEY (course_id) REFERENCES Student_Course_Registrations(course_id)\n", ");\n", "\n", "CREATE TABLE Candidates (\n", " candidate_id INTEGER,\n", " candidate_details VARCHAR(255),\n", " PRIMARY KEY (candidate_id),\n", " FOREIGN KEY (candidate_id) REFERENCES People(person_id)\n", ");\n", "\n", "CREATE TABLE Candidate_Assessments (\n", " candidate_id INTEGER,\n", " qualification CHAR(15),\n", " assessment_date DATETIME,\n", " asessment_outcome_code CHAR(15),\n", " PRIMARY KEY (candidate_id),\n", " FOREIGN KEY (candidate_id) REFERENCES Candidates(candidate_id)\n", ");\n", "```\n", "\n", "# Question: How many students are attending English courses?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", "\n", "```json\n", "{\n", " 'Courses': ['course_id', 'course_name'],\n", " 'Student_Course_Attendance': ['student_id', 'course_id']\n", "}\n", "```<|eot_id|>\n" ] } ], "source": [ "print(df['text'][df.index[70]])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "executionInfo": { "elapsed": 386, "status": "ok", "timestamp": 1731936460602, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "jgNv1q3IA4J2" }, "outputs": [], "source": [ "_df = pd.DataFrame(columns=['text'])\n", "_df['text'] = df.sample(frac=1, random_state=14).reset_index(drop=True)['text']\n", "_df = Dataset.from_pandas(_df)\n", "_df = _df.train_test_split(test_size=0.01, shuffle=True, seed=14)\n", "train_dataset, valid_dataset = _df[\"train\"], _df[\"test\"]" ] }, { "cell_type": "markdown", "metadata": { "id": "DWpXeuO_KlLS" }, "source": [ "### Finetuning" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "executionInfo": { "elapsed": 558, "status": "ok", "timestamp": 1731936487795, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "0oVpZDj1AXY9" }, "outputs": [], "source": [ "from huggingface_hub import login, create_repo\n", "from google.colab import userdata\n", "import wandb\n", "import os\n", "\n", "#token = userdata.get('hf_write')\n", "token = WRITE_TOKEN\n", "login(token=token)\n", "set_seed(1234)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "executionInfo": { "elapsed": 539, "status": "ok", "timestamp": 1731936491409, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "KRhO7UJ-Q4Y8" }, "outputs": [], "source": [ "def find_all_linear_names(model, new_tokens=False):\n", " lora_module_names = set()\n", " for name, module in model.named_modules():\n", " if isinstance(module, bnb.nn.Linear4bit) or isinstance(module, bnb.nn.Linear8bitLt):\n", " names = name.split(\".\")\n", " lora_module_names.add(names[0] if len(names) == 1 else names[-1])\n", " if(new_tokens):\n", " lora_module_names.add(\"lm_head\")\n", " return list(lora_module_names)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 541, "status": "ok", "timestamp": 1731936494446, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "L0qqP5Y9PtRh", "outputId": "4034bd48-a65e-42e2-8cb7-4467aa03edb0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 7 modules to quantize: ['up_proj', 'o_proj', 'q_proj', 'v_proj', 'down_proj', 'k_proj', 'gate_proj']\n" ] } ], "source": [ "modules = find_all_linear_names(model)\n", "print(f\"Found {len(modules)} modules to quantize: {modules}\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "executionInfo": { "elapsed": 367, "status": "ok", "timestamp": 1731936496858, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "uFUnJrbjPwAT" }, "outputs": [], "source": [ "peft_config = LoraConfig(\n", " lora_alpha=128, #primeira versão = 16\n", " lora_dropout=0.1,\n", " r=64,\n", " # bias=\"none\",\n", " # task_type=\"CAUSAL_LM\",\n", " target_modules=modules,\n", " # modules_to_save=[\"embed_tokens\"], #quando adicionar tokens speciais\n", ")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "executionInfo": { "elapsed": 376, "status": "ok", "timestamp": 1731936499232, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "yA_4Bz3jpCCn", "outputId": "0c15897b-31af-44dd-d4a5-59ec04b96299" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" }, "text/plain": [ "'lleticiasilvaa/Llama-3.2-schemaLinking-v0'" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "out_name" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "executionInfo": { "elapsed": 363, "status": "ok", "timestamp": 1731936502759, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "buh0o2P2jwbx" }, "outputs": [], "source": [ "torch.cuda.empty_cache()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 754, "referenced_widgets": [ "caccac2a2d5e4445ad1c42f548ac76ff", "433c3a339e004da5b9ffe0d6ad425633", "c51a291ede9f4473b7f3e7dc9e904fa1", "0670f1f9662b48ab9f10fcc6e3a64078", "c5e60fbf6ba840a8a5abef9e3c1e610f", "c16988e98aa64dc0837b8eddb6fb5af0", "275f7188aa2942a18109c41992d9e8ab", "7fcc38066a594c4c92163037724baf19", "576cecd2fbd3410cb8923a513a09f94b", "764123d56cc24dc8b47f78213837a793", "b169080680c74fe694980345d7b39173", "fb91c87599e84f4a84889fc4e4c7dd8c", "f4d1bb5aaaaf4b17b785b6ed324d7e15", "840ec873419d45b6b558bea842cf0e4d", "f79f690a99384752a3e5438ad578f99e", "3260ea2d0a974213942307ab8bd09829", "cf71ccaf09dd443186bda3748a07d1c4", "0432ec190416496aa7eaa6762c56902a", "27585098c5824a7ba24b542674c71521", "5c6c17d7026241519c4f59657d96ea40", "c29b77dce92147dca76136bd5fb8419e", "70f3d5e80b6543b892f87a6497b182b6" ] }, "executionInfo": { "elapsed": 5278419, "status": "ok", "timestamp": 1731941791695, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "9bD7ea0F-GQn", "outputId": "5b2b2bd8-8d3c-4674-c7a2-daa29e455021" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", "/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", "/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "caccac2a2d5e4445ad1c42f548ac76ff", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/8569 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fb91c87599e84f4a84889fc4e4c7dd8c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/87 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.\n", "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", " return fn(*args, **kwargs)\n" ] }, { "data": { "text/html": [ "\n", "
Step | \n", "Training Loss | \n", "Validation Loss | \n", "
---|---|---|
250 | \n", "0.433300 | \n", "0.237069 | \n", "
500 | \n", "0.145700 | \n", "0.127559 | \n", "
750 | \n", "0.100500 | \n", "0.103754 | \n", "
1000 | \n", "0.086400 | \n", "0.098527 | \n", "
"
],
"text/plain": [
"