USC-Applied-NLP-Group
/

SQL-Generation

TensorBoard

Safetensors

Model card Files Files and versions Metrics Training metrics Community

licesma commited on Apr 11

Commit

569052e

1 Parent(s): 2b3100e

Prepare fine-tune for colab

Browse files

Files changed (1) hide show

finetune_model.ipynb +86 -219

finetune_model.ipynb CHANGED Viewed

@@ -7,6 +7,73 @@
     "# Finetune DeepSeek Coder 1.3B for NBA Kaggle Database SQLite Generation"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -28,185 +95,7 @@
     }
    ],
    "source": [
-    "input_prompt = \"\"\"You are an AI assistant that converts natural language queries into valid SQLite queries.\n",
-    "Database Schema and Explanations\n",
-    "\n",
-    "team Table\n",
-    "Stores information about NBA teams.\n",
-    "CREATE TABLE IF NOT EXISTS \"team\" (\n",
-    "  \"id\" TEXT PRIMARY KEY,      -- Unique identifier for the team\n",
-    "  \"full_name\" TEXT,           -- Full official name of the team (e.g., \"Los Angeles Lakers\")\n",
-    "  \"abbreviation\" TEXT,        -- Shortened team name (e.g., \"LAL\")\n",
-    "  \"nickname\" TEXT,            -- Commonly used nickname for the team (e.g., \"Lakers\")\n",
-    "  \"city\" TEXT,                -- City where the team is based\n",
-    "  \"state\" TEXT,               -- State where the team is located\n",
-    "  \"year_founded\" REAL         -- Year the team was established\n",
-    ");\n",
-    "\n",
-    "game Table\n",
-    "Contains detailed statistics for each NBA game, including home and away team performance.\n",
-    "CREATE TABLE IF NOT EXISTS \"game\" (\n",
-    "  \"season_id\" TEXT,            -- Season identifier, formatted as \"2YYYY\" (e.g., \"21970\" for the 1970 season)\n",
-    "  \"team_id_home\" TEXT,         -- ID of the home team (matches \"id\" in team table)\n",
-    "  \"team_abbreviation_home\" TEXT, -- Abbreviation of the home team\n",
-    "  \"team_name_home\" TEXT,       -- Full name of the home team\n",
-    "  \"game_id\" TEXT PRIMARY KEY,  -- Unique identifier for the game\n",
-    "  \"game_date\" TIMESTAMP,       -- Date the game was played (YYYY-MM-DD format)\n",
-    "  \"matchup_home\" TEXT,         -- Matchup details including opponent (e.g., \"LAL vs. BOS\")\n",
-    "  \"wl_home\" TEXT,              -- \"W\" if the home team won, \"L\" if they lost\n",
-    "  \"min\" INTEGER,               -- Total minutes played in the game\n",
-    "  \"fgm_home\" REAL,             -- Field goals made by the home team\n",
-    "  \"fga_home\" REAL,             -- Field goals attempted by the home team\n",
-    "  \"fg_pct_home\" REAL,          -- Field goal percentage of the home team\n",
-    "  \"fg3m_home\" REAL,            -- Three-point field goals made by the home team\n",
-    "  \"fg3a_home\" REAL,            -- Three-point attempts by the home team\n",
-    "  \"fg3_pct_home\" REAL,         -- Three-point field goal percentage of the home team\n",
-    "  \"ftm_home\" REAL,             -- Free throws made by the home team\n",
-    "  \"fta_home\" REAL,             -- Free throws attempted by the home team\n",
-    "  \"ft_pct_home\" REAL,          -- Free throw percentage of the home team\n",
-    "  \"oreb_home\" REAL,            -- Offensive rebounds by the home team\n",
-    "  \"dreb_home\" REAL,            -- Defensive rebounds by the home team\n",
-    "  \"reb_home\" REAL,             -- Total rebounds by the home team\n",
-    "  \"ast_home\" REAL,             -- Assists by the home team\n",
-    "  \"stl_home\" REAL,             -- Steals by the home team\n",
-    "  \"blk_home\" REAL,             -- Blocks by the home team\n",
-    "  \"tov_home\" REAL,             -- Turnovers by the home team\n",
-    "  \"pf_home\" REAL,              -- Personal fouls by the home team\n",
-    "  \"pts_home\" REAL,             -- Total points scored by the home team\n",
-    "  \"plus_minus_home\" INTEGER,   -- Plus/minus rating for the home team\n",
-    "  \"video_available_home\" INTEGER, -- Indicates whether video is available (1 = Yes, 0 = No)\n",
-    "  \"team_id_away\" TEXT,         -- ID of the away team\n",
-    "  \"team_abbreviation_away\" TEXT, -- Abbreviation of the away team\n",
-    "  \"team_name_away\" TEXT,       -- Full name of the away team\n",
-    "  \"matchup_away\" TEXT,         -- Matchup details from the away team’s perspective\n",
-    "  \"wl_away\" TEXT,              -- \"W\" if the away team won, \"L\" if they lost\n",
-    "  \"fgm_away\" REAL,             -- Field goals made by the away team\n",
-    "  \"fga_away\" REAL,             -- Field goals attempted by the away team\n",
-    "  \"fg_pct_away\" REAL,          -- Field goal percentage of the away team\n",
-    "  \"fg3m_away\" REAL,            -- Three-point field goals made by the away team\n",
-    "  \"fg3a_away\" REAL,            -- Three-point attempts by the away team\n",
-    "  \"fg3_pct_away\" REAL,         -- Three-point field goal percentage of the away team\n",
-    "  \"ftm_away\" REAL,             -- Free throws made by the away team\n",
-    "  \"fta_away\" REAL,             -- Free throws attempted by the away team\n",
-    "  \"ft_pct_away\" REAL,          -- Free throw percentage of the away team\n",
-    "  \"oreb_away\" REAL,            -- Offensive rebounds by the away team\n",
-    "  \"dreb_away\" REAL,            -- Defensive rebounds by the away team\n",
-    "  \"reb_away\" REAL,             -- Total rebounds by the away team\n",
-    "  \"ast_away\" REAL,             -- Assists by the away team\n",
-    "  \"stl_away\" REAL,             -- Steals by the away team\n",
-    "  \"blk_away\" REAL,             -- Blocks by the away team\n",
-    "  \"tov_away\" REAL,             -- Turnovers by the away team\n",
-    "  \"pf_away\" REAL,              -- Personal fouls by the away team\n",
-    "  \"pts_away\" REAL,             -- Total points scored by the away team\n",
-    "  \"plus_minus_away\" INTEGER,   -- Plus/minus rating for the away team\n",
-    "  \"video_available_away\" INTEGER, -- Indicates whether video is available (1 = Yes, 0 = No)\n",
-    "  \"season_type\" TEXT           -- Regular season or playoffs\n",
-    ");\n",
-    "\n",
-    "other_stats Table\n",
-    "Stores additional statistics, linked to the game table via game_id.\n",
-    "CREATE TABLE IF NOT EXISTS \"other_stats\" (\n",
-    "  \"game_id\" TEXT,             -- Unique game identifier, matches id column from game table\n",
-    "  \"league_id\" TEXT,           -- League identifier\n",
-    "  \"team_id_home\" TEXT,        -- Home team identifier\n",
-    "  \"team_abbreviation_home\" TEXT, -- Home team abbreviation\n",
-    "  \"team_city_home\" TEXT,      -- Home team city\n",
-    "  \"pts_paint_home\" INTEGER,   -- Points in the paint by the home team\n",
-    "  \"pts_2nd_chance_home\" INTEGER, -- Second chance points by the home team\n",
-    "  \"pts_fb_home\" INTEGER,      -- Fast break points by the home team\n",
-    "  \"largest_lead_home\" INTEGER,-- Largest lead by the home team\n",
-    "  \"lead_changes\" INTEGER,     -- Number of lead changes \n",
-    "  \"times_tied\" INTEGER,       -- Number of times the score was tied\n",
-    "  \"team_turnovers_home\" INTEGER, -- Home team turnovers\n",
-    "  \"total_turnovers_home\" INTEGER, -- Total turnovers by the home team\n",
-    "  \"team_rebounds_home\" INTEGER, -- Home team rebounds\n",
-    "  \"pts_off_to_home\" INTEGER,  -- Points off turnovers by the home team\n",
-    "  \"team_id_away\" TEXT,        -- Away team identifier\n",
-    "  \"team_abbreviation_away\" TEXT,  -- Away team abbreviation\n",
-    "  \"pts_paint_away\" INTEGER,   -- Points in the paint by the away team\n",
-    "  \"pts_2nd_chance_away\" INTEGER, -- Second chance points by the away team\n",
-    "  \"pts_fb_away\" INTEGER,      -- Fast break points by the away team\n",
-    "  \"largest_lead_away\" INTEGER,-- Largest lead by the away team\n",
-    "  \"team_turnovers_away\" INTEGER, -- Away team turnovers\n",
-    "  \"total_turnovers_away\" INTEGER, -- Total turnovers by the away team\n",
-    "  \"team_rebounds_away\" INTEGER, -- Away team rebounds\n",
-    "  \"pts_off_to_away\" INTEGER   -- Points off turnovers by the away team\n",
-    ");\n",
-    "\n",
-    "\n",
-    "Team Name Information\n",
-    "In the plaintext user questions, only the full team names will be used, but in the queries you may use the full team names or the abbreviations. \n",
-    "The full team names can be used with the game table, while the abbreviations should be used with the other_stats table.\n",
-    "Notice they are separated by the | character in the following list:\n",
-    "\n",
-    "Atlanta Hawks|ATL\n",
-    "Boston Celtics|BOS\n",
-    "Cleveland Cavaliers|CLE\n",
-    "New Orleans Pelicans|NOP\n",
-    "Chicago Bulls|CHI\n",
-    "Dallas Mavericks|DAL\n",
-    "Denver Nuggets|DEN\n",
-    "Golden State Warriors|GSW\n",
-    "Houston Rockets|HOU\n",
-    "Los Angeles Clippers|LAC\n",
-    "Los Angeles Lakers|LAL\n",
-    "Miami Heat|MIA\n",
-    "Milwaukee Bucks|MIL\n",
-    "Minnesota Timberwolves|MIN\n",
-    "Brooklyn Nets|BKN\n",
-    "New York Knicks|NYK\n",
-    "Orlando Magic|ORL\n",
-    "Indiana Pacers|IND\n",
-    "Philadelphia 76ers|PHI\n",
-    "Phoenix Suns|PHX\n",
-    "Portland Trail Blazers|POR\n",
-    "Sacramento Kings|SAC\n",
-    "San Antonio Spurs|SAS\n",
-    "Oklahoma City Thunder|OKC\n",
-    "Toronto Raptors|TOR\n",
-    "Utah Jazz|UTA\n",
-    "Memphis Grizzlies|MEM\n",
-    "Washington Wizards|WAS\n",
-    "Detroit Pistons|DET\n",
-    "Charlotte Hornets|CHA\n",
-    "\n",
-    "Query Guidelines\n",
-    "Use team_name_home and team_name_away to match teams to the game table. Use team_abbreviation_home and team_abbreviation away to match teams to the other_stats table.\n",
-    "\n",
-    "To filter by season, use season_id = '2YYYY'.\n",
-    "\n",
-    "Example: To get statistics from 2005, use a statement like: season_id = '22005'. To get statistics from 1972, use a statement like: season_id = \"21972\". To get statistics from 2015, use a statement like: season_id = \"22015\".\n",
-    "\n",
-    "Ensure queries return relevant columns and avoid unnecessary joins.\n",
-    "\n",
-    "Example User Requests and SQLite Queries\n",
-    "Request:\n",
-    "\"What is the most points the Los Angeles Lakers have ever scored at home?\"\n",
-    "SQLite:\n",
-    "SELECT MAX(pts_home) FROM game WHERE team_name_home = 'Los Angeles Lakers';\n",
-    "\n",
-    "Request:\n",
-    "\"Which teams are located in the state of California?\"\n",
-    "SQLite:\n",
-    "SELECT full_name FROM team WHERE state = 'California';\n",
-    "\n",
-    "Request:\n",
-    "\"Which team had the highest number of team turnovers in an away game?\"\n",
-    "SQLite:\n",
-    "SELECT team_abbreviation_away FROM other_stats ORDER BY team_turnovers_away DESC LIMIT 1;\n",
-    "\n",
-    "Request:\n",
-    "\"Which teams were founded before 1979?\"\n",
-    "SQLite:\n",
-    "SELECT full_name FROM team WHERE year_founded < 1979;\n",
-    "\n",
-    "Request:\n",
-    "\"Find the Boston Celtics largest home victory margin in the 2008 season.\"\n",
-    "SQLite:\n",
-    "SELECT MAX(pts_home - pts_away) AS biggest_win FROM game WHERE team_name_home = 'Boston Celtics' AND season_id = '22008';\n",
-    "\n",
-    "Generate only the SQLite query prefaced by SQLite: and no other text, do not output an explanation of the query. Now generate an SQLite query for the following user request. Request:\n",
-    "\"\"\"\n",
     "\n",
     "print(len(input_prompt))"
    ]
@@ -220,30 +109,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:From c:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\tf_keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Dean\\AppData\\Local\\Temp\\ipykernel_6496\\2921743792.py:18: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
       "  df = df.applymap(lambda x: re.sub(r'\\s+', ' ', x) if isinstance(x, str) else x)\n"
      ]
     },
@@ -274,7 +147,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Map: 100%|██████████| 1044/1044 [00:37<00:00, 27.57 examples/s]"
      ]
     },
     {
@@ -295,22 +168,14 @@
     }
    ],
    "source": [
-    "import pandas as pd\n",
-    "import torch\n",
-    "from datasets import Dataset\n",
-    "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, EarlyStoppingCallback, PreTrainedTokenizer\n",
-    "from torch.utils.data import DataLoader\n",
-    "from peft import LoraConfig, get_peft_model, TaskType\n",
-    "import os\n",
-    "import re\n",
-    "import numpy as np\n",
     "\n",
     "# Model output directories\n",
-    "MODEL_DIR = \"./fine-tuned-model-16\"\n",
-    "VAL_OUTPUT = \"val-16.hf\"\n",
     "\n",
     "# Load dataset\n",
-    "df = pd.read_csv(\"./train-data/sql_train.tsv\", sep='\\t')\n",
     "\n",
     "df = df.applymap(lambda x: re.sub(r'\\s+', ' ', x) if isinstance(x, str) else x)\n",
     "\n",
@@ -319,14 +184,16 @@
     "print(df.head())\n",
     "\n",
     "# Load tokenizer\n",
-    "model_name = \"./deepseek-coder-1.3b-instruct\"\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
     "\n",
     "# Enable 8-bit quantization for lower memory usage\n",
-    "bnb_config = BitsAndBytesConfig(\n",
-    "    load_in_8bit=True, \n",
-    "    bnb_8bit_compute_dtype=torch.float16\n",
-    ")\n",
     "\n",
     "# Load model with quantization\n",
     "#device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
@@ -870,7 +737,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -893,7 +760,7 @@
     "print(prompt_length)\n",
     "\n",
     "# Create connection to sqlite3 database\n",
-    "connection = sql.connect('./nba-data/nba.sqlite')\n",
     "cursor = connection.cursor()\n",
     "\n",
     "for v in val_dataset:\n",
@@ -4248,7 +4115,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -4262,7 +4129,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.6"
   }
  },
  "nbformat": 4,

     "# Finetune DeepSeek Coder 1.3B for NBA Kaggle Database SQLite Generation"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/CSCI544/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import torch\n",
+    "from datasets import Dataset\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, EarlyStoppingCallback, PreTrainedTokenizer\n",
+    "from torch.utils.data import DataLoader\n",
+    "import sys\n",
+    "from peft import LoraConfig, get_peft_model, TaskType\n",
+    "from huggingface_hub import snapshot_download\n",
+    "import os\n",
+    "import re\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "is_google_colab = False\n",
+    "use_bnb = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "current_read_path = \"./\"\n",
+    "current_write_path = \"./\"\n",
+    "\n",
+    "def read_path(rel_path):\n",
+    "    return os.path.join(current_read_path, rel_path)\n",
+    "\n",
+    "def write_path(rel_path):\n",
+    "    return os.path.join(current_write_path, rel_path)\n",
+    "\n",
+    "if is_google_colab:\n",
+    "    from google.colab import drive\n",
+    "    drive.mount('/content/drive')\n",
+    "    current_write_path = \"/content/drive/MyDrive/sql_gen\"\n",
+    "\n",
+    "    hugging_face_path = snapshot_download(\n",
+    "        repo_id=\"USC-Applied-NLP-Group/SQL-Generation\",\n",
+    "        repo_type=\"model\",  \n",
+    "        allow_patterns=[\"src/*\", \"train-data/*\", \"deepseek-coder-1.3b-instruct/*\", \"nba-data/*\"], \n",
+    "    )\n",
+    "    sys.path.append(hugging_face_path)\n",
+    "    current_path = hugging_face_path"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
     }
    ],
    "source": [
+    "from src.prompts.prompt import input_text as input_prompt\n",
     "\n",
     "print(len(input_prompt))"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/var/folders/g0/47tr69v179dg7w6zyphp9b280000gn/T/ipykernel_35112/48906000.py:8: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
       "  df = df.applymap(lambda x: re.sub(r'\\s+', ' ', x) if isinstance(x, str) else x)\n"
      ]
     },
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Map: 100%|██████████| 1044/1044 [00:17<00:00, 59.19 examples/s]"
      ]
     },
     {
     }
    ],
    "source": [
+    "\n",
     "\n",
     "# Model output directories\n",
+    "MODEL_DIR = write_path(\"fine-tuned-model-16-test\")\n",
+    "VAL_OUTPUT = write_path(\"val-16.hf\")\n",
     "\n",
     "# Load dataset\n",
+    "df = pd.read_csv(read_path(\"train-data/sql_train.tsv\"), sep='\\t')\n",
     "\n",
     "df = df.applymap(lambda x: re.sub(r'\\s+', ' ', x) if isinstance(x, str) else x)\n",
     "\n",
     "print(df.head())\n",
     "\n",
     "# Load tokenizer\n",
+    "model_name = read_path(\"deepseek-coder-1.3b-instruct\")\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
     "\n",
     "# Enable 8-bit quantization for lower memory usage\n",
+    "bnb_config = None\n",
+    "if use_bnb:\n",
+    "    bnb_config = BitsAndBytesConfig(\n",
+    "        load_in_8bit=True, \n",
+    "        bnb_8bit_compute_dtype=torch.float16\n",
+    "    )\n",
     "\n",
     "# Load model with quantization\n",
     "#device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
     "print(prompt_length)\n",
     "\n",
     "# Create connection to sqlite3 database\n",
+    "connection = sql.connect(read_path('nba-data/nba.sqlite'))\n",
     "cursor = connection.cursor()\n",
     "\n",
     "for v in val_dataset:\n",
  ],
  "metadata": {
   "kernelspec": {
+   "display_name": "CSCI544",
    "language": "python",
    "name": "python3"
   },
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,