Add second RAG notebook

Browse files

Files changed (5) hide show

src/rag/table_documents.py +104 -0
src/rag/table_retriever.py +131 -0
src/rag/team_documents.py +13 -0
test_rag_2.ipynb +489 -0
train-data/expanded_sql_train.tsv +0 -0

src/rag/table_documents.py ADDED Viewed

	@@ -0,0 +1,104 @@

+team_table_document = '''team Table
+Stores information about NBA teams.
+CREATE TABLE IF NOT EXISTS "team" (
+  "id" TEXT PRIMARY KEY,      -- Unique identifier for the team
+  "full_name" TEXT,           -- Full official name of the team (e.g., "Los Angeles Lakers")
+  "abbreviation" TEXT,        -- Shortened team name (e.g., "LAL")
+  "nickname" TEXT,            -- Commonly used nickname for the team (e.g., "Lakers")
+  "city" TEXT,                -- City where the team is based
+  "state" TEXT,               -- State where the team is located
+  "year_founded" REAL         -- Year the team was established
+);'''
+game_table_document = '''game Table
+Contains detailed statistics for each NBA game, including home and away team performance.
+CREATE TABLE IF NOT EXISTS "game" (
+  "season_id" TEXT,            -- Season identifier, formatted as "2YYYY" (e.g., "21970" for the 1970 season)
+  "team_id_home" TEXT,         -- ID of the home team (matches "id" in team table)
+  "team_abbreviation_home" TEXT, -- Abbreviation of the home team
+  "team_name_home" TEXT,       -- Full name of the home team
+  "game_id" TEXT PRIMARY KEY,  -- Unique identifier for the game
+  "game_date" TIMESTAMP,       -- Date the game was played (YYYY-MM-DD format)
+  "matchup_home" TEXT,         -- Matchup details including opponent (e.g., "LAL vs. BOS")
+  "wl_home" TEXT,              -- "W" if the home team won, "L" if they lost
+  "min" INTEGER,               -- Total minutes played in the game
+  "fgm_home" REAL,             -- Field goals made by the home team
+  "fga_home" REAL,             -- Field goals attempted by the home team
+  "fg_pct_home" REAL,          -- Field goal percentage of the home team
+  "fg3m_home" REAL,            -- Three-point field goals made by the home team
+  "fg3a_home" REAL,            -- Three-point attempts by the home team
+  "fg3_pct_home" REAL,         -- Three-point field goal percentage of the home team
+  "ftm_home" REAL,             -- Free throws made by the home team
+  "fta_home" REAL,             -- Free throws attempted by the home team
+  "ft_pct_home" REAL,          -- Free throw percentage of the home team
+  "oreb_home" REAL,            -- Offensive rebounds by the home team
+  "dreb_home" REAL,            -- Defensive rebounds by the home team
+  "reb_home" REAL,             -- Total rebounds by the home team
+  "ast_home" REAL,             -- Assists by the home team
+  "stl_home" REAL,             -- Steals by the home team
+  "blk_home" REAL,             -- Blocks by the home team
+  "tov_home" REAL,             -- Turnovers by the home team
+  "pf_home" REAL,              -- Personal fouls by the home team
+  "pts_home" REAL,             -- Total points scored by the home team
+  "plus_minus_home" INTEGER,   -- Plus/minus rating for the home team
+  "video_available_home" INTEGER, -- Indicates whether video is available (1 = Yes, 0 = No)
+  "team_id_away" TEXT,         -- ID of the away team
+  "team_abbreviation_away" TEXT, -- Abbreviation of the away team
+  "team_name_away" TEXT,       -- Full name of the away team
+  "matchup_away" TEXT,         -- Matchup details from the away team’s perspective
+  "wl_away" TEXT,              -- "W" if the away team won, "L" if they lost
+  "fgm_away" REAL,             -- Field goals made by the away team
+  "fga_away" REAL,             -- Field goals attempted by the away team
+  "fg_pct_away" REAL,          -- Field goal percentage of the away team
+  "fg3m_away" REAL,            -- Three-point field goals made by the away team
+  "fg3a_away" REAL,            -- Three-point attempts by the away team
+  "fg3_pct_away" REAL,         -- Three-point field goal percentage of the away team
+  "ftm_away" REAL,             -- Free throws made by the away team
+  "fta_away" REAL,             -- Free throws attempted by the away team
+  "ft_pct_away" REAL,          -- Free throw percentage of the away team
+  "oreb_away" REAL,            -- Offensive rebounds by the away team
+  "dreb_away" REAL,            -- Defensive rebounds by the away team
+  "reb_away" REAL,             -- Total rebounds by the away team
+  "ast_away" REAL,             -- Assists by the away team
+  "stl_away" REAL,             -- Steals by the away team
+  "blk_away" REAL,             -- Blocks by the away team
+  "tov_away" REAL,             -- Turnovers by the away team
+  "pf_away" REAL,              -- Personal fouls by the away team
+  "pts_away" REAL,             -- Total points scored by the away team
+  "plus_minus_away" INTEGER,   -- Plus/minus rating for the away team
+  "video_available_away" INTEGER, -- Indicates whether video is available (1 = Yes, 0 = No)
+  "season_type" TEXT           -- Regular season or playoffs
+);
+'''
+other_stats_table_document = '''other_stats Table
+Stores additional statistics, linked to the game table via game_id.
+CREATE TABLE IF NOT EXISTS "other_stats" (
+  "game_id" TEXT,             -- Unique game identifier, matches id column from game table
+  "league_id" TEXT,           -- League identifier
+  "team_id_home" TEXT,        -- Home team identifier
+  "team_abbreviation_home" TEXT, -- Home team abbreviation
+  "team_city_home" TEXT,      -- Home team city
+  "pts_paint_home" INTEGER,   -- Points in the paint by the home team
+  "pts_2nd_chance_home" INTEGER, -- Second chance points by the home team
+  "pts_fb_home" INTEGER,      -- Fast break points by the home team
+  "largest_lead_home" INTEGER,-- Largest lead by the home team
+  "lead_changes" INTEGER,     -- Number of lead changes
+  "times_tied" INTEGER,       -- Number of times the score was tied
+  "team_turnovers_home" INTEGER, -- Home team turnovers
+  "total_turnovers_home" INTEGER, -- Total turnovers by the home team
+  "team_rebounds_home" INTEGER, -- Home team rebounds
+  "pts_off_to_home" INTEGER,  -- Points off turnovers by the home team
+  "team_id_away" TEXT,        -- Away team identifier
+  "team_abbreviation_away" TEXT,  -- Away team abbreviation
+  "pts_paint_away" INTEGER,   -- Points in the paint by the away team
+  "pts_2nd_chance_away" INTEGER, -- Second chance points by the away team
+  "pts_fb_away" INTEGER,      -- Fast break points by the away team
+  "largest_lead_away" INTEGER,-- Largest lead by the away team
+  "team_turnovers_away" INTEGER, -- Away team turnovers
+  "total_turnovers_away" INTEGER, -- Total turnovers by the away team
+  "team_rebounds_away" INTEGER, -- Away team rebounds
+  "pts_off_to_away" INTEGER   -- Points off turnovers by the away team
+);
+'''

src/rag/table_retriever.py ADDED Viewed

	@@ -0,0 +1,131 @@

+team_table_document = '''team Table
+Stores information about NBA teams.
+CREATE TABLE IF NOT EXISTS "team" (
+  "id" TEXT PRIMARY KEY,      -- Unique identifier for the team
+  "full_name" TEXT,           -- Full official name of the team (e.g., "Los Angeles Lakers")
+  "abbreviation" TEXT,        -- Shortened team name (e.g., "LAL")
+  "nickname" TEXT,            -- Commonly used nickname for the team (e.g., "Lakers")
+  "city" TEXT,                -- City where the team is based
+  "state" TEXT,               -- State where the team is located
+  "year_founded" REAL         -- Year the team was established
+);'''
+game_table_document = '''game Table
+Contains detailed statistics for each NBA game, including home and away team performance.
+CREATE TABLE IF NOT EXISTS "game" (
+  "season_id" TEXT,            -- Season identifier, formatted as "2YYYY" (e.g., "21970" for the 1970 season)
+  "team_id_home" TEXT,         -- ID of the home team (matches "id" in team table)
+  "team_abbreviation_home" TEXT, -- Abbreviation of the home team
+  "team_name_home" TEXT,       -- Full name of the home team
+  "game_id" TEXT PRIMARY KEY,  -- Unique identifier for the game
+  "game_date" TIMESTAMP,       -- Date the game was played (YYYY-MM-DD format)
+  "matchup_home" TEXT,         -- Matchup details including opponent (e.g., "LAL vs. BOS")
+  "wl_home" TEXT,              -- "W" if the home team won, "L" if they lost
+  "min" INTEGER,               -- Total minutes played in the game
+  "fgm_home" REAL,             -- Field goals made by the home team
+  "fga_home" REAL,             -- Field goals attempted by the home team
+  "fg_pct_home" REAL,          -- Field goal percentage of the home team
+  "fg3m_home" REAL,            -- Three-point field goals made by the home team
+  "fg3a_home" REAL,            -- Three-point attempts by the home team
+  "fg3_pct_home" REAL,         -- Three-point field goal percentage of the home team
+  "ftm_home" REAL,             -- Free throws made by the home team
+  "fta_home" REAL,             -- Free throws attempted by the home team
+  "ft_pct_home" REAL,          -- Free throw percentage of the home team
+  "oreb_home" REAL,            -- Offensive rebounds by the home team
+  "dreb_home" REAL,            -- Defensive rebounds by the home team
+  "reb_home" REAL,             -- Total rebounds by the home team
+  "ast_home" REAL,             -- Assists by the home team
+  "stl_home" REAL,             -- Steals by the home team
+  "blk_home" REAL,             -- Blocks by the home team
+  "tov_home" REAL,             -- Turnovers by the home team
+  "pf_home" REAL,              -- Personal fouls by the home team
+  "pts_home" REAL,             -- Total points scored by the home team
+  "plus_minus_home" INTEGER,   -- Plus/minus rating for the home team
+  "video_available_home" INTEGER, -- Indicates whether video is available (1 = Yes, 0 = No)
+  "team_id_away" TEXT,         -- ID of the away team
+  "team_abbreviation_away" TEXT, -- Abbreviation of the away team
+  "team_name_away" TEXT,       -- Full name of the away team
+  "matchup_away" TEXT,         -- Matchup details from the away team’s perspective
+  "wl_away" TEXT,              -- "W" if the away team won, "L" if they lost
+  "fgm_away" REAL,             -- Field goals made by the away team
+  "fga_away" REAL,             -- Field goals attempted by the away team
+  "fg_pct_away" REAL,          -- Field goal percentage of the away team
+  "fg3m_away" REAL,            -- Three-point field goals made by the away team
+  "fg3a_away" REAL,            -- Three-point attempts by the away team
+  "fg3_pct_away" REAL,         -- Three-point field goal percentage of the away team
+  "ftm_away" REAL,             -- Free throws made by the away team
+  "fta_away" REAL,             -- Free throws attempted by the away team
+  "ft_pct_away" REAL,          -- Free throw percentage of the away team
+  "oreb_away" REAL,            -- Offensive rebounds by the away team
+  "dreb_away" REAL,            -- Defensive rebounds by the away team
+  "reb_away" REAL,             -- Total rebounds by the away team
+  "ast_away" REAL,             -- Assists by the away team
+  "stl_away" REAL,             -- Steals by the away team
+  "blk_away" REAL,             -- Blocks by the away team
+  "tov_away" REAL,             -- Turnovers by the away team
+  "pf_away" REAL,              -- Personal fouls by the away team
+  "pts_away" REAL,             -- Total points scored by the away team
+  "plus_minus_away" INTEGER,   -- Plus/minus rating for the away team
+  "video_available_away" INTEGER, -- Indicates whether video is available (1 = Yes, 0 = No)
+  "season_type" TEXT           -- Regular season or playoffs
+);
+'''
+other_stats_table_document = '''other_stats Table
+Stores additional statistics, linked to the game table via game_id.
+CREATE TABLE IF NOT EXISTS "other_stats" (
+  "game_id" TEXT,             -- Unique game identifier, matches id column from game table
+  "league_id" TEXT,           -- League identifier
+  "team_id_home" TEXT,        -- Home team identifier
+  "team_abbreviation_home" TEXT, -- Home team abbreviation
+  "team_city_home" TEXT,      -- Home team city
+  "pts_paint_home" INTEGER,   -- Points in the paint by the home team
+  "pts_2nd_chance_home" INTEGER, -- Second chance points by the home team
+  "pts_fb_home" INTEGER,      -- Fast break points by the home team
+  "largest_lead_home" INTEGER,-- Largest lead by the home team
+  "lead_changes" INTEGER,     -- Number of lead changes
+  "times_tied" INTEGER,       -- Number of times the score was tied
+  "team_turnovers_home" INTEGER, -- Home team turnovers
+  "total_turnovers_home" INTEGER, -- Total turnovers by the home team
+  "team_rebounds_home" INTEGER, -- Home team rebounds
+  "pts_off_to_home" INTEGER,  -- Points off turnovers by the home team
+  "team_id_away" TEXT,        -- Away team identifier
+  "team_abbreviation_away" TEXT,  -- Away team abbreviation
+  "pts_paint_away" INTEGER,   -- Points in the paint by the away team
+  "pts_2nd_chance_away" INTEGER, -- Second chance points by the away team
+  "pts_fb_away" INTEGER,      -- Fast break points by the away team
+  "largest_lead_away" INTEGER,-- Largest lead by the away team
+  "team_turnovers_away" INTEGER, -- Away team turnovers
+  "total_turnovers_away" INTEGER, -- Total turnovers by the away team
+  "team_rebounds_away" INTEGER, -- Away team rebounds
+  "pts_off_to_away" INTEGER   -- Points off turnovers by the away team
+);
+'''
+team_name_document = '''Team Name Information
+In plaintext user questions, only the full team names will be used, but in the queries you may use either full names or abbreviations.
+Full names are used with the game table, while abbreviations should be used with the other_stats table.
+Team names and abbreviations (separated by |):
+Atlanta Hawks|ATL, Boston Celtics|BOS, Cleveland Cavaliers|CLE, New Orleans Pelicans|NOP,
+Chicago Bulls|CHI, Dallas Mavericks|DAL, Denver Nuggets|DEN, Golden State Warriors|GSW,
+Houston Rockets|HOU, Los Angeles Clippers|LAC, Los Angeles Lakers|LAL, Miami Heat|MIA,
+Milwaukee Bucks|MIL, Minnesota Timberwolves|MIN, Brooklyn Nets|BKN, New York Knicks|NYK,
+Orlando Magic|ORL, Indiana Pacers|IND, Philadelphia 76ers|PHI, Phoenix Suns|PHX,
+Portland Trail Blazers|POR, Sacramento Kings|SAC, San Antonio Spurs|SAS,
+Oklahoma City Thunder|OKC, Toronto Raptors|TOR, Utah Jazz|UTA, Memphis Grizzlies|MEM,
+Washington Wizards|WAS, Detroit Pistons|DET, Charlotte Hornets|CHA
+'''
+def retrieve_doc(has_team_schema, has_game_schema, has_other_stats_schema, has_team_names = True):
+        documents = []
+        # Now scores should be a 1D tensor with length equal to available_docs
+        if has_team_schema:
+            documents.append(team_table_document)
+        if has_game_schema:
+            documents.append(game_table_document)
+        if has_other_stats_schema:
+            documents.append(other_stats_table_document)
+        if has_team_names:
+            documents.append(team_name_document)
+        return documents

src/rag/team_documents.py ADDED Viewed

	@@ -0,0 +1,13 @@

+team_name_document = '''Team Name Information
+In plaintext user questions, only the full team names will be used, but in the queries you may use either full names or abbreviations.
+Full names are used with the game table, while abbreviations should be used with the other_stats table.
+Team names and abbreviations (separated by |):
+Atlanta Hawks|ATL, Boston Celtics|BOS, Cleveland Cavaliers|CLE, New Orleans Pelicans|NOP,
+Chicago Bulls|CHI, Dallas Mavericks|DAL, Denver Nuggets|DEN, Golden State Warriors|GSW,
+Houston Rockets|HOU, Los Angeles Clippers|LAC, Los Angeles Lakers|LAL, Miami Heat|MIA,
+Milwaukee Bucks|MIL, Minnesota Timberwolves|MIN, Brooklyn Nets|BKN, New York Knicks|NYK,
+Orlando Magic|ORL, Indiana Pacers|IND, Philadelphia 76ers|PHI, Phoenix Suns|PHX,
+Portland Trail Blazers|POR, Sacramento Kings|SAC, San Antonio Spurs|SAS,
+Oklahoma City Thunder|OKC, Toronto Raptors|TOR, Utah Jazz|UTA, Memphis Grizzlies|MEM,
+Washington Wizards|WAS, Detroit Pistons|DET, Charlotte Hornets|CHA
+'''

test_rag_2.ipynb ADDED Viewed

	@@ -0,0 +1,489 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9ba5b9ac",
+   "metadata": {},
+   "source": [
+    "# Notebook to evaluate RAG performance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afeb236f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import warnings\n",
+    "import torch\n",
+    "import time\n",
+    "import math\n",
+    "import sqlite3 as sql\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "from huggingface_hub import snapshot_download\n",
+    "import sys\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b7c75665",
+   "metadata": {},
+   "source": [
+    "## Create RAG document store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e202df8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "is_google_colab=False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc6c4ccd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "current_path = \"./\"\n",
+    "\n",
+    "def get_path(rel_path):\n",
+    "    return os.path.join(current_path, rel_path)\n",
+    "\n",
+    "if is_google_colab:\n",
+    "    hugging_face_path = snapshot_download(\n",
+    "        repo_id=\"USC-Applied-NLP-Group/SQL-Generation\",\n",
+    "        repo_type=\"model\",  \n",
+    "        allow_patterns=[\"src/*\", \"train-data/*\", \"deepseek-coder-1.3b-instruct/*\", \"nba-data/*\"], \n",
+    "    )\n",
+    "    sys.path.append(hugging_face_path)\n",
+    "    current_path = hugging_face_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d589714b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/CSCI544/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total dataset examples: 1044\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "# Establish a database connection once (adjust the DB path as needed)\n",
+    "connection = sql.connect(get_path('nba-data/nba.sqlite'))\n",
+    "cursor = connection.cursor()\n",
+    "\n",
+    "# ------------------------------\n",
+    "# Load dataset and print summary\n",
+    "# ------------------------------\n",
+    "df = pd.read_csv(get_path(\"train-data/expanded_ql_train.tsv\"), sep='\\t')\n",
+    "print(\"Total dataset examples: \" + str(len(df)))\n",
+    "print(\"\\n\")\n",
+    "\n",
+    "# ------------------------------\n",
+    "# Load tokenizer and model\n",
+    "# ------------------------------\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(get_path(\"deepseek-coder-1.3b-instruct\"))\n",
+    "model = AutoModelForCausalLM.from_pretrained(get_path(\n",
+    "    \"deepseek-coder-1.3b-instruct\"),\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    "    device_map=device\n",
+    ")\n",
+    "model.generation_config.pad_token_id = tokenizer.pad_token_id\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "499d2745",
+   "metadata": {},
+   "source": [
+    "## Define compare result function for evaluation process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "268561cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.evaluation.compare_result import compare_result\n",
+    "from src.rag.table_retriever import retrieve_doc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7393ccb",
+   "metadata": {},
+   "source": [
+    "## Create evaluation loop for RAG model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "500f003b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ------------------------------\n",
+    "# Function to evaluate the model on a given dataset\n",
+    "# ------------------------------\n",
+    "def run_evaluation(nba_df, title):\n",
+    "    counter = 0\n",
+    "    num_valid = 0\n",
+    "    num_sql_matched = 0\n",
+    "    num_result_matched = 0\n",
+    "    for index, row in nba_df.iterrows():\n",
+    "        # Retrieve relevant schema chunks via RAG\n",
+    "        relevant_schemas = retrieve_doc(row['team_flag'], row['game_flag'], row['other_stats_flag'], False)\n",
+    "        schema_block = \"\\n\\n\".join(relevant_schemas)\n",
+    "        \n",
+    "        #print(row[\"natural_query\"])\n",
+    "        #print(row[\"sql_query\"])\n",
+    "        #print(schema_block)\n",
+    "        #return\n",
+    "        # Build the prompt with instructions, schema, examples, and current request.\n",
+    "        input_text = f\"\"\"\n",
+    "You are an AI assistant that generates SQLite queries for an NBA database based on user questions.\n",
+    "\n",
+    "### Relevant Schema:\n",
+    "{schema_block}\n",
+    "\n",
+    "### Instructions:\n",
+    "- Generate a valid SQLite query to retrieve relevant data from the database.\n",
+    "- Use column names correctly based on the provided schema.\n",
+    "- Output only the SQLite query as plain text.\n",
+    "\n",
+    "### Team Name Information:\n",
+    "In the plaintext user questions, only the full team names will be used, but in the queries you may use the full team names or the abbreviations. \n",
+    "The full team names can be used with the game table, while the abbreviations should be used with the other_stats table.\n",
+    "Notice they are separated by the | character in the following list:\n",
+    "\n",
+    "Atlanta Hawks|ATL\n",
+    "Boston Celtics|BOS\n",
+    "Cleveland Cavaliers|CLE\n",
+    "New Orleans Pelicans|NOP\n",
+    "Chicago Bulls|CHI\n",
+    "Dallas Mavericks|DAL\n",
+    "Denver Nuggets|DEN\n",
+    "Golden State Warriors|GSW\n",
+    "Houston Rockets|HOU\n",
+    "Los Angeles Clippers|LAC\n",
+    "Los Angeles Lakers|LAL\n",
+    "Miami Heat|MIA\n",
+    "Milwaukee Bucks|MIL\n",
+    "Minnesota Timberwolves|MIN\n",
+    "Brooklyn Nets|BKN\n",
+    "New York Knicks|NYK\n",
+    "Orlando Magic|ORL\n",
+    "Indiana Pacers|IND\n",
+    "Philadelphia 76ers|PHI\n",
+    "Phoenix Suns|PHX\n",
+    "Portland Trail Blazers|POR\n",
+    "Sacramento Kings|SAC\n",
+    "San Antonio Spurs|SAS\n",
+    "Oklahoma City Thunder|OKC\n",
+    "Toronto Raptors|TOR\n",
+    "Utah Jazz|UTA\n",
+    "Memphis Grizzlies|MEM\n",
+    "Washington Wizards|WAS\n",
+    "Detroit Pistons|DET\n",
+    "Charlotte Hornets|CHA\n",
+    "\n",
+    "### Query Guidelines:\n",
+    "Use team_name_home and team_name_away to match teams to the game table. Use team_abbreviation_home and team_abbreviation away to match teams to the other_stats table.\n",
+    "\n",
+    "To filter by season, use season_id = '2YYYY'.\n",
+    "\n",
+    "Example: To get statistics from 2005, use a statement like: season_id = '22005'. To get statistics from 1972, use a statement like: season_id = \"21972\". To get statistics from 2015, use a statement like: season_id = \"22015\".\n",
+    "\n",
+    "Ensure queries return relevant columns and avoid unnecessary joins.\n",
+    "\n",
+    "### Example User Requests and SQLite Queries\n",
+    "Request:\n",
+    "\"What is the most points the Los Angeles Lakers have ever scored at home?\"\n",
+    "SQLite:\n",
+    "SELECT MAX(pts_home)\n",
+    "FROM game\n",
+    "WHERE team_name_home = 'Los Angeles Lakers';\n",
+    "\n",
+    "Request:\n",
+    "\"Which teams are located in the state of California?\"\n",
+    "SQLite:\n",
+    "SELECT full_name FROM team WHERE state = 'California';\n",
+    "\n",
+    "Request:\n",
+    "\"Which team had the highest number of team turnovers in an away game?\"\n",
+    "SQLite:\n",
+    "SELECT team_abbreviation_away FROM other_stats ORDER BY team_turnovers_away DESC LIMIT 1;\n",
+    "\n",
+    "Request:\n",
+    "\"Which teams were founded before 1979?\"\n",
+    "SQLite:\n",
+    "SELECT full_name FROM team WHERE year_founded < 1979;\n",
+    "\n",
+    "Request:\n",
+    "\"Find the Boston Celtics largest home victory margin in the 2008 season.\"\n",
+    "SQLite:\n",
+    "SELECT MAX(pts_home - pts_away) AS biggest_win\n",
+    "FROM game\n",
+    "WHERE team_name_home = 'Boston Celtics' AND season_id = '22008';\n",
+    "\n",
+    "Generate only the SQLite query prefaced by SQLite: and no other text. Now generate an SQLite query for the following user request.\n",
+    "Request: {row[\"natural_query\"]}\n",
+    "\"\"\"\n",
+    "        messages = [{'role': 'user', 'content': input_text}]\n",
+    "        prompt_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)\n",
+    "        inputs = tokenizer(prompt_text, return_tensors=\"pt\", padding=True).to(model.device)\n",
+    "        \n",
+    "        outputs = model.generate(\n",
+    "            **inputs,\n",
+    "            max_new_tokens=512,\n",
+    "            do_sample=False,\n",
+    "            top_k=50,\n",
+    "            top_p=0.95,\n",
+    "            num_return_sequences=1,\n",
+    "            eos_token_id=tokenizer.eos_token_id,\n",
+    "            pad_token_id=tokenizer.eos_token_id\n",
+    "        )\n",
+    "        \n",
+    "        # Decode the model output.\n",
+    "        generated_query = tokenizer.decode(outputs[0][len(inputs[\"input_ids\"][0]):], skip_special_tokens=True)\n",
+    "        \n",
+    "        # Clean generated query: remove any prefix and truncate after first semicolon.\n",
+    "        if generated_query.startswith(\"SQLite:\"):\n",
+    "            clean_query = generated_query[len(\"SQLite:\"):].strip()\n",
+    "        elif generated_query.startswith(\"SQL:\"):\n",
+    "            clean_query = generated_query[len(\"SQL:\"):].strip()\n",
+    "        else:\n",
+    "            clean_query = generated_query.strip()\n",
+    "        \n",
+    "        semicolon_idx = clean_query.find(\";\")\n",
+    "        if semicolon_idx != -1:\n",
+    "            clean_query = clean_query[:semicolon_idx+1]\n",
+    "        \n",
+    "        # Execute the cleaned query on the SQLite DB to obtain the actual result.\n",
+    "        \"\"\"\n",
+    "        try:\n",
+    "            cursor.execute(clean_query)\n",
+    "            rows = cursor.fetchall()\n",
+    "            if rows and isinstance(rows[0], (tuple, list)) and len(rows[0]) > 0:\n",
+    "                actual_result = rows[0][0]\n",
+    "            elif rows:\n",
+    "                actual_result = rows[0]\n",
+    "            else:\n",
+    "                actual_result = \"\"\n",
+    "        except Exception as e:\n",
+    "            actual_result = \"Error executing query: \" + str(e)\n",
+    "        \"\"\"\n",
+    "        \n",
+    "        # Compare the ground truth query and expected result to the generated query and actual result.\n",
+    "        valid, sql_matched, result_matched = compare_result(cursor, row[\"sql_query\"], row[\"result\"], generated_query)\n",
+    "        \"\"\"\n",
+    "        print(\"=============================================\")\n",
+    "        print(f\"Overall Valid: {valid}\")\n",
+    "        print(f\"SQL Query Matched: {sql_matched}\")\n",
+    "        print(f\"Result Matched: {result_matched}\")\n",
+    "        print(\"=============================================\\n\")\n",
+    "        \n",
+    "        # Print debug output.\n",
+    "        print(\"----- Ground Truth SQL Query -----\")\n",
+    "        print(row[\"sql_query\"])\n",
+    "        print(\"------------------------------------\\n\")\n",
+    "        print(\"----- Model Generated SQL Query -----\")\n",
+    "        print(generated_query)\n",
+    "        print(\"---------------------------------------\\n\")\n",
+    "        \n",
+    "        print(\"----- Expected Result -----\")\n",
+    "        print(row[\"result\"])\n",
+    "        print(\"----- Actual DB Result -----\")\n",
+    "        print(actual_result)\n",
+    "        print(\"-------------------------------------------------\\n\")\n",
+    "        \"\"\"\n",
+    "        if valid:\n",
+    "            num_valid += 1\n",
+    "        if sql_matched:\n",
+    "            num_sql_matched += 1\n",
+    "        if result_matched:\n",
+    "            num_result_matched += 1\n",
+    "        \n",
+    "        counter += 1\n",
+    "\n",
+    "      # CONTROL ITERS\n",
+    "      #   if counter == 2:\n",
+    "      #       break\n",
+    "        \n",
+    "        if counter % 50 == 0:\n",
+    "            print(\"Completed \" + str(counter))\n",
+    "    \n",
+    "    print(\"\\n\" + title + \" results:\")\n",
+    "    print(\"Percent valid: \" + str(num_valid / len(nba_df)))\n",
+    "    print(\"Percent SQLite matched: \" + str(num_sql_matched / len(nba_df)))\n",
+    "    print(\"Percent result matched: \" + str(num_result_matched / len(nba_df)))\n",
+    "    print(\"Dataset length: \" + str(len(nba_df)))\n",
+    "    print(\"-------------------\")\n",
+    "    print(\"Num queries tested: \", counter)\n",
+    "    print(\"Num correct queries: \", num_result_matched)\n",
+    "    print(\"Acc: \", (num_result_matched / counter)*100)\n",
+    "    print(\"-------------------\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9c23d082",
+   "metadata": {},
+   "source": [
+    "## Run evaluation using RAG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6eb6a1c1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Completed 50\n",
+      "Completed 100\n",
+      "Completed 150\n",
+      "Completed 200\n",
+      "Completed 250\n",
+      "Completed 300\n",
+      "Completed 350\n",
+      "Completed 400\n",
+      "Completed 450\n",
+      "Completed 500\n",
+      "Completed 550\n",
+      "Completed 600\n",
+      "Completed 650\n",
+      "Completed 700\n",
+      "Completed 750\n",
+      "Completed 800\n",
+      "Completed 850\n",
+      "Completed 900\n",
+      "Completed 950\n",
+      "Completed 1000\n",
+      "\n",
+      "All training data results:\n",
+      "Percent valid: 0.7988505747126436\n",
+      "Percent SQLite matched: 0.13409961685823754\n",
+      "Percent result matched: 0.3850574712643678\n",
+      "Dataset length: 1044\n",
+      "-------------------\n",
+      "Num queries tested:  1044\n",
+      "Num correct queries:  402\n",
+      "Acc:  38.50574712643678\n",
+      "-------------------\n",
+      "Dataset length: 1044\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ------------------------------\n",
+    "# Run evaluation on the full training dataset\n",
+    "# ------------------------------\n",
+    "run_evaluation(df, \"All training data\")\n",
+    "print(\"Dataset length: \" + str(len(df)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f298cfa1",
+   "metadata": {},
+   "source": [
+    "## Run RAG evaluation on small query dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "121855db",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Completed 50\n",
+      "Completed 100\n",
+      "Completed 150\n",
+      "Completed 200\n",
+      "\n",
+      "Less than 90 results:\n",
+      "Percent valid: 0.8979591836734694\n",
+      "Percent SQLite matched: 0.37551020408163266\n",
+      "Percent result matched: 0.7061224489795919\n",
+      "Dataset length: 245\n",
+      "-------------------\n",
+      "Num queries tested:  245\n",
+      "Num correct queries:  173\n",
+      "Acc:  70.61224489795919\n",
+      "-------------------\n",
+      "Dataset length: 245\n"
+     ]
+    }
+   ],
+   "source": [
+    "less_than_90_df = pd.read_csv(get_path(\"train-data/less_than_90.tsv\"), sep='\\t')\n",
+    "run_evaluation(less_than_90_df, \"Less than 90\")\n",
+    "print(\"Dataset length: \" + str(len(less_than_90_df)))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CSCI544",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

train-data/expanded_sql_train.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff