{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Run pre-trained DeepSeek Coder 1.3B Model on Chat-GPT 4o generated dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "import torch\n", "import sys\n", "import os\n", "import sqlite3 as sql\n", "from huggingface_hub import snapshot_download" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "is_google_colab=False" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "current_path = \"./\"\n", "\n", "def get_path(rel_path):\n", " return os.path.join(current_path, rel_path)\n", "\n", "if is_google_colab:\n", " hugging_face_path = snapshot_download(\n", " repo_id=\"USC-Applied-NLP-Group/SQL-Generation\",\n", " repo_type=\"model\", \n", " allow_patterns=[\"src/*\", \"train-data/*\", \"deepseek-coder-1.3b-instruct/*\", \"nba-data/*\"], \n", " )\n", " sys.path.append(hugging_face_path)\n", " current_path = hugging_face_path" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from src.prompts.prompt import input_text\n", "from src.evaluation.compare_result import compare_result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## First load dataset into pandas dataframe" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total dataset examples: 1044\n", "\n", "\n", "How many times were games tied when the Indiana Pacers played at home?\n", "SELECT SUM(times_tied) as total_times_tied FROM other_stats WHERE team_abbreviation_home = 'IND';\n", "4805.0\n" ] } ], "source": [ "# Load dataset and check length\n", "df = pd.read_csv(get_path(\"train-data/sql_train.tsv\"), sep=\"\\t\")\n", "print(\"Total dataset examples: \" + str(len(df)))\n", "print(\"\\n\")\n", "\n", "# Test sampling\n", "sample = df.sample(n=1)\n", "print(sample[\"natural_query\"].values[0])\n", "print(sample[\"sql_query\"].values[0])\n", "print(sample[\"result\"].values[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load pre-trained DeepSeek model using transformers and pytorch packages" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Set device to cuda if available, otherwise CPU\n", "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "# Load model and tokenizer\n", "if is_google_colab:\n", " tokenizer = AutoTokenizer.from_pretrained(get_path(\"deepseek-coder-1.3b-instruct\"))\n", " model = AutoModelForCausalLM.from_pretrained(get_path(\"deepseek-coder-1.3b-instruct\"), torch_dtype=torch.bfloat16, device_map=device) \n", "else:\n", " tokenizer = AutoTokenizer.from_pretrained(\"./deepseek-coder-1.3b-instruct\")\n", " model = AutoModelForCausalLM.from_pretrained(\"./deepseek-coder-1.3b-instruct\", torch_dtype=torch.bfloat16, device_map=device) \n", "model.generation_config.pad_token_id = tokenizer.pad_token_id" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test model performance on a single example" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SQLite:\n", "SELECT COUNT(*) FROM game WHERE team_name_home = 'Indiana Pacers' AND wl_home = 'T';\n", "\n" ] } ], "source": [ "# Create message with sample query and run model\n", "message=[{ 'role': 'user', 'content': input_text + sample[\"natural_query\"].values[0]}]\n", "inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n", "outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)\n", "\n", "# Print output\n", "query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n", "print(query_output)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test sample output on sqlite3 database" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cleaned\n", "(0,)\n" ] } ], "source": [ "# Create connection to sqlite3 database\n", "connection = sql.connect(get_path('nba-data/nba.sqlite'))\n", "cursor = connection.cursor()\n", "\n", "# Execute query from model output and print result\n", "if query_output[0:7] == \"SQLite:\":\n", " print(\"cleaned\")\n", " query = query_output[7:]\n", "elif query_output[0:4] == \"SQL:\":\n", " query = query_output[4:]\n", "else:\n", " query = query_output\n", "\n", "try:\n", " cursor.execute(query)\n", " rows = cursor.fetchall()\n", " for row in rows:\n", " print(row)\n", "except:\n", " pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create function to compare output to ground truth result from examples" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "What is the year the Milwaukee team was founded?\n", "SELECT year_founded FROM team WHERE city = 'Milwaukee';\n", "1968.0\n", "SQLite:\n", "SELECT year_founded FROM team WHERE full_name = 'Milwaukee Bucks';\n", "\n", "Statement valid? True\n", "SQLite matched? False\n", "Result matched? True\n" ] } ], "source": [ "# Obtain sample\n", "sample = df.sample(n=1)\n", "\n", "print(sample[\"natural_query\"].values[0])\n", "print(sample[\"sql_query\"].values[0])\n", "print(sample[\"result\"].values[0])\n", "\n", "# Create message with sample query and run model\n", "message=[{ 'role': 'user', 'content': input_text + sample[\"natural_query\"].values[0]}]\n", "inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n", "outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)\n", "\n", "# Print output\n", "query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n", "print(query_output)\n", "\n", "result = compare_result(cursor, sample[\"sql_query\"].values[0], sample[\"result\"].values[0], query_output)\n", "print(\"Statement valid? \" + str(result[0]))\n", "print(\"SQLite matched? \" + str(result[1]))\n", "print(\"Result matched? \" + str(result[2]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create function to evaluate pretrained model on full datasets" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def run_evaluation(nba_df, title):\n", " counter = 0\n", " num_valid = 0\n", " num_sql_matched = 0\n", " num_result_matched = 0\n", " for index, row in nba_df.iterrows():\n", " # Create message with sample query and run model\n", " message=[{ 'role': 'user', 'content': input_text + row[\"natural_query\"]}]\n", " inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n", " outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)\n", "\n", " # Obtain output\n", " query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n", "\n", " # Evaluate model result\n", " valid, sql_matched, result_matched = compare_result(cursor, row[\"sql_query\"], row[\"result\"], query_output)\n", " if valid:\n", " num_valid += 1\n", " if sql_matched:\n", " num_sql_matched += 1\n", " if result_matched:\n", " num_result_matched += 1\n", "\n", " # Break after predefined number of examples\n", " counter += 1\n", " if counter % 50 == 0:\n", " print(\"Completed \" + str(counter))\n", "\n", " # Print evaluation results\n", " print(\"\\n\" + title + \" results:\")\n", " print(\"Percent valid: \" + str(num_valid / len(nba_df)))\n", " print(\"Percent SQLite matched: \" + str(num_sql_matched / len(nba_df)))\n", " print(\"Percent result matched: \" + str(num_result_matched / len(nba_df)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Evaluate on less than 90 dataset" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Completed 50\n", "Completed 100\n", "Completed 150\n", "Completed 200\n", "\n", "Less than 90 results:\n", "Percent valid: 0.8734693877551021\n", "Percent SQLite matched: 0.4448979591836735\n", "Percent result matched: 0.6979591836734694\n", "Dataset length: 245\n" ] } ], "source": [ "less_than_90_df = pd.read_csv(get_path(\"train-data/less_than_90.tsv\"), sep='\\t')\n", "run_evaluation(less_than_90_df, \"Less than 90\")\n", "print(\"Dataset length: \" + str(len(less_than_90_df)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Evaluate on game table queries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "game_queries = pd.read_csv(get_path(\"train-data/queries_from_game.tsv\"), sep='\\t')\n", "run_evaluation(game_queries, \"Queries from game\")\n", "print(\"Dataset length: \" + str(len(game_queries)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate on other stats queries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "other_stats_queries = pd.read_csv(get_path(\"train-data/queries_from_other_stats.tsv\"), sep='\\t')\n", "run_evaluation(other_stats_queries, \"Queries from other stats\")\n", "print(\"Dataset length: \" + str(len(other_stats_queries)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate on team queries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "team_queries = pd.read_csv(get_path(\"train-data/queries_from_team.tsv\"), sep='\\t')\n", "run_evaluation(team_queries, \"Queries from team\")\n", "print(\"Dataset length: \" + str(len(team_queries)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate on queries requiring join statements" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "join_queries = pd.read_csv(get_path(\"train-data/with_join.tsv\"), sep='\\t')\n", "run_evaluation(join_queries, \"Queries with join\")\n", "print(\"Dataset length: \" + str(len(join_queries)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate on queries not requiring join statements" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "no_join_queries = pd.read_csv(get_path(\"train-data/without_join.tsv\"), sep='\\t')\n", "run_evaluation(no_join_queries, \"Queries without join\")\n", "print(\"Dataset length: \" + str(len(no_join_queries)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate on full training dataset" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Completed 50\n", "Completed 100\n", "Completed 150\n", "Completed 200\n", "Completed 250\n", "Completed 300\n", "Completed 350\n", "Completed 400\n", "Completed 450\n", "Completed 500\n", "Completed 550\n", "Completed 600\n", "Completed 650\n", "Completed 700\n", "Completed 750\n", "Completed 800\n", "Completed 850\n", "Completed 900\n", "Completed 950\n", "Completed 1000\n", "\n", "All training data results:\n", "Percent valid: 0.7097701149425287\n", "Percent SQLite matched: 0.14367816091954022\n", "Percent result matched: 0.3668582375478927\n", "Dataset length: 1044\n" ] } ], "source": [ "# Run evaluation on all training data\n", "run_evaluation(df, \"All training data\")\n", "print(\"Dataset length: \" + str(len(df)))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 2 }