{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Run pre-trained DeepSeek Coder 1.3B Model on Chat-GPT 4o generated dataset" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "import torch\n", "import sys\n", "import os\n", "import sqlite3 as sql\n", "from sql_metadata import Parser\n", "from huggingface_hub import snapshot_download" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "is_google_colab=False" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "current_path = \"./\"\n", "\n", "def get_path(rel_path):\n", " return os.path.join(current_path, rel_path)\n", "\n", "if is_google_colab:\n", " hugging_face_path = snapshot_download(\n", " repo_id=\"USC-Applied-NLP-Group/SQL-Generation\",\n", " repo_type=\"model\", \n", " allow_patterns=[\"src/*\", \"train-data/*\", \"deepseek-coder-1.3b-instruct/*\", \"nba-data/*\"], \n", " )\n", " sys.path.append(hugging_face_path)\n", " current_path = hugging_face_path" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from src.prompts.pre_rag_prompt import input_text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## First load dataset into pandas dataframe" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total dataset examples: 1044\n", "\n", "\n", "Which game had the lowest combined score when the Philadelphia 76ers played in the 2019 season?\n", "SELECT game_id, (pts_home + pts_away) AS total_points FROM game WHERE season_id = '22019' AND (team_abbreviation_home = 'PHI' OR team_abbreviation_away = 'PHI') ORDER BY total_points ASC LIMIT 1;\n", "0021900630 | 177.0\n" ] } ], "source": [ "# Load dataset and check length\n", "df = pd.read_csv(get_path(\"train-data/sql_train.tsv\"), sep=\"\\t\")\n", "print(\"Total dataset examples: \" + str(len(df)))\n", "print(\"\\n\")\n", "\n", "# Test sampling\n", "sample = df.sample(n=1)\n", "print(sample[\"natural_query\"].values[0])\n", "print(sample[\"sql_query\"].values[0])\n", "print(sample[\"result\"].values[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load pre-trained DeepSeek model using transformers and pytorch packages" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Set device to cuda if available, otherwise CPU\n", "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "# Load model and tokenizer\n", "if is_google_colab:\n", " tokenizer = AutoTokenizer.from_pretrained(get_path(\"deepseek-coder-1.3b-instruct\"))\n", " model = AutoModelForCausalLM.from_pretrained(get_path(\"deepseek-coder-1.3b-instruct\"), torch_dtype=torch.bfloat16, device_map=device) \n", "else:\n", " tokenizer = AutoTokenizer.from_pretrained(\"./deepseek-coder-1.3b-instruct\")\n", " model = AutoModelForCausalLM.from_pretrained(\"./deepseek-coder-1.3b-instruct\", torch_dtype=torch.bfloat16, device_map=device) \n", "model.generation_config.pad_token_id = tokenizer.pad_token_id" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test model performance on a single example" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Response:\n", "game\n", "\n" ] } ], "source": [ "# Create message with sample query and run model\n", "message=[{ 'role': 'user', 'content': input_text + sample[\"natural_query\"].values[0]}]\n", "inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n", "outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)\n", "\n", "# Print output\n", "query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n", "print(query_output)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test sample output on sqlite3 database" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create connection to sqlite3 database\n", "connection = sql.connect(get_path('nba-data/nba.sqlite'))\n", "cursor = connection.cursor()\n", "\n", "# Execute query from model output and print result\n", "if query_output[0:7] == \"SQLite:\":\n", " print(\"cleaned\")\n", " query = query_output[7:]\n", "elif query_output[0:4] == \"SQL:\":\n", " query = query_output[4:]\n", "else:\n", " query = query_output\n", "\n", "try:\n", " cursor.execute(query)\n", " rows = cursor.fetchall()\n", " for row in rows:\n", " print(row)\n", "except:\n", " pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create function to compare output to ground truth result from examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Which team abbreviation belongs to the team based in Phoenix?\n", "SELECT abbreviation FROM team WHERE city = 'Phoenix';\n", "PHX\n", "\"team\"\n", "\n" ] } ], "source": [ "# Obtain sample\n", "sample = df.sample(n=1)\n", "\n", "print(sample[\"natural_query\"].values[0])\n", "print(sample[\"sql_query\"].values[0])\n", "print(sample[\"result\"].values[0])\n", "\n", "# Create message with sample query and run model\n", "message=[{ 'role': 'user', 'content': input_text + sample[\"natural_query\"].values[0]}]\n", "inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n", "outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)\n", "\n", "# Print output\n", "query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n", "print(query_output)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create function to evaluate pretrained model on full datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def run_evaluation(nba_df):\n", " team_flags = []\n", " game_flags = []\n", " other_stats_flags =[]\n", " for index, row in nba_df.iterrows():\n", " # Create message with sample query and run model\n", " # Obtain output\n", " \n", "\n", "\n", " parser = Parser(row['sql_query'])\n", " team_flags.append(\"team\" in parser.tables)\n", " game_flags.append(\"game\" in parser.tables)\n", " other_stats_flags.append(\"other_stats\" in parser.tables)\n", " nba_df['team_flag'] = team_flags\n", " nba_df['game_flag'] = game_flags\n", " nba_df['other_stats_flag'] = other_stats_flags\n", " nba_df.to_csv(get_path(\"expanded_data_paraser.tsv\"), sep=\"\\t\", index=False)\n", " \n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "run_evaluation(df)" ] } ], "metadata": { "kernelspec": { "display_name": "CSCI544", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 }