Notebooks for GPT evaluation

Browse files

Files changed (5) hide show

__pycache__/rag_metadata.cpython-311.pyc +0 -0
chat_gpt_3.5.ipynb +424 -0
chat_gpt_4.ipynb +435 -0
src/evaluation/__pycache__/compare_result.cpython-311.pyc +0 -0
src/rag/__pycache__/table_retriever.cpython-311.pyc +0 -0

__pycache__/rag_metadata.cpython-311.pyc ADDED Viewed

Binary file (3.73 kB). View file

chat_gpt_3.5.ipynb ADDED Viewed

	@@ -0,0 +1,424 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cf4403ec",
+   "metadata": {},
+   "source": [
+    "# Notebook to evaluate ChatGPT Peformance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f708eaa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import warnings\n",
+    "import sqlite3 as sql\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "from huggingface_hub import snapshot_download\n",
+    "import sys\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83a1bd00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"<key>\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b3a647bf",
+   "metadata": {},
+   "source": [
+    "## Set up path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "996e282d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "is_google_colab=False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "5d96087b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "current_path = \"./\"\n",
+    "\n",
+    "def get_path(rel_path):\n",
+    "    return os.path.join(current_path, rel_path)\n",
+    "\n",
+    "if is_google_colab:\n",
+    "    hugging_face_path = snapshot_download(\n",
+    "        repo_id=\"USC-Applied-NLP-Group/SQL-Generation\",\n",
+    "        repo_type=\"model\",  \n",
+    "        allow_patterns=[\"src/*\", \"train-data/*\", \"deepseek-coder-1.3b-instruct/*\", \"nba-data/*\"], \n",
+    "    )\n",
+    "    sys.path.append(hugging_face_path)\n",
+    "    current_path = hugging_face_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "483da9f0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'./nba-data/nba.sqlite'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_path('nba-data/nba.sqlite')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "5cc9f19f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total dataset examples: 1044\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "# Establish a database connection once (adjust the DB path as needed)\n",
+    "connection = sql.connect(get_path('nba-data/nba.sqlite'))\n",
+    "cursor = connection.cursor()\n",
+    "\n",
+    "# ------------------------------\n",
+    "# Load dataset and print summary\n",
+    "# ------------------------------\n",
+    "df = pd.read_csv(get_path(\"train-data/expanded_sql_train.tsv\"), sep='\\t')\n",
+    "print(\"Total dataset examples: \" + str(len(df)))\n",
+    "print(\"\\n\")\n",
+    "\n",
+    "# ------------------------------\n",
+    "# Load tokenizer and model\n",
+    "# ------------------------------\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2d859d8",
+   "metadata": {},
+   "source": [
+    "## Define compare result function for evaluation process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "a5295234",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.evaluation.compare_result import compare_result\n",
+    "from src.rag.table_retriever import retrieve_doc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a89a468",
+   "metadata": {},
+   "source": [
+    "## Create evaluation loop for ChatGPT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "e580dda8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "client = OpenAI()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "69707ee7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ------------------------------\n",
+    "# Function to evaluate the model on a given dataset\n",
+    "# ------------------------------\n",
+    "\n",
+    "from src.prompts.prompt import input_text\n",
+    "def run_evaluation(nba_df, title):\n",
+    "    counter = 0\n",
+    "    num_valid = 0\n",
+    "    num_sql_matched = 0\n",
+    "    num_result_matched = 0\n",
+    "    for index, row in nba_df.iterrows():\n",
+    "        # Retrieve relevant schema chunks via RAG\n",
+    "\n",
+    "        response = client.chat.completions.create(\n",
+    "            model=\"gpt-3.5-turbo\",\n",
+    "            messages=[\n",
+    "            {\"role\": \"user\", \"content\": input_text + row[\"natural_query\"]}\n",
+    "            ]\n",
+    "        )\n",
+    "        \n",
+    "        # Decode the model output.\n",
+    "        generated_query = response.choices[0].message.content\n",
+    "        \n",
+    "        # Clean generated query: remove any prefix and truncate after first semicolon.\n",
+    "        if generated_query.startswith(\"SQLite:\"):\n",
+    "            clean_query = generated_query[len(\"SQLite:\"):].strip()\n",
+    "        elif generated_query.startswith(\"SQL:\"):\n",
+    "            clean_query = generated_query[len(\"SQL:\"):].strip()\n",
+    "        else:\n",
+    "            clean_query = generated_query.strip()\n",
+    "        \n",
+    "        semicolon_idx = clean_query.find(\";\")\n",
+    "        if semicolon_idx != -1:\n",
+    "            clean_query = clean_query[:semicolon_idx+1]\n",
+    "        \n",
+    "        # Execute the cleaned query on the SQLite DB to obtain the actual result.\n",
+    "        \"\"\"\n",
+    "        try:\n",
+    "            cursor.execute(clean_query)\n",
+    "            rows = cursor.fetchall()\n",
+    "            if rows and isinstance(rows[0], (tuple, list)) and len(rows[0]) > 0:\n",
+    "                actual_result = rows[0][0]\n",
+    "            elif rows:\n",
+    "                actual_result = rows[0]\n",
+    "            else:\n",
+    "                actual_result = \"\"\n",
+    "        except Exception as e:\n",
+    "            actual_result = \"Error executing query: \" + str(e)\n",
+    "        \"\"\"\n",
+    "        \n",
+    "        # Compare the ground truth query and expected result to the generated query and actual result.\n",
+    "        valid, sql_matched, result_matched = compare_result(cursor, row[\"sql_query\"], row[\"result\"], generated_query)\n",
+    "        \"\"\"\n",
+    "        print(\"=============================================\")\n",
+    "        print(f\"Overall Valid: {valid}\")\n",
+    "        print(f\"SQL Query Matched: {sql_matched}\")\n",
+    "        print(f\"Result Matched: {result_matched}\")\n",
+    "        print(\"=============================================\\n\")\n",
+    "        \n",
+    "        # Print debug output.\n",
+    "        print(\"----- Ground Truth SQL Query -----\")\n",
+    "        print(row[\"sql_query\"])\n",
+    "        print(\"------------------------------------\\n\")\n",
+    "        print(\"----- Model Generated SQL Query -----\")\n",
+    "        print(generated_query)\n",
+    "        print(\"---------------------------------------\\n\")\n",
+    "        \n",
+    "        print(\"----- Expected Result -----\")\n",
+    "        print(row[\"result\"])\n",
+    "        print(\"----- Actual DB Result -----\")\n",
+    "        print(actual_result)\n",
+    "        print(\"-------------------------------------------------\\n\")\n",
+    "        \"\"\"\n",
+    "        if valid:\n",
+    "            num_valid += 1\n",
+    "        if sql_matched:\n",
+    "            num_sql_matched += 1\n",
+    "        if result_matched:\n",
+    "            num_result_matched += 1\n",
+    "        \n",
+    "        counter += 1\n",
+    "\n",
+    "      # CONTROL ITERS\n",
+    "      #   if counter == 2:\n",
+    "      #       break\n",
+    "        \n",
+    "        if counter % 50 == 0:\n",
+    "            print(\"Completed \" + str(counter))\n",
+    "    \n",
+    "    print(\"\\n\" + title + \" results:\")\n",
+    "    print(\"Percent valid: \" + str(num_valid / len(nba_df)))\n",
+    "    print(\"Percent SQLite matched: \" + str(num_sql_matched / len(nba_df)))\n",
+    "    print(\"Percent result matched: \" + str(num_result_matched / len(nba_df)))\n",
+    "    print(\"Dataset length: \" + str(len(nba_df)))\n",
+    "    print(\"-------------------\")\n",
+    "    print(\"Num queries tested: \", counter)\n",
+    "    print(\"Num correct queries: \", num_result_matched)\n",
+    "    print(\"Acc: \", (num_result_matched / counter)*100)\n",
+    "    print(\"-------------------\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "0c3fdc3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run(nba_df, title):\n",
+    "    counter = 0\n",
+    "    num_valid = 0\n",
+    "    num_sql_matched = 0\n",
+    "    num_result_matched = 0\n",
+    "    for index, row in nba_df.iterrows():\n",
+    "        print(row['natural_query'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bff68e0",
+   "metadata": {},
+   "source": [
+    "## Run ChatGPT evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "ce291e30",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Completed 50\n",
+      "Completed 100\n",
+      "Completed 150\n",
+      "Completed 200\n",
+      "Completed 250\n",
+      "Completed 300\n",
+      "Completed 350\n",
+      "Completed 400\n",
+      "Completed 450\n",
+      "Completed 500\n",
+      "Completed 550\n",
+      "Completed 600\n",
+      "Completed 650\n",
+      "Completed 700\n",
+      "Completed 750\n",
+      "Completed 800\n",
+      "Completed 850\n",
+      "Completed 900\n",
+      "Completed 950\n",
+      "Completed 1000\n",
+      "\n",
+      "All training data results:\n",
+      "Percent valid: 0.8630268199233716\n",
+      "Percent SQLite matched: 0.20114942528735633\n",
+      "Percent result matched: 0.6293103448275862\n",
+      "Dataset length: 1044\n",
+      "-------------------\n",
+      "Num queries tested:  1044\n",
+      "Num correct queries:  657\n",
+      "Acc:  62.93103448275862\n",
+      "-------------------\n",
+      "Dataset length: 1044\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ------------------------------\n",
+    "# Run evaluation on the full training dataset\n",
+    "# ------------------------------\n",
+    "run_evaluation(df, \"All training data\")\n",
+    "print(\"Dataset length: \" + str(len(df)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b21994fa",
+   "metadata": {},
+   "source": [
+    "## Run RAG evaluation on small query dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2d12248",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Completed 50\n",
+      "Completed 100\n",
+      "Completed 150\n",
+      "Completed 200\n",
+      "\n",
+      "Less than 90 results:\n",
+      "Percent valid: 0.8979591836734694\n",
+      "Percent SQLite matched: 0.37551020408163266\n",
+      "Percent result matched: 0.7061224489795919\n",
+      "Dataset length: 245\n",
+      "-------------------\n",
+      "Num queries tested:  245\n",
+      "Num correct queries:  173\n",
+      "Acc:  70.61224489795919\n",
+      "-------------------\n",
+      "Dataset length: 245\n"
+     ]
+    }
+   ],
+   "source": [
+    "less_than_90_df = pd.read_csv(get_path(\"train-data/less_than_90.tsv\"), sep='\\t')\n",
+    "run_evaluation(less_than_90_df, \"Less than 90\")\n",
+    "print(\"Dataset length: \" + str(len(less_than_90_df)))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CSCI544",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

chat_gpt_4.ipynb ADDED Viewed

	@@ -0,0 +1,435 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cf4403ec",
+   "metadata": {},
+   "source": [
+    "# Notebook to evaluate ChatGPT Peformance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f708eaa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/CSCI544/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import warnings\n",
+    "import sqlite3 as sql\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "from huggingface_hub import snapshot_download\n",
+    "import sys\n",
+    "import os\n",
+    "import openai\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83a1bd00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"<key>\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b3a647bf",
+   "metadata": {},
+   "source": [
+    "## Set up path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "996e282d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "is_google_colab=False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5d96087b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "current_path = \"./\"\n",
+    "\n",
+    "def get_path(rel_path):\n",
+    "    return os.path.join(current_path, rel_path)\n",
+    "\n",
+    "if is_google_colab:\n",
+    "    hugging_face_path = snapshot_download(\n",
+    "        repo_id=\"USC-Applied-NLP-Group/SQL-Generation\",\n",
+    "        repo_type=\"model\",  \n",
+    "        allow_patterns=[\"src/*\", \"train-data/*\", \"deepseek-coder-1.3b-instruct/*\", \"nba-data/*\"], \n",
+    "    )\n",
+    "    sys.path.append(hugging_face_path)\n",
+    "    current_path = hugging_face_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "483da9f0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'./nba-data/nba.sqlite'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_path('nba-data/nba.sqlite')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "5cc9f19f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total dataset examples: 1044\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "# Establish a database connection once (adjust the DB path as needed)\n",
+    "connection = sql.connect(get_path('nba-data/nba.sqlite'))\n",
+    "cursor = connection.cursor()\n",
+    "\n",
+    "# ------------------------------\n",
+    "# Load dataset and print summary\n",
+    "# ------------------------------\n",
+    "df = pd.read_csv(get_path(\"train-data/expanded_sql_train.tsv\"), sep='\\t')\n",
+    "print(\"Total dataset examples: \" + str(len(df)))\n",
+    "print(\"\\n\")\n",
+    "\n",
+    "# ------------------------------\n",
+    "# Load tokenizer and model\n",
+    "# ------------------------------\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2d859d8",
+   "metadata": {},
+   "source": [
+    "## Define compare result function for evaluation process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a5295234",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.evaluation.compare_result import compare_result\n",
+    "from src.rag.table_retriever import retrieve_doc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a89a468",
+   "metadata": {},
+   "source": [
+    "## Create evaluation loop for ChatGPT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e580dda8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "client = OpenAI()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "69707ee7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ------------------------------\n",
+    "# Function to evaluate the model on a given dataset\n",
+    "# ------------------------------\n",
+    "\n",
+    "from src.prompts.prompt import input_text\n",
+    "def run_evaluation(nba_df, title):\n",
+    "    counter = 0\n",
+    "    num_valid = 0\n",
+    "    num_sql_matched = 0\n",
+    "    num_result_matched = 0\n",
+    "    for index, row in nba_df.iterrows():\n",
+    "        # Retrieve relevant schema chunks via RAG\n",
+    "\n",
+    "        response = client.chat.completions.create(\n",
+    "            model=\"gpt-4-turbo\",\n",
+    "            messages=[\n",
+    "            {\"role\": \"user\", \"content\": input_text + row[\"natural_query\"]}\n",
+    "            ]\n",
+    "        )\n",
+    "        \n",
+    "        # Decode the model output.\n",
+    "        generated_query = response.choices[0].message.content\n",
+    "        \n",
+    "        # Clean generated query: remove any prefix and truncate after first semicolon.\n",
+    "        if generated_query.startswith(\"SQLite:\"):\n",
+    "            clean_query = generated_query[len(\"SQLite:\"):].strip()\n",
+    "        elif generated_query.startswith(\"SQL:\"):\n",
+    "            clean_query = generated_query[len(\"SQL:\"):].strip()\n",
+    "        else:\n",
+    "            clean_query = generated_query.strip()\n",
+    "        \n",
+    "        semicolon_idx = clean_query.find(\";\")\n",
+    "        if semicolon_idx != -1:\n",
+    "            clean_query = clean_query[:semicolon_idx+1]\n",
+    "        \n",
+    "        # Execute the cleaned query on the SQLite DB to obtain the actual result.\n",
+    "        \"\"\"\n",
+    "        try:\n",
+    "            cursor.execute(clean_query)\n",
+    "            rows = cursor.fetchall()\n",
+    "            if rows and isinstance(rows[0], (tuple, list)) and len(rows[0]) > 0:\n",
+    "                actual_result = rows[0][0]\n",
+    "            elif rows:\n",
+    "                actual_result = rows[0]\n",
+    "            else:\n",
+    "                actual_result = \"\"\n",
+    "        except Exception as e:\n",
+    "            actual_result = \"Error executing query: \" + str(e)\n",
+    "        \"\"\"\n",
+    "        \n",
+    "        # Compare the ground truth query and expected result to the generated query and actual result.\n",
+    "        valid, sql_matched, result_matched = compare_result(cursor, row[\"sql_query\"], row[\"result\"], generated_query)\n",
+    "        \"\"\"\n",
+    "        print(\"=============================================\")\n",
+    "        print(f\"Overall Valid: {valid}\")\n",
+    "        print(f\"SQL Query Matched: {sql_matched}\")\n",
+    "        print(f\"Result Matched: {result_matched}\")\n",
+    "        print(\"=============================================\\n\")\n",
+    "        \n",
+    "        # Print debug output.\n",
+    "        print(\"----- Ground Truth SQL Query -----\")\n",
+    "        print(row[\"sql_query\"])\n",
+    "        print(\"------------------------------------\\n\")\n",
+    "        print(\"----- Model Generated SQL Query -----\")\n",
+    "        print(generated_query)\n",
+    "        print(\"---------------------------------------\\n\")\n",
+    "        \n",
+    "        print(\"----- Expected Result -----\")\n",
+    "        print(row[\"result\"])\n",
+    "        print(\"----- Actual DB Result -----\")\n",
+    "        print(actual_result)\n",
+    "        print(\"-------------------------------------------------\\n\")\n",
+    "        \"\"\"\n",
+    "        if valid:\n",
+    "            num_valid += 1\n",
+    "        if sql_matched:\n",
+    "            num_sql_matched += 1\n",
+    "        if result_matched:\n",
+    "            num_result_matched += 1\n",
+    "        \n",
+    "        counter += 1\n",
+    "\n",
+    "      # CONTROL ITERS\n",
+    "      #   if counter == 2:\n",
+    "      #       break\n",
+    "        \n",
+    "        if counter % 50 == 0:\n",
+    "            print(\"Completed \" + str(counter))\n",
+    "    \n",
+    "    print(\"\\n\" + title + \" results:\")\n",
+    "    print(\"Percent valid: \" + str(num_valid / len(nba_df)))\n",
+    "    print(\"Percent SQLite matched: \" + str(num_sql_matched / len(nba_df)))\n",
+    "    print(\"Percent result matched: \" + str(num_result_matched / len(nba_df)))\n",
+    "    print(\"Dataset length: \" + str(len(nba_df)))\n",
+    "    print(\"-------------------\")\n",
+    "    print(\"Num queries tested: \", counter)\n",
+    "    print(\"Num correct queries: \", num_result_matched)\n",
+    "    print(\"Acc: \", (num_result_matched / counter)*100)\n",
+    "    print(\"-------------------\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "0c3fdc3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run(nba_df, title):\n",
+    "    counter = 0\n",
+    "    num_valid = 0\n",
+    "    num_sql_matched = 0\n",
+    "    num_result_matched = 0\n",
+    "    for index, row in nba_df.iterrows():\n",
+    "        print(row['natural_query'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bff68e0",
+   "metadata": {},
+   "source": [
+    "## Run ChatGPT evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ce291e30",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Completed 50\n",
+      "Completed 100\n",
+      "Completed 150\n",
+      "Completed 200\n",
+      "Completed 250\n",
+      "Completed 300\n",
+      "Completed 350\n",
+      "Completed 400\n",
+      "Completed 450\n",
+      "Completed 500\n",
+      "Completed 550\n",
+      "Completed 600\n",
+      "Completed 650\n",
+      "Completed 700\n",
+      "Completed 750\n",
+      "Completed 800\n",
+      "Completed 850\n",
+      "Completed 900\n",
+      "Completed 950\n",
+      "Completed 1000\n",
+      "\n",
+      "All training data results:\n",
+      "Percent valid: 0.9521072796934866\n",
+      "Percent SQLite matched: 0.2260536398467433\n",
+      "Percent result matched: 0.7758620689655172\n",
+      "Dataset length: 1044\n",
+      "-------------------\n",
+      "Num queries tested:  1044\n",
+      "Num correct queries:  810\n",
+      "Acc:  77.58620689655173\n",
+      "-------------------\n",
+      "Dataset length: 1044\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ------------------------------\n",
+    "# Run evaluation on the full training dataset\n",
+    "# ------------------------------\n",
+    "run_evaluation(df, \"All training data\")\n",
+    "print(\"Dataset length: \" + str(len(df)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b21994fa",
+   "metadata": {},
+   "source": [
+    "## Run RAG evaluation on small query dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2d12248",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Completed 50\n",
+      "Completed 100\n",
+      "Completed 150\n",
+      "Completed 200\n",
+      "\n",
+      "Less than 90 results:\n",
+      "Percent valid: 0.8979591836734694\n",
+      "Percent SQLite matched: 0.37551020408163266\n",
+      "Percent result matched: 0.7061224489795919\n",
+      "Dataset length: 245\n",
+      "-------------------\n",
+      "Num queries tested:  245\n",
+      "Num correct queries:  173\n",
+      "Acc:  70.61224489795919\n",
+      "-------------------\n",
+      "Dataset length: 245\n"
+     ]
+    }
+   ],
+   "source": [
+    "less_than_90_df = pd.read_csv(get_path(\"train-data/less_than_90.tsv\"), sep='\\t')\n",
+    "run_evaluation(less_than_90_df, \"Less than 90\")\n",
+    "print(\"Dataset length: \" + str(len(less_than_90_df)))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CSCI544",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

src/evaluation/__pycache__/compare_result.cpython-311.pyc CHANGED Viewed

Binary files a/src/evaluation/__pycache__/compare_result.cpython-311.pyc and b/src/evaluation/__pycache__/compare_result.cpython-311.pyc differ

src/rag/__pycache__/table_retriever.cpython-311.pyc ADDED Viewed

Binary file (8.28 kB). View file