# Notebook to evaluate ChatGPT Peformance

In [None]:
import pandas as pd
import warnings
import sqlite3 as sql
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import snapshot_download
import sys
import os
import openai


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "<key>"

## Set up path

In [2]:
is_google_colab=False

In [3]:
current_path = "./"

def get_path(rel_path):
    return os.path.join(current_path, rel_path)

if is_google_colab:
    hugging_face_path = snapshot_download(
        repo_id="USC-Applied-NLP-Group/SQL-Generation",
        repo_type="model",  
        allow_patterns=["src/*", "train-data/*", "deepseek-coder-1.3b-instruct/*", "nba-data/*"], 
    )
    sys.path.append(hugging_face_path)
    current_path = hugging_face_path

In [4]:
get_path('nba-data/nba.sqlite')

'./nba-data/nba.sqlite'

In [5]:


warnings.filterwarnings("ignore")
# Establish a database connection once (adjust the DB path as needed)
connection = sql.connect(get_path('nba-data/nba.sqlite'))
cursor = connection.cursor()

# ------------------------------
# Load dataset and print summary
# ------------------------------
df = pd.read_csv(get_path("train-data/expanded_sql_train.tsv"), sep='\t')
print("Total dataset examples: " + str(len(df)))
print("\n")

# ------------------------------
# Load tokenizer and model
# ------------------------------



Total dataset examples: 1044




## Define compare result function for evaluation process

In [6]:
from src.evaluation.compare_result import compare_result
from src.rag.table_retriever import retrieve_doc

## Create evaluation loop for ChatGPT

In [8]:
from openai import OpenAI
client = OpenAI()

In [9]:
# ------------------------------
# Function to evaluate the model on a given dataset
# ------------------------------

from src.prompts.prompt import input_text
def run_evaluation(nba_df, title):
    counter = 0
    num_valid = 0
    num_sql_matched = 0
    num_result_matched = 0
    for index, row in nba_df.iterrows():
        # Retrieve relevant schema chunks via RAG

        response = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=[
            {"role": "user", "content": input_text + row["natural_query"]}
            ]
        )
        
        # Decode the model output.
        generated_query = response.choices[0].message.content
        
        # Clean generated query: remove any prefix and truncate after first semicolon.
        if generated_query.startswith("SQLite:"):
            clean_query = generated_query[len("SQLite:"):].strip()
        elif generated_query.startswith("SQL:"):
            clean_query = generated_query[len("SQL:"):].strip()
        else:
            clean_query = generated_query.strip()
        
        semicolon_idx = clean_query.find(";")
        if semicolon_idx != -1:
            clean_query = clean_query[:semicolon_idx+1]
        
        # Execute the cleaned query on the SQLite DB to obtain the actual result.
        """
        try:
            cursor.execute(clean_query)
            rows = cursor.fetchall()
            if rows and isinstance(rows[0], (tuple, list)) and len(rows[0]) > 0:
                actual_result = rows[0][0]
            elif rows:
                actual_result = rows[0]
            else:
                actual_result = ""
        except Exception as e:
            actual_result = "Error executing query: " + str(e)
        """
        
        # Compare the ground truth query and expected result to the generated query and actual result.
        valid, sql_matched, result_matched = compare_result(cursor, row["sql_query"], row["result"], generated_query)
        """
        print("=============================================")
        print(f"Overall Valid: {valid}")
        print(f"SQL Query Matched: {sql_matched}")
        print(f"Result Matched: {result_matched}")
        print("=============================================\n")
        
        # Print debug output.
        print("----- Ground Truth SQL Query -----")
        print(row["sql_query"])
        print("------------------------------------\n")
        print("----- Model Generated SQL Query -----")
        print(generated_query)
        print("---------------------------------------\n")
        
        print("----- Expected Result -----")
        print(row["result"])
        print("----- Actual DB Result -----")
        print(actual_result)
        print("-------------------------------------------------\n")
        """
        if valid:
            num_valid += 1
        if sql_matched:
            num_sql_matched += 1
        if result_matched:
            num_result_matched += 1
        
        counter += 1

      # CONTROL ITERS
      #   if counter == 2:
      #       break
        
        if counter % 50 == 0:
            print("Completed " + str(counter))
    
    print("\n" + title + " results:")
    print("Percent valid: " + str(num_valid / len(nba_df)))
    print("Percent SQLite matched: " + str(num_sql_matched / len(nba_df)))
    print("Percent result matched: " + str(num_result_matched / len(nba_df)))
    print("Dataset length: " + str(len(nba_df)))
    print("-------------------")
    print("Num queries tested: ", counter)
    print("Num correct queries: ", num_result_matched)
    print("Acc: ", (num_result_matched / counter)*100)
    print("-------------------")
    

In [17]:
def run(nba_df, title):
    counter = 0
    num_valid = 0
    num_sql_matched = 0
    num_result_matched = 0
    for index, row in nba_df.iterrows():
        print(row['natural_query'])

## Run ChatGPT evaluation

In [10]:
# ------------------------------
# Run evaluation on the full training dataset
# ------------------------------
run_evaluation(df, "All training data")
print("Dataset length: " + str(len(df)))

Completed 50
Completed 100
Completed 150
Completed 200
Completed 250
Completed 300
Completed 350
Completed 400
Completed 450
Completed 500
Completed 550
Completed 600
Completed 650
Completed 700
Completed 750
Completed 800
Completed 850
Completed 900
Completed 950
Completed 1000

All training data results:
Percent valid: 0.9521072796934866
Percent SQLite matched: 0.2260536398467433
Percent result matched: 0.7758620689655172
Dataset length: 1044
-------------------
Num queries tested:  1044
Num correct queries:  810
Acc:  77.58620689655173
-------------------
Dataset length: 1044


## Run RAG evaluation on small query dataset

In [None]:
less_than_90_df = pd.read_csv(get_path("train-data/less_than_90.tsv"), sep='\t')
run_evaluation(less_than_90_df, "Less than 90")
print("Dataset length: " + str(len(less_than_90_df)))

Completed 50
Completed 100
Completed 150
Completed 200

Less than 90 results:
Percent valid: 0.8979591836734694
Percent SQLite matched: 0.37551020408163266
Percent result matched: 0.7061224489795919
Dataset length: 245
-------------------
Num queries tested:  245
Num correct queries:  173
Acc:  70.61224489795919
-------------------
Dataset length: 245
