# Run pre-trained DeepSeek Coder 1.3B Model on Chat-GPT 4o generated dataset

In [11]:
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import sys
import os
import sqlite3 as sql
from sql_metadata import Parser
from huggingface_hub import snapshot_download

In [4]:
is_google_colab=False

In [5]:
current_path = "./"

def get_path(rel_path):
 return os.path.join(current_path, rel_path)

if is_google_colab:
 hugging_face_path = snapshot_download(
 repo_id="USC-Applied-NLP-Group/SQL-Generation",
 repo_type="model", 
 allow_patterns=["src/*", "train-data/*", "deepseek-coder-1.3b-instruct/*", "nba-data/*"], 
 )
 sys.path.append(hugging_face_path)
 current_path = hugging_face_path

In [6]:
from src.prompts.pre_rag_prompt import input_text

## First load dataset into pandas dataframe

In [7]:
# Load dataset and check length
df = pd.read_csv(get_path("train-data/sql_train.tsv"), sep="\t")
print("Total dataset examples: " + str(len(df)))
print("\n")

# Test sampling
sample = df.sample(n=1)
print(sample["natural_query"].values[0])
print(sample["sql_query"].values[0])
print(sample["result"].values[0])

Total dataset examples: 1044


Which game had the lowest combined score when the Philadelphia 76ers played in the 2019 season?
SELECT game_id, (pts_home + pts_away) AS total_points FROM game WHERE season_id = '22019' AND (team_abbreviation_home = 'PHI' OR team_abbreviation_away = 'PHI') ORDER BY total_points ASC LIMIT 1;
0021900630 | 177.0


## Load pre-trained DeepSeek model using transformers and pytorch packages

In [8]:
# Set device to cuda if available, otherwise CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
if is_google_colab:
 tokenizer = AutoTokenizer.from_pretrained(get_path("deepseek-coder-1.3b-instruct"))
 model = AutoModelForCausalLM.from_pretrained(get_path("deepseek-coder-1.3b-instruct"), torch_dtype=torch.bfloat16, device_map=device) 
else:
 tokenizer = AutoTokenizer.from_pretrained("./deepseek-coder-1.3b-instruct")
 model = AutoModelForCausalLM.from_pretrained("./deepseek-coder-1.3b-instruct", torch_dtype=torch.bfloat16, device_map=device) 
model.generation_config.pad_token_id = tokenizer.pad_token_id

## Test model performance on a single example

In [9]:
# Create message with sample query and run model
message=[{ 'role': 'user', 'content': input_text + sample["natural_query"].values[0]}]
inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)

# Print output
query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
print(query_output)

Response:
game



# Test sample output on sqlite3 database

In [None]:
# Create connection to sqlite3 database
connection = sql.connect(get_path('nba-data/nba.sqlite'))
cursor = connection.cursor()

# Execute query from model output and print result
if query_output[0:7] == "SQLite:":
 print("cleaned")
 query = query_output[7:]
elif query_output[0:4] == "SQL:":
 query = query_output[4:]
else:
 query = query_output

try:
 cursor.execute(query)
 rows = cursor.fetchall()
 for row in rows:
 print(row)
except:
 pass

## Create function to compare output to ground truth result from examples

In [None]:
# Obtain sample
sample = df.sample(n=1)

print(sample["natural_query"].values[0])
print(sample["sql_query"].values[0])
print(sample["result"].values[0])

# Create message with sample query and run model
message=[{ 'role': 'user', 'content': input_text + sample["natural_query"].values[0]}]
inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)

# Print output
query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
print(query_output)


Which team abbreviation belongs to the team based in Phoenix?
SELECT abbreviation FROM team WHERE city = 'Phoenix';
PHX
"team"



## Create function to evaluate pretrained model on full datasets

In [None]:
def run_evaluation(nba_df):
 team_flags = []
 game_flags = []
 other_stats_flags =[]
 for index, row in nba_df.iterrows():
 # Create message with sample query and run model
 # Obtain output
 


 parser = Parser(row['sql_query'])
 team_flags.append("team" in parser.tables)
 game_flags.append("game" in parser.tables)
 other_stats_flags.append("other_stats" in parser.tables)
 nba_df['team_flag'] = team_flags
 nba_df['game_flag'] = game_flags
 nba_df['other_stats_flag'] = other_stats_flags
 nba_df.to_csv(get_path("expanded_data_paraser.tsv"), sep="\t", index=False)
 


In [18]:
run_evaluation(df)