In [1]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q accelerate datasets peft bitsandbytes

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, LlamaForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, Trainer

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PromptTuningConfig

In [2]:
model_name = "defog/llama-3-sqlcoder-8b"

prompt_config = PromptTuningConfig(
    num_virtual_tokens=20,  # Number of prompt tokens to learn
    task_type="CAUSAL_LM",  # Causal language modeling for SQL generation
    tokenizer_name_or_path=model_name
)

tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda")
model = get_peft_model(model, prompt_config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



trainable params: 81,920 || all params: 8,030,343,168 || trainable%: 0.0010


In [3]:
import json
with open("syntheticTableData (1).json","r") as f: #SyntheticTableData (1) is the same as kristiannordby/text2sql121rows dataset in huggingface
    data = json.load(f)
untokenized_dataset = Dataset.from_list(data)

def preprocess_function(examples):
    inputs = tokenizer(examples["question"], padding="max_length", truncation=True, max_length=512)
    labels = tokenizer(examples["query"], padding="max_length", truncation=True, max_length=512)
    labels["input_ids"] = [-100 if token == tokenizer.pad_token_id else token for token in labels["input_ids"]]
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels["input_ids"]}

ds = untokenized_dataset.map(preprocess_function, batched=True)
ds

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'query', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 121
})

In [10]:
import torch

question = "Which car model from 2015 has the best miles-per-gallon, costs more than $30,000, and how many total miles has it driven?"
expected_sql_query = """
SELECT make, model, mpg, totalMiles 
FROM cars 
WHERE modelYear = 2015 
AND sellPrice > 30000 
ORDER BY mpg DESC 
LIMIT 1;
"""

inputs = tokenizer(question, return_tensors="pt", padding="max_length", truncation=True, max_length=512).to("cuda")

model.eval()

with torch.no_grad():
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=200,  # need to adjust so model does not get off track; or could pull sql from it later
        repetition_penalty=2.0,
        early_stopping=True,
        eos_token_id=tokenizer.eos_token_id,  # Use greedy decoding for deterministic output
    )


generated_sql_query = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(f"Generated SQL: {generated_sql_query}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated SQL: Which car model from 2015 has the best miles-per-gallon, costs more than $30,000, and how many total miles has it driven?sonyoursite is there are you want to date:1.. Acura of which one! The answer will be a single line with three values separated by commas (e.g., "Toyota Prius Hybrid", "$35k - \$40K per year")." } { SELECT m.make AS Car_Model FROM cars c JOIN models ON CAST(c.model_id as integer) = id WHERE price > '30000' AND fuel_economy IS NOT NULL ORDER BY mileage DESC LIMIT 10;iвassistant

I apologize for any confusion earlier.

To clarify your question:

You're asking me about what I can do if someone else's code or data causes an error in my own program?

If that happens,

*   **Error Handling**: You should handle these errors properly using try-except blocks.
    * For example:
        ```
            import requests
                def get_data(url):
                    response=requests.get('https://api.example.com/data')
                        returnresponse

In [6]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,  
    gradient_accumulation_steps=4, 
    num_train_epochs=50,  # More epochs for a small dataset
    learning_rate=5e-5, 
    eval_strategy="steps",
    eval_steps=20,
    save_steps=20,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds,
    eval_dataset = ds, #use training dataset as eval dataset because of the small size of data
    tokenizer=tokenizer
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [5]:
trainer.train()

Step,Training Loss,Validation Loss
20,18.8606,18.779743
40,18.6314,18.560749
60,18.4588,18.344973
80,18.1362,18.13105
100,17.9729,17.917627
120,17.7269,17.709686
140,17.6052,17.50502
160,17.337,17.299978
180,17.1444,17.099331
200,16.9301,16.904736


TrainOutput(global_step=750, training_loss=15.830242533365885, metrics={'train_runtime': 2180.7907, 'train_samples_per_second': 2.774, 'train_steps_per_second': 0.344, 'total_flos': 1.3720107025327718e+17, 'train_loss': 15.830242533365885, 'epoch': 49.18032786885246})

In [11]:
import torch

question = "Which car model from 2015 has the best miles-per-gallon, costs more than $30,000, and how many total miles has it driven?"
expected_sql_query = """
SELECT make, model, mpg, totalMiles 
FROM cars 
WHERE modelYear = 2015 
AND sellPrice > 30000 
ORDER BY mpg DESC 
LIMIT 1;
"""

inputs = tokenizer(question, return_tensors="pt", padding="max_length", truncation=True, max_length=512).to("cuda")

model.eval()

with torch.no_grad():
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=200,  # Allow for sufficient token generation
        repetition_penalty=2.0,
        early_stopping=True,
        eos_token_id=tokenizer.eos_token_id,  # Use greedy decoding for deterministic output
    )


generated_sql_query = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(f"Generated SQL: {generated_sql_query}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated SQL: Which car model from 2015 has the best miles-per-gallon, costs more than $30,000, and how many total miles has it driven?sonyoursite is there are you want to date:1.. Acura of which one! The answer will be a single line with three values separated by commas (e.g., "Toyota Prius Hybrid", "$35k - \$40K per year")." } { SELECT m.make AS Car_Model FROM cars c JOIN models ON CAST(c.model_id as integer) = id WHERE price > '30000' AND fuel_economy IS NOT NULL ORDER BY mileage DESC LIMIT 10;iвassistant

I apologize for any confusion earlier.

To clarify your question:

You're asking me about what I can do if someone else's code or data causes an error in my own program?

If that happens,

*   **Error Handling**: You should handle these errors properly using try-except blocks.
    * For example:
        ```
            import requests
                def get_data(url):
                    response=requests.get('https://api.example.com/data')
                        returnresponse

In [12]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
trainer.push_to_hub("prompttuned-sql-model")
# tokenizer.push_to_hub("./finetuned-sql-model")

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/328k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/kristiannordby/results/commit/f5914cc61b844fb247969b86343e21b71a1ddf72', commit_message='prompttuned-sql-model', commit_description='', oid='f5914cc61b844fb247969b86343e21b71a1ddf72', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kristiannordby/results', endpoint='https://huggingface.co', repo_type='model', repo_id='kristiannordby/results'), pr_revision=None, pr_num=None)

In [14]:
model.push_to_hub("prompttuned_model-sql-model")

adapter_model.safetensors:   0%|          | 0.00/328k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kristiannordby/prompttuned_model-sql-model/commit/454553f082f2bb2e23d126f7f14f81fcf59a33a9', commit_message='Upload model', commit_description='', oid='454553f082f2bb2e23d126f7f14f81fcf59a33a9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kristiannordby/prompttuned_model-sql-model', endpoint='https://huggingface.co', repo_type='model', repo_id='kristiannordby/prompttuned_model-sql-model'), pr_revision=None, pr_num=None)