DeanGumas commited on Apr 7

Commit

c78bae0

1 Parent(s): b0335cf

Increasing LoRA rank to 32 and updating loss function

Browse files

Files changed (32) hide show

fine-tuned-model/{checkpoint-295 → checkpoint-1121}/README.md +0 -0
fine-tuned-model/{checkpoint-354 → checkpoint-1121}/adapter_config.json +0 -0
fine-tuned-model/{checkpoint-295 → checkpoint-1121}/adapter_model.safetensors +1 -1
fine-tuned-model/{checkpoint-354 → checkpoint-1121}/optimizer.pt +1 -1
fine-tuned-model/{checkpoint-354 → checkpoint-1121}/rng_state.pth +1 -1
fine-tuned-model/{checkpoint-354 → checkpoint-1121}/scaler.pt +1 -1
fine-tuned-model/{checkpoint-295 → checkpoint-1121}/scheduler.pt +1 -1
fine-tuned-model/{checkpoint-295 → checkpoint-1121}/special_tokens_map.json +0 -0
fine-tuned-model/{checkpoint-295 → checkpoint-1121}/tokenizer.json +0 -0
fine-tuned-model/{checkpoint-295 → checkpoint-1121}/tokenizer_config.json +0 -0
fine-tuned-model/checkpoint-1121/trainer_state.json +349 -0
fine-tuned-model/{checkpoint-354 → checkpoint-1121}/training_args.bin +1 -1
fine-tuned-model/checkpoint-295/trainer_state.json +0 -109
fine-tuned-model/checkpoint-354/trainer_state.json +0 -107
fine-tuned-model/{checkpoint-354 → checkpoint-590}/README.md +0 -0
fine-tuned-model/{checkpoint-295 → checkpoint-590}/adapter_config.json +2 -2
fine-tuned-model/{checkpoint-354 → checkpoint-590}/adapter_model.safetensors +2 -2
fine-tuned-model/{checkpoint-295 → checkpoint-590}/optimizer.pt +2 -2
fine-tuned-model/{checkpoint-295 → checkpoint-590}/rng_state.pth +1 -1
fine-tuned-model/{checkpoint-295 → checkpoint-590}/scaler.pt +1 -1
fine-tuned-model/{checkpoint-354 → checkpoint-590}/scheduler.pt +1 -1
fine-tuned-model/{checkpoint-354 → checkpoint-590}/special_tokens_map.json +0 -0
fine-tuned-model/{checkpoint-354 → checkpoint-590}/tokenizer.json +0 -0
fine-tuned-model/{checkpoint-354 → checkpoint-590}/tokenizer_config.json +0 -0
fine-tuned-model/checkpoint-590/trainer_state.json +200 -0
fine-tuned-model/{checkpoint-295 → checkpoint-590}/training_args.bin +1 -1
fine-tuned-model/model.safetensors +1 -1
fine-tuned-model/runs/Apr03_21-04-00_DESKTOP-SMJC97K/events.out.tfevents.1743739440.DESKTOP-SMJC97K.13648.0 +3 -0
fine-tuned-model/runs/Apr04_09-11-28_DESKTOP-SMJC97K/events.out.tfevents.1743783088.DESKTOP-SMJC97K.12624.0 +3 -0
fine-tuned-model/runs/Apr06_12-22-00_DESKTOP-SMJC97K/events.out.tfevents.1743967320.DESKTOP-SMJC97K.20424.0 +3 -0
finetune_model.ipynb +99 -47
test_finetuned.ipynb +39 -77

fine-tuned-model/{checkpoint-295 → checkpoint-1121}/README.md RENAMED Viewed

File without changes

fine-tuned-model/{checkpoint-354 → checkpoint-1121}/adapter_config.json RENAMED Viewed

File without changes

fine-tuned-model/{checkpoint-295 → checkpoint-1121}/adapter_model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5dbc52cfab26c0dfe8ab09edbd2f3c33aa892b2ea1e668a652968307cf887d90
 size 25191536

 version https://git-lfs.github.com/spec/v1
+oid sha256:b1f1fd45ed5bb6da6a7e92eeac091e7318b04c5ab44a21746698e6055407db26
 size 25191536

fine-tuned-model/{checkpoint-354 → checkpoint-1121}/optimizer.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d6cf67b41bdad86d98c95348eccb014252be813c3e3c818332ec110bdd9b7218
 size 50492858

 version https://git-lfs.github.com/spec/v1
+oid sha256:a590d95a55c582b9e2e80da5b7f846b4bc5b100bf9d4776020022d35932666e3
 size 50492858

fine-tuned-model/{checkpoint-354 → checkpoint-1121}/rng_state.pth RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:514fdade4a576ea3a449805be1a2440948e08bdd1739bc46dff3651cafdbaa1f
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:28366cd32aadd3d8ca0d87af02b96e313773bfd18ed31387c0328dae31820b84
 size 14244

fine-tuned-model/{checkpoint-354 → checkpoint-1121}/scaler.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7c13b31f5b4095a59f0eede567d0ae212057e5fe2f6b3f1f4be4554781e46a5
 size 988

 version https://git-lfs.github.com/spec/v1
+oid sha256:b2b0170234c1d1dfefc47a409256c774d9bf2fd95dc87f6cf439883a650de5bd
 size 988

fine-tuned-model/{checkpoint-295 → checkpoint-1121}/scheduler.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ee2a611a1529c905642325059ae97e660232f5907273b45d09703eb7bc5fa03
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:144d877c8e0681417ebedd2a8d6df1d85acea4b51efcda810a2976cf0dd26747
 size 1064

fine-tuned-model/{checkpoint-295 → checkpoint-1121}/special_tokens_map.json RENAMED Viewed

File without changes

fine-tuned-model/{checkpoint-295 → checkpoint-1121}/tokenizer.json RENAMED Viewed

File without changes

fine-tuned-model/{checkpoint-295 → checkpoint-1121}/tokenizer_config.json RENAMED Viewed

File without changes

fine-tuned-model/checkpoint-1121/trainer_state.json ADDED Viewed

	@@ -0,0 +1,349 @@

+{
+  "best_global_step": 885,
+  "best_metric": 1.009942650794983,
+  "best_model_checkpoint": "./fine-tuned-model\\checkpoint-885",
+  "epoch": 19.0,
+  "eval_steps": 500,
+  "global_step": 1121,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.847457627118644,
+      "grad_norm": 0.939425528049469,
+      "learning_rate": 5.8983050847457634e-05,
+      "loss": 7.3825,
+      "step": 50
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.3358267545700073,
+      "eval_runtime": 7.2445,
+      "eval_samples_per_second": 14.494,
+      "eval_steps_per_second": 0.966,
+      "step": 59
+    },
+    {
+      "epoch": 1.694915254237288,
+      "grad_norm": 2.5237913131713867,
+      "learning_rate": 5.796610169491525e-05,
+      "loss": 1.6245,
+      "step": 100
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.1412484645843506,
+      "eval_runtime": 7.1915,
+      "eval_samples_per_second": 14.601,
+      "eval_steps_per_second": 0.973,
+      "step": 118
+    },
+    {
+      "epoch": 2.542372881355932,
+      "grad_norm": 7.8459625244140625,
+      "learning_rate": 5.6949152542372884e-05,
+      "loss": 1.4469,
+      "step": 150
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.0957719087600708,
+      "eval_runtime": 7.3705,
+      "eval_samples_per_second": 14.246,
+      "eval_steps_per_second": 0.95,
+      "step": 177
+    },
+    {
+      "epoch": 3.389830508474576,
+      "grad_norm": 1.5296450853347778,
+      "learning_rate": 5.593220338983051e-05,
+      "loss": 1.3912,
+      "step": 200
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.072253704071045,
+      "eval_runtime": 7.3695,
+      "eval_samples_per_second": 14.248,
+      "eval_steps_per_second": 0.95,
+      "step": 236
+    },
+    {
+      "epoch": 4.237288135593221,
+      "grad_norm": 1.9591976404190063,
+      "learning_rate": 5.4915254237288135e-05,
+      "loss": 1.2829,
+      "step": 250
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 1.1338605880737305,
+      "eval_runtime": 7.1975,
+      "eval_samples_per_second": 14.588,
+      "eval_steps_per_second": 0.973,
+      "step": 295
+    },
+    {
+      "epoch": 5.084745762711864,
+      "grad_norm": 1.4545025825500488,
+      "learning_rate": 5.389830508474577e-05,
+      "loss": 1.3583,
+      "step": 300
+    },
+    {
+      "epoch": 5.932203389830509,
+      "grad_norm": 1.4759844541549683,
+      "learning_rate": 5.288135593220339e-05,
+      "loss": 1.295,
+      "step": 350
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 1.043823480606079,
+      "eval_runtime": 7.226,
+      "eval_samples_per_second": 14.531,
+      "eval_steps_per_second": 0.969,
+      "step": 354
+    },
+    {
+      "epoch": 6.779661016949152,
+      "grad_norm": 1.5640958547592163,
+      "learning_rate": 5.186440677966102e-05,
+      "loss": 1.2581,
+      "step": 400
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 1.0363339185714722,
+      "eval_runtime": 7.3055,
+      "eval_samples_per_second": 14.373,
+      "eval_steps_per_second": 0.958,
+      "step": 413
+    },
+    {
+      "epoch": 7.627118644067797,
+      "grad_norm": 2.0497965812683105,
+      "learning_rate": 5.0847457627118643e-05,
+      "loss": 1.2544,
+      "step": 450
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 1.067766785621643,
+      "eval_runtime": 7.3717,
+      "eval_samples_per_second": 14.244,
+      "eval_steps_per_second": 0.95,
+      "step": 472
+    },
+    {
+      "epoch": 8.474576271186441,
+      "grad_norm": 1.2606173753738403,
+      "learning_rate": 4.9830508474576276e-05,
+      "loss": 1.3097,
+      "step": 500
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 1.02413809299469,
+      "eval_runtime": 7.1818,
+      "eval_samples_per_second": 14.62,
+      "eval_steps_per_second": 0.975,
+      "step": 531
+    },
+    {
+      "epoch": 9.322033898305085,
+      "grad_norm": 2.3118815422058105,
+      "learning_rate": 4.88135593220339e-05,
+      "loss": 1.2787,
+      "step": 550
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 1.013655424118042,
+      "eval_runtime": 7.1577,
+      "eval_samples_per_second": 14.67,
+      "eval_steps_per_second": 0.978,
+      "step": 590
+    },
+    {
+      "epoch": 10.169491525423728,
+      "grad_norm": 1.2859658002853394,
+      "learning_rate": 4.7796610169491526e-05,
+      "loss": 1.2354,
+      "step": 600
+    },
+    {
+      "epoch": 11.0,
+      "eval_loss": 1.0547661781311035,
+      "eval_runtime": 7.1926,
+      "eval_samples_per_second": 14.598,
+      "eval_steps_per_second": 0.973,
+      "step": 649
+    },
+    {
+      "epoch": 11.016949152542374,
+      "grad_norm": 2.121445417404175,
+      "learning_rate": 4.677966101694916e-05,
+      "loss": 1.2596,
+      "step": 650
+    },
+    {
+      "epoch": 11.864406779661017,
+      "grad_norm": 3.0464370250701904,
+      "learning_rate": 4.576271186440678e-05,
+      "loss": 1.2646,
+      "step": 700
+    },
+    {
+      "epoch": 12.0,
+      "eval_loss": 1.0133599042892456,
+      "eval_runtime": 10.6666,
+      "eval_samples_per_second": 9.844,
+      "eval_steps_per_second": 0.656,
+      "step": 708
+    },
+    {
+      "epoch": 12.711864406779661,
+      "grad_norm": 1.1342540979385376,
+      "learning_rate": 4.474576271186441e-05,
+      "loss": 1.2068,
+      "step": 750
+    },
+    {
+      "epoch": 13.0,
+      "eval_loss": 1.0467838048934937,
+      "eval_runtime": 11.4351,
+      "eval_samples_per_second": 9.182,
+      "eval_steps_per_second": 0.612,
+      "step": 767
+    },
+    {
+      "epoch": 13.559322033898304,
+      "grad_norm": 2.094381093978882,
+      "learning_rate": 4.3728813559322035e-05,
+      "loss": 1.2955,
+      "step": 800
+    },
+    {
+      "epoch": 14.0,
+      "eval_loss": 1.0249124765396118,
+      "eval_runtime": 13.2701,
+      "eval_samples_per_second": 7.913,
+      "eval_steps_per_second": 0.528,
+      "step": 826
+    },
+    {
+      "epoch": 14.40677966101695,
+      "grad_norm": 1.0174381732940674,
+      "learning_rate": 4.271186440677966e-05,
+      "loss": 1.2215,
+      "step": 850
+    },
+    {
+      "epoch": 15.0,
+      "eval_loss": 1.009942650794983,
+      "eval_runtime": 10.124,
+      "eval_samples_per_second": 10.371,
+      "eval_steps_per_second": 0.691,
+      "step": 885
+    },
+    {
+      "epoch": 15.254237288135593,
+      "grad_norm": 1.1202493906021118,
+      "learning_rate": 4.169491525423729e-05,
+      "loss": 1.2365,
+      "step": 900
+    },
+    {
+      "epoch": 16.0,
+      "eval_loss": 1.0121246576309204,
+      "eval_runtime": 9.974,
+      "eval_samples_per_second": 10.527,
+      "eval_steps_per_second": 0.702,
+      "step": 944
+    },
+    {
+      "epoch": 16.10169491525424,
+      "grad_norm": 1.1021959781646729,
+      "learning_rate": 4.067796610169492e-05,
+      "loss": 1.2412,
+      "step": 950
+    },
+    {
+      "epoch": 16.949152542372882,
+      "grad_norm": 0.9624550938606262,
+      "learning_rate": 3.966101694915254e-05,
+      "loss": 1.2348,
+      "step": 1000
+    },
+    {
+      "epoch": 17.0,
+      "eval_loss": 1.0155479907989502,
+      "eval_runtime": 8.9635,
+      "eval_samples_per_second": 11.714,
+      "eval_steps_per_second": 0.781,
+      "step": 1003
+    },
+    {
+      "epoch": 17.796610169491526,
+      "grad_norm": 0.9586867094039917,
+      "learning_rate": 3.864406779661017e-05,
+      "loss": 1.2455,
+      "step": 1050
+    },
+    {
+      "epoch": 18.0,
+      "eval_loss": 1.0335369110107422,
+      "eval_runtime": 9.0555,
+      "eval_samples_per_second": 11.595,
+      "eval_steps_per_second": 0.773,
+      "step": 1062
+    },
+    {
+      "epoch": 18.64406779661017,
+      "grad_norm": 1.7303390502929688,
+      "learning_rate": 3.76271186440678e-05,
+      "loss": 1.2238,
+      "step": 1100
+    },
+    {
+      "epoch": 19.0,
+      "eval_loss": 1.020735263824463,
+      "eval_runtime": 9.6479,
+      "eval_samples_per_second": 10.883,
+      "eval_steps_per_second": 0.726,
+      "step": 1121
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 2950,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 4,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 4
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.5260523640520704e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

fine-tuned-model/{checkpoint-354 → checkpoint-1121}/training_args.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:067bd988483e44c78bdd5214e17314d730fc21ddb68a86665b5964dfa1dff3cf
 size 5368

 version https://git-lfs.github.com/spec/v1
+oid sha256:eb126e0e8412d31a50774b555d6da314217e8af8f8f466b0f189bdae98050751
 size 5368

fine-tuned-model/checkpoint-295/trainer_state.json DELETED Viewed

@@ -1,109 +0,0 @@
-{
-  "best_global_step": null,
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 5.0,
-  "eval_steps": 500,
-  "global_step": 295,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "epoch": 0.847457627118644,
-      "grad_norm": 3.6365857124328613,
-      "learning_rate": 4.152542372881356e-05,
-      "loss": 8.5103,
-      "step": 50
-    },
-    {
-      "epoch": 1.0,
-      "eval_loss": 1.4866021871566772,
-      "eval_runtime": 5.4305,
-      "eval_samples_per_second": 19.335,
-      "eval_steps_per_second": 1.289,
-      "step": 59
-    },
-    {
-      "epoch": 1.694915254237288,
-      "grad_norm": 3.137465715408325,
-      "learning_rate": 3.305084745762712e-05,
-      "loss": 1.7098,
-      "step": 100
-    },
-    {
-      "epoch": 2.0,
-      "eval_loss": 1.2273037433624268,
-      "eval_runtime": 5.362,
-      "eval_samples_per_second": 19.582,
-      "eval_steps_per_second": 1.305,
-      "step": 118
-    },
-    {
-      "epoch": 2.542372881355932,
-      "grad_norm": 1.6243258714675903,
-      "learning_rate": 2.457627118644068e-05,
-      "loss": 1.5421,
-      "step": 150
-    },
-    {
-      "epoch": 3.0,
-      "eval_loss": 1.1611202955245972,
-      "eval_runtime": 5.348,
-      "eval_samples_per_second": 19.634,
-      "eval_steps_per_second": 1.309,
-      "step": 177
-    },
-    {
-      "epoch": 3.389830508474576,
-      "grad_norm": 1.7812302112579346,
-      "learning_rate": 1.6101694915254237e-05,
-      "loss": 1.4875,
-      "step": 200
-    },
-    {
-      "epoch": 4.0,
-      "eval_loss": 1.153254508972168,
-      "eval_runtime": 5.347,
-      "eval_samples_per_second": 19.637,
-      "eval_steps_per_second": 1.309,
-      "step": 236
-    },
-    {
-      "epoch": 4.237288135593221,
-      "grad_norm": 2.1582489013671875,
-      "learning_rate": 7.627118644067798e-06,
-      "loss": 1.3883,
-      "step": 250
-    },
-    {
-      "epoch": 5.0,
-      "eval_loss": 1.1216797828674316,
-      "eval_runtime": 5.3095,
-      "eval_samples_per_second": 19.776,
-      "eval_steps_per_second": 1.318,
-      "step": 295
-    }
-  ],
-  "logging_steps": 50,
-  "max_steps": 295,
-  "num_input_tokens_seen": 0,
-  "num_train_epochs": 5,
-  "save_steps": 500,
-  "stateful_callbacks": {
-    "TrainerControl": {
-      "args": {
-        "should_epoch_stop": false,
-        "should_evaluate": false,
-        "should_log": false,
-        "should_save": true,
-        "should_training_stop": true
-      },
-      "attributes": {}
-    }
-  },
-  "total_flos": 9279085168558080.0,
-  "train_batch_size": 16,
-  "trial_name": null,
-  "trial_params": null
-}

fine-tuned-model/checkpoint-354/trainer_state.json DELETED Viewed

@@ -1,107 +0,0 @@
-{
-  "best_global_step": null,
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 3.0,
-  "eval_steps": 500,
-  "global_step": 354,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "epoch": 0.423728813559322,
-      "grad_norm": 1.3252296447753906,
-      "learning_rate": 0.0004307909604519774,
-      "loss": 3.0243,
-      "step": 50
-    },
-    {
-      "epoch": 0.847457627118644,
-      "grad_norm": 1.302614450454712,
-      "learning_rate": 0.00036016949152542374,
-      "loss": 1.2356,
-      "step": 100
-    },
-    {
-      "epoch": 1.0,
-      "eval_loss": 0.9709166288375854,
-      "eval_runtime": 5.194,
-      "eval_samples_per_second": 20.216,
-      "eval_steps_per_second": 2.695,
-      "step": 118
-    },
-    {
-      "epoch": 1.271186440677966,
-      "grad_norm": 0.6076271533966064,
-      "learning_rate": 0.0002895480225988701,
-      "loss": 1.2005,
-      "step": 150
-    },
-    {
-      "epoch": 1.694915254237288,
-      "grad_norm": 1.1516226530075073,
-      "learning_rate": 0.0002189265536723164,
-      "loss": 1.2331,
-      "step": 200
-    },
-    {
-      "epoch": 2.0,
-      "eval_loss": 0.9370157718658447,
-      "eval_runtime": 5.1425,
-      "eval_samples_per_second": 20.418,
-      "eval_steps_per_second": 2.722,
-      "step": 236
-    },
-    {
-      "epoch": 2.1186440677966103,
-      "grad_norm": 0.5812012553215027,
-      "learning_rate": 0.0001483050847457627,
-      "loss": 1.1483,
-      "step": 250
-    },
-    {
-      "epoch": 2.542372881355932,
-      "grad_norm": 2.018043279647827,
-      "learning_rate": 7.768361581920904e-05,
-      "loss": 1.1873,
-      "step": 300
-    },
-    {
-      "epoch": 2.9661016949152543,
-      "grad_norm": 0.5886570811271667,
-      "learning_rate": 7.062146892655367e-06,
-      "loss": 1.1576,
-      "step": 350
-    },
-    {
-      "epoch": 3.0,
-      "eval_loss": 0.9401432871818542,
-      "eval_runtime": 5.1425,
-      "eval_samples_per_second": 20.418,
-      "eval_steps_per_second": 2.722,
-      "step": 354
-    }
-  ],
-  "logging_steps": 50,
-  "max_steps": 354,
-  "num_input_tokens_seen": 0,
-  "num_train_epochs": 3,
-  "save_steps": 500,
-  "stateful_callbacks": {
-    "TrainerControl": {
-      "args": {
-        "should_epoch_stop": false,
-        "should_evaluate": false,
-        "should_log": false,
-        "should_save": true,
-        "should_training_stop": true
-      },
-      "attributes": {}
-    }
-  },
-  "total_flos": 5567451101134848.0,
-  "train_batch_size": 8,
-  "trial_name": null,
-  "trial_params": null
-}

fine-tuned-model/{checkpoint-354 → checkpoint-590}/README.md RENAMED Viewed

File without changes

fine-tuned-model/{checkpoint-295 → checkpoint-590}/adapter_config.json RENAMED Viewed

@@ -20,12 +20,12 @@
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
-  "r": 16,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "k_proj",
     "q_proj",
     "o_proj",
     "v_proj"
   ],

   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
+  "r": 32,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "q_proj",
+    "k_proj",
     "o_proj",
     "v_proj"
   ],

fine-tuned-model/{checkpoint-354 → checkpoint-590}/adapter_model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:920b86465c2818967aa645e7034a6f5566672ba63135dcb7809e893d48e77305
-size 25191536

 version https://git-lfs.github.com/spec/v1
+oid sha256:6bd78c8222b91c5a43eef81fb14dfb052607a1e986c931f28dbf005c1c762963
+size 50357440

fine-tuned-model/{checkpoint-295 → checkpoint-590}/optimizer.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bc34cbc76e28ac17c0afc1333f5d7b30a8beebce84a6f3e73589b31ff00733f2
-size 50492858

 version https://git-lfs.github.com/spec/v1
+oid sha256:378f24a6f93dde2886215460fba55025509bed73deec840e856c2fc0ae7f20dd
+size 100825274

fine-tuned-model/{checkpoint-295 → checkpoint-590}/rng_state.pth RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:47500e947b0979f4187ced89844b0b41af88c14cc3ed27ad8cb01fdb1072cb88
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:5e1486d84435f60895ed0edc99f537f9a3ec350361fbd8798e501803966814c2
 size 14244

fine-tuned-model/{checkpoint-295 → checkpoint-590}/scaler.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0cac5ada32db5a3f9944a8e52bf38c14646b18fcd0190b4fb8155b619d8f5ab
 size 988

 version https://git-lfs.github.com/spec/v1
+oid sha256:ef038376ed0ccb6d992bae6f264fc61e513f2c11e7ddccb8f3b500fe3976c969
 size 988

fine-tuned-model/{checkpoint-354 → checkpoint-590}/scheduler.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a52d2ab24942c7198df251117ba6eb70c88f4cd280a3f04105f33dacfdbfd3ad
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:6a773639ece71e52af3d21940488177b6e201c3d8fb49851b0d6319abe8767df
 size 1064

fine-tuned-model/{checkpoint-354 → checkpoint-590}/special_tokens_map.json RENAMED Viewed

File without changes

fine-tuned-model/{checkpoint-354 → checkpoint-590}/tokenizer.json RENAMED Viewed

File without changes

fine-tuned-model/{checkpoint-354 → checkpoint-590}/tokenizer_config.json RENAMED Viewed

File without changes

fine-tuned-model/checkpoint-590/trainer_state.json ADDED Viewed

	@@ -0,0 +1,200 @@

+{
+  "best_global_step": 590,
+  "best_metric": 0.9585100412368774,
+  "best_model_checkpoint": "./fine-tuned-model\\checkpoint-590",
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 590,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.847457627118644,
+      "grad_norm": 2.3572731018066406,
+      "learning_rate": 3.932203389830509e-05,
+      "loss": 9.5716,
+      "step": 50
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.5832620859146118,
+      "eval_runtime": 5.4855,
+      "eval_samples_per_second": 19.141,
+      "eval_steps_per_second": 1.276,
+      "step": 59
+    },
+    {
+      "epoch": 1.694915254237288,
+      "grad_norm": 3.8656413555145264,
+      "learning_rate": 3.8644067796610175e-05,
+      "loss": 1.7466,
+      "step": 100
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.1679714918136597,
+      "eval_runtime": 5.364,
+      "eval_samples_per_second": 19.575,
+      "eval_steps_per_second": 1.305,
+      "step": 118
+    },
+    {
+      "epoch": 2.542372881355932,
+      "grad_norm": 2.582818031311035,
+      "learning_rate": 3.796610169491526e-05,
+      "loss": 1.5173,
+      "step": 150
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.0937272310256958,
+      "eval_runtime": 5.4735,
+      "eval_samples_per_second": 19.183,
+      "eval_steps_per_second": 1.279,
+      "step": 177
+    },
+    {
+      "epoch": 3.389830508474576,
+      "grad_norm": 0.9783422350883484,
+      "learning_rate": 3.728813559322034e-05,
+      "loss": 1.4233,
+      "step": 200
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.0387905836105347,
+      "eval_runtime": 5.354,
+      "eval_samples_per_second": 19.612,
+      "eval_steps_per_second": 1.307,
+      "step": 236
+    },
+    {
+      "epoch": 4.237288135593221,
+      "grad_norm": 0.6458675265312195,
+      "learning_rate": 3.6610169491525426e-05,
+      "loss": 1.3044,
+      "step": 250
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 1.0661542415618896,
+      "eval_runtime": 5.4235,
+      "eval_samples_per_second": 19.36,
+      "eval_steps_per_second": 1.291,
+      "step": 295
+    },
+    {
+      "epoch": 5.084745762711864,
+      "grad_norm": 1.3425668478012085,
+      "learning_rate": 3.593220338983051e-05,
+      "loss": 1.3608,
+      "step": 300
+    },
+    {
+      "epoch": 5.932203389830509,
+      "grad_norm": 1.331030011177063,
+      "learning_rate": 3.52542372881356e-05,
+      "loss": 1.2839,
+      "step": 350
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 0.9894506335258484,
+      "eval_runtime": 5.406,
+      "eval_samples_per_second": 19.423,
+      "eval_steps_per_second": 1.295,
+      "step": 354
+    },
+    {
+      "epoch": 6.779661016949152,
+      "grad_norm": 1.02914297580719,
+      "learning_rate": 3.457627118644068e-05,
+      "loss": 1.2485,
+      "step": 400
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 0.9816469550132751,
+      "eval_runtime": 5.4105,
+      "eval_samples_per_second": 19.407,
+      "eval_steps_per_second": 1.294,
+      "step": 413
+    },
+    {
+      "epoch": 7.627118644067797,
+      "grad_norm": 1.8862000703811646,
+      "learning_rate": 3.389830508474576e-05,
+      "loss": 1.2426,
+      "step": 450
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 1.0074799060821533,
+      "eval_runtime": 5.423,
+      "eval_samples_per_second": 19.362,
+      "eval_steps_per_second": 1.291,
+      "step": 472
+    },
+    {
+      "epoch": 8.474576271186441,
+      "grad_norm": 0.9509351849555969,
+      "learning_rate": 3.322033898305085e-05,
+      "loss": 1.2903,
+      "step": 500
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 0.9700178503990173,
+      "eval_runtime": 5.388,
+      "eval_samples_per_second": 19.488,
+      "eval_steps_per_second": 1.299,
+      "step": 531
+    },
+    {
+      "epoch": 9.322033898305085,
+      "grad_norm": 1.861725926399231,
+      "learning_rate": 3.2542372881355934e-05,
+      "loss": 1.2588,
+      "step": 550
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 0.9585100412368774,
+      "eval_runtime": 5.409,
+      "eval_samples_per_second": 19.412,
+      "eval_steps_per_second": 1.294,
+      "step": 590
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 2950,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 2,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.86489122586624e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

fine-tuned-model/{checkpoint-295 → checkpoint-590}/training_args.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:145881f21d543612154f9ea67fda44fa38754eb20bd4eb13e1492be30ee670d7
 size 5368

 version https://git-lfs.github.com/spec/v1
+oid sha256:dbc31a968883ba1c37ac7256a236ef457c3a308b208d06d810a9cc1c7385f86a
 size 5368

fine-tuned-model/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9062451f577293bdd513174888e96a053a05c6f763eabe4c1196ad82987caf2c
 size 1480793144

 version https://git-lfs.github.com/spec/v1
+oid sha256:95cc3d2d2f4148921f13714c2b852562d2616722a7178361623f64e4339f0051
 size 1480793144

fine-tuned-model/runs/Apr03_21-04-00_DESKTOP-SMJC97K/events.out.tfevents.1743739440.DESKTOP-SMJC97K.13648.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1ac21d76b4b1a98d5bd7cd3227cbd39d36f6159fbfd9788d75abf80fc773513
+size 12206

fine-tuned-model/runs/Apr04_09-11-28_DESKTOP-SMJC97K/events.out.tfevents.1743783088.DESKTOP-SMJC97K.12624.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4b3ce33677563c06e296678e5116701cd3be77c97d996cf838591c38fe6185d
+size 15791

fine-tuned-model/runs/Apr06_12-22-00_DESKTOP-SMJC97K/events.out.tfevents.1743967320.DESKTOP-SMJC97K.20424.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05abf9b879ca9f8506f60328b453852dc688840e099650a234aeef0e0080671d
+size 12206

finetune_model.ipynb CHANGED Viewed

@@ -175,9 +175,7 @@
     "Request:\n",
     "\"What is the most points the Los Angeles Lakers have ever scored at home?\"\n",
     "SQLite:\n",
-    "SELECT MAX(pts_home) \n",
-    "FROM game \n",
-    "WHERE team_name_home = 'Los Angeles Lakers';\n",
     "\n",
     "Request:\n",
     "\"Which teams are located in the state of California?\"\n",
@@ -197,9 +195,7 @@
     "Request:\n",
     "\"Find the Boston Celtics largest home victory margin in the 2008 season.\"\n",
     "SQLite:\n",
-    "SELECT MAX(pts_home - pts_away) AS biggest_win\n",
-    "FROM game\n",
-    "WHERE team_name_home = 'Boston Celtics' AND season_id = '22008';\n",
     "\n",
     "Generate only the SQLite query prefaced by SQLite: and no other text, do not output an explanation of the query. Now generate an SQLite query for the following user request. Request:\n",
     "\"\"\""
@@ -230,7 +226,21 @@
      "output_type": "stream",
      "text": [
       "WARNING:tensorflow:From c:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\tf_keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
-      "\n",
       "Total dataset examples: 1044\n",
       "                                       natural_query  \\\n",
       "0  Which NBA teams were established after the yea...   \n",
@@ -241,10 +251,10 @@
       "\n",
       "                                           sql_query                result  \n",
       "0  SELECT full_name FROM team WHERE year_founded ...  New Orleans Pelicans  \n",
-      "1  SELECT MAX(pts_home) FROM game  WHERE team_nam...                   162  \n",
       "2  SELECT pts_home FROM game WHERE team_name_home...                   156  \n",
-      "3  SELECT COUNT(*)  FROM game  WHERE team_abbrevi...                    29  \n",
-      "4  SELECT AVG(ast_home)  FROM game  WHERE team_ab...           26.51355662  \n"
      ]
     },
     {
@@ -413,9 +423,7 @@
       "Request:\n",
       "\"What is the most points the Los Angeles Lakers have ever scored at home?\"\n",
       "SQLite:\n",
-      "SELECT MAX(pts_home) \n",
-      "FROM game \n",
-      "WHERE team_name_home = 'Los Angeles Lakers';\n",
       "\n",
       "Request:\n",
       "\"Which teams are located in the state of California?\"\n",
@@ -435,9 +443,7 @@
       "Request:\n",
       "\"Find the Boston Celtics largest home victory margin in the 2008 season.\"\n",
       "SQLite:\n",
-      "SELECT MAX(pts_home - pts_away) AS biggest_win\n",
-      "FROM game\n",
-      "WHERE team_name_home = 'Boston Celtics' AND season_id = '22008';\n",
       "\n",
       "Generate only the SQLite query prefaced by SQLite: and no other text, do not output an explanation of the query. Now generate an SQLite query for the following user request. Request:\n",
       "Which NBA teams were established after the year 2000? List their names and founding years, sorted from newest to oldest\n",
@@ -449,7 +455,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Map: 100%|██████████| 1044/1044 [00:01<00:00, 552.67 examples/s]"
      ]
     },
     {
@@ -611,9 +617,7 @@
       "Request:\n",
       "\"What is the most points the Los Angeles Lakers have ever scored at home?\"\n",
       "SQLite:\n",
-      "SELECT MAX(pts_home) \n",
-      "FROM game \n",
-      "WHERE team_name_home = 'Los Angeles Lakers';\n",
       "\n",
       "Request:\n",
       "\"Which teams are located in the state of California?\"\n",
@@ -633,14 +637,12 @@
       "Request:\n",
       "\"Find the Boston Celtics largest home victory margin in the 2008 season.\"\n",
       "SQLite:\n",
-      "SELECT MAX(pts_home - pts_away) AS biggest_win\n",
-      "FROM game\n",
-      "WHERE team_name_home = 'Boston Celtics' AND season_id = '22008';\n",
       "\n",
       "Generate only the SQLite query prefaced by SQLite: and no other text, do not output an explanation of the query. Now generate an SQLite query for the following user request. Request:\n",
       "How many points did the Golden State Warriors score in their first game of the 2005 season?\n",
       "SQLite: \n",
-      "SELECT pts_home  FROM game  WHERE team_abbreviation_home = 'GSW'  AND season_id = '22005'  ORDER BY game_date ASC  LIMIT 1;\n",
       "939\n",
       "105\n"
      ]
@@ -657,14 +659,17 @@
     "import pandas as pd\n",
     "import torch\n",
     "from datasets import Dataset\n",
-    "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig\n",
     "from torch.utils.data import DataLoader\n",
     "from peft import LoraConfig, get_peft_model, TaskType\n",
     "import os\n",
     "\n",
     "# Load dataset\n",
     "df = pd.read_csv(\"./train-data/sql_train.tsv\", sep='\\t')\n",
     "\n",
     "# Display dataset info\n",
     "print(f\"Total dataset examples: {len(df)}\")\n",
     "print(df.head())\n",
@@ -721,7 +726,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "trainable params: 6,291,456 || all params: 1,352,763,392 || trainable%: 0.4651\n"
      ]
     }
    ],
@@ -745,7 +750,7 @@
     "\n",
     "# Define LoRA configuration\n",
     "lora_config = LoraConfig(\n",
-    "    r=16,  # Rank of LoRA matrices (adjust for memory vs. accuracy)\n",
     "    lora_alpha=32,  # Scaling factor\n",
     "    lora_dropout=0.1,  # Dropout for regularization\n",
     "    bias=\"none\",\n",
@@ -782,7 +787,7 @@
      "text": [
       "c:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
       "  warnings.warn(\n",
-      "C:\\Users\\Dean\\AppData\\Local\\Temp\\ipykernel_9248\\2737143648.py:17: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
       "  trainer = Trainer(\n",
       "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
      ]
@@ -795,13 +800,16 @@
     "    save_strategy=\"epoch\",  # Save model every epoch\n",
     "    per_device_train_batch_size=16,  # LoRA allows higher batch size\n",
     "    per_device_eval_batch_size=16,\n",
-    "    num_train_epochs=5,  # Increase if needed\n",
-    "    learning_rate=5e-5,  # Higher LR since we're only training LoRA layers\n",
     "    weight_decay=0.01,\n",
     "    logging_steps=50,  # Print loss every 50 steps\n",
-    "    save_total_limit=2,  # Keep last 2 checkpoints\n",
     "    fp16=True if torch.cuda.is_available() else False,\n",
-    "    push_to_hub=False\n",
     ")\n",
     "\n",
     "# Trainer setup\n",
@@ -810,7 +818,8 @@
     "    args=training_args,\n",
     "    train_dataset=train_dataset,\n",
     "    eval_dataset=val_dataset,\n",
-    "    tokenizer=tokenizer\n",
     ")"
    ]
   },
@@ -840,8 +849,8 @@
        "\n",
        "    <div>\n",
        "      \n",
-       "      <progress value='295' max='295' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [295/295 1:04:01, Epoch 5/5]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
@@ -854,28 +863,63 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <td>1</td>\n",
-       "      <td>8.510300</td>\n",
-       "      <td>1.486602</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>2</td>\n",
-       "      <td>1.709800</td>\n",
-       "      <td>1.227304</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>3</td>\n",
-       "      <td>1.542100</td>\n",
-       "      <td>1.161120</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>4</td>\n",
-       "      <td>1.487500</td>\n",
-       "      <td>1.153255</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>5</td>\n",
-       "      <td>1.388300</td>\n",
-       "      <td>1.121680</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table><p>"
@@ -927,9 +971,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
@@ -949,7 +1001,7 @@
     "inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n",
     "\n",
     "# Generate SQL query\n",
-    "outputs = model.generate(inputs, max_new_tokens=512, do_sample=False)\n",
     "query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n",
     "\n",
     "print(\"Generated SQL:\", query_output)"

     "Request:\n",
     "\"What is the most points the Los Angeles Lakers have ever scored at home?\"\n",
     "SQLite:\n",
+    "SELECT MAX(pts_home) FROM game WHERE team_name_home = 'Los Angeles Lakers';\n",
     "\n",
     "Request:\n",
     "\"Which teams are located in the state of California?\"\n",
     "Request:\n",
     "\"Find the Boston Celtics largest home victory margin in the 2008 season.\"\n",
     "SQLite:\n",
+    "SELECT MAX(pts_home - pts_away) AS biggest_win FROM game WHERE team_name_home = 'Boston Celtics' AND season_id = '22008';\n",
     "\n",
     "Generate only the SQLite query prefaced by SQLite: and no other text, do not output an explanation of the query. Now generate an SQLite query for the following user request. Request:\n",
     "\"\"\""
      "output_type": "stream",
      "text": [
       "WARNING:tensorflow:From c:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\tf_keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Dean\\AppData\\Local\\Temp\\ipykernel_20424\\3615904657.py:13: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
+      "  df = df.applymap(lambda x: re.sub(r'\\s+', ' ', x) if isinstance(x, str) else x)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "Total dataset examples: 1044\n",
       "                                       natural_query  \\\n",
       "0  Which NBA teams were established after the yea...   \n",
       "\n",
       "                                           sql_query                result  \n",
       "0  SELECT full_name FROM team WHERE year_founded ...  New Orleans Pelicans  \n",
+      "1  SELECT MAX(pts_home) FROM game WHERE team_name...                   162  \n",
       "2  SELECT pts_home FROM game WHERE team_name_home...                   156  \n",
+      "3  SELECT COUNT(*) FROM game WHERE team_abbreviat...                    29  \n",
+      "4  SELECT AVG(ast_home) FROM game WHERE team_abbr...           26.51355662  \n"
      ]
     },
     {
       "Request:\n",
       "\"What is the most points the Los Angeles Lakers have ever scored at home?\"\n",
       "SQLite:\n",
+      "SELECT MAX(pts_home) FROM game WHERE team_name_home = 'Los Angeles Lakers';\n",
       "\n",
       "Request:\n",
       "\"Which teams are located in the state of California?\"\n",
       "Request:\n",
       "\"Find the Boston Celtics largest home victory margin in the 2008 season.\"\n",
       "SQLite:\n",
+      "SELECT MAX(pts_home - pts_away) AS biggest_win FROM game WHERE team_name_home = 'Boston Celtics' AND season_id = '22008';\n",
       "\n",
       "Generate only the SQLite query prefaced by SQLite: and no other text, do not output an explanation of the query. Now generate an SQLite query for the following user request. Request:\n",
       "Which NBA teams were established after the year 2000? List their names and founding years, sorted from newest to oldest\n",
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Map: 100%|██████████| 1044/1044 [00:01<00:00, 546.45 examples/s]"
      ]
     },
     {
       "Request:\n",
       "\"What is the most points the Los Angeles Lakers have ever scored at home?\"\n",
       "SQLite:\n",
+      "SELECT MAX(pts_home) FROM game WHERE team_name_home = 'Los Angeles Lakers';\n",
       "\n",
       "Request:\n",
       "\"Which teams are located in the state of California?\"\n",
       "Request:\n",
       "\"Find the Boston Celtics largest home victory margin in the 2008 season.\"\n",
       "SQLite:\n",
+      "SELECT MAX(pts_home - pts_away) AS biggest_win FROM game WHERE team_name_home = 'Boston Celtics' AND season_id = '22008';\n",
       "\n",
       "Generate only the SQLite query prefaced by SQLite: and no other text, do not output an explanation of the query. Now generate an SQLite query for the following user request. Request:\n",
       "How many points did the Golden State Warriors score in their first game of the 2005 season?\n",
       "SQLite: \n",
+      "SELECT pts_home FROM game WHERE team_abbreviation_home = 'GSW' AND season_id = '22005' ORDER BY game_date ASC LIMIT 1;\n",
       "939\n",
       "105\n"
      ]
     "import pandas as pd\n",
     "import torch\n",
     "from datasets import Dataset\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, EarlyStoppingCallback\n",
     "from torch.utils.data import DataLoader\n",
     "from peft import LoraConfig, get_peft_model, TaskType\n",
     "import os\n",
+    "import re\n",
     "\n",
     "# Load dataset\n",
     "df = pd.read_csv(\"./train-data/sql_train.tsv\", sep='\\t')\n",
     "\n",
+    "df = df.applymap(lambda x: re.sub(r'\\s+', ' ', x) if isinstance(x, str) else x)\n",
+    "\n",
     "# Display dataset info\n",
     "print(f\"Total dataset examples: {len(df)}\")\n",
     "print(df.head())\n",
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "trainable params: 12,582,912 || all params: 1,359,054,848 || trainable%: 0.9259\n"
      ]
     }
    ],
     "\n",
     "# Define LoRA configuration\n",
     "lora_config = LoraConfig(\n",
+    "    r=32,  # Rank of LoRA matrices (adjust for memory vs. accuracy)\n",
     "    lora_alpha=32,  # Scaling factor\n",
     "    lora_dropout=0.1,  # Dropout for regularization\n",
     "    bias=\"none\",\n",
      "text": [
       "c:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
       "  warnings.warn(\n",
+      "C:\\Users\\Dean\\AppData\\Local\\Temp\\ipykernel_20424\\92099500.py:20: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
       "  trainer = Trainer(\n",
       "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
      ]
     "    save_strategy=\"epoch\",  # Save model every epoch\n",
     "    per_device_train_batch_size=16,  # LoRA allows higher batch size\n",
     "    per_device_eval_batch_size=16,\n",
+    "    num_train_epochs=50,  # Increase if needed\n",
+    "    learning_rate=4e-5,  # Higher LR since we're only training LoRA layers\n",
     "    weight_decay=0.01,\n",
     "    logging_steps=50,  # Print loss every 50 steps\n",
+    "    save_total_limit=2,  # Keep last 4 checkpoints\n",
     "    fp16=True if torch.cuda.is_available() else False,\n",
+    "    push_to_hub=False,\n",
+    "    load_best_model_at_end=True,\n",
+    "    metric_for_best_model=\"eval_loss\",\n",
+    "    greater_is_better=False\n",
     ")\n",
     "\n",
     "# Trainer setup\n",
     "    args=training_args,\n",
     "    train_dataset=train_dataset,\n",
     "    eval_dataset=val_dataset,\n",
+    "    tokenizer=tokenizer,\n",
+    "    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]\n",
     ")"
    ]
   },
        "\n",
        "    <div>\n",
        "      \n",
+       "      <progress value='708' max='2950' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [ 708/2950 2:25:12 < 7:41:08, 0.08 it/s, Epoch 12/50]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td>1</td>\n",
+       "      <td>9.571600</td>\n",
+       "      <td>1.583262</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>2</td>\n",
+       "      <td>1.746600</td>\n",
+       "      <td>1.167971</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>3</td>\n",
+       "      <td>1.517300</td>\n",
+       "      <td>1.093727</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>4</td>\n",
+       "      <td>1.423300</td>\n",
+       "      <td>1.038791</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>5</td>\n",
+       "      <td>1.304400</td>\n",
+       "      <td>1.066154</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>1.283900</td>\n",
+       "      <td>0.989451</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>1.248500</td>\n",
+       "      <td>0.981647</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8</td>\n",
+       "      <td>1.242600</td>\n",
+       "      <td>1.007480</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>9</td>\n",
+       "      <td>1.290300</td>\n",
+       "      <td>0.970018</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>1.258800</td>\n",
+       "      <td>0.958510</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>11</td>\n",
+       "      <td>1.217200</td>\n",
+       "      <td>1.017668</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>12</td>\n",
+       "      <td>1.242000</td>\n",
+       "      <td>0.961481</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table><p>"
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\bitsandbytes\\autograd\\_functions.py:315: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
     "inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n",
     "\n",
     "# Generate SQL query\n",
+    "outputs = model.generate(inputs, max_new_tokens=256, do_sample=False)\n",
     "query_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n",
     "\n",
     "print(\"Generated SQL:\", query_output)"

test_finetuned.ipynb CHANGED Viewed

@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -26,9 +26,9 @@
       "Total dataset examples: 1044\n",
       "\n",
       "\n",
-      "In which season did the Chicago Bulls have the highest average fg_pct at home?\n",
-      "SELECT season_id, AVG(fg_pct_home) as avg_stat FROM game WHERE team_name_home = 'Chicago Bulls' GROUP BY season_id ORDER BY avg_stat DESC LIMIT 1;\n",
-      "12022.0\n"
      ]
     }
    ],
@@ -58,7 +58,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -92,11 +92,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "input_text = \"\"\"You are an AI assistant that converts natural language queries into valid SQLite queries.\n",
     "Database Schema and Explanations\n",
     "\n",
     "team Table\n",
@@ -251,9 +251,7 @@
     "Request:\n",
     "\"What is the most points the Los Angeles Lakers have ever scored at home?\"\n",
     "SQLite:\n",
-    "SELECT MAX(pts_home) \n",
-    "FROM game \n",
-    "WHERE team_name_home = 'Los Angeles Lakers';\n",
     "\n",
     "Request:\n",
     "\"Which teams are located in the state of California?\"\n",
@@ -273,9 +271,7 @@
     "Request:\n",
     "\"Find the Boston Celtics largest home victory margin in the 2008 season.\"\n",
     "SQLite:\n",
-    "SELECT MAX(pts_home - pts_away) AS biggest_win\n",
-    "FROM game\n",
-    "WHERE team_name_home = 'Boston Celtics' AND season_id = '22008';\n",
     "\n",
     "Generate only the SQLite query prefaced by SQLite: and no other text, do not output an explanation of the query. Now generate an SQLite query for the following user request. Request:\n",
     "\"\"\""
@@ -290,14 +286,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "SQLite: SELECT season_id FROM game WHERE team_name_home = 'Chicago Bulls' GROUP BY season_id ORDER BY AVG(fg_pct_home) DESC LIMIT 1;\n",
       "\n"
      ]
     }
@@ -322,15 +318,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "SELECT season_id FROM game WHERE team_name_home = 'Chicago Bulls' GROUP BY season_id ORDER BY AVG(fg_pct_home) DESC LIMIT 1;\n",
-      "('12022',)\n"
      ]
     }
    ],
@@ -374,21 +369,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "In which season did the Chicago Bulls have the highest average fg_pct at home?\n",
-      "SELECT season_id, AVG(fg_pct_home) as avg_stat FROM game WHERE team_name_home = 'Chicago Bulls' GROUP BY season_id ORDER BY avg_stat DESC LIMIT 1;\n",
-      "12022.0\n",
-      "SQLite: SELECT season_id FROM game WHERE team_name_home = 'Chicago Bulls' GROUP BY season_id ORDER BY AVG(fg_pct_home) DESC LIMIT 1;\n",
       "\n",
-      "Statement valid? True\n",
       "SQLite matched? False\n",
-      "Result matched? True\n"
      ]
     }
    ],
@@ -397,10 +392,18 @@
     "\n",
     "def compare_result(sample_query, sample_result, query_output):\n",
     "    # Clean model output to only have the query output\n",
-    "    if query_output[0:8] == \"SQLite: \":\n",
     "        query = query_output[8:]\n",
     "    elif query_output[0:5] == \"SQL: \":\n",
     "        query = query_output[5:]\n",
     "    else:\n",
     "        query = query_output\n",
     "\n",
@@ -448,7 +451,7 @@
     "                            if math.isclose(float(r), float(res), abs_tol=0.5):\n",
     "                                return True, query_match, True\n",
     "                        except:\n",
-    "                            if r in res or res in r:\n",
     "                                return True, query_match, True\n",
     "                    \n",
     "            # Check if the model returned a sum of examples as opposed to the whole thing\n",
@@ -494,7 +497,8 @@
     "        return False, False, False\n",
     "\n",
     "# Obtain sample\n",
-    "#sample = df.sample(n=1)\n",
     "print(sample[\"natural_query\"].values[0])\n",
     "print(sample[\"sql_query\"].values[0])\n",
     "print(sample[\"result\"].values[0])\n",
@@ -523,7 +527,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -571,7 +575,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -584,9 +588,9 @@
       "Completed 200\n",
       "\n",
       "Less than 90 results:\n",
-      "Percent valid: 0.5183673469387755\n",
-      "Percent SQLite matched: 0.2857142857142857\n",
-      "Percent result matched: 0.42857142857142855\n",
       "Dataset length: 245\n"
      ]
     }
@@ -606,51 +610,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[9], line 2\u001b[0m\n\u001b[0;32m      1\u001b[0m game_queries \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./train-data/queries_from_game.tsv\u001b[39m\u001b[38;5;124m\"\u001b[39m, sep\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m----> 2\u001b[0m \u001b[43mrun_evaluation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgame_queries\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mQueries from game\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset length: \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mlen\u001b[39m(game_queries)))\n",
-      "Cell \u001b[1;32mIn[7], line 10\u001b[0m, in \u001b[0;36mrun_evaluation\u001b[1;34m(nba_df, title)\u001b[0m\n\u001b[0;32m      8\u001b[0m message\u001b[38;5;241m=\u001b[39m[{ \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrole\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124muser\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m'\u001b[39m: input_text \u001b[38;5;241m+\u001b[39m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnatural_query\u001b[39m\u001b[38;5;124m\"\u001b[39m]}]\n\u001b[0;32m      9\u001b[0m inputs \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mapply_chat_template(message, add_generation_prompt\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mto(model\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m---> 10\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_new_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m128\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdo_sample\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_k\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m50\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_p\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.95\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_return_sequences\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meos_token_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43meos_token_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;66;03m# Obtain output\u001b[39;00m\n\u001b[0;32m     13\u001b[0m query_output \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mdecode(outputs[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;28mlen\u001b[39m(inputs[\u001b[38;5;241m0\u001b[39m]):], skip_special_tokens\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\utils\\_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m    114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m    115\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[1;32m--> 116\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\generation\\utils.py:2326\u001b[0m, in \u001b[0;36mGenerationMixin.generate\u001b[1;34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, **kwargs)\u001b[0m\n\u001b[0;32m   2318\u001b[0m     input_ids, model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_expand_inputs_for_generation(\n\u001b[0;32m   2319\u001b[0m         input_ids\u001b[38;5;241m=\u001b[39minput_ids,\n\u001b[0;32m   2320\u001b[0m         expand_size\u001b[38;5;241m=\u001b[39mgeneration_config\u001b[38;5;241m.\u001b[39mnum_return_sequences,\n\u001b[0;32m   2321\u001b[0m         is_encoder_decoder\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mis_encoder_decoder,\n\u001b[0;32m   2322\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs,\n\u001b[0;32m   2323\u001b[0m     )\n\u001b[0;32m   2325\u001b[0m     \u001b[38;5;66;03m# 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)\u001b[39;00m\n\u001b[1;32m-> 2326\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sample\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   2327\u001b[0m \u001b[43m        \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2328\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlogits_processor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_logits_processor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2329\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstopping_criteria\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_stopping_criteria\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2330\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeneration_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2331\u001b[0m \u001b[43m        \u001b[49m\u001b[43msynced_gpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msynced_gpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2332\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstreamer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstreamer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2333\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2334\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2336\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m generation_mode \u001b[38;5;129;01min\u001b[39;00m (GenerationMode\u001b[38;5;241m.\u001b[39mBEAM_SAMPLE, GenerationMode\u001b[38;5;241m.\u001b[39mBEAM_SEARCH):\n\u001b[0;32m   2337\u001b[0m     \u001b[38;5;66;03m# 11. interleave input_ids with `num_beams` additional sequences per batch\u001b[39;00m\n\u001b[0;32m   2338\u001b[0m     input_ids, model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_expand_inputs_for_generation(\n\u001b[0;32m   2339\u001b[0m         input_ids\u001b[38;5;241m=\u001b[39minput_ids,\n\u001b[0;32m   2340\u001b[0m         expand_size\u001b[38;5;241m=\u001b[39mgeneration_config\u001b[38;5;241m.\u001b[39mnum_beams,\n\u001b[0;32m   2341\u001b[0m         is_encoder_decoder\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mis_encoder_decoder,\n\u001b[0;32m   2342\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs,\n\u001b[0;32m   2343\u001b[0m     )\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\generation\\utils.py:3289\u001b[0m, in \u001b[0;36mGenerationMixin._sample\u001b[1;34m(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)\u001b[0m\n\u001b[0;32m   3287\u001b[0m     is_prefill \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m   3288\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 3289\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m   3291\u001b[0m \u001b[38;5;66;03m# synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping\u001b[39;00m\n\u001b[0;32m   3292\u001b[0m model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_model_kwargs_for_generation(\n\u001b[0;32m   3293\u001b[0m     outputs,\n\u001b[0;32m   3294\u001b[0m     model_kwargs,\n\u001b[0;32m   3295\u001b[0m     is_encoder_decoder\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mis_encoder_decoder,\n\u001b[0;32m   3296\u001b[0m )\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\accelerate\\hooks.py:170\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[1;34m(module, *args, **kwargs)\u001b[0m\n\u001b[0;32m    168\u001b[0m         output \u001b[38;5;241m=\u001b[39m module\u001b[38;5;241m.\u001b[39m_old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m    169\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 170\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_old_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    171\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\utils\\deprecation.py:172\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>.wrapper.<locals>.wrapped_func\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    168\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m minimum_action \u001b[38;5;129;01min\u001b[39;00m (Action\u001b[38;5;241m.\u001b[39mNOTIFY, Action\u001b[38;5;241m.\u001b[39mNOTIFY_ALWAYS) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torchdynamo_compiling():\n\u001b[0;32m    169\u001b[0m     \u001b[38;5;66;03m# DeprecationWarning is ignored by default, so we use FutureWarning instead\u001b[39;00m\n\u001b[0;32m    170\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(message, \u001b[38;5;167;01mFutureWarning\u001b[39;00m, stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m--> 172\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\models\\llama\\modeling_llama.py:853\u001b[0m, in \u001b[0;36mLlamaForCausalLM.forward\u001b[1;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, logits_to_keep, **kwargs)\u001b[0m\n\u001b[0;32m    850\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m    852\u001b[0m \u001b[38;5;66;03m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[1;32m--> 853\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    854\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    855\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    856\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    857\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    858\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    859\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    860\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    861\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    862\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    863\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    864\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    865\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    867\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m    868\u001b[0m \u001b[38;5;66;03m# Only compute necessary logits, and do not upcast them to float if we are not computing the loss\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\accelerate\\hooks.py:170\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[1;34m(module, *args, **kwargs)\u001b[0m\n\u001b[0;32m    168\u001b[0m         output \u001b[38;5;241m=\u001b[39m module\u001b[38;5;241m.\u001b[39m_old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m    169\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 170\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_old_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    171\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\models\\llama\\modeling_llama.py:601\u001b[0m, in \u001b[0;36mLlamaModel.forward\u001b[1;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, **flash_attn_kwargs)\u001b[0m\n\u001b[0;32m    589\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[0;32m    590\u001b[0m         decoder_layer\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[0;32m    591\u001b[0m         hidden_states,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    598\u001b[0m         position_embeddings,\n\u001b[0;32m    599\u001b[0m     )\n\u001b[0;32m    600\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 601\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mdecoder_layer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    602\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    603\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcausal_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    604\u001b[0m \u001b[43m        \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    605\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    606\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    607\u001b[0m \u001b[43m        \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    608\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    609\u001b[0m \u001b[43m        \u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    610\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mflash_attn_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    611\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    613\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m    615\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\accelerate\\hooks.py:170\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[1;34m(module, *args, **kwargs)\u001b[0m\n\u001b[0;32m    168\u001b[0m         output \u001b[38;5;241m=\u001b[39m module\u001b[38;5;241m.\u001b[39m_old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m    169\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 170\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_old_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    171\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\models\\llama\\modeling_llama.py:343\u001b[0m, in \u001b[0;36mLlamaDecoderLayer.forward\u001b[1;34m(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, position_embeddings, **kwargs)\u001b[0m\n\u001b[0;32m    340\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_layernorm(hidden_states)\n\u001b[0;32m    342\u001b[0m \u001b[38;5;66;03m# Self Attention\u001b[39;00m\n\u001b[1;32m--> 343\u001b[0m hidden_states, self_attn_weights \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself_attn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    344\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    345\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    346\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    347\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    348\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    349\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    350\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    351\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    352\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    353\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    354\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m residual \u001b[38;5;241m+\u001b[39m hidden_states\n\u001b[0;32m    356\u001b[0m \u001b[38;5;66;03m# Fully Connected\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\accelerate\\hooks.py:170\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[1;34m(module, *args, **kwargs)\u001b[0m\n\u001b[0;32m    168\u001b[0m         output \u001b[38;5;241m=\u001b[39m module\u001b[38;5;241m.\u001b[39m_old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m    169\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 170\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_old_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    171\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\models\\llama\\modeling_llama.py:277\u001b[0m, in \u001b[0;36mLlamaAttention.forward\u001b[1;34m(self, hidden_states, position_embeddings, attention_mask, past_key_value, cache_position, **kwargs)\u001b[0m\n\u001b[0;32m    274\u001b[0m input_shape \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39mshape[:\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m    275\u001b[0m hidden_shape \u001b[38;5;241m=\u001b[39m (\u001b[38;5;241m*\u001b[39minput_shape, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhead_dim)\n\u001b[1;32m--> 277\u001b[0m query_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mq_proj\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mview(hidden_shape)\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\n\u001b[0;32m    278\u001b[0m key_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mk_proj(hidden_states)\u001b[38;5;241m.\u001b[39mview(hidden_shape)\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\n\u001b[0;32m    279\u001b[0m value_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mv_proj(hidden_states)\u001b[38;5;241m.\u001b[39mview(hidden_shape)\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\accelerate\\hooks.py:170\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[1;34m(module, *args, **kwargs)\u001b[0m\n\u001b[0;32m    168\u001b[0m         output \u001b[38;5;241m=\u001b[39m module\u001b[38;5;241m.\u001b[39m_old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m    169\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 170\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_old_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    171\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\bitsandbytes\\nn\\modules.py:990\u001b[0m, in \u001b[0;36mLinear8bitLt.forward\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m    987\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbias \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbias\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m!=\u001b[39m x\u001b[38;5;241m.\u001b[39mdtype:\n\u001b[0;32m    988\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbias\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbias\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mto(x\u001b[38;5;241m.\u001b[39mdtype)\n\u001b[1;32m--> 990\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mbnb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmatmul\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstate\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    992\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mhas_fp16_weights \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mCB \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m    993\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mweight\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mCB\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\bitsandbytes\\autograd\\_functions.py:509\u001b[0m, in \u001b[0;36mmatmul\u001b[1;34m(A, B, out, state, threshold, bias)\u001b[0m\n\u001b[0;32m    507\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m threshold \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0.0\u001b[39m:\n\u001b[0;32m    508\u001b[0m     state\u001b[38;5;241m.\u001b[39mthreshold \u001b[38;5;241m=\u001b[39m threshold\n\u001b[1;32m--> 509\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mMatMul8bitLt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mA\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mB\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstate\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\torch\\autograd\\function.py:574\u001b[0m, in \u001b[0;36mFunction.apply\u001b[1;34m(cls, *args, **kwargs)\u001b[0m\n\u001b[0;32m    571\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_C\u001b[38;5;241m.\u001b[39m_are_functorch_transforms_active():\n\u001b[0;32m    572\u001b[0m     \u001b[38;5;66;03m# See NOTE: [functorch vjp and autograd interaction]\u001b[39;00m\n\u001b[0;32m    573\u001b[0m     args \u001b[38;5;241m=\u001b[39m _functorch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39munwrap_dead_wrappers(args)\n\u001b[1;32m--> 574\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m    576\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_setup_ctx_defined:\n\u001b[0;32m    577\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[0;32m    578\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIn order to use an autograd.Function with functorch transforms \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    579\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m(vmap, grad, jvp, jacrev, ...), it must override the setup_context \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    580\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstaticmethod. For more details, please see \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    581\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://pytorch.org/docs/main/notes/extending.func.html\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    582\u001b[0m     )\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\bitsandbytes\\autograd\\_functions.py:326\u001b[0m, in \u001b[0;36mMatMul8bitLt.forward\u001b[1;34m(ctx, A, B, out, bias, state)\u001b[0m\n\u001b[0;32m    323\u001b[0m     CA, CAt, SCA, SCAt, outlier_cols \u001b[38;5;241m=\u001b[39m F\u001b[38;5;241m.\u001b[39mint8_double_quant(A\u001b[38;5;241m.\u001b[39mto(torch\u001b[38;5;241m.\u001b[39mfloat16), threshold\u001b[38;5;241m=\u001b[39mstate\u001b[38;5;241m.\u001b[39mthreshold)\n\u001b[0;32m    324\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    325\u001b[0m     \u001b[38;5;66;03m# Fast path\u001b[39;00m\n\u001b[1;32m--> 326\u001b[0m     CA, SCA, outlier_cols \u001b[38;5;241m=\u001b[39m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mint8_vectorwise_quant\u001b[49m\u001b[43m(\u001b[49m\u001b[43mA\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfloat16\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mthreshold\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstate\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mthreshold\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    327\u001b[0m     CAt \u001b[38;5;241m=\u001b[39m SCAt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m    329\u001b[0m has_grad \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\bitsandbytes\\functional.py:2789\u001b[0m, in \u001b[0;36mint8_vectorwise_quant\u001b[1;34m(A, threshold)\u001b[0m\n\u001b[0;32m   2786\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m outliers\u001b[38;5;241m.\u001b[39many():\n\u001b[0;32m   2787\u001b[0m         outlier_cols \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39margwhere(outliers\u001b[38;5;241m.\u001b[39many(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m))\u001b[38;5;241m.\u001b[39mview(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m-> 2789\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43m_cuda_device_of\u001b[49m\u001b[43m(\u001b[49m\u001b[43mA\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[0;32m   2790\u001b[0m     lib\u001b[38;5;241m.\u001b[39mcint8_vector_quant(\n\u001b[0;32m   2791\u001b[0m         get_ptr(A),\n\u001b[0;32m   2792\u001b[0m         get_ptr(out_row),\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2797\u001b[0m         _get_tensor_stream(A),\n\u001b[0;32m   2798\u001b[0m     )\n\u001b[0;32m   2800\u001b[0m \u001b[38;5;66;03m# Zero out values from outlier columns across all rows.\u001b[39;00m\n\u001b[0;32m   2801\u001b[0m \u001b[38;5;66;03m# The kernel will handle this for outliers themselves, so we can optimize for rows=1.\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\Dean\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\bitsandbytes\\functional.py:205\u001b[0m, in \u001b[0;36m_cuda_device_of\u001b[1;34m(a)\u001b[0m\n\u001b[0;32m    202\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    203\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mcontextlib\u001b[39;00m\n\u001b[1;32m--> 205\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_cuda_device_of\u001b[39m(a: torch\u001b[38;5;241m.\u001b[39mTensor):\n\u001b[0;32m    206\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m contextlib\u001b[38;5;241m.\u001b[39mnullcontext()\n\u001b[0;32m    209\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_paged\u001b[39m(\u001b[38;5;241m*\u001b[39mshape, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32, device\u001b[38;5;241m=\u001b[39mFIRST_CUDA_DEVICE):\n",
-      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
    "source": [
     "game_queries = pd.read_csv(\"./train-data/queries_from_game.tsv\", sep='\\t')\n",
     "run_evaluation(game_queries, \"Queries from game\")\n",

   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
       "Total dataset examples: 1044\n",
       "\n",
       "\n",
+      "What is the average number of points in the paint allowed by the Chicago Bulls when playing at home in the 2001 season in games with more than 15 lead changes?\n",
+      "SELECT AVG(o.pts_paint_away) FROM game g JOIN other_stats o ON g.game_id = o.game_id WHERE g.team_abbreviation_home = 'CHI' AND g.season_id = '22001' AND o.lead_changes > 15;\n",
+      "31.333333333333332\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
+    "input_text = input_prompt = \"\"\"You are an AI assistant that converts natural language queries into valid SQLite queries.\n",
     "Database Schema and Explanations\n",
     "\n",
     "team Table\n",
     "Request:\n",
     "\"What is the most points the Los Angeles Lakers have ever scored at home?\"\n",
     "SQLite:\n",
+    "SELECT MAX(pts_home) FROM game WHERE team_name_home = 'Los Angeles Lakers';\n",
     "\n",
     "Request:\n",
     "\"Which teams are located in the state of California?\"\n",
     "Request:\n",
     "\"Find the Boston Celtics largest home victory margin in the 2008 season.\"\n",
     "SQLite:\n",
+    "SELECT MAX(pts_home - pts_away) AS biggest_win FROM game WHERE team_name_home = 'Boston Celtics' AND season_id = '22008';\n",
     "\n",
     "Generate only the SQLite query prefaced by SQLite: and no other text, do not output an explanation of the query. Now generate an SQLite query for the following user request. Request:\n",
     "\"\"\""
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "SQLite: SELECT AVG(pts_paint_home) FROM other_stats WHERE team_name_home = 'Chicago Bulls' AND season_id = '22001' AND lead_changes > 15;\n",
       "\n"
      ]
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "SELECT AVG(pts_paint_home) FROM other_stats WHERE team_name_home = 'Chicago Bulls' AND season_id = '22001' AND lead_changes > 15;\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "What is the average number of fg_pct in home games by the Los Angeles Lakers?\n",
+      "SELECT AVG(fg_pct_home) FROM game WHERE team_name_home = 'Los Angeles Lakers';\n",
+      "0.4782432016418667\n",
+      "SQLite: AVG(fg_pct_home) FROM game WHERE team_name_home = 'Los Angeles Lakers';\n",
       "\n",
+      "Statement valid? False\n",
       "SQLite matched? False\n",
+      "Result matched? False\n"
      ]
     }
    ],
     "\n",
     "def compare_result(sample_query, sample_result, query_output):\n",
     "    # Clean model output to only have the query output\n",
+    "    if query_output[0:8] == \"SQLite:\\n\":\n",
     "        query = query_output[8:]\n",
+    "    elif query_output[0:8] == \"SQLite: \":\n",
+    "        query = query_output[8:]\n",
+    "    elif query_output[0:7] == \"SQLite:\":\n",
+    "        query = query_output[7:]\n",
+    "    elif query_output[0:5] == \"SQL:\\n\":\n",
+    "        query = query_output[5:]\n",
     "    elif query_output[0:5] == \"SQL: \":\n",
     "        query = query_output[5:]\n",
+    "    elif query_output[0:4] == \"SQL:\":\n",
+    "        query = query_output[4:]\n",
     "    else:\n",
     "        query = query_output\n",
     "\n",
     "                            if math.isclose(float(r), float(res), abs_tol=0.5):\n",
     "                                return True, query_match, True\n",
     "                        except:\n",
+    "                            if str(r) in res or res in str(r):\n",
     "                                return True, query_match, True\n",
     "                    \n",
     "            # Check if the model returned a sum of examples as opposed to the whole thing\n",
     "        return False, False, False\n",
     "\n",
     "# Obtain sample\n",
+    "less_than_90_df = pd.read_csv(\"./train-data/less_than_90.tsv\", sep='\\t')\n",
+    "sample = less_than_90_df.sample(n=1)\n",
     "print(sample[\"natural_query\"].values[0])\n",
     "print(sample[\"sql_query\"].values[0])\n",
     "print(sample[\"result\"].values[0])\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
       "Completed 200\n",
       "\n",
       "Less than 90 results:\n",
+      "Percent valid: 0.49795918367346936\n",
+      "Percent SQLite matched: 0.27346938775510204\n",
+      "Percent result matched: 0.4122448979591837\n",
       "Dataset length: 245\n"
      ]
     }
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "game_queries = pd.read_csv(\"./train-data/queries_from_game.tsv\", sep='\\t')\n",
     "run_evaluation(game_queries, \"Queries from game\")\n",