aarohanverma
/

text2sql-flan-t5-base-qlora-finetuned

@@ -3,34 +3,6 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "cadbd30d-57ce-4ef2-889f-24bd0ff06b89",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/workspace\n"
-     ]
-    }
-   ],
-   "source": [
-    "!echo $PWD"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "12d99875-d86b-4442-8682-b9751118d90e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#!pip3 install evaluate datasets bitsandbytes transformers peft rapidfuzz absl-py"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
    "id": "5f167a6f-5139-46e6-afb2-a1fa4d12f3fd",
    "metadata": {},
    "outputs": [],
@@ -60,7 +32,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "53684b5e-c27e-4eb9-815e-583aa194e096",
    "metadata": {},
    "outputs": [
@@ -83,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "a47bf3cd-752d-4d1c-9697-70098d6204fa",
    "metadata": {},
    "outputs": [],
@@ -97,7 +69,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "f16df21e-9797-4f78-83a1-a2943759ba55",
    "metadata": {},
    "outputs": [],
@@ -109,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "196e83da-6c8c-4cd7-bd70-2598a5e2a16a",
    "metadata": {},
    "outputs": [],
@@ -123,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "id": "cea22b9f-f309-4151-81ac-37547c8feeb0",
    "metadata": {},
    "outputs": [],
@@ -155,7 +127,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "id": "d4eb82ce-1713-40b6-981d-43ce35aaa6f6",
    "metadata": {},
    "outputs": [
@@ -163,9 +135,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-17 17:06:42,785 - INFO - Loading raw datasets from various sources...\n",
-      "2025-03-17 17:07:15,400 - INFO - Total rows before dropping duplicates: 490241\n",
-      "2025-03-17 17:07:16,852 - INFO - Total rows after dropping duplicates: 440785\n"
      ]
     }
    ],
@@ -198,7 +170,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "id": "8446814e-5a2c-48a4-8c01-059afcf1d3c1",
    "metadata": {},
    "outputs": [
@@ -207,7 +179,7 @@
      "output_type": "stream",
      "text": [
       "Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors\n",
-      "2025-03-17 17:10:43,961 - INFO - Total rows after filtering by token length (prompt <= 500 and response <= 250 tokens): 398481\n"
      ]
     }
    ],
@@ -238,7 +210,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "id": "177e1e6d-9fbc-442d-9774-5a3e5234329f",
    "metadata": {},
    "outputs": [
@@ -246,7 +218,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-17 17:10:43,968 - INFO - Sample from filtered final_df:\n",
       "                                               query  \\\n",
       "0           Name the home team for carlton away team   \n",
       "1  what will the population of Asia be when Latin...   \n",
@@ -271,7 +243,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "id": "0b639efe-ebeb-4b34-bc3f-accf776ba0da",
    "metadata": {},
    "outputs": [
@@ -279,13 +251,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-17 17:10:44,311 - INFO - Final split sizes: Train: 338708, Test: 39848, Validation: 19925\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "11dc405cf3d54b6abc81b8eaf6742bea",
        "version_major": 2,
        "version_minor": 0
       },
@@ -299,7 +271,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "868d3a0d08874c448faac4b50dbb3685",
        "version_major": 2,
        "version_minor": 0
       },
@@ -313,7 +285,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0370d0dd07514d5cae499ab93ca47ee8",
        "version_major": 2,
        "version_minor": 0
       },
@@ -328,8 +300,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-17 17:10:45,869 - INFO - Merged and Saved Dataset Successfully!\n",
-      "2025-03-17 17:10:45,870 - INFO - Dataset summary: DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['query', 'context', 'response'],\n",
       "        num_rows: 338708\n",
@@ -378,7 +350,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "id": "9f6e1095-d72d-4e22-b20d-683f1f84544c",
    "metadata": {},
    "outputs": [
@@ -386,11 +358,11 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-17 17:10:46,218 - INFO - Reloaded dataset from disk. Example from test split:\n",
       "{'query': \"Show the name and type of military cyber commands in the 'Military_Cyber_Commands' table.\", 'context': \"CREATE SCHEMA IF NOT EXISTS defense_security;CREATE TABLE IF NOT EXISTS defense_security.Military_Cyber_Commands (id INT PRIMARY KEY, command_name VARCHAR(255), type VARCHAR(255));INSERT INTO defense_security.Military_Cyber_Commands (id, command_name, type) VALUES (1, 'USCYBERCOM', 'Defensive Cyber Operations'), (2, 'JTF-CND', 'Offensive Cyber Operations'), (3, '10th Fleet', 'Network Warfare');\", 'response': 'SELECT command_name, type FROM defense_security.Military_Cyber_Commands;'}\n",
-      "2025-03-17 17:10:46,475 - INFO - Loaded Tokenized Dataset from disk.\n",
-      "2025-03-17 17:10:46,477 - INFO - Final tokenized dataset splits: dict_keys(['train', 'test', 'validation'])\n",
-      "2025-03-17 17:10:46,483 - INFO - Sample tokenized record from train split:\n",
       "{'input_ids': tensor([ 1193,  6327,    10,   205,  4386,  6048,   332, 17098,   953,   834,\n",
       "         4350,   834,  4013,    41,   234,   834, 11650,   584,  4280, 28027,\n",
       "            6,   550,   834, 11650,   584,  4280, 28027,     3,    61,     3,\n",
@@ -564,7 +536,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "7f004e55-181c-47aa-9f3e-c7c1ceae780c",
    "metadata": {},
    "outputs": [
@@ -631,7 +603,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "id": "f50e56c7-98b3-42bc-9129-89f3eff802e7",
    "metadata": {},
    "outputs": [
@@ -639,8 +611,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-17 17:10:50,413 - INFO - Attempting to load the fine-tuned model...\n",
-      "2025-03-17 17:10:51,949 - INFO - Fine-tuned model loaded successfully.\n"
      ]
     }
    ],
@@ -743,7 +715,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "id": "f364eb6b-56cb-4533-8ef6-b5e7f56895aa",
    "metadata": {},
    "outputs": [
@@ -751,7 +723,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-17 17:10:51,987 - INFO - Running inference on 5 examples (displaying real responses).\n"
      ]
     },
     {
@@ -777,7 +751,7 @@
       "SELECT command_name, type FROM defense_security.Military_Cyber_Commands;\n",
       "----------------------------------------------------------------------------------------------------\n",
       "ORIGINAL MODEL OUTPUT:\n",
-      "USCYBERCOM, JTF-CND, Offensive Cyber Operations, 10th Fleet, Network Warfare\n",
       "----------------------------------------------------------------------------------------------------\n",
       "FINE-TUNED MODEL OUTPUT:\n",
       "SELECT command_name, type FROM defense_security.Military_Cyber_Commands;\n",
@@ -800,7 +774,7 @@
       "SELECT SUM(cost) FROM incidents WHERE cause = 'insider threat' AND date >= DATE_SUB(CURRENT_DATE, INTERVAL 6 MONTH);\n",
       "----------------------------------------------------------------------------------------------------\n",
       "ORIGINAL MODEL OUTPUT:\n",
-      "5000\n",
       "----------------------------------------------------------------------------------------------------\n",
       "FINE-TUNED MODEL OUTPUT:\n",
       "SELECT SUM(cost) FROM incidents WHERE cause = 'insider threat' AND date >= DATE_SUB(CURRENT_DATE, INTERVAL 6 MONTH);\n",
@@ -846,7 +820,7 @@
       "SELECT COUNT(posts.id) FROM posts INNER JOIN users ON posts.user_id = users.id WHERE users.location = 'Australia' AND posts.created_at >= DATE_SUB(NOW(), INTERVAL 1 MONTH);\n",
       "----------------------------------------------------------------------------------------------------\n",
       "ORIGINAL MODEL OUTPUT:\n",
-      "INT users created a total of 50 posts in Australia in the last month.\n",
       "----------------------------------------------------------------------------------------------------\n",
       "FINE-TUNED MODEL OUTPUT:\n",
       "SELECT COUNT(*) FROM posts p JOIN users u ON p.user_id = u.id WHERE u.location = 'Australia' AND p.created_at >= DATE_SUB(CURRENT_DATE, INTERVAL 1 MONTH);\n",
@@ -858,7 +832,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-17 17:11:00,034 - INFO - Starting evaluation on the full test set using batching.\n"
      ]
     },
     {
@@ -882,7 +856,7 @@
       "SELECT Country, SUM(Capacity) as TotalCapacity FROM WindFarms GROUP BY Country;\n",
       "----------------------------------------------------------------------------------------------------\n",
       "ORIGINAL MODEL OUTPUT:\n",
-      "1, 150, USA, (2, 200, Canada, 3), 120, Mexico\n",
       "----------------------------------------------------------------------------------------------------\n",
       "FINE-TUNED MODEL OUTPUT:\n",
       "SELECT Country, SUM(Capacity) FROM WindFarms GROUP BY Country;\n",
@@ -890,51 +864,10 @@
       "\n"
      ]
     },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2025-03-17 18:28:59,727 - INFO - Full test set comparison (first 5 rows):\n",
-      "                                      Human Response  \\\n",
-      "0  SELECT command_name, type FROM defense_securit...   \n",
-      "1  SELECT SUM(cost) FROM incidents WHERE cause = ...   \n",
-      "2  SELECT state, (libraries / population) AS libr...   \n",
-      "3  SELECT COUNT(posts.id) FROM posts INNER JOIN u...   \n",
-      "4  SELECT Country, SUM(Capacity) as TotalCapacity...   \n",
-      "\n",
-      "                               Original Model Output  \\\n",
-      "0  USCYBERCOM, JTF-CND, offensive Cyber operation...   \n",
-      "1                                             t = t.   \n",
-      "2                                         California   \n",
-      "3      The total number of users in Australia is 50.   \n",
-      "4                                                  a   \n",
-      "\n",
-      "                             Fine-Tuned Model Output  \n",
-      "0  SELECT command_name, type FROM military_cyber_...  \n",
-      "1  SELECT SUM(cost) FROM incidents WHERE cause = ...  \n",
-      "2  SELECT state, t.population, t.tut FROM librari...  \n",
-      "3  SELECT COUNT(*) FROM posts WHERE CUTS(CUTS.id,...  \n",
-      "4  SELECT Country, SUM(Capacity) FROM WindFarms G...  \n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Full Test Set Comparison (First 5 Rows):\n",
-      "                                                                                                                                                              Human Response                                                        Original Model Output                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      Fine-Tuned Model Output\n",
-      "                                                                                                    SELECT command_name, type FROM defense_security.Military_Cyber_Commands; USCYBERCOM, JTF-CND, offensive Cyber operations, 10th Fleet, Network Warfare                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      SELECT command_name, type FROM military_cyber_Commands;\n",
-      "                                                        SELECT SUM(cost) FROM incidents WHERE cause = 'insider threat' AND date >= DATE_SUB(CURRENT_DATE, INTERVAL 6 MONTH);                                                                       t = t.                                                                                                                                                                                                                                                                                                                                                                                                                         SELECT SUM(cost) FROM incidents WHERE cause = 'insider threat' AND date >= DATE_SUB(CURRENT_DATE, INTERVAL 6 MONTH);\n",
-      "                                                   SELECT state, (libraries / population) AS libraries_per_capita FROM libraries ORDER BY libraries_per_capita DESC LIMIT 3;                                                                   California                                                                                                                                                                                                                                                                                                                                                                                                                   SELECT state, t.population, t.tut FROM libraries t JOIN t ON t.state = t.state GROUP BY state ORDER BY t.tut DESC LIMIT 3;\n",
-      "SELECT COUNT(posts.id) FROM posts INNER JOIN users ON posts.user_id = users.id WHERE users.location = 'Australia' AND posts.created_at >= DATE_SUB(NOW(), INTERVAL 1 MONTH);                                The total number of users in Australia is 50. SELECT COUNT(*) FROM posts WHERE CUTS(CUTS.id, CUTS.created_at) = CUTS.id AND CUTS.id = CUTS.id WHERE CUTS.location = 'Australia' AND CUTS.created_at >= DATE_SUB(CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CUTS.CU\n",
-      "                                                                                             SELECT Country, SUM(Capacity) as TotalCapacity FROM WindFarms GROUP BY Country;                                                                            a                                                                                                                                                                                                                                                                                                                                                                                                                                                                               SELECT Country, SUM(Capacity) FROM WindFarms GROUP BY Country;\n"
-     ]
-    },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fb9a4b84525845e78668fbb5472ac4c8",
        "version_major": 2,
        "version_minor": 0
       },
@@ -948,7 +881,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5a92eb8c1607450d8babbce26891eb97",
        "version_major": 2,
        "version_minor": 0
       },
@@ -962,7 +895,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e5b5b1034f354abfbdfc46f0ff2b9349",
        "version_major": 2,
        "version_minor": 0
       },
@@ -977,8 +910,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-17 18:29:02,580 - INFO - Using default tokenizer.\n",
-      "2025-03-17 18:30:27,253 - INFO - Using default tokenizer.\n"
      ]
     },
     {
@@ -990,27 +923,49 @@
       "Evaluation Metrics:\n",
       "====================================================================================================\n",
       "ORIGINAL MODEL:\n",
-      "  ROUGE: {'rouge1': np.float64(0.033688028857640176), 'rouge2': np.float64(0.008171862977966522), 'rougeL': np.float64(0.030557406905046474), 'rougeLsum': np.float64(0.030592110084298876)}\n",
-      "  BLEU: {'bleu': 0.0036692781190090368, 'precisions': [0.02284408025462027, 0.004200643881640979, 0.002134841269783046, 0.0008848453895992066], 'brevity_penalty': 1.0, 'length_ratio': 1.1809102409373358, 'translation_length': 1421725, 'reference_length': 1203923}\n",
-      "  Fuzzy Match Score: 11.31%\n",
       "  Exact Match Accuracy: 0.00%\n",
       "\n",
       "FINE-TUNED MODEL:\n",
-      "  ROUGE: {'rouge1': np.float64(0.6914345907518044), 'rouge2': np.float64(0.5453255406268581), 'rougeL': np.float64(0.6642891642898592), 'rougeLsum': np.float64(0.6642865716725223)}\n",
-      "  BLEU: {'bleu': 0.31698443630421885, 'precisions': [0.46303833317311294, 0.34558772459086096, 0.2792686360724928, 0.2259198229483191], 'brevity_penalty': 1.0, 'length_ratio': 1.4083799379196178, 'translation_length': 1695581, 'reference_length': 1203923}\n",
-      "  Fuzzy Match Score: 81.98%\n",
-      "  Exact Match Accuracy: 16.39%\n",
       "====================================================================================================\n"
      ]
     }
    ],
    "source": [
-    "from rapidfuzz import fuzz\n",
-    "import pandas as pd\n",
     "import re\n",
     "import evaluate\n",
     "\n",
-    "# --- Helper Functions for SQL Normalization and Exact Match ---\n",
     "def normalize_sql(sql):\n",
     "    \"\"\"Normalize SQL by stripping whitespace and lowercasing.\"\"\"\n",
     "    return \" \".join(sql.strip().lower().split())\n",
@@ -1026,7 +981,16 @@
     "    scores = [fuzz.token_set_ratio(pred, ref) for pred, ref in zip(predictions, references)]\n",
     "    return sum(scores) / len(scores) if scores else 0\n",
     "\n",
-    "# --- Part A: Inference on 5 Examples with Real Responses (unchanged) ---\n",
     "logger.info(\"Running inference on 5 examples (displaying real responses).\")\n",
     "\n",
     "num_examples = 5\n",
@@ -1034,7 +998,7 @@
     "sample_contexts = dataset[\"test\"][:num_examples][\"context\"]\n",
     "sample_human_responses = dataset[\"test\"][:num_examples][\"response\"]\n",
     "\n",
-    "print(\"\\n\" + \"=\"*100)\n",
     "for idx in range(num_examples):\n",
     "    prompt = f\"\"\"Context:\n",
     "{sample_contexts[idx]}\n",
@@ -1044,14 +1008,12 @@
     "\n",
     "Response:\n",
     "\"\"\"\n",
-    "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(device)\n",
-    "    \n",
-    "    # Generate outputs with both models using keyword arguments\n",
-    "    orig_out_ids = original_model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=200)\n",
-    "    finetuned_out_ids = finetuned_model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=200)\n",
     "    \n",
-    "    orig_text = tokenizer.decode(orig_out_ids[0], skip_special_tokens=True)\n",
-    "    finetuned_text = tokenizer.decode(finetuned_out_ids[0], skip_special_tokens=True)\n",
     "    \n",
     "    print(\"-\" * 100)\n",
     "    print(f\"Example {idx+1}\")\n",
@@ -1063,10 +1025,10 @@
     "    print(sample_human_responses[idx])\n",
     "    print(\"-\" * 100)\n",
     "    print(\"ORIGINAL MODEL OUTPUT:\")\n",
-    "    print(orig_text)\n",
     "    print(\"-\" * 100)\n",
     "    print(\"FINE-TUNED MODEL OUTPUT:\")\n",
-    "    print(finetuned_text)\n",
     "    print(\"=\" * 100 + \"\\n\")\n",
     "    clear_memory()\n",
     "\n",
@@ -1077,32 +1039,46 @@
     "all_original_responses = []\n",
     "all_finetuned_responses = []\n",
     "\n",
-    "batch_size = 128  # Adjust batch size based on your GPU memory\n",
     "test_dataset = dataset[\"test\"]\n",
     "\n",
     "for i in range(0, len(test_dataset), batch_size):\n",
     "    # Slicing the dataset returns a dict of lists\n",
-    "    batch = test_dataset[i:i+batch_size]\n",
     "    \n",
-    "    # Construct prompts for each example in the batch by iterating over indices\n",
     "    prompts = [\n",
     "        f\"Context:\\n{batch['context'][j]}\\n\\nQuery:\\n{batch['query'][j]}\\n\\nResponse:\"\n",
     "        for j in range(len(batch[\"context\"]))\n",
     "    ]\n",
     "    \n",
-    "    # Extend human responses for each example\n",
     "    all_human_responses.extend(batch[\"response\"])\n",
     "    \n",
-    "    # Tokenize the batch of prompts\n",
-    "    inputs = tokenizer(prompts, return_tensors=\"pt\", padding=True, truncation=True).to(device)\n",
     "    \n",
-    "    # Generate outputs with both models for the batch\n",
-    "    orig_ids = original_model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=300)\n",
-    "    finetuned_ids = finetuned_model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=300)\n",
     "    \n",
-    "    # Decode each sample in the batch\n",
     "    orig_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in orig_ids]\n",
-    "    finetuned_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in finetuned_ids]\n",
     "    \n",
     "    all_original_responses.extend(orig_texts)\n",
     "    all_finetuned_responses.extend(finetuned_texts)\n",
@@ -1111,13 +1087,10 @@
     "# Create a DataFrame for a quick comparison of results\n",
     "zipped_all = list(zip(all_human_responses, all_original_responses, all_finetuned_responses))\n",
     "df_full = pd.DataFrame(zipped_all, columns=[\"Human Response\", \"Original Model Output\", \"Fine-Tuned Model Output\"])\n",
-    "logger.info(\"Full test set comparison (first 5 rows):\\n%s\", df_full.head())\n",
-    "print(\"\\nFull Test Set Comparison (First 5 Rows):\")\n",
-    "print(df_full.head().to_string(index=False))\n",
     "clear_memory()\n",
     "\n",
     "# --- Compute Evaluation Metrics ---\n",
-    "# Load evaluation libraries\n",
     "rouge = evaluate.load(\"rouge\")\n",
     "bleu = evaluate.load(\"bleu\")\n",
     "\n",
@@ -1149,9 +1122,9 @@
     "finetuned_fuzzy = compute_fuzzy_match(all_finetuned_responses, all_human_responses)\n",
     "finetuned_exact = compute_exact_match(all_finetuned_responses, all_human_responses)\n",
     "\n",
-    "print(\"\\n\" + \"=\"*100)\n",
     "print(\"Evaluation Metrics:\")\n",
-    "print(\"=\"*100)\n",
     "print(\"ORIGINAL MODEL:\")\n",
     "print(f\"  ROUGE: {orig_rouge}\")\n",
     "print(f\"  BLEU: {orig_bleu}\")\n",
@@ -1162,13 +1135,13 @@
     "print(f\"  BLEU: {finetuned_bleu}\")\n",
     "print(f\"  Fuzzy Match Score: {finetuned_fuzzy:.2f}%\")\n",
     "print(f\"  Exact Match Accuracy: {finetuned_exact:.2f}%\")\n",
-    "print(\"=\"*100)\n",
-    "clear_memory()\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
    "id": "462546a7-6928-4723-b00e-23c3a4091d99",
    "metadata": {},
    "outputs": [
@@ -1176,7 +1149,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-18 16:55:06,158 - INFO - Running inference with deterministic decoding and beam search.\n"
      ]
     },
     {
@@ -1191,10 +1164,7 @@
       "Retrieve the total order amount for each customer, showing only customers from the USA, and sort the result by total order amount in descending order.\n",
       "\n",
       "Response:\n",
-      "SELECT customers.name, SUM(orders.total_amount) as total_amount FROM customers INNER JOIN orders ON customers.id = orders.customer_id WHERE customers.country = 'USA' GROUP BY customers.name ORDER BY total_amount DESC;\n",
-      "\n",
-      "EXPECTED RESPONSE:\n",
-      "SELECT c.name, SUM(o.total_amount) as total_order_amount FROM customers c JOIN orders o ON c.id = o.customer_id WHERE c.country = 'USA' GROUP BY c.name ORDER BY total_order_amount DESC;\n"
      ]
     }
    ],
@@ -1214,7 +1184,7 @@
     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
     "\n",
     "# Load the fine-tuned model and tokenizer\n",
-    "model_name = \"text2sql_flant5base_finetuned\"  # Directory of your fine-tuned model\n",
     "finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-base\")\n",
     "finetuned_model.to(device)\n",
@@ -1227,12 +1197,17 @@
     "    inputs = tokenizer(prompt_text, return_tensors=\"pt\").to(device)\n",
     "    generated_ids = finetuned_model.generate(\n",
     "        input_ids=inputs[\"input_ids\"],\n",
-    "        max_new_tokens=250,   # Adjust based on query complexity\n",
-    "        temperature=0.0,      # Deterministic output\n",
-    "        num_beams=3,          # Beam search for better output quality\n",
     "        early_stopping=True,  # Stop early if possible\n",
     "    )\n",
-    "    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
     "\n",
     "# Sample context and query (example)\n",
     "context = (\n",
@@ -1264,16 +1239,6 @@
     "logger.info(\"Running inference with deterministic decoding and beam search.\")\n",
     "generated_sql = run_inference(sample_prompt)\n",
     "\n",
-    "# Define the expected response (this is a placeholder - update as necessary)\n",
-    "expected_response = (\n",
-    "    \"SELECT c.name, SUM(o.total_amount) as total_order_amount \"\n",
-    "    \"FROM customers c \"\n",
-    "    \"JOIN orders o ON c.id = o.customer_id \"\n",
-    "    \"WHERE c.country = 'USA' \"\n",
-    "    \"GROUP BY c.name \"\n",
-    "    \"ORDER BY total_order_amount DESC;\"\n",
-    ")\n",
-    "\n",
     "# Print output in the given format\n",
     "print(\"Prompt:\")\n",
     "print(\"Context:\")\n",
@@ -1281,14 +1246,12 @@
     "print(\"\\nQuery:\")\n",
     "print(query)\n",
     "print(\"\\nResponse:\")\n",
-    "print(generated_sql)\n",
-    "print(\"\\nEXPECTED RESPONSE:\")\n",
-    "print(expected_response)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "id": "a69f268e-bc69-4633-9c15-4e118c20178e",
    "metadata": {},
    "outputs": [
@@ -1319,22 +1282,22 @@
     "# Load fine-tuned LoRA adapter model\n",
     "lora_model = PeftModel.from_pretrained(base_model, lora_model_path)\n",
     "\n",
-    "# Save the LoRA adapter separately (for users who want lightweight adapters)\n",
     "lora_model.save_pretrained(lora_model_path)\n",
     "tokenizer.save_pretrained(lora_model_path)\n",
     "\n",
-    "# Merge LoRA into the base model to create a fully fine-tuned model\n",
     "merged_model = lora_model.merge_and_unload()\n",
     "\n",
-    "# Save the full fine-tuned model\n",
     "merged_model.save_pretrained(full_model_output_path)\n",
     "tokenizer.save_pretrained(full_model_output_path)\n",
     "\n",
-    "# Save generation config (optional but recommended for inference settings)\n",
     "generation_config = {\n",
-    "    \"max_new_tokens\": 250,\n",
-    "    \"temperature\": 0.0,\n",
-    "    \"num_beams\": 3,\n",
     "    \"early_stopping\": True\n",
     "}\n",
     "with open(f\"{full_model_output_path}/generation_config.json\", \"w\") as f:\n",
@@ -1346,7 +1309,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
    "id": "f1c95dfc-6662-44d8-8ecc-bff414fecee5",
    "metadata": {},
    "outputs": [
@@ -1354,22 +1317,11 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-03-18 16:55:46,428 - INFO - Running inference with beam search decoding.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Prompt:\n",
-      "Context:\n",
-      "CREATE TABLE employees (id INT PRIMARY KEY, name VARCHAR(100), department VARCHAR(50), salary INT); CREATE TABLE projects (project_id INT PRIMARY KEY, project_name VARCHAR(100), budget INT); CREATE TABLE employee_projects (employee_id INT, project_id INT, role VARCHAR(50), FOREIGN KEY (employee_id) REFERENCES employees(id), FOREIGN KEY (project_id) REFERENCES projects(project_id)); INSERT INTO employees (id, name, department, salary) VALUES (1, 'Alice', 'Engineering', 90000), (2, 'Bob', 'Marketing', 70000), (3, 'Charlie', 'Engineering', 95000), (4, 'David', 'HR', 60000), (5, 'Eve', 'Engineering', 110000); INSERT INTO projects (project_id, project_name, budget) VALUES (101, 'AI Research', 500000), (102, 'Marketing Campaign', 200000), (103, 'Cloud Migration', 300000); INSERT INTO employee_projects (employee_id, project_id, role) VALUES (1, 101, 'Lead Engineer'), (2, 102, 'Marketing Specialist'), (3, 101, 'Engineer'), (4, 103, 'HR Coordinator'), (5, 101, 'AI Scientist');\n",
-      "\n",
-      "Query:\n",
-      "Find the names of employees who are working on the 'AI Research' project along with their roles.\n",
-      "\n",
-      "Response:\n",
-      "SELECT employees.name, employee_projects.role FROM employees INNER JOIN employee_projects ON employees.id = employee_projects.employee_id INNER JOIN projects ON employee_projects.project_id = projects.project_id WHERE projects.project_name = 'AI Research';\n"
      ]
     }
    ],
@@ -1462,7 +1414,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "562458ed-53f4-44af-a7a3-e42a175c7245",
    "metadata": {},
    "outputs": [],
    "source": []

   {
    "cell_type": "code",
    "execution_count": 1,
    "id": "5f167a6f-5139-46e6-afb2-a1fa4d12f3fd",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "53684b5e-c27e-4eb9-815e-583aa194e096",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "id": "a47bf3cd-752d-4d1c-9697-70098d6204fa",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "id": "f16df21e-9797-4f78-83a1-a2943759ba55",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "id": "196e83da-6c8c-4cd7-bd70-2598a5e2a16a",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "id": "cea22b9f-f309-4151-81ac-37547c8feeb0",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "d4eb82ce-1713-40b6-981d-43ce35aaa6f6",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-03-19 14:56:53,295 - INFO - Loading raw datasets from various sources...\n",
+      "2025-03-19 14:57:25,655 - INFO - Total rows before dropping duplicates: 490241\n",
+      "2025-03-19 14:57:27,208 - INFO - Total rows after dropping duplicates: 440785\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "id": "8446814e-5a2c-48a4-8c01-059afcf1d3c1",
    "metadata": {},
    "outputs": [
      "output_type": "stream",
      "text": [
       "Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors\n",
+      "2025-03-19 15:01:13,787 - INFO - Total rows after filtering by token length (prompt <= 500 and response <= 250 tokens): 398481\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "id": "177e1e6d-9fbc-442d-9774-5a3e5234329f",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-03-19 15:01:13,794 - INFO - Sample from filtered final_df:\n",
       "                                               query  \\\n",
       "0           Name the home team for carlton away team   \n",
       "1  what will the population of Asia be when Latin...   \n",
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "id": "0b639efe-ebeb-4b34-bc3f-accf776ba0da",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-03-19 15:01:14,006 - INFO - Final split sizes: Train: 338708, Test: 39848, Validation: 19925\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "81e753f720e44f40b5f0dfa5263e2bf5",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "59b1ce0d9ee548668dbc87b99d6e0951",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4a378405a0a24c13a81fc853550d01d6",
        "version_major": 2,
        "version_minor": 0
       },
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-03-19 15:01:15,490 - INFO - Merged and Saved Dataset Successfully!\n",
+      "2025-03-19 15:01:15,497 - INFO - Dataset summary: DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['query', 'context', 'response'],\n",
       "        num_rows: 338708\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "id": "9f6e1095-d72d-4e22-b20d-683f1f84544c",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-03-19 15:01:15,843 - INFO - Reloaded dataset from disk. Example from test split:\n",
       "{'query': \"Show the name and type of military cyber commands in the 'Military_Cyber_Commands' table.\", 'context': \"CREATE SCHEMA IF NOT EXISTS defense_security;CREATE TABLE IF NOT EXISTS defense_security.Military_Cyber_Commands (id INT PRIMARY KEY, command_name VARCHAR(255), type VARCHAR(255));INSERT INTO defense_security.Military_Cyber_Commands (id, command_name, type) VALUES (1, 'USCYBERCOM', 'Defensive Cyber Operations'), (2, 'JTF-CND', 'Offensive Cyber Operations'), (3, '10th Fleet', 'Network Warfare');\", 'response': 'SELECT command_name, type FROM defense_security.Military_Cyber_Commands;'}\n",
+      "2025-03-19 15:01:16,155 - INFO - Loaded Tokenized Dataset from disk.\n",
+      "2025-03-19 15:01:16,159 - INFO - Final tokenized dataset splits: dict_keys(['train', 'test', 'validation'])\n",
+      "2025-03-19 15:01:16,167 - INFO - Sample tokenized record from train split:\n",
       "{'input_ids': tensor([ 1193,  6327,    10,   205,  4386,  6048,   332, 17098,   953,   834,\n",
       "         4350,   834,  4013,    41,   234,   834, 11650,   584,  4280, 28027,\n",
       "            6,   550,   834, 11650,   584,  4280, 28027,     3,    61,     3,\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "id": "7f004e55-181c-47aa-9f3e-c7c1ceae780c",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "id": "f50e56c7-98b3-42bc-9129-89f3eff802e7",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-03-19 15:01:30,827 - INFO - Attempting to load the fine-tuned model...\n",
+      "2025-03-19 15:01:32,195 - INFO - Fine-tuned model loaded successfully.\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "id": "f364eb6b-56cb-4533-8ef6-b5e7f56895aa",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-03-19 15:01:32,235 - INFO - Running inference on 5 examples (displaying real responses).\n",
+      "/venv/main/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:629: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
+      "  warnings.warn(\n"
      ]
     },
     {
       "SELECT command_name, type FROM defense_security.Military_Cyber_Commands;\n",
       "----------------------------------------------------------------------------------------------------\n",
       "ORIGINAL MODEL OUTPUT:\n",
+      "USCYBERCOM, JTF-CND, Offensive Cyber Operations\n",
       "----------------------------------------------------------------------------------------------------\n",
       "FINE-TUNED MODEL OUTPUT:\n",
       "SELECT command_name, type FROM defense_security.Military_Cyber_Commands;\n",
       "SELECT SUM(cost) FROM incidents WHERE cause = 'insider threat' AND date >= DATE_SUB(CURRENT_DATE, INTERVAL 6 MONTH);\n",
       "----------------------------------------------------------------------------------------------------\n",
       "ORIGINAL MODEL OUTPUT:\n",
+      "10000, 2022-01-01\n",
       "----------------------------------------------------------------------------------------------------\n",
       "FINE-TUNED MODEL OUTPUT:\n",
       "SELECT SUM(cost) FROM incidents WHERE cause = 'insider threat' AND date >= DATE_SUB(CURRENT_DATE, INTERVAL 6 MONTH);\n",
       "SELECT COUNT(posts.id) FROM posts INNER JOIN users ON posts.user_id = users.id WHERE users.location = 'Australia' AND posts.created_at >= DATE_SUB(NOW(), INTERVAL 1 MONTH);\n",
       "----------------------------------------------------------------------------------------------------\n",
       "ORIGINAL MODEL OUTPUT:\n",
+      "The total number of posts made by users located in Australia is 50.\n",
       "----------------------------------------------------------------------------------------------------\n",
       "FINE-TUNED MODEL OUTPUT:\n",
       "SELECT COUNT(*) FROM posts p JOIN users u ON p.user_id = u.id WHERE u.location = 'Australia' AND p.created_at >= DATE_SUB(CURRENT_DATE, INTERVAL 1 MONTH);\n",
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-03-19 15:01:40,448 - INFO - Starting evaluation on the full test set using batching.\n"
      ]
     },
     {
       "SELECT Country, SUM(Capacity) as TotalCapacity FROM WindFarms GROUP BY Country;\n",
       "----------------------------------------------------------------------------------------------------\n",
       "ORIGINAL MODEL OUTPUT:\n",
+      "1, 150, USA, 2, 200, Canada, 3, 120, Mexico\n",
       "----------------------------------------------------------------------------------------------------\n",
       "FINE-TUNED MODEL OUTPUT:\n",
       "SELECT Country, SUM(Capacity) FROM WindFarms GROUP BY Country;\n",
       "\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7beecee09a34f9790be1e4538a87442",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "763373c451c94f5e92bc6a6253109275",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "afdce82cb8964da788756d783539ee8d",
        "version_major": 2,
        "version_minor": 0
       },
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-03-19 16:47:58,173 - INFO - Using default tokenizer.\n",
+      "2025-03-19 16:49:07,668 - INFO - Using default tokenizer.\n"
      ]
     },
     {
       "Evaluation Metrics:\n",
       "====================================================================================================\n",
       "ORIGINAL MODEL:\n",
+      "  ROUGE: {'rouge1': np.float64(0.05646642898660111), 'rouge2': np.float64(0.01562815013068162), 'rougeL': np.float64(0.05031267225420556), 'rougeLsum': np.float64(0.05036072587316542)}\n",
+      "  BLEU: {'bleu': 0.003142147128241449, 'precisions': [0.12293406776920406, 0.03289697910893642, 0.018512080104175887, 0.008342750223825794], 'brevity_penalty': 0.11177079327444009, 'length_ratio': 0.3133514352662089, 'translation_length': 377251, 'reference_length': 1203923}\n",
+      "  Fuzzy Match Score: 13.98%\n",
       "  Exact Match Accuracy: 0.00%\n",
       "\n",
       "FINE-TUNED MODEL:\n",
+      "  ROUGE: {'rouge1': np.float64(0.7538800834024002), 'rouge2': np.float64(0.6103863808522726), 'rougeL': np.float64(0.7262841884754194), 'rougeLsum': np.float64(0.7261852209847466)}\n",
+      "  BLEU: {'bleu': 0.4719774431701209, 'precisions': [0.7603153442288385, 0.598309257795389, 0.5021259810303533, 0.42128998564638875], 'brevity_penalty': 0.8474086962179814, 'length_ratio': 0.8579477258927689, 'translation_length': 1032903, 'reference_length': 1203923}\n",
+      "  Fuzzy Match Score: 85.62%\n",
+      "  Exact Match Accuracy: 18.29%\n",
       "====================================================================================================\n"
      ]
     }
    ],
    "source": [
+    "import logging\n",
     "import re\n",
+    "import pandas as pd\n",
+    "from rapidfuzz import fuzz\n",
     "import evaluate\n",
     "\n",
+    "# Assuming tokenizer, device, original_model, finetuned_model, and dataset are already defined.\n",
+    "# Define a helper function for output post-processing.\n",
+    "def post_process_output(output_text: str) -> str:\n",
+    "    \"\"\"Post-process the generated output to remove repeated text.\"\"\"\n",
+    "    # Keep only the first valid SQL query (everything before the first semicolon)\n",
+    "    return output_text.split(\";\")[0] + \";\" if \";\" in output_text else output_text\n",
+    "\n",
+    "# Define a helper function for generating outputs with the given generation parameters.\n",
+    "def generate_with_params(model, input_ids):\n",
+    "    generated_ids = model.generate(\n",
+    "        input_ids=input_ids,\n",
+    "        max_new_tokens=100, \n",
+    "        num_beams=5,\n",
+    "        repetition_penalty=1.2,\n",
+    "        temperature=0.1,\n",
+    "        early_stopping=True\n",
+    "    )\n",
+    "    # Decode and post-process output\n",
+    "    output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "    return output_text\n",
+    "\n",
+    "# Helper functions for SQL normalization and evaluation metrics\n",
     "def normalize_sql(sql):\n",
     "    \"\"\"Normalize SQL by stripping whitespace and lowercasing.\"\"\"\n",
     "    return \" \".join(sql.strip().lower().split())\n",
     "    scores = [fuzz.token_set_ratio(pred, ref) for pred, ref in zip(predictions, references)]\n",
     "    return sum(scores) / len(scores) if scores else 0\n",
     "\n",
+    "# Dummy function to free up memory if needed.\n",
+    "def clear_memory():\n",
+    "    # If using torch.cuda, you can clear cache:\n",
+    "    # torch.cuda.empty_cache()\n",
+    "    pass\n",
+    "\n",
+    "logger = logging.getLogger(__name__)\n",
+    "logger.setLevel(logging.INFO)\n",
+    "\n",
+    "# --- Part A: Inference on 5 Examples with Real Responses ---\n",
     "logger.info(\"Running inference on 5 examples (displaying real responses).\")\n",
     "\n",
     "num_examples = 5\n",
     "sample_contexts = dataset[\"test\"][:num_examples][\"context\"]\n",
     "sample_human_responses = dataset[\"test\"][:num_examples][\"response\"]\n",
     "\n",
+    "print(\"\\n\" + \"=\" * 100)\n",
     "for idx in range(num_examples):\n",
     "    prompt = f\"\"\"Context:\n",
     "{sample_contexts[idx]}\n",
     "\n",
     "Response:\n",
     "\"\"\"\n",
+    "    # Tokenize the prompt and move to device\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\", truncation=True, max_length=512).to(device)\n",
     "    \n",
+    "    # Generate outputs using the modified generation parameters\n",
+    "    orig_out = generate_with_params(original_model, inputs[\"input_ids\"])\n",
+    "    finetuned_out = post_process_output(generate_with_params(finetuned_model, inputs[\"input_ids\"]))\n",
     "    \n",
     "    print(\"-\" * 100)\n",
     "    print(f\"Example {idx+1}\")\n",
     "    print(sample_human_responses[idx])\n",
     "    print(\"-\" * 100)\n",
     "    print(\"ORIGINAL MODEL OUTPUT:\")\n",
+    "    print(orig_out)\n",
     "    print(\"-\" * 100)\n",
     "    print(\"FINE-TUNED MODEL OUTPUT:\")\n",
+    "    print(finetuned_out)\n",
     "    print(\"=\" * 100 + \"\\n\")\n",
     "    clear_memory()\n",
     "\n",
     "all_original_responses = []\n",
     "all_finetuned_responses = []\n",
     "\n",
+    "batch_size = 128  # Adjust based on GPU memory\n",
     "test_dataset = dataset[\"test\"]\n",
     "\n",
     "for i in range(0, len(test_dataset), batch_size):\n",
     "    # Slicing the dataset returns a dict of lists\n",
+    "    batch = test_dataset[i:i + batch_size]\n",
     "    \n",
+    "    # Construct prompts for each example in the batch\n",
     "    prompts = [\n",
     "        f\"Context:\\n{batch['context'][j]}\\n\\nQuery:\\n{batch['query'][j]}\\n\\nResponse:\"\n",
     "        for j in range(len(batch[\"context\"]))\n",
     "    ]\n",
     "    \n",
+    "    # Extend human responses\n",
     "    all_human_responses.extend(batch[\"response\"])\n",
     "    \n",
+    "    # Tokenize the batch of prompts with padding and truncation\n",
+    "    inputs = tokenizer(prompts, return_tensors=\"pt\", padding=True, truncation=True, max_length=512).to(device)\n",
     "    \n",
+    "    # Generate outputs for the batch for both models\n",
+    "    orig_ids = original_model.generate(\n",
+    "        input_ids=inputs[\"input_ids\"],\n",
+    "        max_new_tokens=100,\n",
+    "        num_beams=5,\n",
+    "        repetition_penalty=1.2,\n",
+    "        temperature=0.1,\n",
+    "        early_stopping=True\n",
+    "    )\n",
+    "    finetuned_ids = finetuned_model.generate(\n",
+    "        input_ids=inputs[\"input_ids\"],\n",
+    "        max_new_tokens=100,\n",
+    "        num_beams=5,\n",
+    "        repetition_penalty=1.2,\n",
+    "        temperature=0.1,\n",
+    "        early_stopping=True\n",
+    "    )\n",
     "    \n",
+    "    # Decode and post-process each sample in the batch\n",
     "    orig_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in orig_ids]\n",
+    "    finetuned_texts = [post_process_output(tokenizer.decode(ids, skip_special_tokens=True)) for ids in finetuned_ids]\n",
     "    \n",
     "    all_original_responses.extend(orig_texts)\n",
     "    all_finetuned_responses.extend(finetuned_texts)\n",
     "# Create a DataFrame for a quick comparison of results\n",
     "zipped_all = list(zip(all_human_responses, all_original_responses, all_finetuned_responses))\n",
     "df_full = pd.DataFrame(zipped_all, columns=[\"Human Response\", \"Original Model Output\", \"Fine-Tuned Model Output\"])\n",
+    "df_full.to_csv('evaluation_results.csv', index=False)\n",
     "clear_memory()\n",
     "\n",
     "# --- Compute Evaluation Metrics ---\n",
     "rouge = evaluate.load(\"rouge\")\n",
     "bleu = evaluate.load(\"bleu\")\n",
     "\n",
     "finetuned_fuzzy = compute_fuzzy_match(all_finetuned_responses, all_human_responses)\n",
     "finetuned_exact = compute_exact_match(all_finetuned_responses, all_human_responses)\n",
     "\n",
+    "print(\"\\n\" + \"=\" * 100)\n",
     "print(\"Evaluation Metrics:\")\n",
+    "print(\"=\" * 100)\n",
     "print(\"ORIGINAL MODEL:\")\n",
     "print(f\"  ROUGE: {orig_rouge}\")\n",
     "print(f\"  BLEU: {orig_bleu}\")\n",
     "print(f\"  BLEU: {finetuned_bleu}\")\n",
     "print(f\"  Fuzzy Match Score: {finetuned_fuzzy:.2f}%\")\n",
     "print(f\"  Exact Match Accuracy: {finetuned_exact:.2f}%\")\n",
+    "print(\"=\" * 100)\n",
+    "clear_memory()"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "id": "462546a7-6928-4723-b00e-23c3a4091d99",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "2025-03-19 16:51:05,225 - INFO - Running inference with deterministic decoding and beam search.\n"
      ]
     },
     {
       "Retrieve the total order amount for each customer, showing only customers from the USA, and sort the result by total order amount in descending order.\n",
       "\n",
       "Response:\n",
+      "SELECT customer_id, SUM(total_amount) as total_amount FROM orders JOIN customers ON orders.customer_id = customers.id WHERE customers.country = 'USA' GROUP BY customer_id ORDER BY total_amount DESC;\n"
      ]
     }
    ],
     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
     "\n",
     "# Load the fine-tuned model and tokenizer\n",
+    "model_name = \"text2sql_flant5base_finetuned\" \n",
     "finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-base\")\n",
     "finetuned_model.to(device)\n",
     "    inputs = tokenizer(prompt_text, return_tensors=\"pt\").to(device)\n",
     "    generated_ids = finetuned_model.generate(\n",
     "        input_ids=inputs[\"input_ids\"],\n",
+    "        max_new_tokens=100,   # Adjust based on query complexity\n",
+    "        temperature=0.1,      # Deterministic output\n",
+    "        num_beams=5,          # Beam search for better output quality\n",
     "        early_stopping=True,  # Stop early if possible\n",
     "    )\n",
+    "    generated_sql = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
+    "\n",
+    "    # Post-processing to remove repeated text\n",
+    "    generated_sql = generated_sql.split(\";\")[0] + \";\"  # Keep only the first valid SQL query\n",
+    "\n",
+    "    return generated_sql\n",
     "\n",
     "# Sample context and query (example)\n",
     "context = (\n",
     "logger.info(\"Running inference with deterministic decoding and beam search.\")\n",
     "generated_sql = run_inference(sample_prompt)\n",
     "\n",
     "# Print output in the given format\n",
     "print(\"Prompt:\")\n",
     "print(\"Context:\")\n",
     "print(\"\\nQuery:\")\n",
     "print(query)\n",
     "print(\"\\nResponse:\")\n",
+    "print(generated_sql)\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "id": "a69f268e-bc69-4633-9c15-4e118c20178e",
    "metadata": {},
    "outputs": [
     "# Load fine-tuned LoRA adapter model\n",
     "lora_model = PeftModel.from_pretrained(base_model, lora_model_path)\n",
     "\n",
+    "# ✅ Save the LoRA adapter separately (for users who want lightweight adapters)\n",
     "lora_model.save_pretrained(lora_model_path)\n",
     "tokenizer.save_pretrained(lora_model_path)\n",
     "\n",
+    "# ✅ Merge LoRA into the base model to create a fully fine-tuned model\n",
     "merged_model = lora_model.merge_and_unload()\n",
     "\n",
+    "# ✅ Save the full fine-tuned model\n",
     "merged_model.save_pretrained(full_model_output_path)\n",
     "tokenizer.save_pretrained(full_model_output_path)\n",
     "\n",
+    "# ✅ Save generation config (optional but recommended for inference settings)\n",
     "generation_config = {\n",
+    "    \"max_new_tokens\": 100,\n",
+    "    \"temperature\": 0.1,\n",
+    "    \"num_beams\": 5,\n",
     "    \"early_stopping\": True\n",
     "}\n",
     "with open(f\"{full_model_output_path}/generation_config.json\", \"w\") as f:\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "f1c95dfc-6662-44d8-8ecc-bff414fecee5",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/venv/main/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:629: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
+      "  warnings.warn(\n",
+      "/venv/main/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:629: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
+      "  warnings.warn(\n",
+      "2025-03-19 16:51:49,933 - INFO - Running inference with beam search decoding.\n"
      ]
     }
    ],
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "97425ac4-ad46-4f38-b22d-071e161da20a",
    "metadata": {},
    "outputs": [],
    "source": []