updated rag evaluation function to match that from baseline testing

Browse files

Files changed (2) hide show

__pycache__/rag_metadata.cpython-312.pyc +0 -0
test_rag.py +102 -39

__pycache__/rag_metadata.cpython-312.pyc ADDED Viewed

Binary file (3.32 kB). View file

test_rag.py CHANGED Viewed

@@ -24,7 +24,7 @@ print("\n")
 # ------------------------------
 # Load tokenizer and model
 # ------------------------------
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tokenizer = AutoTokenizer.from_pretrained("./deepseek-coder-1.3b-instruct")
 model = AutoModelForCausalLM.from_pretrained(
     "./deepseek-coder-1.3b-instruct",
@@ -160,48 +160,111 @@ retriever.add_documents(metadata_docs)
 # ------------------------------
 # Define a function to compare model output to ground truth
 # ------------------------------
-def compare_result(sample_query, sample_result, generated_query, actual_result):
-    # Remove any prefixes from the generated query
-    if generated_query.startswith("SQLite: "):
-        query = generated_query[len("SQLite: "):]
-    elif generated_query.startswith("SQL: "):
-        query = generated_query[len("SQL: "):]
     else:
-        query = generated_query
-    # Truncate query after the first semicolon (if present)
-    semicolon_index = query.find(";")
-    if semicolon_index != -1:
-        query = query[:semicolon_index+1]
-    # Simple function to clean strings: removes whitespace and lowercases.
-    clean_str = lambda s: "".join(s.split()).lower()
-    # Compare the generated query text with the sample query.
-    query_match = (clean_str(query) == clean_str(sample_query))
-    # Compare the expected result and the actual result numerically if possible.
     try:
-        sample_val = float(sample_result)
-        actual_val = float(actual_result)
-        result_match = math.isclose(sample_val, actual_val, abs_tol=1e-6)
-    except Exception:
-        # Otherwise, do a cleaned string comparison.
-        result_match = (clean_str(str(sample_result)) == clean_str(str(actual_result)))
-    overall_valid = query_match and result_match
-    # Debug output.
-    print("DEBUG: Expected Result (from dataset):", sample_result)
-    print("DEBUG: Actual DB Result:", actual_result)
-    try:
-        sample_val = float(sample_result)
-        actual_val = float(actual_result)
-        print("DEBUG: Numeric Comparison result:", math.isclose(sample_val, actual_val, abs_tol=1e-6))
-    except Exception:
-        print("DEBUG: Numeric Comparison: N/A")
-    return overall_valid, query_match, result_match
 # ------------------------------
@@ -313,7 +376,7 @@ Request: {row["natural_query"]}
             actual_result = "Error executing query: " + str(e)
         # Compare the ground truth query and expected result to the generated query and actual result.
-        valid, sql_matched, result_matched = compare_result(row["sql_query"], row["result"], generated_query, actual_result)
         print("=============================================")
         print(f"Overall Valid: {valid}")
         print(f"SQL Query Matched: {sql_matched}")

 # ------------------------------
 # Load tokenizer and model
 # ------------------------------
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 tokenizer = AutoTokenizer.from_pretrained("./deepseek-coder-1.3b-instruct")
 model = AutoModelForCausalLM.from_pretrained(
     "./deepseek-coder-1.3b-instruct",
 # ------------------------------
 # Define a function to compare model output to ground truth
 # ------------------------------
+def compare_result(sample_query, sample_result, query_output):
+    # Clean model output to only have the query output
+    if query_output[0:8] == "SQLite:\n":
+        query = query_output[8:]
+    elif query_output[0:8] == "SQLite: ":
+        query = query_output[8:]
+    elif query_output[0:7] == "SQLite:":
+        query = query_output[7:]
+    elif query_output[0:5] == "SQL:\n":
+        query = query_output[5:]
+    elif query_output[0:5] == "SQL: ":
+        query = query_output[5:]
+    elif query_output[0:4] == "SQL:":
+        query = query_output[4:]
     else:
+        query = query_output
+    # Clean any excess text after the query semicolon
+    for i in range(len(query)):
+        if query[i] == ";":
+            query = query[:i+1]
+            break
+    # Try to execute query, if it fails, then this is a failure of the model
     try:
+        # Execute query and obtain result
+        cursor.execute(query)
+        rows = cursor.fetchall()
+        # Strip all whitespace before comparing queries since there may be differences in spacing, newlines, tabs, etc.
+        query = query.replace(" ", "").replace("\n", "").replace("\t", "")
+        sample_query = sample_query.replace(" ", "").replace("\n", "").replace("\t", "")
+        query_match = (query == sample_query)
+        # If the queries match, the results clearly also match
+        if query_match:
+            return True, True, True
+        # Check if this is a multi-line query
+        if "|" in sample_result or "(" in sample_result:
+            #print(rows)
+            # Create list of results by stripping separators and splitting on them
+            if "(" in sample_result:
+                sample_result = sample_result.replace("(", "").replace(")", "")
+                result_list = sample_result.split(",")
+            else:
+                result_list = sample_result.split("|")
+            # Strip all results in list
+            for i in range(len(result_list)):
+                result_list[i] = str(result_list[i]).strip()
+            # Loop through model result and see if it matches training example
+            result = False
+            for row in rows:
+                for r in row:
+                    for res in result_list:
+                        try:
+                            if math.isclose(float(r), float(res), abs_tol=0.5):
+                                return True, query_match, True
+                        except:
+                            if str(r) in res or res in str(r):
+                                return True, query_match, True
+            # Check if the model returned a sum of examples as opposed to the whole thing
+            if len(rows) == 1:
+                for r in rows[0]:
+                    if r == str(len(result_list)):
+                        return True, query_match, True
+            return True, query_match, result
+        # Else the sample result is a single value or string
+        else:
+            #print(rows)
+            result = False
+            # Loop through model result and see if it contains the sample result
+            for row in rows:
+                for r in row:
+                    # Check by string
+                    if str(r) in str(sample_result):
+                        try:
+                            if math.isclose(float(r), float(sample_result), abs_tol=0.5):
+                                return True, query_match, True
+                        except:
+                            return True, query_match, True
+                    # Check by number, using try incase the cast as float fails
+                    try:
+                        if math.isclose(float(r), float(sample_result), abs_tol=0.5):
+                            return True, query_match, True
+                    except:
+                        pass
+            # Check if the model returned a list of examples instead of a total sum (both acceptable)
+            try:
+                if len(rows) > 1 and len(rows) == int(sample_result):
+                    return True, query_match, True
+                if len(rows[0]) > 1 and rows[0][1] is not None and  len(rows[0]) == int(sample_result):
+                    return True, query_match, True
+            except:
+                pass
+            # Compare results and return
+            return True, query_match, result
+    except:
+        return False, False, False
 # ------------------------------
             actual_result = "Error executing query: " + str(e)
         # Compare the ground truth query and expected result to the generated query and actual result.
+        valid, sql_matched, result_matched = compare_result(row["sql_query"], row["result"], generated_query)
         print("=============================================")
         print(f"Overall Valid: {valid}")
         print(f"SQL Query Matched: {sql_matched}")