Spaces:

mubbashir-ahmed
/

MyModelTestingSpace

Sleeping

App Files Files Community

Mubbashir Ahmed commited on 4 days ago

Commit

ec635d6

1 Parent(s): a1b742b

updates

Browse files

Files changed (1) hide show

app.py +74 -75

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import random
 import time
 import json
 import gradio as gr
 from huggingface_hub import InferenceClient
 # ------------------------
@@ -36,6 +38,7 @@ model_list = {
 # ------------------------
 # Prompt Template for SQL Generation
 # ------------------------
 def build_prompt(user_question):
     return f"""You are an expert SQL assistant. Convert the given question into a valid SQL query.
@@ -57,89 +60,85 @@ Q: {user_question}
 A:"""
 # ------------------------
-# Inference + Evaluation Logic
 # ------------------------
-def evaluate_all_models(user_input, expected_sql, chat_history):
-    evaluations = []
-    full_chat_transcript = ""
-    prompt = build_prompt(user_input)
-    for model_name, model_config in model_list.items():
-        client = model_config["client"]
-        model_id = model_config["model_id"]
-        messages = chat_history + [{"role": "user", "content": prompt}]
-        try:
-            start_time = time.time()
-            result = client.chat.completions.create(
-                model=model_id,
-                messages=messages
-            )
-            model_sql = result.choices[0].message.content
-            latency = int((time.time() - start_time) * 1000)
-        except Exception as e:
-            model_sql = f"⚠️ Error: {str(e)}"
-            latency = -1
-        # Evaluation criteria (simulated)
-        sql_gen_accuracy = "✅" if expected_sql.strip().lower() in model_sql.strip().lower() else "❌"
-        exec_response_accuracy = "✅" if sql_gen_accuracy == "✅" else "❌"
-        intent_clarity = "✅" if len(user_input.strip().split()) < 5 and "SELECT" in model_sql.upper() else "❌"
-        semantic_clarity = "✅" if any(word in model_sql.lower() for word in ["from", "join", "group by"]) else "❌"
-        latency_status = "✅" if latency <= 1000 else "❌"
-        summary = (
-            f"### 🤖 {model_name} Evaluation\n"
-            f"- SQL Generation Match: {sql_gen_accuracy}\n"
-            f"- Execution Accuracy: {exec_response_accuracy}\n"
-            f"- Intent Clarification: {intent_clarity}\n"
-            f"- Semantic Mapping: {semantic_clarity}\n"
-            f"- Response Latency: {latency} ms ({latency_status})\n"
-        )
-        evaluations.append(summary)
-        full_chat_transcript += f"\n👤 User: {user_input}\n🤖 {model_name}: {model_sql}\n"
-    return full_chat_transcript.strip(), chat_history, "\n\n".join(evaluations)
-# ------------------------
-# Load Random Spider Prompt
-# ------------------------
-def get_random_spider_prompt():
-    sample = random.choice(spider_dataset)
-    return sample["question"], sample["query"], sample["query"]
 # ------------------------
-# Gradio UI
 # ------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 🧠 Spider Dataset Model Evaluation")
-    prompt_input = gr.Textbox(label="Your Prompt", lines=3, placeholder="Ask your BI question...")
-    expected_sql_display = gr.Textbox(label="Expected SQL", lines=2, interactive=False)
-    load_spider_btn = gr.Button("🔀 Load Random Spider Prompt")
-    run_button = gr.Button("Send & Evaluate All Models")
-    chat_display = gr.Textbox(label="Chat History", lines=20, interactive=False)
-    evaluation_display = gr.Markdown()
-    chat_memory = gr.State([])
-    expected_sql = gr.State("")
-    load_spider_btn.click(
-        fn=get_random_spider_prompt,
-        inputs=[],
-        outputs=[prompt_input, expected_sql, expected_sql_display]
-    )
-    run_button.click(
-        fn=evaluate_all_models,
-        inputs=[prompt_input, expected_sql, chat_memory],
-        outputs=[chat_display, chat_memory, evaluation_display]
-    )
 # Launch
-demo.launch()

 import time
 import json
 import gradio as gr
+import csv
+from datetime import datetime
 from huggingface_hub import InferenceClient
 # ------------------------
 # ------------------------
 # Prompt Template for SQL Generation
 # ------------------------
 def build_prompt(user_question):
     return f"""You are an expert SQL assistant. Convert the given question into a valid SQL query.
 A:"""
 # ------------------------
+# Evaluation + Batch Logic
 # ------------------------
+def evaluate_batch(n=50):
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    output_path = f"evaluation_results_{timestamp}.csv"
+    results = []
+    selected_samples = random.sample(spider_dataset, n)
+    for idx, sample in enumerate(selected_samples):
+        user_question = sample["question"]
+        expected_sql = sample["query"]
+        prompt = build_prompt(user_question)
+        row = {
+            "question": user_question,
+            "gold_sql": expected_sql
+        }
+        for model_name, model_config in model_list.items():
+            client = model_config["client"]
+            model_id = model_config["model_id"]
+            try:
+                start_time = time.time()
+                result = client.chat.completions.create(
+                    model=model_id,
+                    messages=[{"role": "user", "content": prompt}]
+                )
+                model_sql = result.choices[0].message.content
+                latency = int((time.time() - start_time) * 1000)
+            except Exception as e:
+                model_sql = f"ERROR: {str(e)}"
+                latency = -1
+            sql_gen_accuracy = "✅" if expected_sql.strip().lower() in model_sql.strip().lower() else "❌"
+            exec_response_accuracy = "✅" if sql_gen_accuracy == "✅" else "❌"
+            intent_clarity = "✅" if len(user_question.strip().split()) < 5 and "SELECT" in model_sql.upper() else "❌"
+            semantic_clarity = "✅" if any(word in model_sql.lower() for word in ["from", "join", "group by"]) else "❌"
+            latency_status = "✅" if latency <= 1000 else "❌"
+            row.update({
+                f"{model_name}_sql": model_sql,
+                f"{model_name}_sql_match": sql_gen_accuracy,
+                f"{model_name}_exec_match": exec_response_accuracy,
+                f"{model_name}_intent_clarity": intent_clarity,
+                f"{model_name}_semantic_clarity": semantic_clarity,
+                f"{model_name}_latency_ms": latency,
+                f"{model_name}_latency_status": latency_status
+            })
+        results.append(row)
+        print(f"[{idx+1}/{n}] Done: {user_question}")
+    # Save to CSV
+    fieldnames = results[0].keys()
+    with open(output_path, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(results)
+    print(f"\n✅ Evaluation completed and saved to: {output_path}")
+    return output_path
 # ------------------------
+# Gradio UI for batch evaluation
 # ------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## 🧠 Run Batch Evaluation on Spider Dataset")
+    num_samples = gr.Slider(10, 100, value=50, step=10, label="Number of Random Samples")
+    run_button = gr.Button("🚀 Run Evaluation")
+    download_output = gr.File(label="Download Evaluation CSV")
+    def run_eval(n):
+        return evaluate_batch(n)
+    run_button.click(fn=run_eval, inputs=[num_samples], outputs=[download_output])
 # Launch
+if __name__ == "__main__":
+    demo.launch()