Spaces:

mubbashir-ahmed
/

MyModelTestingSpace

Sleeping

App Files Files Community

Mubbashir Ahmed commited on 24 days ago

Commit

26fe788

1 Parent(s): f34278a

evaluting each model together

Browse files

Files changed (1) hide show

app.py +48 -55

app.py CHANGED Viewed

@@ -21,71 +21,70 @@ spider_dataset = load_dataset("spider", split="train")
 llama_client = InferenceClient(provider="fireworks-ai", api_key=HF_TOKEN)
 qwen_client = InferenceClient(provider="featherless-ai", api_key=HF_TOKEN)
 # ------------------------
 # Inference + Evaluation Logic
 # ------------------------
-def evaluate_model(model_name, user_input, expected_sql, chat_history):
-    messages = chat_history + [{"role": "user", "content": user_input}]
-    try:
-        start_time = time.time()
-        if model_name == "LLaMA 4":
-            result = llama_client.chat.completions.create(
-                model="meta-llama/Llama-4-Maverick-17B-128E-Instruct",
-                messages=messages
-            )
-            model_sql = result.choices[0].message.content
-        elif model_name == "Qwen3 14B":
-            result = qwen_client.chat.completions.create(
-                model="Qwen/Qwen3-14B",
                 messages=messages
             )
             model_sql = result.choices[0].message.content
-        else:
-            model_sql = "❌ Invalid model selected."
-        end_time = time.time()
-        latency = int((end_time - start_time) * 1000)  # ms
-    except Exception as e:
-        model_sql = f"⚠️ Error: {str(e)}"
-        latency = -1
-    # Evaluation criteria (simulated, can be replaced with real validation)
-    sql_gen_accuracy = "✅" if expected_sql.strip().lower() in model_sql.strip().lower() else "❌"
-    exec_response_accuracy = "✅" if sql_gen_accuracy == "✅" else "❌"
-    intent_clarity = "✅" if len(user_input.strip().split()) < 5 and "SELECT" in model_sql.upper() else "❌"
-    semantic_clarity = "✅" if any(word in model_sql.lower() for word in ["from", "join", "group by"]) else "❌"
-    latency_status = "✅" if latency <= 1000 else "❌"
-    evaluation_summary = (
-        f"📊 **Evaluation Summary**\n"
-        f"- SQL Generation Match: {sql_gen_accuracy}\n"
-        f"- Execution Accuracy: {exec_response_accuracy}\n"
-        f"- Intent Clarification: {intent_clarity}\n"
-        f"- Semantic Mapping: {semantic_clarity}\n"
-        f"- Response Latency: {latency} ms ({latency_status})\n"
-    )
-    chat_history.append({"role": "user", "content": user_input})
-    chat_history.append({"role": "assistant", "content": model_sql})
-    chat_transcript = "\n".join([
-        f"👤 User: {msg['content']}" if msg["role"] == "user" else f"🤖 Assistant: {msg['content']}"
-        for msg in chat_history
-    ])
-    return chat_transcript, chat_history, evaluation_summary
 # ------------------------
 # Load Random Spider Prompt
 # ------------------------
 def get_random_spider_prompt():
     sample = random.choice(spider_dataset)
-    return sample["question"], sample["query"], sample["query"]  # Return expected SQL twice
 # ------------------------
 # Gradio UI
@@ -93,17 +92,11 @@ def get_random_spider_prompt():
 with gr.Blocks() as demo:
     gr.Markdown("## 🧠 Spider Dataset Model Evaluation")
-    model_choice = gr.Dropdown(
-        choices=["LLaMA 4", "Qwen3 14B"],
-        label="Select Model",
-        value="LLaMA 4"
-    )
     prompt_input = gr.Textbox(label="Your Prompt", lines=3, placeholder="Ask your BI question...")
     expected_sql_display = gr.Textbox(label="Expected SQL", lines=2, interactive=False)
     load_spider_btn = gr.Button("🔀 Load Random Spider Prompt")
-    run_button = gr.Button("Send & Evaluate")
     chat_display = gr.Textbox(label="Chat History", lines=20, interactive=False)
     evaluation_display = gr.Markdown()
@@ -118,8 +111,8 @@ with gr.Blocks() as demo:
     )
     run_button.click(
-        fn=evaluate_model,
-        inputs=[model_choice, prompt_input, expected_sql, chat_memory],
         outputs=[chat_display, chat_memory, evaluation_display]
     )

 llama_client = InferenceClient(provider="fireworks-ai", api_key=HF_TOKEN)
 qwen_client = InferenceClient(provider="featherless-ai", api_key=HF_TOKEN)
+model_list = {
+    "LLaMA 4": {
+        "client": llama_client,
+        "model_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
+    },
+    "Qwen3 14B": {
+        "client": qwen_client,
+        "model_id": "Qwen/Qwen3-14B"
+    }
+}
 # ------------------------
 # Inference + Evaluation Logic
 # ------------------------
+def evaluate_all_models(user_input, expected_sql, chat_history):
+    evaluations = []
+    full_chat_transcript = ""
+    for model_name, model_config in model_list.items():
+        client = model_config["client"]
+        model_id = model_config["model_id"]
+        messages = chat_history + [{"role": "user", "content": user_input}]
+        try:
+            start_time = time.time()
+            result = client.chat.completions.create(
+                model=model_id,
                 messages=messages
             )
             model_sql = result.choices[0].message.content
+            latency = int((time.time() - start_time) * 1000)
+        except Exception as e:
+            model_sql = f"⚠️ Error: {str(e)}"
+            latency = -1
+        # Evaluation criteria (simulated)
+        sql_gen_accuracy = "✅" if expected_sql.strip().lower() in model_sql.strip().lower() else "❌"
+        exec_response_accuracy = "✅" if sql_gen_accuracy == "✅" else "❌"
+        intent_clarity = "✅" if len(user_input.strip().split()) < 5 and "SELECT" in model_sql.upper() else "❌"
+        semantic_clarity = "✅" if any(word in model_sql.lower() for word in ["from", "join", "group by"]) else "❌"
+        latency_status = "✅" if latency <= 1000 else "❌"
+        summary = (
+            f"### 🤖 {model_name} Evaluation\n"
+            f"- SQL Generation Match: {sql_gen_accuracy}\n"
+            f"- Execution Accuracy: {exec_response_accuracy}\n"
+            f"- Intent Clarification: {intent_clarity}\n"
+            f"- Semantic Mapping: {semantic_clarity}\n"
+            f"- Response Latency: {latency} ms ({latency_status})\n"
+        )
+        evaluations.append(summary)
+        full_chat_transcript += f"\n👤 User: {user_input}\n🤖 {model_name}: {model_sql}\n"
+    return full_chat_transcript.strip(), chat_history, "\n\n".join(evaluations)
 # ------------------------
 # Load Random Spider Prompt
 # ------------------------
 def get_random_spider_prompt():
     sample = random.choice(spider_dataset)
+    return sample["question"], sample["query"], sample["query"]
 # ------------------------
 # Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## 🧠 Spider Dataset Model Evaluation")
     prompt_input = gr.Textbox(label="Your Prompt", lines=3, placeholder="Ask your BI question...")
     expected_sql_display = gr.Textbox(label="Expected SQL", lines=2, interactive=False)
     load_spider_btn = gr.Button("🔀 Load Random Spider Prompt")
+    run_button = gr.Button("Send & Evaluate All Models")
     chat_display = gr.Textbox(label="Chat History", lines=20, interactive=False)
     evaluation_display = gr.Markdown()
     )
     run_button.click(
+        fn=evaluate_all_models,
+        inputs=[prompt_input, expected_sql, chat_memory],
         outputs=[chat_display, chat_memory, evaluation_display]
     )