Spaces:

SaylorTwift
/

OpenEvalsModelDetails

Runtime error

App Files Files Community

Linker1907 commited on Mar 24

Commit

c5bf87e

1 Parent(s): 9b8ac8f

init

Browse files

Files changed (2) hide show

app.py +26 -8
experiments.json +87 -3

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ from datasets import load_dataset
 from collections import defaultdict
 import json
 import gradio as gr
-from functools import lru_cache
 # Load models and experiments
@@ -10,9 +9,8 @@ with open("experiments.json") as f:
     experiments = json.load(f)
 MODELS = list(experiments.keys())
-MODELS = [m for m in MODELS if m != "claude-3-7-sonnet-20250219"]
-@lru_cache
 def load_details_and_results(model, benchmark, experiment_tag):
     def worker(example):
         example["predictions"] = example["predictions"]
@@ -20,7 +18,7 @@ def load_details_and_results(model, benchmark, experiment_tag):
         example["metrics"] = example["metrics"]
         return example
-    repo = f"SaylorTwift/details_{model.replace('/', '__')}_private"
     subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
     split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
@@ -49,8 +47,14 @@ def display_model_comparison(selected_models, benchmark, example_index):
     if not selected_models:
         return "Please select at least one model to compare."
     outputs = []
-    for model in selected_models:
         try:
             example = experiment_details[model][benchmark][example_index]
             outputs.append({
@@ -111,7 +115,9 @@ def display_model_comparison(selected_models, benchmark, example_index):
                     html_output += "<div style='margin-bottom: 10px;'>\n"
                     html_output += f"<strong>{role}:</strong>\n"
                     html_output += "<div style='overflow-x: auto;'>\n"
-                    html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{msg['content']}</code></pre>\n"
                     html_output += "</div>\n"
                     html_output += "</div>\n"
                 else:
@@ -123,8 +129,13 @@ def display_model_comparison(selected_models, benchmark, example_index):
         else:
             html_output += "<div style='overflow-x: auto;'>\n"
             if isinstance(prompt_text, dict) and 'content' in prompt_text:
-                html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text['content']}</code></pre>\n"
             else:
                 html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
             html_output += "</div>\n"
@@ -140,7 +151,9 @@ def display_model_comparison(selected_models, benchmark, example_index):
         html_output += "</summary>\n"
         html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
         html_output += "<div style='overflow-x: auto;'>\n"
-        html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{output['Prediction']}</code></pre>\n"
         html_output += "</div>\n"
         html_output += "</div>\n"
         html_output += "</details>\n"
@@ -156,6 +169,11 @@ available_benchmarks = list(set(
     for benchmark in experiment_details[model].keys()
 ))
 # Create the Gradio interface
 demo = gr.Interface(
     fn=display_model_comparison,

 from collections import defaultdict
 import json
 import gradio as gr
 # Load models and experiments
     experiments = json.load(f)
 MODELS = list(experiments.keys())
+MODELS = [m for m in MODELS if m.startswith("google/gemma-3")]
 def load_details_and_results(model, benchmark, experiment_tag):
     def worker(example):
         example["predictions"] = example["predictions"]
         example["metrics"] = example["metrics"]
         return example
+    repo = f"OpenEvals/details_{model.replace('/', '__')}_private"
     subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
     split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
     if not selected_models:
         return "Please select at least one model to compare."
+    # Filter out models that don't have the selected benchmark
+    available_models = [model for model in selected_models if benchmark in experiment_details[model]]
+    if not available_models:
+        return f"No models have results for benchmark: {benchmark}"
     outputs = []
+    for model in available_models:  # Changed from selected_models to available_models
         try:
             example = experiment_details[model][benchmark][example_index]
             outputs.append({
                     html_output += "<div style='margin-bottom: 10px;'>\n"
                     html_output += f"<strong>{role}:</strong>\n"
                     html_output += "<div style='overflow-x: auto;'>\n"
+                    # Escape HTML in content
+                    content = msg['content'].replace('<', '&lt;').replace('>', '&gt;')
+                    html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
                     html_output += "</div>\n"
                     html_output += "</div>\n"
                 else:
         else:
             html_output += "<div style='overflow-x: auto;'>\n"
             if isinstance(prompt_text, dict) and 'content' in prompt_text:
+                # Escape HTML in content
+                content = prompt_text['content'].replace('<', '&lt;').replace('>', '&gt;')
+                html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
             else:
+                # Escape HTML if prompt_text is a string
+                if isinstance(prompt_text, str):
+                    prompt_text = prompt_text.replace('<', '&lt;').replace('>', '&gt;')
                 html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
             html_output += "</div>\n"
         html_output += "</summary>\n"
         html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
         html_output += "<div style='overflow-x: auto;'>\n"
+        # Escape HTML in prediction
+        prediction = output['Prediction'].replace('<', '&lt;').replace('>', '&gt;')
+        html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n"
         html_output += "</div>\n"
         html_output += "</div>\n"
         html_output += "</details>\n"
     for benchmark in experiment_details[model].keys()
 ))
+# Update the Gradio interface to dynamically filter models based on benchmark
+def update_model_choices(benchmark):
+    available_models = [model for model in MODELS if benchmark in experiment_details[model]]
+    return gr.Dropdown(choices=sorted(available_models), value=sorted(available_models))
 # Create the Gradio interface
 demo = gr.Interface(
     fn=display_model_comparison,

experiments.json CHANGED Viewed

@@ -3,6 +3,7 @@
         "display_name": "gpt 4o",
         "provider": "openai",
         "open": false,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
@@ -55,6 +56,7 @@
         "display_name": "Claude 3.7 Sonnet",
         "provider": "anthropic",
         "open": false,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
@@ -112,6 +114,7 @@
         "display_name": "o3-mini",
         "provider": "openai",
         "open": false,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
@@ -164,6 +167,7 @@
         "display_name": "Moonlight",
         "provider": "moonshotai",
         "open": true,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
@@ -216,6 +220,7 @@
         "display_name": "Llama 3.3 70B",
         "provider": "meta",
         "open": true,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
@@ -258,6 +263,7 @@
         "display_name": "DeepSeek Llama 70B",
         "provider": "deepseek",
         "open": true,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
@@ -300,6 +306,7 @@
         "display_name": "TinyR1 32B",
         "provider": "qihoo360",
         "open": true,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
@@ -342,6 +349,7 @@
         "display_name": "gpt 4.5",
         "provider": "openai",
         "open": false,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
@@ -384,6 +392,7 @@
         "display_name": "DeepSeek Qwen 32B",
         "provider": "deepseek",
         "open": true,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
@@ -426,6 +435,7 @@
         "display_name": "DeepSeek R1",
         "provider": "deepseek",
         "open": true,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
@@ -468,19 +478,20 @@
         "display_name": "QwQ 32B",
         "provider": "Qwen",
         "open": true,
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
                 "metrics": ["extractive_match"],
                 "tags": {
-                    "latest": "2025-03-07T11-04-40.089127"
                 }
             },
             "gpqa_diamond": {
                 "subset": "lighteval|gpqa:diamond|0",
                 "metrics": ["extractive_match"],
                 "tags": {
-                    "latest": "2025-03-07T11-04-40.089127"
                 }
             },
             "aime_24": {
@@ -501,7 +512,80 @@
                 "subset": "extended|ifeval|0",
                 "metrics": ["prompt_level_strict_acc"],
                 "tags": {
-                    "latest": "2025-03-07T11-04-40.089127"
                 }
             }
         }

         "display_name": "gpt 4o",
         "provider": "openai",
         "open": false,
+        "size": "?B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
         "display_name": "Claude 3.7 Sonnet",
         "provider": "anthropic",
         "open": false,
+        "size": "?B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
         "display_name": "o3-mini",
         "provider": "openai",
         "open": false,
+        "size": "?B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
         "display_name": "Moonlight",
         "provider": "moonshotai",
         "open": true,
+        "size": "16B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
         "display_name": "Llama 3.3 70B",
         "provider": "meta",
         "open": true,
+        "size": "70B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
         "display_name": "DeepSeek Llama 70B",
         "provider": "deepseek",
         "open": true,
+        "size": "70B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
         "display_name": "TinyR1 32B",
         "provider": "qihoo360",
         "open": true,
+        "size": "32B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
         "display_name": "gpt 4.5",
         "provider": "openai",
         "open": false,
+        "size": "?B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
         "display_name": "DeepSeek Qwen 32B",
         "provider": "deepseek",
         "open": true,
+        "size": "32B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
         "display_name": "DeepSeek R1",
         "provider": "deepseek",
         "open": true,
+        "size": "671B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
         "display_name": "QwQ 32B",
         "provider": "Qwen",
         "open": true,
+        "size": "32B",
         "benchmarks": {
             "math_500": {
                 "subset": "lighteval|math_500|0",
                 "metrics": ["extractive_match"],
                 "tags": {
+                    "latest": "2025-03-10T11-47-46.303371"
                 }
             },
             "gpqa_diamond": {
                 "subset": "lighteval|gpqa:diamond|0",
                 "metrics": ["extractive_match"],
                 "tags": {
+                    "latest": "2025-03-10T11-47-46.303371"
                 }
             },
             "aime_24": {
                 "subset": "extended|ifeval|0",
                 "metrics": ["prompt_level_strict_acc"],
                 "tags": {
+                    "latest": "2025-03-10T12-21-36.862202"
+                }
+            }
+        }
+    },
+    "google/gemma-3-1b-it": {
+        "display_name": "Gemma 3",
+        "provider": "google",
+        "open": true,
+        "size": "1B",
+        "benchmarks": {
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-18T14-25-56.178612"
+                }
+            }
+        }
+    },
+    "google/gemma-3-12b-it": {
+        "display_name": "Gemma 3 12B",
+        "provider": "google",
+        "open": true,
+        "size": "12B",
+        "benchmarks": {
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-18T14-36-23.368081"
+                }
+            }
+        }
+    },
+    "google/gemma-3-27b-it": {
+        "display_name": "Gemma 3 27B",
+        "provider": "google",
+        "open": true,
+        "size": "27B",
+        "benchmarks": {
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-18T14-41-33.181467"
+                }
+            },
+            "aime_24": {
+                "subset": "lighteval|aime24|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-18T15-11-34.174477"
+                }
+            },
+            "ifeval": {
+                "subset": "extended|ifeval|0",
+                "metrics": ["prompt_level_strict_acc"],
+                "tags": {
+                    "latest": "2025-03-18T15-20-14.979833"
+                }
+            },
+            "gpqa_diamond": {
+                "subset": "lighteval|gpqa:diamond|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-18T15-20-14.979833"
+                }
+            },
+            "math_500": {
+                "subset": "lighteval|math_500|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-18T15-20-14.979833"
                 }
             }
         }