Linker1907's picture
from datasets import load_dataset
from collections import defaultdict
import json
import gradio as gr
# Load models and experiments
with open("experiments.json") as f:
experiments = json.load(f)
MODELS = list(experiments.keys())
def load_details_and_results(model, benchmark, experiment_tag):
def worker(example):
example["predictions"] = example["predictions"]
example["gold"] = example["gold"][0]
example["metrics"] = example["metrics"]
return example
repo = f"OpenEvals/details_{model.replace('/', '__')}_private"
subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
details = load_dataset(repo, subset, split=split)
results = load_dataset(repo, "results", split=split)
results = eval(results[0]["results"])
columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
details = details.select_columns(columns_to_keep)
details =
return details, results
# Load all experiment details
experiment_details = defaultdict(dict)
for model in MODELS:
for benchmark, benchmark_details in experiments[model]["benchmarks"].items():
subset = benchmark_details["subset"]
for experiment_tag in benchmark_details["tags"]:
details, _ = load_details_and_results(model, benchmark, experiment_tag)
experiment_details[model][subset] = details
def display_model_comparison(selected_models, benchmark, example_index):
if not selected_models:
return "Please select at least one model to compare."
# Filter out models that don't have the selected benchmark
available_models = [model for model in selected_models if benchmark in experiment_details[model]]
if not available_models:
return f"No models have results for benchmark: {benchmark}"
outputs = []
for model in available_models: # Changed from selected_models to available_models
example = experiment_details[model][benchmark][example_index]
'Model': model.split('/')[-1],
'Prediction': example['predictions'][0] if example['predictions'] else '',
'Prompt': example['full_prompt'],
'Metrics': example['metrics'],
'Gold': example['gold']
except (KeyError, IndexError):
if not outputs:
return "No results found for the selected combination."
# Create HTML output with all models
html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"
# Show gold answer at the top with distinct styling
if outputs:
html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n"
html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n"
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{outputs[0]['Gold']}</code></pre>\n"
html_output += "</div>\n"
html_output += "</div>\n"
for output in outputs:
html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
html_output += f"<h2 style='margin-top: 0;'>{output['Model']}</h2>\n"
# Format metrics as a clean table
html_output += "<details open style='margin-bottom: 15px;'>\n"
html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
metrics = output['Metrics']
if isinstance(metrics, str):
metrics = eval(metrics)
html_output += "<div style='overflow-x: auto;'>\n"
html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n"
for key, value in metrics.items():
if isinstance(value, float):
value = f"{value:.3f}"
html_output += f"<tr><td style='padding: 5px; border-bottom: 1px solid #ddd;'><strong>{key}</strong></td><td style='padding: 5px; border-bottom: 1px solid #ddd;'>{value}</td></tr>\n"
html_output += "</table>\n"
html_output += "</div>\n"
html_output += "</details>\n\n"
# Handle prompt formatting with better styling
html_output += "<details style='margin-bottom: 15px;'>\n"
html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
prompt_text = output['Prompt']
if isinstance(prompt_text, list):
for i, msg in enumerate(prompt_text):
if isinstance(msg, dict) and 'content' in msg:
role = msg.get('role', 'message').title()
html_output += "<div style='margin-bottom: 10px;'>\n"
html_output += f"<strong>{role}:</strong>\n"
html_output += "<div style='overflow-x: auto;'>\n"
# Escape HTML in content
content = msg['content'].replace('<', '&lt;').replace('>', '&gt;')
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
html_output += "</div>\n"
html_output += "</div>\n"
html_output += "<div style='margin-bottom: 10px;'>\n"
html_output += "<div style='overflow-x: auto;'>\n"
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{json.dumps(msg, indent=2)}</code></pre>\n"
html_output += "</div>\n"
html_output += "</div>\n"
html_output += "<div style='overflow-x: auto;'>\n"
if isinstance(prompt_text, dict) and 'content' in prompt_text:
# Escape HTML in content
content = prompt_text['content'].replace('<', '&lt;').replace('>', '&gt;')
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
# Escape HTML if prompt_text is a string
if isinstance(prompt_text, str):
prompt_text = prompt_text.replace('<', '&lt;').replace('>', '&gt;')
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
html_output += "</div>\n"
html_output += "</div>\n"
html_output += "</details>\n\n"
# Style prediction output - now in a collapsible section
html_output += "<details open style='margin-bottom: 15px;'>\n"
html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
# Add word count in a muted style
word_count = len(output['Prediction'].split())
html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
html_output += "</summary>\n"
html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
html_output += "<div style='overflow-x: auto;'>\n"
# Escape HTML in prediction
prediction = output['Prediction'].replace('<', '&lt;').replace('>', '&gt;')
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n"
html_output += "</div>\n"
html_output += "</div>\n"
html_output += "</details>\n"
html_output += "</div>\n\n"
html_output += "</div>"
return html_output
# Get unique benchmarks
available_benchmarks = list(set(
for model in MODELS
for benchmark in experiment_details[model].keys()
# Update the Gradio interface to dynamically filter models based on benchmark
def update_model_choices(benchmark):
available_models = [model for model in MODELS if benchmark in experiment_details[model]]
return gr.Dropdown(choices=sorted(available_models), value=sorted(available_models))
# Create the Gradio interface
demo = gr.Interface(
info="Select models to compare"
value=sorted(available_benchmarks)[0] if available_benchmarks else None,
info="Choose the evaluation benchmark"
label="Example Index",
info="Navigate through different examples"
title="Model Generation Comparison",
description="Compare model outputs across different benchmarks and prompts",
css="button { margin: 0 10px; padding: 5px 15px; }"
if __name__ == "__main__":