from datasets import load_dataset
from collections import defaultdict
import json
import gradio as gr
# Load models and experiments
with open("experiments.json") as f:
experiments = json.load(f)
MODELS = list(experiments.keys())
def load_details_and_results(model, benchmark, experiment_tag):
def worker(example):
example["predictions"] = example["predictions"]
example["gold"] = example["gold"][0]
example["metrics"] = example["metrics"]
return example
repo = f"OpenEvals/details_{model.replace('/', '__')}_private"
subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
details = load_dataset(repo, subset, split=split)
results = load_dataset(repo, "results", split=split)
results = eval(results[0]["results"])
columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
details = details.select_columns(columns_to_keep)
details = details.map(worker)
return details, results
# Load all experiment details
experiment_details = defaultdict(dict)
for model in MODELS:
for benchmark, benchmark_details in experiments[model]["benchmarks"].items():
subset = benchmark_details["subset"]
for experiment_tag in benchmark_details["tags"]:
details, _ = load_details_and_results(model, benchmark, experiment_tag)
experiment_details[model][subset] = details
def display_model_comparison(selected_models, benchmark, example_index):
if not selected_models:
return "Please select at least one model to compare."
# Filter out models that don't have the selected benchmark
available_models = [model for model in selected_models if benchmark in experiment_details[model]]
if not available_models:
return f"No models have results for benchmark: {benchmark}"
outputs = []
for model in available_models: # Changed from selected_models to available_models
try:
example = experiment_details[model][benchmark][example_index]
outputs.append({
'Model': model.split('/')[-1],
'Prediction': example['predictions'][0] if example['predictions'] else '',
'Prompt': example['full_prompt'],
'Metrics': example['metrics'],
'Gold': example['gold']
})
except (KeyError, IndexError):
continue
if not outputs:
return "No results found for the selected combination."
# Create HTML output with all models
html_output = "
\n\n"
# Show gold answer at the top with distinct styling
if outputs:
html_output += "
\n"
html_output += "
Ground Truth
\n"
html_output += "
\n"
html_output += f"
{outputs[0]['Gold']}
\n"
html_output += "
\n"
html_output += "
\n"
for output in outputs:
html_output += "
\n"
html_output += f"
{output['Model']}
\n"
# Format metrics as a clean table
html_output += "
\n"
html_output += "Metrics
\n"
metrics = output['Metrics']
if isinstance(metrics, str):
metrics = eval(metrics)
html_output += "\n"
html_output += "
\n"
for key, value in metrics.items():
if isinstance(value, float):
value = f"{value:.3f}"
html_output += f"{key} | {value} |
\n"
html_output += "
\n"
html_output += "
\n"
html_output += " \n\n"
# Handle prompt formatting with better styling
html_output += "
\n"
html_output += "Prompt
\n"
html_output += "\n"
prompt_text = output['Prompt']
if isinstance(prompt_text, list):
for i, msg in enumerate(prompt_text):
if isinstance(msg, dict) and 'content' in msg:
role = msg.get('role', 'message').title()
html_output += "
\n"
html_output += f"
{role}:\n"
html_output += "
\n"
# Escape HTML in content
content = msg['content'].replace('<', '<').replace('>', '>')
html_output += f"
{content}
\n"
html_output += "
\n"
html_output += "
\n"
else:
html_output += "
\n"
html_output += "
\n"
html_output += f"
{json.dumps(msg, indent=2)}
\n"
html_output += "
\n"
html_output += "
\n"
else:
html_output += "
\n"
if isinstance(prompt_text, dict) and 'content' in prompt_text:
# Escape HTML in content
content = prompt_text['content'].replace('<', '<').replace('>', '>')
html_output += f"
{content}
\n"
else:
# Escape HTML if prompt_text is a string
if isinstance(prompt_text, str):
prompt_text = prompt_text.replace('<', '<').replace('>', '>')
html_output += f"
{prompt_text}
\n"
html_output += "
\n"
html_output += "
\n"
html_output += " \n\n"
# Style prediction output - now in a collapsible section
html_output += "
\n"
html_output += "Prediction
"
# Add word count in a muted style
word_count = len(output['Prediction'].split())
html_output += f"({word_count} words)"
html_output += "
\n"
html_output += "\n"
html_output += "
\n"
# Escape HTML in prediction
prediction = output['Prediction'].replace('<', '<').replace('>', '>')
html_output += f"
{prediction}
\n"
html_output += "
\n"
html_output += "
\n"
html_output += " \n"
html_output += "
\n\n"
html_output += "
"
return html_output
# Get unique benchmarks
available_benchmarks = list(set(
benchmark
for model in MODELS
for benchmark in experiment_details[model].keys()
))
# Update the Gradio interface to dynamically filter models based on benchmark
def update_model_choices(benchmark):
available_models = [model for model in MODELS if benchmark in experiment_details[model]]
return gr.Dropdown(choices=sorted(available_models), value=sorted(available_models))
# Create the Gradio interface
demo = gr.Interface(
fn=display_model_comparison,
inputs=[
gr.Dropdown(
choices=sorted(MODELS),
label="Models",
multiselect=True,
value=MODELS,
info="Select models to compare"
),
gr.Dropdown(
choices=sorted(available_benchmarks),
label="Benchmark",
value=sorted(available_benchmarks)[0] if available_benchmarks else None,
info="Choose the evaluation benchmark"
),
gr.Number(
label="Example Index",
value=0,
step=1,
info="Navigate through different examples"
)
],
outputs=gr.HTML(),
title="Model Generation Comparison",
description="Compare model outputs across different benchmarks and prompts",
theme=gr.themes.Soft(),
css="button { margin: 0 10px; padding: 5px 15px; }"
)
if __name__ == "__main__":
demo.launch()