Spaces:
Runtime error
Runtime error
File size: 9,500 Bytes
bcda822 9b8ac8f bcda822 c5bf87e bcda822 c5bf87e bcda822 c5bf87e bcda822 c5bf87e bcda822 c5bf87e bcda822 c5bf87e bcda822 c5bf87e bcda822 c5bf87e bcda822 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
from datasets import load_dataset
from collections import defaultdict
import json
import gradio as gr
# Load models and experiments
with open("experiments.json") as f:
experiments = json.load(f)
MODELS = list(experiments.keys())
def load_details_and_results(model, benchmark, experiment_tag):
def worker(example):
example["predictions"] = example["predictions"]
example["gold"] = example["gold"][0]
example["metrics"] = example["metrics"]
return example
repo = f"OpenEvals/details_{model.replace('/', '__')}_private"
subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
details = load_dataset(repo, subset, split=split)
results = load_dataset(repo, "results", split=split)
results = eval(results[0]["results"])
columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
details = details.select_columns(columns_to_keep)
details = details.map(worker)
return details, results
# Load all experiment details
experiment_details = defaultdict(dict)
for model in MODELS:
for benchmark, benchmark_details in experiments[model]["benchmarks"].items():
subset = benchmark_details["subset"]
for experiment_tag in benchmark_details["tags"]:
details, _ = load_details_and_results(model, benchmark, experiment_tag)
experiment_details[model][subset] = details
def display_model_comparison(selected_models, benchmark, example_index):
if not selected_models:
return "Please select at least one model to compare."
# Filter out models that don't have the selected benchmark
available_models = [model for model in selected_models if benchmark in experiment_details[model]]
if not available_models:
return f"No models have results for benchmark: {benchmark}"
outputs = []
for model in available_models: # Changed from selected_models to available_models
try:
example = experiment_details[model][benchmark][example_index]
outputs.append({
'Model': model.split('/')[-1],
'Prediction': example['predictions'][0] if example['predictions'] else '',
'Prompt': example['full_prompt'],
'Metrics': example['metrics'],
'Gold': example['gold']
})
except (KeyError, IndexError):
continue
if not outputs:
return "No results found for the selected combination."
# Create HTML output with all models
html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"
# Show gold answer at the top with distinct styling
if outputs:
html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n"
html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n"
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{outputs[0]['Gold']}</code></pre>\n"
html_output += "</div>\n"
html_output += "</div>\n"
for output in outputs:
html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
html_output += f"<h2 style='margin-top: 0;'>{output['Model']}</h2>\n"
# Format metrics as a clean table
html_output += "<details open style='margin-bottom: 15px;'>\n"
html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
metrics = output['Metrics']
if isinstance(metrics, str):
metrics = eval(metrics)
html_output += "<div style='overflow-x: auto;'>\n"
html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n"
for key, value in metrics.items():
if isinstance(value, float):
value = f"{value:.3f}"
html_output += f"<tr><td style='padding: 5px; border-bottom: 1px solid #ddd;'><strong>{key}</strong></td><td style='padding: 5px; border-bottom: 1px solid #ddd;'>{value}</td></tr>\n"
html_output += "</table>\n"
html_output += "</div>\n"
html_output += "</details>\n\n"
# Handle prompt formatting with better styling
html_output += "<details style='margin-bottom: 15px;'>\n"
html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
prompt_text = output['Prompt']
if isinstance(prompt_text, list):
for i, msg in enumerate(prompt_text):
if isinstance(msg, dict) and 'content' in msg:
role = msg.get('role', 'message').title()
html_output += "<div style='margin-bottom: 10px;'>\n"
html_output += f"<strong>{role}:</strong>\n"
html_output += "<div style='overflow-x: auto;'>\n"
# Escape HTML in content
content = msg['content'].replace('<', '<').replace('>', '>')
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
html_output += "</div>\n"
html_output += "</div>\n"
else:
html_output += "<div style='margin-bottom: 10px;'>\n"
html_output += "<div style='overflow-x: auto;'>\n"
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{json.dumps(msg, indent=2)}</code></pre>\n"
html_output += "</div>\n"
html_output += "</div>\n"
else:
html_output += "<div style='overflow-x: auto;'>\n"
if isinstance(prompt_text, dict) and 'content' in prompt_text:
# Escape HTML in content
content = prompt_text['content'].replace('<', '<').replace('>', '>')
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
else:
# Escape HTML if prompt_text is a string
if isinstance(prompt_text, str):
prompt_text = prompt_text.replace('<', '<').replace('>', '>')
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
html_output += "</div>\n"
html_output += "</div>\n"
html_output += "</details>\n\n"
# Style prediction output - now in a collapsible section
html_output += "<details open style='margin-bottom: 15px;'>\n"
html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
# Add word count in a muted style
word_count = len(output['Prediction'].split())
html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
html_output += "</summary>\n"
html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
html_output += "<div style='overflow-x: auto;'>\n"
# Escape HTML in prediction
prediction = output['Prediction'].replace('<', '<').replace('>', '>')
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n"
html_output += "</div>\n"
html_output += "</div>\n"
html_output += "</details>\n"
html_output += "</div>\n\n"
html_output += "</div>"
return html_output
# Get unique benchmarks
available_benchmarks = list(set(
benchmark
for model in MODELS
for benchmark in experiment_details[model].keys()
))
# Update the Gradio interface to dynamically filter models based on benchmark
def update_model_choices(benchmark):
available_models = [model for model in MODELS if benchmark in experiment_details[model]]
return gr.Dropdown(choices=sorted(available_models), value=sorted(available_models))
# Create the Gradio interface
demo = gr.Interface(
fn=display_model_comparison,
inputs=[
gr.Dropdown(
choices=sorted(MODELS),
label="Models",
multiselect=True,
value=MODELS,
info="Select models to compare"
),
gr.Dropdown(
choices=sorted(available_benchmarks),
label="Benchmark",
value=sorted(available_benchmarks)[0] if available_benchmarks else None,
info="Choose the evaluation benchmark"
),
gr.Number(
label="Example Index",
value=0,
step=1,
info="Navigate through different examples"
)
],
outputs=gr.HTML(),
title="Model Generation Comparison",
description="Compare model outputs across different benchmarks and prompts",
theme=gr.themes.Soft(),
css="button { margin: 0 10px; padding: 5px 15px; }"
)
if __name__ == "__main__":
demo.launch() |