from datasets import load_dataset from collections import defaultdict import json import gradio as gr # Load models and experiments with open("experiments.json") as f: experiments = json.load(f) MODELS = list(experiments.keys()) def load_details_and_results(model, benchmark, experiment_tag): def worker(example): example["predictions"] = example["predictions"] example["gold"] = example["gold"][0] example["metrics"] = example["metrics"] return example repo = f"OpenEvals/details_{model.replace('/', '__')}_private" subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_") split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_") details = load_dataset(repo, subset, split=split) results = load_dataset(repo, "results", split=split) results = eval(results[0]["results"]) columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions'] details = details.select_columns(columns_to_keep) details = details.map(worker) return details, results # Load all experiment details experiment_details = defaultdict(dict) for model in MODELS: for benchmark, benchmark_details in experiments[model]["benchmarks"].items(): subset = benchmark_details["subset"] for experiment_tag in benchmark_details["tags"]: details, _ = load_details_and_results(model, benchmark, experiment_tag) experiment_details[model][subset] = details def display_model_comparison(selected_models, benchmark, example_index): if not selected_models: return "Please select at least one model to compare." # Filter out models that don't have the selected benchmark available_models = [model for model in selected_models if benchmark in experiment_details[model]] if not available_models: return f"No models have results for benchmark: {benchmark}" outputs = [] for model in available_models: # Changed from selected_models to available_models try: example = experiment_details[model][benchmark][example_index] outputs.append({ 'Model': model.split('/')[-1], 'Prediction': example['predictions'][0] if example['predictions'] else '', 'Prompt': example['full_prompt'], 'Metrics': example['metrics'], 'Gold': example['gold'] }) except (KeyError, IndexError): continue if not outputs: return "No results found for the selected combination." # Create HTML output with all models html_output = "
\n\n" # Show gold answer at the top with distinct styling if outputs: html_output += "
\n" html_output += "

Ground Truth

\n" html_output += "
\n" html_output += f"
{outputs[0]['Gold']}
\n" html_output += "
\n" html_output += "
\n" for output in outputs: html_output += "
\n" html_output += f"

{output['Model']}

\n" # Format metrics as a clean table html_output += "
\n" html_output += "

Metrics

\n" metrics = output['Metrics'] if isinstance(metrics, str): metrics = eval(metrics) html_output += "
\n" html_output += "\n" for key, value in metrics.items(): if isinstance(value, float): value = f"{value:.3f}" html_output += f"\n" html_output += "
{key}{value}
\n" html_output += "
\n" html_output += "
\n\n" # Handle prompt formatting with better styling html_output += "
\n" html_output += "

Prompt

\n" html_output += "
\n" prompt_text = output['Prompt'] if isinstance(prompt_text, list): for i, msg in enumerate(prompt_text): if isinstance(msg, dict) and 'content' in msg: role = msg.get('role', 'message').title() html_output += "
\n" html_output += f"{role}:\n" html_output += "
\n" # Escape HTML in content content = msg['content'].replace('<', '<').replace('>', '>') html_output += f"
{content}
\n" html_output += "
\n" html_output += "
\n" else: html_output += "
\n" html_output += "
\n" html_output += f"
{json.dumps(msg, indent=2)}
\n" html_output += "
\n" html_output += "
\n" else: html_output += "
\n" if isinstance(prompt_text, dict) and 'content' in prompt_text: # Escape HTML in content content = prompt_text['content'].replace('<', '<').replace('>', '>') html_output += f"
{content}
\n" else: # Escape HTML if prompt_text is a string if isinstance(prompt_text, str): prompt_text = prompt_text.replace('<', '<').replace('>', '>') html_output += f"
{prompt_text}
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n\n" # Style prediction output - now in a collapsible section html_output += "
\n" html_output += "

Prediction

" # Add word count in a muted style word_count = len(output['Prediction'].split()) html_output += f"({word_count} words)" html_output += "
\n" html_output += "
\n" html_output += "
\n" # Escape HTML in prediction prediction = output['Prediction'].replace('<', '<').replace('>', '>') html_output += f"
{prediction}
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n\n" html_output += "
" return html_output # Get unique benchmarks available_benchmarks = list(set( benchmark for model in MODELS for benchmark in experiment_details[model].keys() )) # Update the Gradio interface to dynamically filter models based on benchmark def update_model_choices(benchmark): available_models = [model for model in MODELS if benchmark in experiment_details[model]] return gr.Dropdown(choices=sorted(available_models), value=sorted(available_models)) # Create the Gradio interface demo = gr.Interface( fn=display_model_comparison, inputs=[ gr.Dropdown( choices=sorted(MODELS), label="Models", multiselect=True, value=MODELS, info="Select models to compare" ), gr.Dropdown( choices=sorted(available_benchmarks), label="Benchmark", value=sorted(available_benchmarks)[0] if available_benchmarks else None, info="Choose the evaluation benchmark" ), gr.Number( label="Example Index", value=0, step=1, info="Navigate through different examples" ) ], outputs=gr.HTML(), title="Model Generation Comparison", description="Compare model outputs across different benchmarks and prompts", theme=gr.themes.Soft(), css="button { margin: 0 10px; padding: 5px 15px; }" ) if __name__ == "__main__": demo.launch()