File size: 9,500 Bytes
bcda822
 
 
 
 
 
 
 
 
 
9b8ac8f
 
bcda822
 
 
 
 
 
 
c5bf87e
bcda822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5bf87e
 
 
 
 
 
bcda822
c5bf87e
bcda822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5bf87e
 
 
bcda822
 
 
 
 
 
 
 
 
 
 
c5bf87e
 
 
bcda822
c5bf87e
 
 
bcda822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5bf87e
 
 
bcda822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5bf87e
 
 
 
 
bcda822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
from datasets import load_dataset
from collections import defaultdict
import json
import gradio as gr

# Load models and experiments

with open("experiments.json") as f:
    experiments = json.load(f)

MODELS = list(experiments.keys())

def load_details_and_results(model, benchmark, experiment_tag):
    def worker(example):
        example["predictions"] = example["predictions"]
        example["gold"] = example["gold"][0]
        example["metrics"] = example["metrics"]
        return example

    repo = f"OpenEvals/details_{model.replace('/', '__')}_private"
    subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
    split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")

    details = load_dataset(repo, subset, split=split)
    results = load_dataset(repo, "results", split=split)

    results = eval(results[0]["results"])

    columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
    details = details.select_columns(columns_to_keep)
    details = details.map(worker)

    return details, results

# Load all experiment details
experiment_details = defaultdict(dict)

for model in MODELS:
    for benchmark, benchmark_details in experiments[model]["benchmarks"].items():
        subset = benchmark_details["subset"]
        for experiment_tag in benchmark_details["tags"]:
            details, _ = load_details_and_results(model, benchmark, experiment_tag)
            experiment_details[model][subset] = details

def display_model_comparison(selected_models, benchmark, example_index):
    if not selected_models:
        return "Please select at least one model to compare."
    
    # Filter out models that don't have the selected benchmark
    available_models = [model for model in selected_models if benchmark in experiment_details[model]]
    
    if not available_models:
        return f"No models have results for benchmark: {benchmark}"
    
    outputs = []
    for model in available_models:  # Changed from selected_models to available_models
        try:
            example = experiment_details[model][benchmark][example_index]
            outputs.append({
                'Model': model.split('/')[-1],
                'Prediction': example['predictions'][0] if example['predictions'] else '',
                'Prompt': example['full_prompt'],
                'Metrics': example['metrics'],
                'Gold': example['gold']
            })
        except (KeyError, IndexError):
            continue
    
    if not outputs:
        return "No results found for the selected combination."
    
    # Create HTML output with all models
    html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"
    
    # Show gold answer at the top with distinct styling
    if outputs:
        html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
        html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n"
        html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n"
        html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{outputs[0]['Gold']}</code></pre>\n"
        html_output += "</div>\n"
        html_output += "</div>\n"
    
    for output in outputs:
        html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
        html_output += f"<h2 style='margin-top: 0;'>{output['Model']}</h2>\n"
        
        # Format metrics as a clean table
        html_output += "<details open style='margin-bottom: 15px;'>\n"
        html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
        metrics = output['Metrics']
        if isinstance(metrics, str):
            metrics = eval(metrics)
        html_output += "<div style='overflow-x: auto;'>\n"
        html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n"
        for key, value in metrics.items():
            if isinstance(value, float):
                value = f"{value:.3f}"
            html_output += f"<tr><td style='padding: 5px; border-bottom: 1px solid #ddd;'><strong>{key}</strong></td><td style='padding: 5px; border-bottom: 1px solid #ddd;'>{value}</td></tr>\n"
        html_output += "</table>\n"
        html_output += "</div>\n"
        html_output += "</details>\n\n"
        
        # Handle prompt formatting with better styling
        html_output += "<details style='margin-bottom: 15px;'>\n"
        html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
        html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
        
        prompt_text = output['Prompt']
        if isinstance(prompt_text, list):
            for i, msg in enumerate(prompt_text):
                if isinstance(msg, dict) and 'content' in msg:
                    role = msg.get('role', 'message').title()
                    html_output += "<div style='margin-bottom: 10px;'>\n"
                    html_output += f"<strong>{role}:</strong>\n"
                    html_output += "<div style='overflow-x: auto;'>\n"
                    # Escape HTML in content
                    content = msg['content'].replace('<', '&lt;').replace('>', '&gt;')
                    html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
                    html_output += "</div>\n"
                    html_output += "</div>\n"
                else:
                    html_output += "<div style='margin-bottom: 10px;'>\n"
                    html_output += "<div style='overflow-x: auto;'>\n"
                    html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{json.dumps(msg, indent=2)}</code></pre>\n"
                    html_output += "</div>\n"
                    html_output += "</div>\n"
        else:
            html_output += "<div style='overflow-x: auto;'>\n"
            if isinstance(prompt_text, dict) and 'content' in prompt_text:
                # Escape HTML in content
                content = prompt_text['content'].replace('<', '&lt;').replace('>', '&gt;')
                html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
            else:
                # Escape HTML if prompt_text is a string
                if isinstance(prompt_text, str):
                    prompt_text = prompt_text.replace('<', '&lt;').replace('>', '&gt;')
                html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
            html_output += "</div>\n"
        
        html_output += "</div>\n"
        html_output += "</details>\n\n"
        
        # Style prediction output - now in a collapsible section
        html_output += "<details open style='margin-bottom: 15px;'>\n"
        html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
        # Add word count in a muted style
        word_count = len(output['Prediction'].split())
        html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
        html_output += "</summary>\n"
        html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
        html_output += "<div style='overflow-x: auto;'>\n"
        # Escape HTML in prediction
        prediction = output['Prediction'].replace('<', '&lt;').replace('>', '&gt;')
        html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n"
        html_output += "</div>\n"
        html_output += "</div>\n"
        html_output += "</details>\n"
        html_output += "</div>\n\n"
    
    html_output += "</div>"
    return html_output

# Get unique benchmarks
available_benchmarks = list(set(
    benchmark 
    for model in MODELS 
    for benchmark in experiment_details[model].keys()
))

# Update the Gradio interface to dynamically filter models based on benchmark
def update_model_choices(benchmark):
    available_models = [model for model in MODELS if benchmark in experiment_details[model]]
    return gr.Dropdown(choices=sorted(available_models), value=sorted(available_models))

# Create the Gradio interface
demo = gr.Interface(
    fn=display_model_comparison,
    inputs=[
        gr.Dropdown(
            choices=sorted(MODELS), 
            label="Models", 
            multiselect=True, 
            value=MODELS,
            info="Select models to compare"
        ),
        gr.Dropdown(
            choices=sorted(available_benchmarks), 
            label="Benchmark",
            value=sorted(available_benchmarks)[0] if available_benchmarks else None,
            info="Choose the evaluation benchmark"
        ),
        gr.Number(
            label="Example Index", 
            value=0,
            step=1,
            info="Navigate through different examples"
        )
    ],
    outputs=gr.HTML(),
    title="Model Generation Comparison",
    description="Compare model outputs across different benchmarks and prompts",
    theme=gr.themes.Soft(),
    css="button { margin: 0 10px; padding: 5px 15px; }"
)

if __name__ == "__main__":
    demo.launch()