Armeddinosaur commited on
Commit
bd9c702
·
1 Parent(s): fc0a17a

updating metric definition

Browse files
Files changed (1) hide show
  1. src/components/leaderboard.py +25 -11
src/components/leaderboard.py CHANGED
@@ -186,16 +186,23 @@ def render_leaderboard_table(display_df, metric_columns, primary_metric):
186
  formula_html = """
187
  <div style="margin: 15px 0;">
188
  <p style="margin-bottom: 10px; font-weight: 500;">Formula:</p>
189
- <div style="background-color: #111; padding: 15px; border-radius: 5px; text-align: center; margin-bottom: 15px;">
190
- Relative Improvement to Human = max<sub>all runs</sub>((P<sub>agent</sub> - P<sub>baseline</sub>) / (P<sub>human</sub> - P<sub>baseline</sub>)) × 100%
 
 
 
 
 
 
 
191
  </div>
192
  <p style="margin-top: 10px; font-weight: 500;">Where:</p>
193
  <ul style="list-style-type: disc; padding-left: 25px; margin-top: 8px;">
194
- <li style="margin-bottom: 5px;">P<sub>agent</sub> is the agent's test performance</li>
195
- <li style="margin-bottom: 5px;">P<sub>baseline</sub> is the baseline test performance</li>
196
- <li style="margin-bottom: 5px;">P<sub>human</sub> is the human performance benchmark</li>
197
- <li style="margin-bottom: 5px;">The maximum is taken across all experimental runs for a given task-model pair</li>
198
  </ul>
 
199
  </div>
200
  """
201
 
@@ -206,15 +213,22 @@ def render_leaderboard_table(display_df, metric_columns, primary_metric):
206
  formula_html = """
207
  <div style="margin: 15px 0;">
208
  <p style="margin-bottom: 10px; font-weight: 500;">Formula:</p>
209
- <div style="background-color: #111; padding: 15px; border-radius: 5px; text-align: center; margin-bottom: 15px;">
210
- Absolute Improvement to Baseline = max<sub>all runs</sub>((P<sub>agent</sub> - P<sub>baseline</sub>) / P<sub>baseline</sub>) × 100%
 
 
 
 
 
 
 
211
  </div>
212
  <p style="margin-top: 10px; font-weight: 500;">Where:</p>
213
  <ul style="list-style-type: disc; padding-left: 25px; margin-top: 8px;">
214
- <li style="margin-bottom: 5px;">P<sub>agent</sub> is the agent's test performance</li>
215
- <li style="margin-bottom: 5px;">P<sub>baseline</sub> is the baseline test performance</li>
216
- <li style="margin-bottom: 5px;">The maximum is taken across all experimental runs for a given task-model pair</li>
217
  </ul>
 
218
  </div>
219
  """
220
 
 
186
  formula_html = """
187
  <div style="margin: 15px 0;">
188
  <p style="margin-bottom: 10px; font-weight: 500;">Formula:</p>
189
+ <div style="background-color: #111; padding: 20px; border-radius: 5px; text-align: center; margin-bottom: 15px; font-size: 18px; line-height: 1.5; border: 1px solid #333;">
190
+ <div style="display: flex; align-items: center; justify-content: center;">
191
+ <div style="margin-right: 10px;">Relative Improvement to Human =</div>
192
+ <div style="display: inline-block; text-align: center; padding: 0 10px;">
193
+ <div style="border-bottom: 1px solid #aaa; padding-bottom: 5px;">s<sub>agent</sub> - s<sub>baseline</sub></div>
194
+ <div style="padding-top: 5px;">s<sub>top_human</sub> - s<sub>baseline</sub></div>
195
+ </div>
196
+ <div style="margin-left: 10px;">× 100%</div>
197
+ </div>
198
  </div>
199
  <p style="margin-top: 10px; font-weight: 500;">Where:</p>
200
  <ul style="list-style-type: disc; padding-left: 25px; margin-top: 8px;">
201
+ <li style="margin-bottom: 5px;">s<sub>agent</sub> is the agent's test performance</li>
202
+ <li style="margin-bottom: 5px;">s<sub>baseline</sub> is the baseline test performance</li>
203
+ <li style="margin-bottom: 5px;">s<sub>top_human</sub> is the top human performance in competition</li>
 
204
  </ul>
205
+ <p style="margin-top: 10px;">This metric normalizes scores by setting the baseline solution to 0 and the top human solution to 100.</p>
206
  </div>
207
  """
208
 
 
213
  formula_html = """
214
  <div style="margin: 15px 0;">
215
  <p style="margin-bottom: 10px; font-weight: 500;">Formula:</p>
216
+ <div style="background-color: #111; padding: 20px; border-radius: 5px; text-align: center; margin-bottom: 15px; font-size: 18px; line-height: 1.5; border: 1px solid #333;">
217
+ <div style="display: flex; align-items: center; justify-content: center;">
218
+ <div style="margin-right: 10px;">Absolute Improvement to Baseline =</div>
219
+ <div style="display: inline-block; text-align: center; padding: 0 10px;">
220
+ <div style="border-bottom: 1px solid #aaa; padding-bottom: 5px;">s<sub>agent</sub> - s<sub>baseline</sub></div>
221
+ <div style="padding-top: 5px;">s<sub>baseline</sub></div>
222
+ </div>
223
+ <div style="margin-left: 10px;">× 100%</div>
224
+ </div>
225
  </div>
226
  <p style="margin-top: 10px; font-weight: 500;">Where:</p>
227
  <ul style="list-style-type: disc; padding-left: 25px; margin-top: 8px;">
228
+ <li style="margin-bottom: 5px;">s<sub>agent</sub> is the agent's test performance</li>
229
+ <li style="margin-bottom: 5px;">s<sub>baseline</sub> is the baseline test performance</li>
 
230
  </ul>
231
+ <p style="margin-top: 10px;">This metric measures the percentage improvement of an agent's performance over the baseline solution.</p>
232
  </div>
233
  """
234