Oleksandr Shchur commited on
Commit
218d801
1 Parent(s): 079b094

Highlight zero-shot models

Browse files
Files changed (1) hide show
  1. app.py +12 -4
app.py CHANGED
@@ -44,8 +44,16 @@ rename_cols = {
44
  selected_cols = list(rename_cols.keys())
45
 
46
 
47
- def is_zero_shot(model_name):
48
- return model_name.startswith("chronos") or model_name in {"timesfm"}
 
 
 
 
 
 
 
 
49
 
50
 
51
  leaderboards = {}
@@ -54,7 +62,7 @@ for metric in ["WQL", "MASE"]:
54
  format_dict = {}
55
  for col in lb.columns:
56
  format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}"
57
- leaderboards[metric] = lb.reset_index().style.format(format_dict)
58
 
59
 
60
  with gr.Blocks() as demo:
@@ -71,7 +79,7 @@ with gr.Blocks() as demo:
71
  * **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`.
72
  * **Average rank**: Arithmetic mean of the ranks achieved by each model on each task.
73
  * **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds).
74
- * **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus.
75
 
76
  Lower values are better for all of the above metrics.
77
 
 
44
  selected_cols = list(rename_cols.keys())
45
 
46
 
47
+ def highlight_zeroshot(styler):
48
+ """Highlight training overlap for zero-shot models with bold green."""
49
+
50
+ def style_func(val):
51
+ if val == 0:
52
+ return "color: green; font-weight: bold"
53
+ else:
54
+ return "color: black"
55
+
56
+ return styler.map(style_func, subset=["Training corpus overlap (%)"])
57
 
58
 
59
  leaderboards = {}
 
62
  format_dict = {}
63
  for col in lb.columns:
64
  format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}"
65
+ leaderboards[metric] = highlight_zeroshot(lb.reset_index().style.format(format_dict))
66
 
67
 
68
  with gr.Blocks() as demo:
 
79
  * **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`.
80
  * **Average rank**: Arithmetic mean of the ranks achieved by each model on each task.
81
  * **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds).
82
+ * **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus. Zero-shot models are highlighted in green.
83
 
84
  Lower values are better for all of the above metrics.
85