Oleksandr Shchur commited on
Commit
079b094
1 Parent(s): a6d6654

Update leaderboard

Browse files
Files changed (2) hide show
  1. app.py +36 -24
  2. requirements.txt +1 -1
app.py CHANGED
@@ -35,48 +35,60 @@ summary_urls = [
35
  "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
36
  ]
37
 
38
- selected_cols = ["gmean_relative_error", "avg_rank", "median_inference_time_s"]
39
  rename_cols = {
40
  "gmean_relative_error": "Average relative error",
41
  "avg_rank": "Average rank",
42
  "median_inference_time_s": "Median inference time (s)",
 
43
  }
 
44
 
45
- lb_mase = (
46
- fev.leaderboard(summary_urls, metric_column="MASE")[selected_cols]
47
- .rename(columns=rename_cols)
48
- .round(3)
49
- .reset_index()
50
- .astype(str)
51
- )
52
- lb_wql = (
53
- fev.leaderboard(summary_urls, metric_column="WQL")[selected_cols]
54
- .rename(columns=rename_cols)
55
- .round(3)
56
- .reset_index()
57
- .astype(str)
58
- )
59
 
60
 
61
  with gr.Blocks() as demo:
62
- with gr.Tab("Leaderboard"):
63
  gr.Markdown("""
64
- ## Chronos zero-shot benchmark results
65
 
66
- This tab contains results for various forecasting models on the 28 datasets used in Benchmark II (zero-shot evaluation) in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot).
69
- """)
70
- gr.Markdown("""### Point forecast accuracy (measured by MASE)
71
  """)
 
72
  gr.Dataframe(
73
- value=lb_mase,
 
74
  interactive=False,
75
  )
76
 
77
- gr.Markdown("### Probabilistic forecast accuracy (measured by WQL)")
 
78
  gr.Dataframe(
79
- value=lb_wql,
80
  interactive=False,
81
  )
82
 
 
35
  "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
36
  ]
37
 
 
38
  rename_cols = {
39
  "gmean_relative_error": "Average relative error",
40
  "avg_rank": "Average rank",
41
  "median_inference_time_s": "Median inference time (s)",
42
+ "training_corpus_overlap": "Training corpus overlap (%)",
43
  }
44
+ selected_cols = list(rename_cols.keys())
45
 
46
+
47
+ def is_zero_shot(model_name):
48
+ return model_name.startswith("chronos") or model_name in {"timesfm"}
49
+
50
+
51
+ leaderboards = {}
52
+ for metric in ["WQL", "MASE"]:
53
+ lb = fev.leaderboard(summary_urls, metric_column=metric)[selected_cols].rename(columns=rename_cols)
54
+ format_dict = {}
55
+ for col in lb.columns:
56
+ format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}"
57
+ leaderboards[metric] = lb.reset_index().style.format(format_dict)
 
 
58
 
59
 
60
  with gr.Blocks() as demo:
61
+ with gr.Tab("Chronos Benchmark II"):
62
  gr.Markdown("""
63
+ ## Chronos Benchmark II results
64
 
65
+ This tab contains results for various forecasting models on the 28 datasets used in Benchmark II in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).
66
+
67
+ These datasets were used for zero-shot evaluation of Chronos models (i.e., Chronos models were not trained on these datasets), but some other models did include certain datasets in their training corpus.
68
+
69
+ Each table contains the following information:
70
+
71
+ * **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`.
72
+ * **Average rank**: Arithmetic mean of the ranks achieved by each model on each task.
73
+ * **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds).
74
+ * **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus.
75
+
76
+ Lower values are better for all of the above metrics.
77
+
78
+ Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815).
79
 
 
 
 
80
  """)
81
+ gr.Markdown("### Probabilistic forecast accuracy\nMeasured by Weighted Quantile Loss (WQL).")
82
  gr.Dataframe(
83
+ value=leaderboards["WQL"],
84
+ datatype=["str", "number", "number", "number"],
85
  interactive=False,
86
  )
87
 
88
+ gr.Markdown("""### Point forecast accuracy\nMeasured by Mean Absolute Scaled Error (MASE).
89
+ """)
90
  gr.Dataframe(
91
+ value=leaderboards["MASE"],
92
  interactive=False,
93
  )
94
 
requirements.txt CHANGED
@@ -8,4 +8,4 @@ huggingface-hub>=0.18.0
8
  matplotlib
9
  numpy
10
  pandas
11
- fev==0.1.0
 
8
  matplotlib
9
  numpy
10
  pandas
11
+ fev==0.2.0