Spaces:
Running
Running
Oleksandr Shchur
commited on
Commit
•
079b094
1
Parent(s):
a6d6654
Update leaderboard
Browse files- app.py +36 -24
- requirements.txt +1 -1
app.py
CHANGED
@@ -35,48 +35,60 @@ summary_urls = [
|
|
35 |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
|
36 |
]
|
37 |
|
38 |
-
selected_cols = ["gmean_relative_error", "avg_rank", "median_inference_time_s"]
|
39 |
rename_cols = {
|
40 |
"gmean_relative_error": "Average relative error",
|
41 |
"avg_rank": "Average rank",
|
42 |
"median_inference_time_s": "Median inference time (s)",
|
|
|
43 |
}
|
|
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
.
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
.
|
55 |
-
|
56 |
-
.reset_index()
|
57 |
-
.astype(str)
|
58 |
-
)
|
59 |
|
60 |
|
61 |
with gr.Blocks() as demo:
|
62 |
-
with gr.Tab("
|
63 |
gr.Markdown("""
|
64 |
-
## Chronos
|
65 |
|
66 |
-
This tab contains results for various forecasting models on the 28 datasets used in Benchmark II
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot).
|
69 |
-
""")
|
70 |
-
gr.Markdown("""### Point forecast accuracy (measured by MASE)
|
71 |
""")
|
|
|
72 |
gr.Dataframe(
|
73 |
-
value=
|
|
|
74 |
interactive=False,
|
75 |
)
|
76 |
|
77 |
-
gr.Markdown("###
|
|
|
78 |
gr.Dataframe(
|
79 |
-
value=
|
80 |
interactive=False,
|
81 |
)
|
82 |
|
|
|
35 |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
|
36 |
]
|
37 |
|
|
|
38 |
rename_cols = {
|
39 |
"gmean_relative_error": "Average relative error",
|
40 |
"avg_rank": "Average rank",
|
41 |
"median_inference_time_s": "Median inference time (s)",
|
42 |
+
"training_corpus_overlap": "Training corpus overlap (%)",
|
43 |
}
|
44 |
+
selected_cols = list(rename_cols.keys())
|
45 |
|
46 |
+
|
47 |
+
def is_zero_shot(model_name):
|
48 |
+
return model_name.startswith("chronos") or model_name in {"timesfm"}
|
49 |
+
|
50 |
+
|
51 |
+
leaderboards = {}
|
52 |
+
for metric in ["WQL", "MASE"]:
|
53 |
+
lb = fev.leaderboard(summary_urls, metric_column=metric)[selected_cols].rename(columns=rename_cols)
|
54 |
+
format_dict = {}
|
55 |
+
for col in lb.columns:
|
56 |
+
format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}"
|
57 |
+
leaderboards[metric] = lb.reset_index().style.format(format_dict)
|
|
|
|
|
58 |
|
59 |
|
60 |
with gr.Blocks() as demo:
|
61 |
+
with gr.Tab("Chronos Benchmark II"):
|
62 |
gr.Markdown("""
|
63 |
+
## Chronos Benchmark II results
|
64 |
|
65 |
+
This tab contains results for various forecasting models on the 28 datasets used in Benchmark II in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).
|
66 |
+
|
67 |
+
These datasets were used for zero-shot evaluation of Chronos models (i.e., Chronos models were not trained on these datasets), but some other models did include certain datasets in their training corpus.
|
68 |
+
|
69 |
+
Each table contains the following information:
|
70 |
+
|
71 |
+
* **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`.
|
72 |
+
* **Average rank**: Arithmetic mean of the ranks achieved by each model on each task.
|
73 |
+
* **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds).
|
74 |
+
* **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus.
|
75 |
+
|
76 |
+
Lower values are better for all of the above metrics.
|
77 |
+
|
78 |
+
Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815).
|
79 |
|
|
|
|
|
|
|
80 |
""")
|
81 |
+
gr.Markdown("### Probabilistic forecast accuracy\nMeasured by Weighted Quantile Loss (WQL).")
|
82 |
gr.Dataframe(
|
83 |
+
value=leaderboards["WQL"],
|
84 |
+
datatype=["str", "number", "number", "number"],
|
85 |
interactive=False,
|
86 |
)
|
87 |
|
88 |
+
gr.Markdown("""### Point forecast accuracy\nMeasured by Mean Absolute Scaled Error (MASE).
|
89 |
+
""")
|
90 |
gr.Dataframe(
|
91 |
+
value=leaderboards["MASE"],
|
92 |
interactive=False,
|
93 |
)
|
94 |
|
requirements.txt
CHANGED
@@ -8,4 +8,4 @@ huggingface-hub>=0.18.0
|
|
8 |
matplotlib
|
9 |
numpy
|
10 |
pandas
|
11 |
-
fev==0.
|
|
|
8 |
matplotlib
|
9 |
numpy
|
10 |
pandas
|
11 |
+
fev==0.2.0
|