import fev import gradio as gr import pandas as pd # Load the CSV data into a pandas DataFrame df = pd.read_csv( "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv" ) markdown_text = """ This space hosts evaluation results for time series forecasting models. Benchmark definitions, implementations of models, as well as the evaluation results for individual tasks are available under https://github.com/autogluon/fev. Currently, the results in this space are a minimal proof of concept. Stay tuned for more benchmarks, results for new models and instructions on how to contribute your results. """ summary_urls = [ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_ets.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_theta.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_base.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_large.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_mini.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_small.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_tiny.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_base.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_mini.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_small.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_tiny.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_base.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_large.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_small.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv", ] rename_cols = { "gmean_relative_error": "Average relative error", "avg_rank": "Average rank", "median_inference_time_s": "Median inference time (s)", "training_corpus_overlap": "Training corpus overlap (%)", } selected_cols = list(rename_cols.keys()) def highlight_zeroshot(styler): """Highlight training overlap for zero-shot models with bold green.""" def style_func(val): if val == 0: return "color: green; font-weight: bold" else: return "color: black" return styler.map(style_func, subset=["Training corpus overlap (%)"]) leaderboards = {} for metric in ["WQL", "MASE"]: lb = fev.leaderboard(summary_urls, metric_column=metric)[selected_cols].rename(columns=rename_cols) format_dict = {} for col in lb.columns: format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}" leaderboards[metric] = highlight_zeroshot(lb.reset_index().style.format(format_dict)) with gr.Blocks() as demo: with gr.Tab("Chronos Benchmark II"): gr.Markdown(""" ## Chronos Benchmark II results This tab contains results for various forecasting models on the 28 datasets used in Benchmark II in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815). These datasets were used for zero-shot evaluation of Chronos models (i.e., Chronos models were not trained on these datasets), but some other models did include certain datasets in their training corpus. Each table contains the following information: * **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`. * **Average rank**: Arithmetic mean of the ranks achieved by each model on each task. * **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds). * **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus. Zero-shot models are highlighted in green. Lower values are better for all of the above metrics. Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815). """) gr.Markdown("### Probabilistic forecast accuracy\nMeasured by Weighted Quantile Loss (WQL).") gr.Dataframe( value=leaderboards["WQL"], datatype=["str", "number", "number", "number"], interactive=False, ) gr.Markdown("""### Point forecast accuracy\nMeasured by Mean Absolute Scaled Error (MASE). """) gr.Dataframe( value=leaderboards["MASE"], interactive=False, ) with gr.Tab("About"): gr.Markdown(markdown_text) if __name__ == "__main__": demo.launch()