# Original code by https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard # Modified by EffiBench import pandas as pd import gradio as gr def make_default_md_1(): link_color = "#1976D2" # This color should be clear in both light and dark mode leaderboard_md = f""" # 🏆 LMSYS Chatbot Arena Leaderboard Blog | Paper | GitHub | Dataset | Twitter | Discord """ return leaderboard_md def make_default_md_2(): leaderboard_md = f""" LMSYS Chatbot Arena is a crowdsourced open platform for LLM evals. We've collected over 800,000 human pairwise comparisons to rank LLMs with the Bradley-Terry model and display the model ratings in Elo-scale. You can find more details in our paper. **Chatbot arena is dependent on community participation, please contribute by casting your vote!** """ return leaderboard_md leaderboard_md = """ Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**. - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 500K+ user votes to compute model strength. - [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses. - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks. 💻 Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. """ acknowledgment_md = """ ### Terms of Service Users are required to agree to the following terms before using the service: The service is a research preview. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. Please do not upload any private information. The service collects user dialogue data, including both text and images, and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) or a similar license. ### Acknowledgment We thank [UC Berkeley SkyLab](https://sky.cs.berkeley.edu/), [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a16z](https://www.a16z.com/), [Together AI](https://www.together.ai/), [Hyperbolic](https://hyperbolic.xyz/), [Anyscale](https://www.anyscale.com/), [HuggingFace](https://huggingface.co/) for their generous [sponsorship](https://lmsys.org/donations/). """ citation_md = """ ### Citation Please cite the following paper if you find our leaderboard or dataset helpful. ``` @misc{chiang2024chatbot, title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference}, author={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Hao Zhang and Banghua Zhu and Michael Jordan and Joseph E. Gonzalez and Ion Stoica}, year={2024}, eprint={2403.04132}, archivePrefix={arXiv}, primaryClass={cs.AI} } """ def build_leaderboard_tab(leaderboard_table_file): gr.Markdown(make_default_md_1(), elem_id="leaderboard_markdown") gr.Markdown(make_default_md_2(), elem_id="leaderboard_markdown") df = pd.read_csv(leaderboard_table_file) def filter_leaderboard(timeout, dataset): filtered_df = df[(df['Timeout'] == timeout) & (df['Dataset'] == dataset)] return filtered_df.drop(columns=['Timeout', 'Dataset']) timeouts = df['Timeout'].unique().tolist() datasets = df['Dataset'].unique().tolist() with gr.Tab("Leaderboard"): gr.Markdown(leaderboard_md, elem_id="leaderboard_markdown") with gr.Row(): timeout_dropdown = gr.Dropdown(label="Timeout", choices=timeouts, value=timeouts[0]) dataset_dropdown = gr.Dropdown(label="Dataset", choices=datasets, value=datasets[0]) initial_data = filter_leaderboard(timeouts[0], datasets[0]) leaderboard = gr.Dataframe(value=initial_data) def update_leaderboard(timeout, dataset): filtered_data = filter_leaderboard(timeout, dataset) leaderboard.update(value=filtered_data) timeout_dropdown.change(update_leaderboard, [timeout_dropdown, dataset_dropdown], leaderboard) dataset_dropdown.change(update_leaderboard, [timeout_dropdown, dataset_dropdown], leaderboard) with gr.Accordion("Citation", open=True): gr.Markdown(citation_md, elem_id="leaderboard_markdown") gr.Markdown(acknowledgment_md, elem_id="ack_markdown")