import gradio as gr import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from src.about import ( INTRODUCTION_TEXT, TITLE ) from src.benchmarks import ( QABenchmarks, LongDocBenchmarks ) from src.display.css_html_js import custom_css from src.envs import ( API, EVAL_RESULTS_PATH, REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST, LATEST_BENCHMARK_VERSION, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, BM25_LINK ) from src.loaders import ( load_eval_results ) from src.utils import ( update_metric, set_listeners, reset_rank, remove_html ) from src.display.gradio_formatting import ( get_version_dropdown, get_search_bar, get_reranking_dropdown, get_noreranking_dropdown, get_metric_dropdown, get_domain_dropdown, get_language_dropdown, get_anonymous_checkbox, get_revision_and_ts_checkbox, get_leaderboard_table ) def restart_space(): API.restart_space(repo_id=REPO_ID) # try: # snapshot_download( # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, # token=TOKEN # ) # except Exception as e: # print(f'failed to download') # restart_space() global data data = load_eval_results(EVAL_RESULTS_PATH) global datastore datastore = data[LATEST_BENCHMARK_VERSION] def update_metric_qa( metric: str, domains: list, langs: list, reranking_model: list, query: str, show_anonymous: bool, show_revision_and_timestamp: bool, ): return update_metric(datastore, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp) def update_metric_long_doc( metric: str, domains: list, langs: list, reranking_model: list, query: str, show_anonymous: bool, show_revision_and_timestamp, ): return update_metric(datastore, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp) def update_datastore(version): global datastore global data datastore = data[version] selected_domains = get_domain_dropdown(QABenchmarks[datastore.slug]) selected_langs = get_language_dropdown(QABenchmarks[datastore.slug]) selected_rerankings = get_reranking_dropdown(datastore.reranking_models) leaderboard_table = get_leaderboard_table( datastore.raw_df_qa, datastore.types_qa) hidden_leaderboard_table_for_search = get_leaderboard_table( datastore.raw_df_qa, datastore.types_qa, visible=False) return selected_domains, selected_langs, selected_rerankings, leaderboard_table, hidden_leaderboard_table_for_search # DOMAIN_COLS_LONG_DOC = list(frozenset([c.value.domain for c in list(LongDocBenchmarks)])) # LANG_COLS_LONG_DOC = list(frozenset([c.value.lang for c in list(LongDocBenchmarks)])) demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("Results", elem_id="results-tab-table"): with gr.Row(): selected_version = get_version_dropdown() with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0): with gr.Row(): with gr.Column(min_width=320): # select domain with gr.Row(): selected_domains = get_domain_dropdown(QABenchmarks[datastore.slug]) # select language with gr.Row(): selected_langs = get_language_dropdown(QABenchmarks[datastore.slug]) with gr.Column(): # select the metric selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA) with gr.Row(): show_anonymous = get_anonymous_checkbox() with gr.Row(): show_revision_and_timestamp = get_revision_and_ts_checkbox() with gr.Tabs(elem_classes="tab-buttons") as sub_tabs: with gr.TabItem("Retrieval + Reranking", id=10): with gr.Row(): # search retrieval models with gr.Column(): search_bar = get_search_bar() # select reranking models with gr.Column(): selected_rerankings = get_reranking_dropdown(datastore.reranking_models) # shown_table lb_table = get_leaderboard_table( datastore.leaderboard_df_qa, datastore.types_qa) # Dummy leaderboard for handling the case when the user uses backspace key hidden_lb_table = get_leaderboard_table( datastore.raw_df_qa, datastore.types_qa, visible=False) selected_version.change( update_datastore, [selected_version,], [selected_domains, selected_langs, selected_rerankings, lb_table, hidden_lb_table] ) set_listeners( "qa", lb_table, hidden_lb_table, search_bar, selected_version, selected_domains, selected_langs, selected_rerankings, show_anonymous, show_revision_and_timestamp, ) # set metric listener selected_metric.change( update_metric_qa, [ selected_metric, selected_domains, selected_langs, selected_rerankings, search_bar, show_anonymous, show_revision_and_timestamp, ], lb_table, queue=True ) with gr.TabItem("Retrieval Only", id=11): with gr.Row(): with gr.Column(scale=1): search_bar_retriever = get_search_bar() with gr.Column(scale=1): selected_noreranker = get_noreranking_dropdown() lb_df_retriever = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"] lb_df_retriever = reset_rank(lb_df_retriever) lb_table_retriever = get_leaderboard_table(lb_df_retriever, datastore.types_qa) # Dummy leaderboard for handling the case when the user uses backspace key hidden_lb_df_retriever = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"] hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever) hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, datastore.types_qa, visible=False) selected_version.change( update_datastore, [selected_version,], [selected_domains, selected_langs, selected_rerankings, lb_table_retriever, hidden_lb_table_retriever] ) set_listeners( "qa", lb_table_retriever, hidden_lb_table_retriever, search_bar_retriever, selected_version, selected_domains, selected_langs, selected_noreranker, show_anonymous, show_revision_and_timestamp, ) # set metric listener selected_metric.change( update_metric_qa, [ selected_metric, selected_domains, selected_langs, selected_noreranker, search_bar_retriever, show_anonymous, show_revision_and_timestamp, ], lb_table_retriever, queue=True ) with gr.TabItem("Reranking Only", id=12): lb_df_reranker = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK] lb_df_reranker = reset_rank(lb_df_reranker) reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist() with gr.Row(): with gr.Column(scale=1): selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker) with gr.Column(scale=1): search_bar_reranker = gr.Textbox(show_label=False, visible=False) lb_table_reranker = get_leaderboard_table(lb_df_reranker, datastore.types_qa) hidden_lb_df_reranker = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK] hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker) hidden_lb_table_reranker = get_leaderboard_table( hidden_lb_df_reranker, datastore.types_qa, visible=False ) selected_version.change( update_datastore, [selected_version,], [selected_domains, selected_langs, selected_rerankings_reranker, lb_table_reranker, hidden_lb_table_reranker] ) set_listeners( "qa", lb_table_reranker, hidden_lb_table_reranker, search_bar_reranker, selected_version, selected_domains, selected_langs, selected_rerankings_reranker, show_anonymous, show_revision_and_timestamp, ) # set metric listener selected_metric.change( update_metric_qa, [ selected_metric, selected_domains, selected_langs, selected_rerankings_reranker, search_bar_reranker, show_anonymous, show_revision_and_timestamp, ], lb_table_reranker, queue=True ) with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1): with gr.Row(): with gr.Column(min_width=320): # select domain with gr.Row(): selected_domains = get_domain_dropdown(LongDocBenchmarks[datastore.slug]) # select language with gr.Row(): selected_langs = get_language_dropdown(LongDocBenchmarks[datastore.slug]) with gr.Column(): # select the metric with gr.Row(): selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_LONG_DOC) with gr.Row(): show_anonymous = get_anonymous_checkbox() with gr.Row(): show_revision_and_timestamp = get_revision_and_ts_checkbox() with gr.Tabs(elem_classes="tab-buttons") as sub_tabs: with gr.TabItem("Retrieval + Reranking", id=20): with gr.Row(): with gr.Column(): search_bar = get_search_bar() # select reranking model with gr.Column(): selected_rerankings = get_reranking_dropdown(datastore.reranking_models) lb_table = get_leaderboard_table( datastore.leaderboard_df_long_doc, datastore.types_long_doc ) # Dummy leaderboard for handling the case when the user uses backspace key hidden_lb_table = get_leaderboard_table( datastore.raw_df_long_doc, datastore.types_long_doc, visible=False ) selected_version.change( update_datastore, [selected_version,], [selected_domains, selected_langs, selected_rerankings, lb_table, hidden_lb_table] ) set_listeners( "long-doc", lb_table, hidden_lb_table, search_bar, selected_version, selected_domains, selected_langs, selected_rerankings, show_anonymous, show_revision_and_timestamp, ) # set metric listener selected_metric.change( update_metric_long_doc, [ selected_metric, selected_domains, selected_langs, selected_rerankings, search_bar, show_anonymous, show_revision_and_timestamp ], lb_table, queue=True ) """ with gr.TabItem("Retrieval Only", id=21): with gr.Row(): with gr.Column(scale=1): search_bar_retriever = get_search_bar() with gr.Column(scale=1): selected_noreranker = get_noreranking_dropdown() lb_df_retriever_long_doc = data["AIR-Bench_24.04"].leaderboard_df_long_doc[ data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker" ] lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc) hidden_lb_db_retriever_long_doc = data["AIR-Bench_24.04"].raw_df_long_doc[ data["AIR-Bench_24.04"].raw_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker" ] hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc) lb_table_retriever_long_doc = get_leaderboard_table( lb_df_retriever_long_doc, data["AIR-Bench_24.04"].types_long_doc) hidden_lb_table_retriever_long_doc = get_leaderboard_table( hidden_lb_db_retriever_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False ) set_listeners( "long-doc", lb_table_retriever_long_doc, hidden_lb_table_retriever_long_doc, search_bar_retriever, selected_domains, selected_langs, selected_noreranker, show_anonymous, show_revision_and_timestamp, ) selected_metric.change( update_metric_long_doc, [ selected_metric, selected_domains, selected_langs, selected_noreranker, search_bar_retriever, show_anonymous, show_revision_and_timestamp, ], lb_table_retriever_long_doc, queue=True ) with gr.TabItem("Reranking Only", id=22): lb_df_reranker_ldoc = data["AIR-Bench_24.04"].leaderboard_df_long_doc[ data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK ] lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc) reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist() with gr.Row(): with gr.Column(scale=1): selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc) with gr.Column(scale=1): search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False) lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc) hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].raw_df_long_doc[data["AIR-Bench_24.04"].raw_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK] hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc) hidden_lb_table_reranker_ldoc = get_leaderboard_table( hidden_lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc, visible=False ) set_listeners( "long-doc", lb_table_reranker_ldoc, hidden_lb_table_reranker_ldoc, search_bar_reranker_ldoc, selected_domains, selected_langs, selected_rerankings_reranker_ldoc, show_anonymous, show_revision_and_timestamp, ) selected_metric.change( update_metric_long_doc, [ selected_metric, selected_domains, selected_langs, selected_rerankings_reranker_ldoc, search_bar_reranker_ldoc, show_anonymous, show_revision_and_timestamp, ], lb_table_reranker_ldoc, queue=True ) with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2): with gr.Column(): with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Row(): gr.Markdown("## ✉️Submit your model here!", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_name = gr.Textbox(label="Retrieval Method name") with gr.Column(): model_url = gr.Textbox(label="Retrieval Method URL") with gr.Row(): with gr.Column(): reranking_model_name = gr.Textbox( label="Reranking Model name", info="Optional", value="NoReranker" ) with gr.Column(): reranking_model_url = gr.Textbox( label="Reranking Model URL", info="Optional", value="" ) with gr.Row(): with gr.Column(): benchmark_version = gr.Dropdown( BENCHMARK_VERSION_LIST, value=LATEST_BENCHMARK_VERSION, interactive=True, label="AIR-Bench Version") with gr.Row(): upload_button = gr.UploadButton("Click to upload search results", file_count="single") with gr.Row(): file_output = gr.File() with gr.Row(): is_anonymous = gr.Checkbox( label="Nope. I want to submit anonymously 🥷", value=False, info="Do you want to shown on the leaderboard by default?") with gr.Row(): submit_button = gr.Button("Submit") with gr.Row(): submission_result = gr.Markdown() upload_button.upload( upload_file, [ upload_button, ], file_output) submit_button.click( submit_results, [ file_output, model_name, model_url, reranking_model_name, reranking_model_url, benchmark_version, is_anonymous ], submission_result, show_progress="hidden" ) with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3): gr.Markdown(BENCHMARKS_TEXT, elem_classes="markdown-text") """ if __name__ == "__main__": scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40) demo.launch()