JavaBench / app.py
CPunisher's picture
Data
23f22ff
raw
history blame
3.25 kB
import gradio as gr
import pandas as pd
import json
from gradio_leaderboard import Leaderboard, SelectColumns
from apscheduler.schedulers.background import BackgroundScheduler
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
AutoEvalColumn,
fields
)
from src.envs import API, REPO_ID
def restart_space():
API.restart_space(repo_id=REPO_ID)
def init_leaderboard(data_file):
with open(data_file, "r") as fp:
data = json.load(fp)
dataframe = pd.DataFrame()
for key, value in data.items():
col_df = pd.DataFrame(value)
col_df.rename(columns={"Pass_at_1": key}, inplace=True)
dataframe = col_df if dataframe.empty else dataframe.merge(col_df, on=['Context', 'Method', 'Model'], how='outer')
dataframe['Score'] = dataframe.drop(columns=['Context', 'Method', 'Model']).sum(axis=1) / 5
numeric_cols = dataframe.select_dtypes(include='number').columns
dataframe[numeric_cols] = dataframe[numeric_cols].apply(lambda x: x * 100).round(1)
cols = list(dataframe.columns)
cols.remove('Score')
cols.insert(3, 'Score')
dataframe = dataframe[cols]
cols.insert(3, cols.pop(cols.index('Score')))
dataframe = dataframe.sort_values(by='Score', ascending=False)
return gr.components.DataFrame(
value=dataframe,
headers=[c.name for c in fields(AutoEvalColumn) if not c.hidden],
datatype=[c.type for c in fields(AutoEvalColumn)],
interactive=False,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.HTML(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("[Method] Evaluation", elem_id="llm-benchmark-tab-table", id=0):
leaderboard = init_leaderboard("./data/data_method.json")
with gr.TabItem("[Context] Evaluation", elem_id="llm-benchmark-tab-table", id=1):
leaderboard = init_leaderboard("./data/data_context.json")
with gr.TabItem("[Incremental] Evaluation", elem_id="llm-benchmark-tab-table", id=2):
leaderboard = init_leaderboard("./data/data_incr-order.json")
# with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("πŸš€ Submission", elem_id="llm-benchmark-tab-table", id=3):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()