Signed-off-by: Jonathan Bnayahu <[email protected]>
- app.py +6 -5
- src/display/utils.py +1 -14
app.py
CHANGED
@@ -30,11 +30,12 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
|
30 |
def init_leaderboard(dataframe):
|
31 |
if dataframe is None or dataframe.empty:
|
32 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
33 |
return Leaderboard(
|
34 |
value=dataframe,
|
35 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
36 |
search_columns=[AutoEvalColumn.model.name],
|
37 |
-
interactive=False
|
38 |
)
|
39 |
|
40 |
def download_csv():
|
@@ -42,15 +43,15 @@ def download_csv():
|
|
42 |
LEADERBOARD_DF.to_csv(buffer, index=False)
|
43 |
return buffer.getvalue()
|
44 |
|
45 |
-
|
46 |
-
with
|
47 |
gr.HTML(TITLE_IMAGE)
|
48 |
gr.HTML(TITLE)
|
49 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
50 |
|
51 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
52 |
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
53 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
54 |
|
55 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
56 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
@@ -82,4 +83,4 @@ with demo:
|
|
82 |
scheduler = BackgroundScheduler()
|
83 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
84 |
scheduler.start()
|
85 |
-
|
|
|
30 |
def init_leaderboard(dataframe):
|
31 |
if dataframe is None or dataframe.empty:
|
32 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
33 |
+
|
34 |
return Leaderboard(
|
35 |
value=dataframe,
|
36 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
37 |
search_columns=[AutoEvalColumn.model.name],
|
38 |
+
interactive=False
|
39 |
)
|
40 |
|
41 |
def download_csv():
|
|
|
43 |
LEADERBOARD_DF.to_csv(buffer, index=False)
|
44 |
return buffer.getvalue()
|
45 |
|
46 |
+
gui = gr.Blocks(css=custom_css)
|
47 |
+
with gui:
|
48 |
gr.HTML(TITLE_IMAGE)
|
49 |
gr.HTML(TITLE)
|
50 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
51 |
|
52 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
53 |
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
54 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF.style.highlight_max(color = 'lightgreen', axis=0).data)
|
55 |
|
56 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
57 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
83 |
scheduler = BackgroundScheduler()
|
84 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
85 |
scheduler.start()
|
86 |
+
gui.queue(default_concurrency_limit=40).launch()
|
src/display/utils.py
CHANGED
@@ -25,23 +25,13 @@ auto_eval_column_dict = []
|
|
25 |
# Init
|
26 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
27 |
#Scores
|
28 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
|
29 |
for task in Tasks:
|
30 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
31 |
|
32 |
# We use make dataclass to dynamically fill the scores from Tasks
|
33 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
34 |
|
35 |
-
## For the queue columns in the submission tab
|
36 |
-
@dataclass(frozen=True)
|
37 |
-
class EvalQueueColumn: # Queue column
|
38 |
-
model = ColumnContent("model", "markdown", True)
|
39 |
-
revision = ColumnContent("revision", "str", True)
|
40 |
-
private = ColumnContent("private", "bool", True)
|
41 |
-
precision = ColumnContent("precision", "str", True)
|
42 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
43 |
-
status = ColumnContent("status", "str", True)
|
44 |
-
|
45 |
## All the model information that we might need
|
46 |
@dataclass
|
47 |
class ModelDetails:
|
@@ -52,8 +42,5 @@ class ModelDetails:
|
|
52 |
# Column selection
|
53 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
54 |
|
55 |
-
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
56 |
-
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
57 |
-
|
58 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
59 |
|
|
|
25 |
# Init
|
26 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
27 |
#Scores
|
28 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
29 |
for task in Tasks:
|
30 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
31 |
|
32 |
# We use make dataclass to dynamically fill the scores from Tasks
|
33 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
## All the model information that we might need
|
36 |
@dataclass
|
37 |
class ModelDetails:
|
|
|
42 |
# Column selection
|
43 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
44 |
|
|
|
|
|
|
|
45 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
46 |
|