Spaces:
Sleeping
Sleeping
dfs
Browse files- app.py +2 -1
- src/populate.py +2 -2
app.py
CHANGED
@@ -50,6 +50,7 @@ except Exception:
|
|
50 |
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
53 |
|
54 |
(
|
55 |
finished_eval_queue_df,
|
@@ -86,7 +87,7 @@ with demo:
|
|
86 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
87 |
|
88 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
89 |
-
leaderboard = init_leaderboard(
|
90 |
|
91 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table-n-correct", id=2):
|
92 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
50 |
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
+
LEADERBOARD_DF_N_CORRECT = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, version="n_correct")
|
54 |
|
55 |
(
|
56 |
finished_eval_queue_df,
|
|
|
87 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
88 |
|
89 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
90 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF_N_CORRECT)
|
91 |
|
92 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table-n-correct", id=2):
|
93 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
src/populate.py
CHANGED
@@ -8,9 +8,9 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
print(all_data_json)
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, version="1_correct") -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
raw_data = get_raw_eval_results(results_path+"/"+version, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
print(all_data_json)
|