Signed-off-by: Jonathan Bnayahu <[email protected]>
- src/about.py +6 -0
- src/leaderboard/read_evals.py +4 -1
src/about.py
CHANGED
@@ -46,6 +46,12 @@ As a dynamic and evolving benchmark, BlueBench currently encompasses diverse dom
|
|
46 |
LLM_BENCHMARKS_TEXT = """
|
47 |
## How it works
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
BlueBench is comprised of the following subtasks:
|
50 |
|
51 |
<style>
|
|
|
46 |
LLM_BENCHMARKS_TEXT = """
|
47 |
## How it works
|
48 |
|
49 |
+
BlueBench was designed with four goals in mind: representativeness, reliability, efficiency, and validity.
|
50 |
+
* **Representative**: tasks distribution represents the required skills in an enterprise setting
|
51 |
+
* **Valid**: tasks measure what they aim to measure
|
52 |
+
* **Robust**: going beyond single-prompt evaluation due to model’s brittleness
|
53 |
+
* **Efficiency**: evaluation is fast (cheap)
|
54 |
+
|
55 |
BlueBench is comprised of the following subtasks:
|
56 |
|
57 |
<style>
|
src/leaderboard/read_evals.py
CHANGED
@@ -69,9 +69,12 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
69 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
70 |
continue
|
71 |
|
|
|
|
|
|
|
72 |
# Sort the files by date
|
73 |
try:
|
74 |
-
files.sort(key=lambda x: x.removesuffix(".json")
|
75 |
except dateutil.parser._parser.ParserError:
|
76 |
files = [files[-1]]
|
77 |
|
|
|
69 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
70 |
continue
|
71 |
|
72 |
+
# skip anything not results
|
73 |
+
files = [f for f in files if (f.endswith("_evaluation_results.json"))]
|
74 |
+
|
75 |
# Sort the files by date
|
76 |
try:
|
77 |
+
files.sort(key=lambda x: x.removesuffix("_evaluation_results.json"))
|
78 |
except dateutil.parser._parser.ParserError:
|
79 |
files = [files[-1]]
|
80 |
|