jbnayahu commited on
Commit
382809d
·
unverified ·
1 Parent(s): 460efe2

Signed-off-by: Jonathan Bnayahu <[email protected]>

Files changed (2) hide show
  1. src/about.py +6 -0
  2. src/leaderboard/read_evals.py +4 -1
src/about.py CHANGED
@@ -46,6 +46,12 @@ As a dynamic and evolving benchmark, BlueBench currently encompasses diverse dom
46
  LLM_BENCHMARKS_TEXT = """
47
  ## How it works
48
 
 
 
 
 
 
 
49
  BlueBench is comprised of the following subtasks:
50
 
51
  <style>
 
46
  LLM_BENCHMARKS_TEXT = """
47
  ## How it works
48
 
49
+ BlueBench was designed with four goals in mind: representativeness, reliability, efficiency, and validity.
50
+ * **Representative**: tasks distribution represents the required skills in an enterprise setting
51
+ * **Valid**: tasks measure what they aim to measure
52
+ * **Robust**: going beyond single-prompt evaluation due to model’s brittleness
53
+ * **Efficiency**: evaluation is fast (cheap)
54
+
55
  BlueBench is comprised of the following subtasks:
56
 
57
  <style>
src/leaderboard/read_evals.py CHANGED
@@ -69,9 +69,12 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
69
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
70
  continue
71
 
 
 
 
72
  # Sort the files by date
73
  try:
74
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
75
  except dateutil.parser._parser.ParserError:
76
  files = [files[-1]]
77
 
 
69
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
70
  continue
71
 
72
+ # skip anything not results
73
+ files = [f for f in files if (f.endswith("_evaluation_results.json"))]
74
+
75
  # Sort the files by date
76
  try:
77
+ files.sort(key=lambda x: x.removesuffix("_evaluation_results.json"))
78
  except dateutil.parser._parser.ParserError:
79
  files = [files[-1]]
80