Clémentine commited on
Commit
3e6770c
·
1 Parent(s): 95c19d6

simplified backend

Browse files
main_backend_harness.py CHANGED
@@ -10,9 +10,10 @@ from src.backend.manage_requests import check_completed_evals, get_eval_requests
10
  from src.backend.sort_queue import sort_models_by_priority
11
 
12
  from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
13
- from src.about import Tasks, NUM_FEWSHOT
14
  from src.logging import setup_logger
15
- TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 
16
 
17
  # logging.basicConfig(level=logging.ERROR)
18
  logger = setup_logger(__name__)
 
10
  from src.backend.sort_queue import sort_models_by_priority
11
 
12
  from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
13
+ from src.envs import TASKS_HARNESS, NUM_FEWSHOT
14
  from src.logging import setup_logger
15
+
16
+
17
 
18
  # logging.basicConfig(level=logging.ERROR)
19
  logger = setup_logger(__name__)
main_backend_lighteval.py CHANGED
@@ -9,8 +9,7 @@ from src.backend.run_eval_suite_lighteval import run_evaluation
9
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
  from src.backend.sort_queue import sort_models_by_priority
11
 
12
- from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION
13
- from src.about import TASKS_LIGHTEVAL
14
  from src.logging import setup_logger
15
 
16
  logger = setup_logger(__name__)
 
9
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
  from src.backend.sort_queue import sort_models_by_priority
11
 
12
+ from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION, TASKS_LIGHTEVAL
 
13
  from src.logging import setup_logger
14
 
15
  logger = setup_logger(__name__)
src/about.py DELETED
@@ -1,24 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
- @dataclass
5
- class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
-
10
-
11
- # Change for your tasks here
12
- # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
-
20
- TASKS_HARNESS = [task.value.benchmark for task in Tasks]
21
- # ---------------------------------------------------
22
-
23
- TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
- #custom|myothertask|0|0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py CHANGED
@@ -4,20 +4,24 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request file
10
 
11
  # For harness evaluations
12
  DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
13
- LIMIT = 20 # !!!! Should be None for actual evaluations!!!
 
 
14
 
15
  # For lighteval evaluations
16
  ACCELERATOR = "cpu"
17
  REGION = "us-east-1"
18
  VENDOR = "aws"
19
- # ----------------------------------
 
20
 
 
21
  REPO_ID = f"{OWNER}/leaderboard-backend"
22
  QUEUE_REPO = f"{OWNER}/requests"
23
  RESULTS_REPO = f"{OWNER}/results"
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset
10
 
11
  # For harness evaluations
12
  DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
13
+ LIMIT = 20 # !!!! For testing, should be None for actual evaluations!!!
14
+ NUM_FEWSHOT = 0 # Change with your few shot for the Harness evaluations
15
+ TASKS_HARNESS = ["anli_r1", "logiqa"]
16
 
17
  # For lighteval evaluations
18
  ACCELERATOR = "cpu"
19
  REGION = "us-east-1"
20
  VENDOR = "aws"
21
+ TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
22
+ # To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
23
 
24
+ # ---------------------------------------------------
25
  REPO_ID = f"{OWNER}/leaderboard-backend"
26
  QUEUE_REPO = f"{OWNER}/requests"
27
  RESULTS_REPO = f"{OWNER}/results"
src/populate.py DELETED
@@ -1,56 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- raw_data = get_raw_eval_results(results_path, requests_path)
13
- all_data_json = [v.to_dict() for v in raw_data]
14
-
15
- df = pd.DataFrame.from_records(all_data_json)
16
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
17
- df = df[cols].round(decimals=2)
18
-
19
- # filter out if any of the benchmarks have not been produced
20
- df = df[has_no_nan_values(df, benchmark_cols)]
21
- return raw_data, df
22
-
23
-
24
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
25
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
26
- all_evals = []
27
-
28
- for entry in entries:
29
- if ".json" in entry:
30
- file_path = os.path.join(save_path, entry)
31
- with open(file_path) as fp:
32
- data = json.load(fp)
33
-
34
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
35
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
36
-
37
- all_evals.append(data)
38
- elif ".md" not in entry:
39
- # this is a folder
40
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
41
- for sub_entry in sub_entries:
42
- file_path = os.path.join(save_path, entry, sub_entry)
43
- with open(file_path) as fp:
44
- data = json.load(fp)
45
-
46
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
47
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
48
- all_evals.append(data)
49
-
50
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
51
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
52
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
53
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
54
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
55
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
56
- return df_finished[cols], df_running[cols], df_pending[cols]