pminervini commited on
Commit
6c79b12
·
1 Parent(s): 84fb473
Files changed (2) hide show
  1. backend-cli.py +18 -12
  2. src/backend/envs.py +5 -4
backend-cli.py CHANGED
@@ -8,14 +8,14 @@ from huggingface_hub import snapshot_download
8
  from src.backend.run_eval_suite import run_evaluation
9
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
  from src.backend.sort_queue import sort_models_by_priority
11
- from src.backend.envs import Tasks, NUM_FEWSHOT, EVAL_REQUESTS_PATH_BACKEND,EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT
12
 
13
  from src.envs import QUEUE_REPO, RESULTS_REPO, API
14
 
15
  import logging
16
  import pprint
17
 
18
- TASKS_HARNESS = [task.value.benchmark for task in Tasks]
19
 
20
  logging.getLogger("openai").setLevel(logging.WARNING)
21
 
@@ -56,19 +56,25 @@ def run_auto_eval():
56
  set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
57
  local_dir=EVAL_REQUESTS_PATH_BACKEND)
58
 
59
- results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
60
- batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
61
 
62
- dumped = json.dumps(results, indent=2)
63
- print(dumped)
64
 
65
- output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
66
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
67
- with open(output_path, "w") as f:
68
- f.write(dumped)
69
 
70
- API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
71
- repo_id=RESULTS_REPO, repo_type="dataset")
 
 
 
 
 
 
 
 
72
 
73
  set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
74
  local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
8
  from src.backend.run_eval_suite import run_evaluation
9
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
  from src.backend.sort_queue import sort_models_by_priority
11
+ from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND,EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT
12
 
13
  from src.envs import QUEUE_REPO, RESULTS_REPO, API
14
 
15
  import logging
16
  import pprint
17
 
18
+ # TASKS_HARNESS = [task.value.benchmark for task in Tasks]
19
 
20
  logging.getLogger("openai").setLevel(logging.WARNING)
21
 
 
56
  set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
57
  local_dir=EVAL_REQUESTS_PATH_BACKEND)
58
 
59
+ # results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
60
+ # batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
61
 
62
+ TASKS_HARNESS = [task.value for task in Tasks]
 
63
 
64
+ for task in TASKS_HARNESS:
65
+ results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
66
+ batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
 
67
 
68
+ dumped = json.dumps(results, indent=2)
69
+ print(dumped)
70
+
71
+ output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
72
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
73
+ with open(output_path, "w") as f:
74
+ f.write(dumped)
75
+
76
+ API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
77
+ repo_id=RESULTS_REPO, repo_type="dataset")
78
 
79
  set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
80
  local_dir=EVAL_REQUESTS_PATH_BACKEND)
src/backend/envs.py CHANGED
@@ -13,21 +13,22 @@ class Task:
13
  benchmark: str
14
  metric: str
15
  col_name: str
 
16
 
17
 
18
  class Tasks(Enum):
19
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
20
  # task0 = Task("anli_r1", "acc", "ANLI")
21
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
22
- task0 = Task("nq_open", "em", "NQ Open")
23
- task1 = Task("triviaqa", "em", "TriviaQA")
24
 
25
 
26
- NUM_FEWSHOT = 64 # Change with your few shot
27
 
28
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
29
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
30
 
31
  DEVICE = "cuda:0" if torch.cuda.is_available() else 'cpu'
32
 
33
- LIMIT = 32 # Testing; needs to be None
 
13
  benchmark: str
14
  metric: str
15
  col_name: str
16
+ num_fewshot: int
17
 
18
 
19
  class Tasks(Enum):
20
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
21
  # task0 = Task("anli_r1", "acc", "ANLI")
22
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
23
+ task0 = Task("nq_open", "em", "NQ Open", 64)
24
+ task1 = Task("triviaqa", "em", "TriviaQA", 64)
25
 
26
 
27
+ # NUM_FEWSHOT = 64 # Change with your few shot
28
 
29
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
30
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
31
 
32
  DEVICE = "cuda:0" if torch.cuda.is_available() else 'cpu'
33
 
34
+ LIMIT = None # Testing; needs to be None