eval-leaderboard

Running

xeon27 commited on Jan 31

Commit

954d8ee

1 Parent(s): 8471f6d

Change model names to reflect version

Files changed (2) hide show

refactor_eval_results.py CHANGED Viewed

@@ -30,7 +30,7 @@ METRIC_NAME = {
 MODEL_SHA_MAP = {
     # open source models
-    "c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", # TODO: verify for the 08-2024 version
     "Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
     "Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
     "Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
@@ -44,6 +44,22 @@ MODEL_SHA_MAP = {
     "o1": "https://openai.com/o1",
 }
 AGENTIC_LOG_MODEL_NAME_MAP = {
     "claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
     "gemini-1.5-pro": "gemini-1.5-pro-002",
@@ -150,6 +166,7 @@ def main():
         requests = {
             "model": model_name,
             "model_sha": MODEL_SHA_MAP[model_name],
             "base_model": "",
             "revision": "main",
             "private": False,

 MODEL_SHA_MAP = {
     # open source models
+    "c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
     "Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
     "Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
     "Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
     "o1": "https://openai.com/o1",
 }
+MODEL_VERSION_MAP = {
+    # open source models
+    "c4ai-command-r-plus": "c4ai-command-r-plus",
+    "Meta-Llama-3.1-70B-Instruct": "Llama-3.1-70B-Instruct",
+    "Mistral-Large-Instruct-2407": "Mistral-Large-Instruct-2407",
+    "Qwen2.5-72B-Instruct": "Qwen2.5-72B-Instruct",
+    # closed source models
+    "claude-3-5-sonnet-20241022": "Claude-3.5-Sonnet-20241022",
+    "gemini-1.5-flash": "Gemini-1.5-Flash",
+    "gemini-1.5-pro": "Gemini-1.5-Pro-002",
+    "gpt-4o": "GPT-4o-20240806",
+    "gpt-4o-mini": "GPT-4o-mini-20240718",
+    "o1": "o1-20241217",
+}
 AGENTIC_LOG_MODEL_NAME_MAP = {
     "claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
     "gemini-1.5-pro": "gemini-1.5-pro-002",
         requests = {
             "model": model_name,
             "model_sha": MODEL_SHA_MAP[model_name],
+            "model_version": MODEL_VERSION_MAP[model_name],
             "base_model": "",
             "revision": "main",
             "private": False,

src/leaderboard/read_evals.py CHANGED Viewed

@@ -20,6 +20,7 @@ class EvalResult:
     full_model: str # org/model (path on hub)
     org: str
     model: str
     revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
@@ -103,6 +104,7 @@ class EvalResult:
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", ""))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
@@ -115,7 +117,7 @@ class EvalResult:
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.revision),
         }
         for task in Tasks:

     full_model: str # org/model (path on hub)
     org: str
     model: str
+    model_version: str
     revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", ""))
+            self.model_version = request.get("model_version", "")
             self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
+            AutoEvalColumn.model.name: make_clickable_model(self.model_version, self.revision),
         }
         for task in Tasks: