Spaces:

gaia-benchmark
/

leaderboard

Running on CPU Upgrade

App Files Files Community

Clémentine commited on 6 days ago

Commit

51195ac

1 Parent(s): 2ff76cf

reordered file saving step + added new catch

Browse files

Files changed (1) hide show

app.py +25 -19

app.py CHANGED Viewed

@@ -34,7 +34,8 @@ ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3:
 os.makedirs("scored", exist_ok=True)
-LOCAL_DEBUG = False
 # Display the results
 eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
@@ -111,7 +112,7 @@ def add_new_eval(
     if path_to_file is None:
         return format_warning("Please attach a file.")
-    # Save submitted file
     if LOCAL_DEBUG:
         print("mock uploaded submission")
     else:
@@ -123,7 +124,23 @@ def add_new_eval(
             token=TOKEN
         )
-    # Compute score
     file_path = path_to_file.name
     scores = {"all": 0, 1: 0, 2: 0, 3: 0}
     num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
@@ -169,7 +186,7 @@ def add_new_eval(
     if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
         return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
-    # Save scored file
     if LOCAL_DEBUG:
         print("mock uploaded scored submission")
     else:
@@ -191,7 +208,7 @@ def add_new_eval(
                 token=TOKEN
             )
-    # Actual submission
     eval_entry = {
         "model": model,
         "model_family": model_family,
@@ -206,6 +223,9 @@ def add_new_eval(
     }
     if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
         return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
     # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
     #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
@@ -220,20 +240,6 @@ def add_new_eval(
     else:
         eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
-    contact_info = {
-        "model": model,
-        "model_family": model_family,
-        "url": url,
-        "organisation": organisation,
-        "username": profile.username,
-        "mail": mail,
-        "date": datetime.datetime.today().strftime('%Y-%m-%d')
-    }
-    contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
-    if LOCAL_DEBUG:
-        print("mock uploaded contact info")
-    else:
-        contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
     return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")

 os.makedirs("scored", exist_ok=True)
+# Should be False on spaces and True outside
+LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
 # Display the results
 eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
     if path_to_file is None:
         return format_warning("Please attach a file.")
+    # SAVE UNSCORED SUBMISSION
     if LOCAL_DEBUG:
         print("mock uploaded submission")
     else:
             token=TOKEN
         )
+    # SAVE CONTACT
+    contact_info = {
+        "model": model,
+        "model_family": model_family,
+        "url": url,
+        "organisation": organisation,
+        "username": profile.username,
+        "mail": mail,
+        "date": datetime.datetime.today().strftime('%Y-%m-%d')
+    }
+    contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
+    if LOCAL_DEBUG:
+        print("mock uploaded contact info")
+    else:
+        contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
+    # SCORE SUBMISSION
     file_path = path_to_file.name
     scores = {"all": 0, 1: 0, 2: 0, 3: 0}
     num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
     if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
         return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
+    # SAVE SCORED SUBMISSION
     if LOCAL_DEBUG:
         print("mock uploaded scored submission")
     else:
                 token=TOKEN
             )
+    # SAVE TO LEADERBOARD DATA
     eval_entry = {
         "model": model,
         "model_family": model_family,
     }
     if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
         return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
+    # Catching spam submissions of 100%
+    if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
+        return format_error(f"There was a problem with your submission. Please open a discussion.")
     # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
     #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
     else:
         eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
     return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")