long-code-arena / src /submission_uploader.py
saridormi's picture
Fix paths
1323fe2
raw
history blame
5.14 kB
import json
import os
from typing import List, Optional
from huggingface_hub import CommitOperationAdd, Discussion, HfApi, HfFileSystem
from .tasks import TASKS_PRETTY_REVERSE
class AlreadyExists(Exception):
pass
class SubmissionUploader:
"""Class for adding new files to a dataset on a Hub and opening a PR.
Heavily influenced by these amazing spaces:
* https://huggingface.co/spaces/safetensors/convert
* https://huggingface.co/spaces/gaia-benchmark/leaderboard
"""
def __init__(self, dataset_id: str):
self._api = HfApi(token=os.environ["HF_TOKEN"])
self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
self._dataset_id = dataset_id
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
"""Searches among discussions of dataset repo for a PR with the given title."""
try:
discussions = self._api.get_repo_discussions(
repo_id=self._dataset_id, repo_type="dataset"
)
except Exception:
return None
for discussion in discussions:
if (
discussion.status == "open"
and discussion.is_pull_request
and discussion.title == pr_title
):
return discussion
def _upload_files(
self,
task_id: str,
model_folder: str,
model_name_pretty: str,
model_availability: str,
urls: str,
context_size: str,
submitted_by: str,
filenames: Optional[List[str]],
) -> List[CommitOperationAdd]:
# add predictions files
commit_operations = [
CommitOperationAdd(
path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
path_or_fileobj=filename,
)
for filename in filenames
]
# add metadata file
metadata_dict = {
"model_name": model_name_pretty,
"model_availability": model_availability,
"urls": urls,
"context_size": context_size,
"submitted_by": submitted_by,
}
with open("metadata.json", "w") as f:
json.dump(metadata_dict, f)
commit_operations.append(
CommitOperationAdd(
path_in_repo=f"{task_id}/predictions/{model_folder}/metadata.json",
path_or_fileobj="metadata.json",
)
)
return commit_operations
def upload_files(
self,
task_pretty: str,
model_folder: str,
model_name_pretty: str,
model_availability: str,
urls: str,
context_size: str,
submitted_by: str,
filenames: Optional[List[str]],
force: bool = False,
) -> str:
try:
pr_title = f"πŸš€ New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
task_id = TASKS_PRETTY_REVERSE[task_pretty]
if not force:
if model_name_pretty in self._fs.ls(
f"datasets/{self._dataset_id}/{task_id}/predictions"
) and all(
filename
in self._fs.ls(
f"datasets/{self._dataset_id}/{task_id}/predictions/{model_name_pretty}"
)
for filename in filenames + ["metadata.json"]
):
return (
f"{model_name_pretty} is already present in {self._dataset_id}."
)
prev_pr = self._get_previous_pr(pr_title)
if prev_pr is not None:
url = f"https://huggingface.co/datasets/{self._dataset_id}/discussions/{prev_pr.num}"
return f"{self._dataset_id} already has an open PR for this submission: {url}."
commit_operations = self._upload_files(
task_id=task_id,
model_folder=model_folder,
model_name_pretty=model_name_pretty,
model_availability=model_availability,
urls=urls,
context_size=context_size,
submitted_by=submitted_by,
filenames=filenames,
)
new_pr = self._api.create_commit(
repo_id=self._dataset_id,
operations=commit_operations,
commit_message=pr_title,
commit_description=f"""New submission to {task_pretty} task in 🏟️ Long Code Arena benchmark!
* Model name: {model_name_pretty}
* Model availability: {model_availability}
* Context Size: {context_size}
* Relevant URLs: {urls}
* Submitted By: {submitted_by}
""",
create_pr=True,
repo_type="dataset",
)
return f"πŸŽ‰ PR created at {new_pr.pr_url}."
except Exception:
return "An exception occured."