import os from datetime import datetime from pathlib import Path import numpy as np import pandas as pd import streamlit as st from datasets import get_dataset_config_names from dotenv import load_dotenv from huggingface_hub import DatasetFilter, list_datasets if Path(".env").is_file(): load_dotenv(".env") auth_token = os.getenv("HF_HUB_TOKEN") TASKS = sorted(get_dataset_config_names("ought/raft")) # Split and capitalize the task names, e.g. banking_77 => Banking 77 FORMATTED_TASK_NAMES = sorted([" ".join(t.capitalize() for t in task.split("_")) for task in TASKS]) def extract_tags(dataset): tags = {} for tag in dataset.tags: k, v = tuple(tag.split(":", 1)) tags[k] = v return tags def download_submissions(): filt = DatasetFilter(benchmark="raft") all_submissions = list_datasets(filter=filt, cardData=True, use_auth_token=auth_token) submissions = [] for dataset in all_submissions: tags = extract_tags(dataset) if tags.get("type") == "evaluation": submissions.append(dataset) return submissions def format_submissions(submissions): submission_data = { **{"Submitter": []}, **{"Submission Name": []}, **{"Submission Date": []}, **{t: [] for t in TASKS}, } # The following picks the latest submissions which adhere to the model card schema for submission in submissions: submission_id = submission.id card_data = submission.cardData username = card_data["submission_dataset"].split("/")[0] submission_data["Submitter"].append(username) submission_id = card_data["submission_id"] submission_name, sha, timestamp = submission_id.split("__") # Format submission names with new backend constraints # TODO(lewtun): make this less hacky! if "_XXX_" in submission_name: submission_name = submission_name.replace("_XXX_", " ") if "_DDD_" in submission_name: submission_name = submission_name.replace("_DDD_", "--") submission_data["Submission Name"].append(submission_name) # Handle mismatch in epoch microseconds vs epoch seconds in new AutoTrain API if len(timestamp) > 10: timestamp = pd.to_datetime(int(timestamp)) else: timestamp = pd.to_datetime(int(timestamp), unit="s") submission_data["Submission Date"].append(datetime.date(timestamp).strftime("%b %d, %Y")) for task in card_data["results"]: task_data = task["task"] task_name = task_data["name"] score = task_data["metrics"][0]["value"] submission_data[task_name].append(score) df = pd.DataFrame(submission_data) df.insert(3, "Overall", df[TASKS].mean(axis=1)) df = df.copy().sort_values("Overall", ascending=False) df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True) # Start ranking from 1 df.insert(0, "Rank", np.arange(1, len(df) + 1)) return df ########### ### APP ### ########### st.set_page_config(layout="wide") st.title("RAFT: Real-world Annotated Few-shot Tasks") st.markdown( """ Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? [RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models: - across multiple domains (lit review, tweets, customer interaction, etc.) - on economically valuable classification tasks (someone inherently cares about the task) - in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set) To submit to RAFT, follow the instruction posted on [this page](https://huggingface.co/datasets/ought/raft-submission). """ ) submissions = download_submissions() df = format_submissions(submissions) styler = df.style.set_precision(3).set_properties(**{"white-space": "pre-wrap", "text-align": "center"}) # hack to remove index column: https://discuss.streamlit.io/t/questions-on-st-table/6878/3 st.markdown( """ """, unsafe_allow_html=True, ) st.table(styler)