In [89]:
from datasets import load_dataset, get_dataset_config_names

In [32]:
import os
from datetime import datetime
from pathlib import Path
from re import sub

import pandas as pd
import requests
import streamlit as st
from datasets import get_dataset_config_names
from dotenv import load_dotenv

if Path(".env").is_file():
    load_dotenv(".env")

auth_token = os.getenv("HF_HUB_TOKEN")
header = {"Authorization": "Bearer " + auth_token}

TASKS = get_dataset_config_names("ought/raft")
# Split and capitalize the task names, e.g. banking_77 => Banking 77
FORMATTED_TASK_NAMES = [" ".join(t.capitalize() for t in task.split("_")) for task in TASKS]


def extract_tags(dataset):
    tags = {}
    for tag in dataset["tags"]:
        k, v = tuple(tag.split(":", 1))
        tags[k] = v
    return tags


def download_submissions():
    response = requests.get("http://huggingface.co/api/datasets", headers=header)
    all_datasets = response.json()

    submissions = []

    for dataset in all_datasets:
        tags = extract_tags(dataset)
        if tags.get("benchmark") == "ought/raft" and tags.get("type") == "evaluation":
            submissions.append(dataset)
    return submissions


def format_submissions(submissions):
    submission_data = {**{"Submission": []}, **{"Date": []}, **{t: [] for t in TASKS}}

    # TODO(lewtun): delete / filter all the junk repos from development
    # The following picks the latest submissions which adhere to the model card schema
    for submission in submissions:
        submission_id = submission["id"]
        response = requests.get(
            f"http://huggingface.co/api/datasets/{submission_id}?full=true",
            headers=header,
        )
        data = response.json()
        card_data = data["card_data"]
        submission_name = card_data["submission_dataset"]
        submission_data["Submission"].append(submission_name)
        submission_id = card_data["submission_id"]
        timestamp = submission_id.split("-")[-1]
        timestamp = pd.to_datetime(int(timestamp))
        submission_data["Date"].append(datetime.date(timestamp))

        for task in card_data["results"]:
            task_data = task["task"]
            task_name = task_data["name"]
            score = task_data["metrics"][0]["value"]
            submission_data[task_name].append(score)

    df = pd.DataFrame(submission_data)
    df.insert(2, "Overall", df[TASKS].mean(axis=1))
    df = df.copy().sort_values("Overall", ascending=False).reset_index().rename(columns={"index": "Rank"})
    df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True)
    return df

In [28]:
submissions = download_submissions()

In [29]:
len(submissions)

2

In [34]:
df = format_submissions(submissions)

In [35]:
df

Unnamed: 0,Rank,Submission,Date,Overall,Ade Corpus V2,Banking 77,Terms Of Service,Tai Safety Research,Neurips Impact Statement Risks,Overruling,Systematic Review Inclusion,One Stop English,Tweet Eval Hate,Twitter Complaints,Semiconductor Org Types
0,1,Human baseline (crowdsourced),2021-08-27,0.735273,0.83,0.607,0.627,0.609,0.857,0.917,0.468,0.646,0.722,0.897,0.908
1,0,GPT-3 baseline,2021-08-27,0.631,0.688,0.295,0.579,0.667,0.595,0.94,0.535,0.407,0.529,0.822,0.884


In [45]:
df.style.format(precision=3)

Unnamed: 0,Rank,Submission,Date,Overall,Ade Corpus V2,Banking 77,Terms Of Service,Tai Safety Research,Neurips Impact Statement Risks,Overruling,Systematic Review Inclusion,One Stop English,Tweet Eval Hate,Twitter Complaints,Semiconductor Org Types
0,1,Human baseline (crowdsourced),2021-08-27,0.735,0.83,0.607,0.627,0.609,0.857,0.917,0.468,0.646,0.722,0.897,0.908
1,0,GPT-3 baseline,2021-08-27,0.631,0.688,0.295,0.579,0.667,0.595,0.94,0.535,0.407,0.529,0.822,0.884


In [47]:
df2 = df.assign(hack="").set_index("hack")

In [48]:
df2.style.format(precision=3)

Unnamed: 0_level_0,Rank,Submission,Date,Overall,Ade Corpus V2,Banking 77,Terms Of Service,Tai Safety Research,Neurips Impact Statement Risks,Overruling,Systematic Review Inclusion,One Stop English,Tweet Eval Hate,Twitter Complaints,Semiconductor Org Types
hack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,1,Human baseline (crowdsourced),2021-08-27,0.735,0.83,0.607,0.627,0.609,0.857,0.917,0.468,0.646,0.722,0.897,0.908
,0,GPT-3 baseline,2021-08-27,0.631,0.688,0.295,0.579,0.667,0.595,0.94,0.535,0.407,0.529,0.822,0.884


In [186]:
for submission in submissions[-1:]:
    submission_id = submission["id"]
    response = requests.get(
        f"http://huggingface.co/api/datasets/{submission_id}?full=true",
        headers=header,
    )
    data = response.json()

In [188]:
{**{"Submission": []}, **{"foo":[]}, **{"bar": []}}

{'Submission': [], 'foo': [], 'bar': []}

In [191]:
sub_id = data["card_data"]["submission_id"]
sub_id

'f5a21c3fcb58ac17c8a47cfffd509b55cbad7ccf-1629986165000000000'

In [195]:
t = sub_id.split("-")[-1]
t

'1629986165000000000'

In [197]:
d = pd.to_datetime(int(t))
d

Timestamp('2021-08-26 13:56:05')

In [201]:
import datetime

In [205]:
pd.DataFrame({"d":[datetime.datetime.date(d)]})

Unnamed: 0,d
0,2021-08-26


In [147]:
timestamp = data["lastModified"]

In [149]:
import pandas as pd

In [160]:
t = pd.to_datetime(timestamp)
t

Timestamp('2021-08-04 22:52:57+0000', tz='UTC')

In [157]:
timestamp

'2021-08-04T22:52:57.000Z'

In [161]:
t_int = int(t.timestamp() * 10 **9)

In [162]:
pd.to_datetime(t_int)

Timestamp('2021-08-04 22:52:57')

In [96]:
submissions = download_submissions()

In [97]:
len(submissions)

16

In [35]:
submissions[-1:]

[{'id': 'autonlp/autonlp-benchmark-raft-ought__raft-ought__raft-dummy-predictions-642',
  'private': True,
  'tags': ['benchmark:ought/raft',
   'type:evaluation',
   'submission_dataset:ought/raft-dummy-predictions',
   'tags:autonlp',
   'tags:evaluation',
   'tags:benchmark'],
  'author': 'autonlp',
  'key': ''}]

In [98]:
df = format_submissions(submissions[-2:])

In [109]:
df

Unnamed: 0,Submission,Overall,banking_77,medical_subdomain_of_clinical_notes,overruling,gpai_initiatives,semiconductor_org_types,twitter_complaints,neurips_impact_statement_risks,systematic_review_inclusion,terms_of_service,tai_safety_research,one_stop_english
1,lewtun/my-raft-dummy-predictions,0.605079,0.948903,0.716526,0.064395,0.529422,0.643723,0.873478,0.756919,0.381609,0.461302,0.624133,0.655457
0,ought/raft-dummy-predictions,0.407345,0.009504,0.591213,0.55239,0.594769,0.339822,0.728116,0.878378,0.291842,0.144772,0.089622,0.260366


In [100]:
df.insert(1, "Overall", df[TASKS].mean(axis=1))

In [110]:
df.copy().sort_values("Overall", ascending=False).reset_index().rename(columns={"index":"Rank"})

Unnamed: 0,Rank,Submission,Overall,banking_77,medical_subdomain_of_clinical_notes,overruling,gpai_initiatives,semiconductor_org_types,twitter_complaints,neurips_impact_statement_risks,systematic_review_inclusion,terms_of_service,tai_safety_research,one_stop_english
0,1,lewtun/my-raft-dummy-predictions,0.605079,0.948903,0.716526,0.064395,0.529422,0.643723,0.873478,0.756919,0.381609,0.461302,0.624133,0.655457
1,0,ought/raft-dummy-predictions,0.407345,0.009504,0.591213,0.55239,0.594769,0.339822,0.728116,0.878378,0.291842,0.144772,0.089622,0.260366


In [119]:
task_names = [" ".join(t.capitalize() for t in task.split("_")) for task in TASKS]

In [121]:
df.rename(columns={k:v for k,v in zip(TASKS, task_names)})

Unnamed: 0,Submission,Overall,Banking 77,Medical Subdomain Of Clinical Notes,Overruling,Gpai Initiatives,Semiconductor Org Types,Twitter Complaints,Neurips Impact Statement Risks,Systematic Review Inclusion,Terms Of Service,Tai Safety Research,One Stop English
1,lewtun/my-raft-dummy-predictions,0.605079,0.948903,0.716526,0.064395,0.529422,0.643723,0.873478,0.756919,0.381609,0.461302,0.624133,0.655457
0,ought/raft-dummy-predictions,0.407345,0.009504,0.591213,0.55239,0.594769,0.339822,0.728116,0.878378,0.291842,0.144772,0.089622,0.260366


In [88]:
df.sort_values("Overall")

Unnamed: 0,Submission,Overall,banking_77,medical_subdomain_of_clinical_notes,overruling,gpai_initiatives,semiconductor_org_types,twitter_complaints,neurips_impact_statement_risks,systematic_review_inclusion,terms_of_service,tai_safety_research,one_stop_english
0,ought/raft-dummy-predictions,0.407345,0.009504,0.591213,0.55239,0.594769,0.339822,0.728116,0.878378,0.291842,0.144772,0.089622,0.260366


In [None]:
df["Overall"] = df.mean()

In [None]:
df["Submission"]

In [38]:
data = format_submissions(submissions[-1:])

In [48]:
pd.DataFrame({"bank":[0.2]})

Unnamed: 0,bank
0,0.2


In [60]:
from datasets import get_dataset_config_names

configs = get_dataset_config_names("glue")
print(configs)

['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched', 'mnli_matched', 'qnli', 'rte', 'wnli', 'ax']


In [50]:
from datasets import get_dataset_config_names

In [51]:
tasks = get_dataset_config_names("ought/raft")

In [55]:
submission_data = {t:[] for t in tasks}

for task in data["card_data"]["results"]:
    task_data = task["task"]
    task_name = task_data["name"]
    score = task_data["metrics"][0]["value"]
    submission_data[task_name].append(score)

In [56]:
submission_data

{'banking_77': [0.009504218288713173],
 'medical_subdomain_of_clinical_notes': [0.5912133593265538],
 'overruling': [0.5523904885287522],
 'gpai_initiatives': [0.5947694876413803],
 'semiconductor_org_types': [0.33982211621333613],
 'twitter_complaints': [0.7281156178656647],
 'neurips_impact_statement_risks': [0.8783775228874845],
 'systematic_review_inclusion': [0.2918416872180052],
 'terms_of_service': [0.14477157391911066],
 'tai_safety_research': [0.08962249895220364],
 'one_stop_english': [0.2603661495335281]}

In [61]:
data["card_data"]

{'benchmark': 'ought/raft',
 'type': 'evaluation',
 'submission_dataset': 'ought/raft-dummy-predictions',
 'tags': ['autonlp', 'evaluation', 'benchmark'],
 'model-index': None,
 'results': [{'task': {'metrics': [{'name': 'f1',
      'type': 'f1',
      'value': 0.009504218288713173}],
    'name': 'banking_77',
    'type': 'text-classification'}},
  {'task': {'metrics': [{'name': 'f1',
      'type': 'f1',
      'value': 0.5912133593265538}],
    'name': 'medical_subdomain_of_clinical_notes',
    'type': 'text-classification'}},
  {'task': {'metrics': [{'name': 'f1',
      'type': 'f1',
      'value': 0.5523904885287522}],
    'name': 'overruling',
    'type': 'text-classification'}},
  {'task': {'metrics': [{'name': 'f1',
      'type': 'f1',
      'value': 0.5947694876413803}],
    'name': 'gpai_initiatives',
    'type': 'text-classification'}},
  {'task': {'metrics': [{'name': 'f1',
      'type': 'f1',
      'value': 0.33982211621333613}],
    'name': 'semiconductor_org_types',
    'ty

In [2]:
response = requests.get("http://huggingface.co/api/datasets", headers=header)
all_datasets = response.json()

In [3]:
len(all_datasets)

2510

In [21]:
all_datasets[154]

{'id': 'disfl_qa',
 'tags': ['annotations_creators:expert-generated',
  'language_creators:found',
  'languages:en',
  'licenses:cc-by-4.0',
  'multilinguality:monolingual',
  'pretty_name:DISFL-QA: A Benchmark Dataset for Understanding Disfluencies in Question Answering',
  'size_categories:10K<n<100K',
  'source_datasets:original',
  'task_categories:question-answering',
  'task_ids:extractive-qa',
  'task_ids:open-domain-qa'],
 'citation': '@inproceedings{gupta-etal-2021-disflqa,\n    title = "{Disfl-QA: A Benchmark Dataset for Understanding Disfluencies in Question Answering}",\n    author = "Gupta, Aditya and Xu, Jiacheng and Upadhyay, Shyam and Yang, Diyi and Faruqui, Manaal",\n    booktitle = "Findings of ACL",\n    year = "2021"\n}',
 'description': 'Disfl-QA is a targeted dataset for contextual disfluencies in an information seeking setting,\nnamely question answering over Wikipedia passages. Disfl-QA builds upon the SQuAD-v2 (Rajpurkar et al., 2018)\ndataset, where each quest

In [22]:
def extract_tags(dataset):
    tags = {}
    for tag in dataset["tags"]:
        k,v = tuple(tag.split(":", 1))
        tags[k] = v
    return tags

In [24]:
tags = extract_tags(all_datasets[0])

In [27]:
tags.get("benchmark") == "raft"

False

In [23]:
for idx, dset in enumerate(all_datasets):
    try:
        extract_tags(dset)
    except:
        print(dset["id"], idx)

In [5]:
{i[0]:i[1] for t.split(":") in all_datasets[0]["tags"]}

{'annotations_creators': 'expert-generated',
 'language_creators': 'found',
 'languages': 'en',
 'licenses': 'mit',
 'multilinguality': 'monolingual',
 'size_categories': '10K<n<100K',
 'source_datasets': 'original',
 'task_categories': 'structure-prediction',
 'task_ids': 'structure-prediction-other-acronym-identification'}

In [11]:
for i in zip("a:b".split(":")):
    print(i)

('a',)
('b',)


In [15]:
a, b = zip(*["a", "b"])

ValueError: not enough values to unpack (expected 2, got 1)

In [12]:
for tag in all_datasets[0]["tags"]:
    k,v = tuple(tag.split(":"))
    print(k,v)

annotations_creators expert-generated
language_creators found
languages en
licenses mit
multilinguality monolingual
size_categories 10K<n<100K
source_datasets original
task_categories structure-prediction
task_ids structure-prediction-other-acronym-identification


In [138]:
from transformers import AutoTokenizer

model_ckpt = "bertin-project/bertin-roberta-base-spanish"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=False)
input_ids = tokenizer("¡hola, me llamo lewis!").input_ids
for token in input_ids:
    print(token, tokenizer.decode(token))

0 <s>
1922 ¡
11884 hola
16 ,
378  me
13496  llamo
466  le
91 w
350 is
5 !
2 </s>


In [None]:
# 1922 ¡
# 11884 hola
# 16 ,
# 378  me
# 13496  llamo
# 466  le
# 91 w
# 350 is
# 5 !

In [130]:
tokenizer.vocab["<s>"]

0

In [131]:
tokenizer.bos_token

'<s>'