{
"cells": [
{
"cell_type": "code",
"execution_count": 89,
"id": "c0cdda73-430c-4e18-bce4-b2218e2597b9",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset, get_dataset_config_names"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4981ce75-5d13-4fd2-b08f-af077066f7d3",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 32,
"id": "13e20072-0304-424a-923d-ac31a1769e94",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from datetime import datetime\n",
"from pathlib import Path\n",
"from re import sub\n",
"\n",
"import pandas as pd\n",
"import requests\n",
"import streamlit as st\n",
"from datasets import get_dataset_config_names\n",
"from dotenv import load_dotenv\n",
"\n",
"if Path(\".env\").is_file():\n",
" load_dotenv(\".env\")\n",
"\n",
"auth_token = os.getenv(\"HF_HUB_TOKEN\")\n",
"header = {\"Authorization\": \"Bearer \" + auth_token}\n",
"\n",
"TASKS = get_dataset_config_names(\"ought/raft\")\n",
"# Split and capitalize the task names, e.g. banking_77 => Banking 77\n",
"FORMATTED_TASK_NAMES = [\" \".join(t.capitalize() for t in task.split(\"_\")) for task in TASKS]\n",
"\n",
"\n",
"def extract_tags(dataset):\n",
" tags = {}\n",
" for tag in dataset[\"tags\"]:\n",
" k, v = tuple(tag.split(\":\", 1))\n",
" tags[k] = v\n",
" return tags\n",
"\n",
"\n",
"def download_submissions():\n",
" response = requests.get(\"http://huggingface.co/api/datasets\", headers=header)\n",
" all_datasets = response.json()\n",
"\n",
" submissions = []\n",
"\n",
" for dataset in all_datasets:\n",
" tags = extract_tags(dataset)\n",
" if tags.get(\"benchmark\") == \"ought/raft\" and tags.get(\"type\") == \"evaluation\":\n",
" submissions.append(dataset)\n",
" return submissions\n",
"\n",
"\n",
"def format_submissions(submissions):\n",
" submission_data = {**{\"Submission\": []}, **{\"Date\": []}, **{t: [] for t in TASKS}}\n",
"\n",
" # TODO(lewtun): delete / filter all the junk repos from development\n",
" # The following picks the latest submissions which adhere to the model card schema\n",
" for submission in submissions:\n",
" submission_id = submission[\"id\"]\n",
" response = requests.get(\n",
" f\"http://huggingface.co/api/datasets/{submission_id}?full=true\",\n",
" headers=header,\n",
" )\n",
" data = response.json()\n",
" card_data = data[\"card_data\"]\n",
" submission_name = card_data[\"submission_dataset\"]\n",
" submission_data[\"Submission\"].append(submission_name)\n",
" submission_id = card_data[\"submission_id\"]\n",
" timestamp = submission_id.split(\"-\")[-1]\n",
" timestamp = pd.to_datetime(int(timestamp))\n",
" submission_data[\"Date\"].append(datetime.date(timestamp))\n",
"\n",
" for task in card_data[\"results\"]:\n",
" task_data = task[\"task\"]\n",
" task_name = task_data[\"name\"]\n",
" score = task_data[\"metrics\"][0][\"value\"]\n",
" submission_data[task_name].append(score)\n",
"\n",
" df = pd.DataFrame(submission_data)\n",
" df.insert(2, \"Overall\", df[TASKS].mean(axis=1))\n",
" df = df.copy().sort_values(\"Overall\", ascending=False).reset_index().rename(columns={\"index\": \"Rank\"})\n",
" df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "8dccc419-7b18-4a10-a4bf-2d69cc3b5888",
"metadata": {},
"outputs": [],
"source": [
"submissions = download_submissions()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "934ea3b9-76dd-4d8f-a62d-8e2fa5959111",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(submissions)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "c3803890-d664-4d24-86bc-8fb095cad40a",
"metadata": {},
"outputs": [],
"source": [
"df = format_submissions(submissions)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "2de6f903-c327-42b6-a1ca-a530a62cc412",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Rank | \n",
" Submission | \n",
" Date | \n",
" Overall | \n",
" Ade Corpus V2 | \n",
" Banking 77 | \n",
" Terms Of Service | \n",
" Tai Safety Research | \n",
" Neurips Impact Statement Risks | \n",
" Overruling | \n",
" Systematic Review Inclusion | \n",
" One Stop English | \n",
" Tweet Eval Hate | \n",
" Twitter Complaints | \n",
" Semiconductor Org Types | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Human baseline (crowdsourced) | \n",
" 2021-08-27 | \n",
" 0.735273 | \n",
" 0.830 | \n",
" 0.607 | \n",
" 0.627 | \n",
" 0.609 | \n",
" 0.857 | \n",
" 0.917 | \n",
" 0.468 | \n",
" 0.646 | \n",
" 0.722 | \n",
" 0.897 | \n",
" 0.908 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" GPT-3 baseline | \n",
" 2021-08-27 | \n",
" 0.631000 | \n",
" 0.688 | \n",
" 0.295 | \n",
" 0.579 | \n",
" 0.667 | \n",
" 0.595 | \n",
" 0.940 | \n",
" 0.535 | \n",
" 0.407 | \n",
" 0.529 | \n",
" 0.822 | \n",
" 0.884 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Rank Submission Date Overall Ade Corpus V2 \\\n",
"0 1 Human baseline (crowdsourced) 2021-08-27 0.735273 0.830 \n",
"1 0 GPT-3 baseline 2021-08-27 0.631000 0.688 \n",
"\n",
" Banking 77 Terms Of Service Tai Safety Research \\\n",
"0 0.607 0.627 0.609 \n",
"1 0.295 0.579 0.667 \n",
"\n",
" Neurips Impact Statement Risks Overruling Systematic Review Inclusion \\\n",
"0 0.857 0.917 0.468 \n",
"1 0.595 0.940 0.535 \n",
"\n",
" One Stop English Tweet Eval Hate Twitter Complaints \\\n",
"0 0.646 0.722 0.897 \n",
"1 0.407 0.529 0.822 \n",
"\n",
" Semiconductor Org Types \n",
"0 0.908 \n",
"1 0.884 "
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "ca6ba762-047f-4074-a5c3-b4168c13d398",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Rank | \n",
" Submission | \n",
" Date | \n",
" Overall | \n",
" Ade Corpus V2 | \n",
" Banking 77 | \n",
" Terms Of Service | \n",
" Tai Safety Research | \n",
" Neurips Impact Statement Risks | \n",
" Overruling | \n",
" Systematic Review Inclusion | \n",
" One Stop English | \n",
" Tweet Eval Hate | \n",
" Twitter Complaints | \n",
" Semiconductor Org Types | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Human baseline (crowdsourced) | \n",
" 2021-08-27 | \n",
" 0.735 | \n",
" 0.830 | \n",
" 0.607 | \n",
" 0.627 | \n",
" 0.609 | \n",
" 0.857 | \n",
" 0.917 | \n",
" 0.468 | \n",
" 0.646 | \n",
" 0.722 | \n",
" 0.897 | \n",
" 0.908 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" GPT-3 baseline | \n",
" 2021-08-27 | \n",
" 0.631 | \n",
" 0.688 | \n",
" 0.295 | \n",
" 0.579 | \n",
" 0.667 | \n",
" 0.595 | \n",
" 0.940 | \n",
" 0.535 | \n",
" 0.407 | \n",
" 0.529 | \n",
" 0.822 | \n",
" 0.884 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.style.format(precision=3)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "094e757c-1c6a-4d01-abb1-872face8c72b",
"metadata": {},
"outputs": [],
"source": [
"df2 = df.assign(hack=\"\").set_index(\"hack\")"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "2ff434e2-5bf6-453f-8470-28c7b1034154",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Rank | \n",
" Submission | \n",
" Date | \n",
" Overall | \n",
" Ade Corpus V2 | \n",
" Banking 77 | \n",
" Terms Of Service | \n",
" Tai Safety Research | \n",
" Neurips Impact Statement Risks | \n",
" Overruling | \n",
" Systematic Review Inclusion | \n",
" One Stop English | \n",
" Tweet Eval Hate | \n",
" Twitter Complaints | \n",
" Semiconductor Org Types | \n",
"
\n",
" \n",
" hack | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | \n",
" 1 | \n",
" Human baseline (crowdsourced) | \n",
" 2021-08-27 | \n",
" 0.735 | \n",
" 0.830 | \n",
" 0.607 | \n",
" 0.627 | \n",
" 0.609 | \n",
" 0.857 | \n",
" 0.917 | \n",
" 0.468 | \n",
" 0.646 | \n",
" 0.722 | \n",
" 0.897 | \n",
" 0.908 | \n",
"
\n",
" \n",
" | \n",
" 0 | \n",
" GPT-3 baseline | \n",
" 2021-08-27 | \n",
" 0.631 | \n",
" 0.688 | \n",
" 0.295 | \n",
" 0.579 | \n",
" 0.667 | \n",
" 0.595 | \n",
" 0.940 | \n",
" 0.535 | \n",
" 0.407 | \n",
" 0.529 | \n",
" 0.822 | \n",
" 0.884 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.style.format(precision=3)"
]
},
{
"cell_type": "code",
"execution_count": 186,
"id": "8be02c77-bda3-499b-9ac4-d50ec35644a5",
"metadata": {},
"outputs": [],
"source": [
"for submission in submissions[-1:]:\n",
" submission_id = submission[\"id\"]\n",
" response = requests.get(\n",
" f\"http://huggingface.co/api/datasets/{submission_id}?full=true\",\n",
" headers=header,\n",
" )\n",
" data = response.json()"
]
},
{
"cell_type": "code",
"execution_count": 188,
"id": "7ab07904-0f7e-401b-96f8-3558433e479a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Submission': [], 'foo': [], 'bar': []}"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"{**{\"Submission\": []}, **{\"foo\":[]}, **{\"bar\": []}}"
]
},
{
"cell_type": "code",
"execution_count": 191,
"id": "69ffb778-09cf-4eb8-ab95-739700d68420",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'f5a21c3fcb58ac17c8a47cfffd509b55cbad7ccf-1629986165000000000'"
]
},
"execution_count": 191,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub_id = data[\"card_data\"][\"submission_id\"]\n",
"sub_id"
]
},
{
"cell_type": "code",
"execution_count": 195,
"id": "f7c3e8c0-68c7-4bad-802b-1b39703e100d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1629986165000000000'"
]
},
"execution_count": 195,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t = sub_id.split(\"-\")[-1]\n",
"t"
]
},
{
"cell_type": "code",
"execution_count": 197,
"id": "34a7483c-0b00-42a7-99b9-ee6bdf34048a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2021-08-26 13:56:05')"
]
},
"execution_count": 197,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d = pd.to_datetime(int(t))\n",
"d"
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "95f1ecff-025f-4b42-8761-2c0964dfac5f",
"metadata": {},
"outputs": [],
"source": [
"import datetime"
]
},
{
"cell_type": "code",
"execution_count": 205,
"id": "c606cfea-165d-4b58-ba37-6fc9b06795cf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" d | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2021-08-26 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" d\n",
"0 2021-08-26"
]
},
"execution_count": 205,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame({\"d\":[datetime.datetime.date(d)]})"
]
},
{
"cell_type": "code",
"execution_count": 147,
"id": "eebefd5a-6451-44b9-bc0f-d0663f321e34",
"metadata": {},
"outputs": [],
"source": [
"timestamp = data[\"lastModified\"]"
]
},
{
"cell_type": "code",
"execution_count": 149,
"id": "bf663ca3-12e8-4178-9aef-aba46621477a",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 160,
"id": "4723aeb6-3993-49b1-b779-c1394b54d776",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2021-08-04 22:52:57+0000', tz='UTC')"
]
},
"execution_count": 160,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t = pd.to_datetime(timestamp)\n",
"t"
]
},
{
"cell_type": "code",
"execution_count": 157,
"id": "cbcc2bf8-e2c8-449c-9f00-38ed80e46ae0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'2021-08-04T22:52:57.000Z'"
]
},
"execution_count": 157,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"timestamp"
]
},
{
"cell_type": "code",
"execution_count": 161,
"id": "177574a9-327e-4999-a1db-c316bb741c8c",
"metadata": {},
"outputs": [],
"source": [
"t_int = int(t.timestamp() * 10 **9)"
]
},
{
"cell_type": "code",
"execution_count": 162,
"id": "b3aa4f70-50af-47b3-a492-c77f65266a5b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2021-08-04 22:52:57')"
]
},
"execution_count": 162,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.to_datetime(t_int)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "ae7c9100-a630-4b4e-a060-331914f86055",
"metadata": {},
"outputs": [],
"source": [
"submissions = download_submissions()"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "831077a1-7f44-4d31-94b3-49257a62c5f7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"16"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(submissions)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "4a8a9ff9-7f94-4abb-8194-9d570ad2216b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'id': 'autonlp/autonlp-benchmark-raft-ought__raft-ought__raft-dummy-predictions-642',\n",
" 'private': True,\n",
" 'tags': ['benchmark:ought/raft',\n",
" 'type:evaluation',\n",
" 'submission_dataset:ought/raft-dummy-predictions',\n",
" 'tags:autonlp',\n",
" 'tags:evaluation',\n",
" 'tags:benchmark'],\n",
" 'author': 'autonlp',\n",
" 'key': ''}]"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"submissions[-1:]"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "2d6e56cb-fca3-4e9e-9a8b-9d2e26816773",
"metadata": {},
"outputs": [],
"source": [
"df = format_submissions(submissions[-2:])"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "7d8a3402-f7b8-4edb-8d1e-afb704dc3c67",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Submission | \n",
" Overall | \n",
" banking_77 | \n",
" medical_subdomain_of_clinical_notes | \n",
" overruling | \n",
" gpai_initiatives | \n",
" semiconductor_org_types | \n",
" twitter_complaints | \n",
" neurips_impact_statement_risks | \n",
" systematic_review_inclusion | \n",
" terms_of_service | \n",
" tai_safety_research | \n",
" one_stop_english | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" lewtun/my-raft-dummy-predictions | \n",
" 0.605079 | \n",
" 0.948903 | \n",
" 0.716526 | \n",
" 0.064395 | \n",
" 0.529422 | \n",
" 0.643723 | \n",
" 0.873478 | \n",
" 0.756919 | \n",
" 0.381609 | \n",
" 0.461302 | \n",
" 0.624133 | \n",
" 0.655457 | \n",
"
\n",
" \n",
" 0 | \n",
" ought/raft-dummy-predictions | \n",
" 0.407345 | \n",
" 0.009504 | \n",
" 0.591213 | \n",
" 0.552390 | \n",
" 0.594769 | \n",
" 0.339822 | \n",
" 0.728116 | \n",
" 0.878378 | \n",
" 0.291842 | \n",
" 0.144772 | \n",
" 0.089622 | \n",
" 0.260366 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Submission Overall banking_77 \\\n",
"1 lewtun/my-raft-dummy-predictions 0.605079 0.948903 \n",
"0 ought/raft-dummy-predictions 0.407345 0.009504 \n",
"\n",
" medical_subdomain_of_clinical_notes overruling gpai_initiatives \\\n",
"1 0.716526 0.064395 0.529422 \n",
"0 0.591213 0.552390 0.594769 \n",
"\n",
" semiconductor_org_types twitter_complaints \\\n",
"1 0.643723 0.873478 \n",
"0 0.339822 0.728116 \n",
"\n",
" neurips_impact_statement_risks systematic_review_inclusion \\\n",
"1 0.756919 0.381609 \n",
"0 0.878378 0.291842 \n",
"\n",
" terms_of_service tai_safety_research one_stop_english \n",
"1 0.461302 0.624133 0.655457 \n",
"0 0.144772 0.089622 0.260366 "
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "f60f453b-2457-4597-9eee-324d4c3a2f2e",
"metadata": {},
"outputs": [],
"source": [
"df.insert(1, \"Overall\", df[TASKS].mean(axis=1))"
]
},
{
"cell_type": "code",
"execution_count": 110,
"id": "1fd83f7a-b554-4e7d-aef6-4338b01f3eec",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Rank | \n",
" Submission | \n",
" Overall | \n",
" banking_77 | \n",
" medical_subdomain_of_clinical_notes | \n",
" overruling | \n",
" gpai_initiatives | \n",
" semiconductor_org_types | \n",
" twitter_complaints | \n",
" neurips_impact_statement_risks | \n",
" systematic_review_inclusion | \n",
" terms_of_service | \n",
" tai_safety_research | \n",
" one_stop_english | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" lewtun/my-raft-dummy-predictions | \n",
" 0.605079 | \n",
" 0.948903 | \n",
" 0.716526 | \n",
" 0.064395 | \n",
" 0.529422 | \n",
" 0.643723 | \n",
" 0.873478 | \n",
" 0.756919 | \n",
" 0.381609 | \n",
" 0.461302 | \n",
" 0.624133 | \n",
" 0.655457 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" ought/raft-dummy-predictions | \n",
" 0.407345 | \n",
" 0.009504 | \n",
" 0.591213 | \n",
" 0.552390 | \n",
" 0.594769 | \n",
" 0.339822 | \n",
" 0.728116 | \n",
" 0.878378 | \n",
" 0.291842 | \n",
" 0.144772 | \n",
" 0.089622 | \n",
" 0.260366 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Rank Submission Overall banking_77 \\\n",
"0 1 lewtun/my-raft-dummy-predictions 0.605079 0.948903 \n",
"1 0 ought/raft-dummy-predictions 0.407345 0.009504 \n",
"\n",
" medical_subdomain_of_clinical_notes overruling gpai_initiatives \\\n",
"0 0.716526 0.064395 0.529422 \n",
"1 0.591213 0.552390 0.594769 \n",
"\n",
" semiconductor_org_types twitter_complaints \\\n",
"0 0.643723 0.873478 \n",
"1 0.339822 0.728116 \n",
"\n",
" neurips_impact_statement_risks systematic_review_inclusion \\\n",
"0 0.756919 0.381609 \n",
"1 0.878378 0.291842 \n",
"\n",
" terms_of_service tai_safety_research one_stop_english \n",
"0 0.461302 0.624133 0.655457 \n",
"1 0.144772 0.089622 0.260366 "
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.copy().sort_values(\"Overall\", ascending=False).reset_index().rename(columns={\"index\":\"Rank\"})"
]
},
{
"cell_type": "code",
"execution_count": 119,
"id": "e1262ff5-6ea3-41ca-affc-b106dd9df5fd",
"metadata": {},
"outputs": [],
"source": [
"task_names = [\" \".join(t.capitalize() for t in task.split(\"_\")) for task in TASKS]"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "45d74b9c-c472-4494-aadc-909976d13b08",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Submission | \n",
" Overall | \n",
" Banking 77 | \n",
" Medical Subdomain Of Clinical Notes | \n",
" Overruling | \n",
" Gpai Initiatives | \n",
" Semiconductor Org Types | \n",
" Twitter Complaints | \n",
" Neurips Impact Statement Risks | \n",
" Systematic Review Inclusion | \n",
" Terms Of Service | \n",
" Tai Safety Research | \n",
" One Stop English | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" lewtun/my-raft-dummy-predictions | \n",
" 0.605079 | \n",
" 0.948903 | \n",
" 0.716526 | \n",
" 0.064395 | \n",
" 0.529422 | \n",
" 0.643723 | \n",
" 0.873478 | \n",
" 0.756919 | \n",
" 0.381609 | \n",
" 0.461302 | \n",
" 0.624133 | \n",
" 0.655457 | \n",
"
\n",
" \n",
" 0 | \n",
" ought/raft-dummy-predictions | \n",
" 0.407345 | \n",
" 0.009504 | \n",
" 0.591213 | \n",
" 0.552390 | \n",
" 0.594769 | \n",
" 0.339822 | \n",
" 0.728116 | \n",
" 0.878378 | \n",
" 0.291842 | \n",
" 0.144772 | \n",
" 0.089622 | \n",
" 0.260366 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Submission Overall Banking 77 \\\n",
"1 lewtun/my-raft-dummy-predictions 0.605079 0.948903 \n",
"0 ought/raft-dummy-predictions 0.407345 0.009504 \n",
"\n",
" Medical Subdomain Of Clinical Notes Overruling Gpai Initiatives \\\n",
"1 0.716526 0.064395 0.529422 \n",
"0 0.591213 0.552390 0.594769 \n",
"\n",
" Semiconductor Org Types Twitter Complaints \\\n",
"1 0.643723 0.873478 \n",
"0 0.339822 0.728116 \n",
"\n",
" Neurips Impact Statement Risks Systematic Review Inclusion \\\n",
"1 0.756919 0.381609 \n",
"0 0.878378 0.291842 \n",
"\n",
" Terms Of Service Tai Safety Research One Stop English \n",
"1 0.461302 0.624133 0.655457 \n",
"0 0.144772 0.089622 0.260366 "
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rename(columns={k:v for k,v in zip(TASKS, task_names)})"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "d31c2bde-1645-4c1b-982b-c9daac40311d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Submission | \n",
" Overall | \n",
" banking_77 | \n",
" medical_subdomain_of_clinical_notes | \n",
" overruling | \n",
" gpai_initiatives | \n",
" semiconductor_org_types | \n",
" twitter_complaints | \n",
" neurips_impact_statement_risks | \n",
" systematic_review_inclusion | \n",
" terms_of_service | \n",
" tai_safety_research | \n",
" one_stop_english | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ought/raft-dummy-predictions | \n",
" 0.407345 | \n",
" 0.009504 | \n",
" 0.591213 | \n",
" 0.55239 | \n",
" 0.594769 | \n",
" 0.339822 | \n",
" 0.728116 | \n",
" 0.878378 | \n",
" 0.291842 | \n",
" 0.144772 | \n",
" 0.089622 | \n",
" 0.260366 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Submission Overall banking_77 \\\n",
"0 ought/raft-dummy-predictions 0.407345 0.009504 \n",
"\n",
" medical_subdomain_of_clinical_notes overruling gpai_initiatives \\\n",
"0 0.591213 0.55239 0.594769 \n",
"\n",
" semiconductor_org_types twitter_complaints \\\n",
"0 0.339822 0.728116 \n",
"\n",
" neurips_impact_statement_risks systematic_review_inclusion \\\n",
"0 0.878378 0.291842 \n",
"\n",
" terms_of_service tai_safety_research one_stop_english \n",
"0 0.144772 0.089622 0.260366 "
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values(\"Overall\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4df33059-020a-43cf-aa3a-de6939268cc7",
"metadata": {},
"outputs": [],
"source": [
"df[\"Overall\"] = df.mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "327539f3-3bf7-4a2e-ac10-89973a2ba37f",
"metadata": {},
"outputs": [],
"source": [
"df[\"Submission\"]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "f07ec556-2ebf-400e-85f7-c978d03b0dc1",
"metadata": {},
"outputs": [],
"source": [
"data = format_submissions(submissions[-1:])"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "a982e024-ab16-4752-984a-5368fa238f1d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bank | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bank\n",
"0 0.2"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame({\"bank\":[0.2]})"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "b7c73606-d7f9-4f17-bf4d-17cfbb3aa664",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched', 'mnli_matched', 'qnli', 'rte', 'wnli', 'ax']\n"
]
}
],
"source": [
"from datasets import get_dataset_config_names\n",
"\n",
"configs = get_dataset_config_names(\"glue\")\n",
"print(configs)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "92eea464-6b63-4613-ab4d-aa5003e0bb3b",
"metadata": {},
"outputs": [],
"source": [
"from datasets import get_dataset_config_names"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "4f9c2924-001a-4b76-a8ed-a072b43eedbd",
"metadata": {},
"outputs": [],
"source": [
"tasks = get_dataset_config_names(\"ought/raft\")"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "9b27374f-b118-440a-acab-6e4aa09f42a4",
"metadata": {},
"outputs": [],
"source": [
"submission_data = {t:[] for t in tasks}\n",
"\n",
"for task in data[\"card_data\"][\"results\"]:\n",
" task_data = task[\"task\"]\n",
" task_name = task_data[\"name\"]\n",
" score = task_data[\"metrics\"][0][\"value\"]\n",
" submission_data[task_name].append(score)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "6b7cf2e0-ee92-4647-8d9b-6edef48e06f8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'banking_77': [0.009504218288713173],\n",
" 'medical_subdomain_of_clinical_notes': [0.5912133593265538],\n",
" 'overruling': [0.5523904885287522],\n",
" 'gpai_initiatives': [0.5947694876413803],\n",
" 'semiconductor_org_types': [0.33982211621333613],\n",
" 'twitter_complaints': [0.7281156178656647],\n",
" 'neurips_impact_statement_risks': [0.8783775228874845],\n",
" 'systematic_review_inclusion': [0.2918416872180052],\n",
" 'terms_of_service': [0.14477157391911066],\n",
" 'tai_safety_research': [0.08962249895220364],\n",
" 'one_stop_english': [0.2603661495335281]}"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"submission_data"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "5df282e4-87c4-4ea7-833e-6a87886e2f76",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'benchmark': 'ought/raft',\n",
" 'type': 'evaluation',\n",
" 'submission_dataset': 'ought/raft-dummy-predictions',\n",
" 'tags': ['autonlp', 'evaluation', 'benchmark'],\n",
" 'model-index': None,\n",
" 'results': [{'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.009504218288713173}],\n",
" 'name': 'banking_77',\n",
" 'type': 'text-classification'}},\n",
" {'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.5912133593265538}],\n",
" 'name': 'medical_subdomain_of_clinical_notes',\n",
" 'type': 'text-classification'}},\n",
" {'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.5523904885287522}],\n",
" 'name': 'overruling',\n",
" 'type': 'text-classification'}},\n",
" {'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.5947694876413803}],\n",
" 'name': 'gpai_initiatives',\n",
" 'type': 'text-classification'}},\n",
" {'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.33982211621333613}],\n",
" 'name': 'semiconductor_org_types',\n",
" 'type': 'text-classification'}},\n",
" {'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.7281156178656647}],\n",
" 'name': 'twitter_complaints',\n",
" 'type': 'text-classification'}},\n",
" {'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.8783775228874845}],\n",
" 'name': 'neurips_impact_statement_risks',\n",
" 'type': 'text-classification'}},\n",
" {'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.2918416872180052}],\n",
" 'name': 'systematic_review_inclusion',\n",
" 'type': 'text-classification'}},\n",
" {'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.14477157391911066}],\n",
" 'name': 'terms_of_service',\n",
" 'type': 'text-classification'}},\n",
" {'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.08962249895220364}],\n",
" 'name': 'tai_safety_research',\n",
" 'type': 'text-classification'}},\n",
" {'task': {'metrics': [{'name': 'f1',\n",
" 'type': 'f1',\n",
" 'value': 0.2603661495335281}],\n",
" 'name': 'one_stop_english',\n",
" 'type': 'text-classification'}}]}"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[\"card_data\"]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b07a4fa9-176e-4ff3-bc3f-eb2a6fc9efda",
"metadata": {},
"outputs": [],
"source": [
"response = requests.get(\"http://huggingface.co/api/datasets\", headers=header)\n",
"all_datasets = response.json()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "63dc07ec-2f28-483f-8163-c97e8a6a4005",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2510"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(all_datasets)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "296f68c1-608d-4ea6-8d0e-cc35fb7d74c4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': 'disfl_qa',\n",
" 'tags': ['annotations_creators:expert-generated',\n",
" 'language_creators:found',\n",
" 'languages:en',\n",
" 'licenses:cc-by-4.0',\n",
" 'multilinguality:monolingual',\n",
" 'pretty_name:DISFL-QA: A Benchmark Dataset for Understanding Disfluencies in Question Answering',\n",
" 'size_categories:10K\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"a\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"b\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)"
]
}
],
"source": [
"a, b = zip(*[\"a\", \"b\"])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "4990ce09-a53f-47dd-b662-3f498352b641",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"annotations_creators expert-generated\n",
"language_creators found\n",
"languages en\n",
"licenses mit\n",
"multilinguality monolingual\n",
"size_categories 10K\n",
"1922 ¡\n",
"11884 hola\n",
"16 ,\n",
"378 me\n",
"13496 llamo\n",
"466 le\n",
"91 w\n",
"350 is\n",
"5 !\n",
"2 \n"
]
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"model_ckpt = \"bertin-project/bertin-roberta-base-spanish\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=False)\n",
"input_ids = tokenizer(\"¡hola, me llamo lewis!\").input_ids\n",
"for token in input_ids:\n",
" print(token, tokenizer.decode(token))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "430400f2-2c04-48d7-bf8e-63528441d410",
"metadata": {},
"outputs": [],
"source": [
"# 1922 ¡\n",
"# 11884 hola\n",
"# 16 ,\n",
"# 378 me\n",
"# 13496 llamo\n",
"# 466 le\n",
"# 91 w\n",
"# 350 is\n",
"# 5 !"
]
},
{
"cell_type": "code",
"execution_count": 130,
"id": "2ecdd872-af9b-4258-8a5e-d867f3785520",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.vocab[\"\"]"
]
},
{
"cell_type": "code",
"execution_count": 131,
"id": "16941c33-5e22-485f-9d24-ac8f8542c368",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"''"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.bos_token"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71929465-5ad5-444d-8c77-22f586b1ba23",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}