{ "cells": [ { "cell_type": "code", "execution_count": 89, "id": "c0cdda73-430c-4e18-bce4-b2218e2597b9", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, get_dataset_config_names" ] }, { "cell_type": "code", "execution_count": null, "id": "4981ce75-5d13-4fd2-b08f-af077066f7d3", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 32, "id": "13e20072-0304-424a-923d-ac31a1769e94", "metadata": {}, "outputs": [], "source": [ "import os\n", "from datetime import datetime\n", "from pathlib import Path\n", "from re import sub\n", "\n", "import pandas as pd\n", "import requests\n", "import streamlit as st\n", "from datasets import get_dataset_config_names\n", "from dotenv import load_dotenv\n", "\n", "if Path(\".env\").is_file():\n", " load_dotenv(\".env\")\n", "\n", "auth_token = os.getenv(\"HF_HUB_TOKEN\")\n", "header = {\"Authorization\": \"Bearer \" + auth_token}\n", "\n", "TASKS = get_dataset_config_names(\"ought/raft\")\n", "# Split and capitalize the task names, e.g. banking_77 => Banking 77\n", "FORMATTED_TASK_NAMES = [\" \".join(t.capitalize() for t in task.split(\"_\")) for task in TASKS]\n", "\n", "\n", "def extract_tags(dataset):\n", " tags = {}\n", " for tag in dataset[\"tags\"]:\n", " k, v = tuple(tag.split(\":\", 1))\n", " tags[k] = v\n", " return tags\n", "\n", "\n", "def download_submissions():\n", " response = requests.get(\"http://huggingface.co/api/datasets\", headers=header)\n", " all_datasets = response.json()\n", "\n", " submissions = []\n", "\n", " for dataset in all_datasets:\n", " tags = extract_tags(dataset)\n", " if tags.get(\"benchmark\") == \"ought/raft\" and tags.get(\"type\") == \"evaluation\":\n", " submissions.append(dataset)\n", " return submissions\n", "\n", "\n", "def format_submissions(submissions):\n", " submission_data = {**{\"Submission\": []}, **{\"Date\": []}, **{t: [] for t in TASKS}}\n", "\n", " # TODO(lewtun): delete / filter all the junk repos from development\n", " # The following picks the latest submissions which adhere to the model card schema\n", " for submission in submissions:\n", " submission_id = submission[\"id\"]\n", " response = requests.get(\n", " f\"http://huggingface.co/api/datasets/{submission_id}?full=true\",\n", " headers=header,\n", " )\n", " data = response.json()\n", " card_data = data[\"card_data\"]\n", " submission_name = card_data[\"submission_dataset\"]\n", " submission_data[\"Submission\"].append(submission_name)\n", " submission_id = card_data[\"submission_id\"]\n", " timestamp = submission_id.split(\"-\")[-1]\n", " timestamp = pd.to_datetime(int(timestamp))\n", " submission_data[\"Date\"].append(datetime.date(timestamp))\n", "\n", " for task in card_data[\"results\"]:\n", " task_data = task[\"task\"]\n", " task_name = task_data[\"name\"]\n", " score = task_data[\"metrics\"][0][\"value\"]\n", " submission_data[task_name].append(score)\n", "\n", " df = pd.DataFrame(submission_data)\n", " df.insert(2, \"Overall\", df[TASKS].mean(axis=1))\n", " df = df.copy().sort_values(\"Overall\", ascending=False).reset_index().rename(columns={\"index\": \"Rank\"})\n", " df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True)\n", " return df" ] }, { "cell_type": "code", "execution_count": 28, "id": "8dccc419-7b18-4a10-a4bf-2d69cc3b5888", "metadata": {}, "outputs": [], "source": [ "submissions = download_submissions()" ] }, { "cell_type": "code", "execution_count": 29, "id": "934ea3b9-76dd-4d8f-a62d-8e2fa5959111", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(submissions)" ] }, { "cell_type": "code", "execution_count": 34, "id": "c3803890-d664-4d24-86bc-8fb095cad40a", "metadata": {}, "outputs": [], "source": [ "df = format_submissions(submissions)" ] }, { "cell_type": "code", "execution_count": 35, "id": "2de6f903-c327-42b6-a1ca-a530a62cc412", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RankSubmissionDateOverallAde Corpus V2Banking 77Terms Of ServiceTai Safety ResearchNeurips Impact Statement RisksOverrulingSystematic Review InclusionOne Stop EnglishTweet Eval HateTwitter ComplaintsSemiconductor Org Types
01Human baseline (crowdsourced)2021-08-270.7352730.8300.6070.6270.6090.8570.9170.4680.6460.7220.8970.908
10GPT-3 baseline2021-08-270.6310000.6880.2950.5790.6670.5950.9400.5350.4070.5290.8220.884
\n", "
" ], "text/plain": [ " Rank Submission Date Overall Ade Corpus V2 \\\n", "0 1 Human baseline (crowdsourced) 2021-08-27 0.735273 0.830 \n", "1 0 GPT-3 baseline 2021-08-27 0.631000 0.688 \n", "\n", " Banking 77 Terms Of Service Tai Safety Research \\\n", "0 0.607 0.627 0.609 \n", "1 0.295 0.579 0.667 \n", "\n", " Neurips Impact Statement Risks Overruling Systematic Review Inclusion \\\n", "0 0.857 0.917 0.468 \n", "1 0.595 0.940 0.535 \n", "\n", " One Stop English Tweet Eval Hate Twitter Complaints \\\n", "0 0.646 0.722 0.897 \n", "1 0.407 0.529 0.822 \n", "\n", " Semiconductor Org Types \n", "0 0.908 \n", "1 0.884 " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 45, "id": "ca6ba762-047f-4074-a5c3-b4168c13d398", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 RankSubmissionDateOverallAde Corpus V2Banking 77Terms Of ServiceTai Safety ResearchNeurips Impact Statement RisksOverrulingSystematic Review InclusionOne Stop EnglishTweet Eval HateTwitter ComplaintsSemiconductor Org Types
01Human baseline (crowdsourced)2021-08-270.7350.8300.6070.6270.6090.8570.9170.4680.6460.7220.8970.908
10GPT-3 baseline2021-08-270.6310.6880.2950.5790.6670.5950.9400.5350.4070.5290.8220.884
\n" ], "text/plain": [ "" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.style.format(precision=3)" ] }, { "cell_type": "code", "execution_count": 47, "id": "094e757c-1c6a-4d01-abb1-872face8c72b", "metadata": {}, "outputs": [], "source": [ "df2 = df.assign(hack=\"\").set_index(\"hack\")" ] }, { "cell_type": "code", "execution_count": 48, "id": "2ff434e2-5bf6-453f-8470-28c7b1034154", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 RankSubmissionDateOverallAde Corpus V2Banking 77Terms Of ServiceTai Safety ResearchNeurips Impact Statement RisksOverrulingSystematic Review InclusionOne Stop EnglishTweet Eval HateTwitter ComplaintsSemiconductor Org Types
hack               
1Human baseline (crowdsourced)2021-08-270.7350.8300.6070.6270.6090.8570.9170.4680.6460.7220.8970.908
0GPT-3 baseline2021-08-270.6310.6880.2950.5790.6670.5950.9400.5350.4070.5290.8220.884
\n" ], "text/plain": [ "" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.style.format(precision=3)" ] }, { "cell_type": "code", "execution_count": 186, "id": "8be02c77-bda3-499b-9ac4-d50ec35644a5", "metadata": {}, "outputs": [], "source": [ "for submission in submissions[-1:]:\n", " submission_id = submission[\"id\"]\n", " response = requests.get(\n", " f\"http://huggingface.co/api/datasets/{submission_id}?full=true\",\n", " headers=header,\n", " )\n", " data = response.json()" ] }, { "cell_type": "code", "execution_count": 188, "id": "7ab07904-0f7e-401b-96f8-3558433e479a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Submission': [], 'foo': [], 'bar': []}" ] }, "execution_count": 188, "metadata": {}, "output_type": "execute_result" } ], "source": [ "{**{\"Submission\": []}, **{\"foo\":[]}, **{\"bar\": []}}" ] }, { "cell_type": "code", "execution_count": 191, "id": "69ffb778-09cf-4eb8-ab95-739700d68420", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'f5a21c3fcb58ac17c8a47cfffd509b55cbad7ccf-1629986165000000000'" ] }, "execution_count": 191, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sub_id = data[\"card_data\"][\"submission_id\"]\n", "sub_id" ] }, { "cell_type": "code", "execution_count": 195, "id": "f7c3e8c0-68c7-4bad-802b-1b39703e100d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'1629986165000000000'" ] }, "execution_count": 195, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t = sub_id.split(\"-\")[-1]\n", "t" ] }, { "cell_type": "code", "execution_count": 197, "id": "34a7483c-0b00-42a7-99b9-ee6bdf34048a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Timestamp('2021-08-26 13:56:05')" ] }, "execution_count": 197, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d = pd.to_datetime(int(t))\n", "d" ] }, { "cell_type": "code", "execution_count": 201, "id": "95f1ecff-025f-4b42-8761-2c0964dfac5f", "metadata": {}, "outputs": [], "source": [ "import datetime" ] }, { "cell_type": "code", "execution_count": 205, "id": "c606cfea-165d-4b58-ba37-6fc9b06795cf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
d
02021-08-26
\n", "
" ], "text/plain": [ " d\n", "0 2021-08-26" ] }, "execution_count": 205, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({\"d\":[datetime.datetime.date(d)]})" ] }, { "cell_type": "code", "execution_count": 147, "id": "eebefd5a-6451-44b9-bc0f-d0663f321e34", "metadata": {}, "outputs": [], "source": [ "timestamp = data[\"lastModified\"]" ] }, { "cell_type": "code", "execution_count": 149, "id": "bf663ca3-12e8-4178-9aef-aba46621477a", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 160, "id": "4723aeb6-3993-49b1-b779-c1394b54d776", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Timestamp('2021-08-04 22:52:57+0000', tz='UTC')" ] }, "execution_count": 160, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t = pd.to_datetime(timestamp)\n", "t" ] }, { "cell_type": "code", "execution_count": 157, "id": "cbcc2bf8-e2c8-449c-9f00-38ed80e46ae0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'2021-08-04T22:52:57.000Z'" ] }, "execution_count": 157, "metadata": {}, "output_type": "execute_result" } ], "source": [ "timestamp" ] }, { "cell_type": "code", "execution_count": 161, "id": "177574a9-327e-4999-a1db-c316bb741c8c", "metadata": {}, "outputs": [], "source": [ "t_int = int(t.timestamp() * 10 **9)" ] }, { "cell_type": "code", "execution_count": 162, "id": "b3aa4f70-50af-47b3-a492-c77f65266a5b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Timestamp('2021-08-04 22:52:57')" ] }, "execution_count": 162, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.to_datetime(t_int)" ] }, { "cell_type": "code", "execution_count": 96, "id": "ae7c9100-a630-4b4e-a060-331914f86055", "metadata": {}, "outputs": [], "source": [ "submissions = download_submissions()" ] }, { "cell_type": "code", "execution_count": 97, "id": "831077a1-7f44-4d31-94b3-49257a62c5f7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "16" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(submissions)" ] }, { "cell_type": "code", "execution_count": 35, "id": "4a8a9ff9-7f94-4abb-8194-9d570ad2216b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'id': 'autonlp/autonlp-benchmark-raft-ought__raft-ought__raft-dummy-predictions-642',\n", " 'private': True,\n", " 'tags': ['benchmark:ought/raft',\n", " 'type:evaluation',\n", " 'submission_dataset:ought/raft-dummy-predictions',\n", " 'tags:autonlp',\n", " 'tags:evaluation',\n", " 'tags:benchmark'],\n", " 'author': 'autonlp',\n", " 'key': ''}]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submissions[-1:]" ] }, { "cell_type": "code", "execution_count": 98, "id": "2d6e56cb-fca3-4e9e-9a8b-9d2e26816773", "metadata": {}, "outputs": [], "source": [ "df = format_submissions(submissions[-2:])" ] }, { "cell_type": "code", "execution_count": 109, "id": "7d8a3402-f7b8-4edb-8d1e-afb704dc3c67", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SubmissionOverallbanking_77medical_subdomain_of_clinical_notesoverrulinggpai_initiativessemiconductor_org_typestwitter_complaintsneurips_impact_statement_riskssystematic_review_inclusionterms_of_servicetai_safety_researchone_stop_english
1lewtun/my-raft-dummy-predictions0.6050790.9489030.7165260.0643950.5294220.6437230.8734780.7569190.3816090.4613020.6241330.655457
0ought/raft-dummy-predictions0.4073450.0095040.5912130.5523900.5947690.3398220.7281160.8783780.2918420.1447720.0896220.260366
\n", "
" ], "text/plain": [ " Submission Overall banking_77 \\\n", "1 lewtun/my-raft-dummy-predictions 0.605079 0.948903 \n", "0 ought/raft-dummy-predictions 0.407345 0.009504 \n", "\n", " medical_subdomain_of_clinical_notes overruling gpai_initiatives \\\n", "1 0.716526 0.064395 0.529422 \n", "0 0.591213 0.552390 0.594769 \n", "\n", " semiconductor_org_types twitter_complaints \\\n", "1 0.643723 0.873478 \n", "0 0.339822 0.728116 \n", "\n", " neurips_impact_statement_risks systematic_review_inclusion \\\n", "1 0.756919 0.381609 \n", "0 0.878378 0.291842 \n", "\n", " terms_of_service tai_safety_research one_stop_english \n", "1 0.461302 0.624133 0.655457 \n", "0 0.144772 0.089622 0.260366 " ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 100, "id": "f60f453b-2457-4597-9eee-324d4c3a2f2e", "metadata": {}, "outputs": [], "source": [ "df.insert(1, \"Overall\", df[TASKS].mean(axis=1))" ] }, { "cell_type": "code", "execution_count": 110, "id": "1fd83f7a-b554-4e7d-aef6-4338b01f3eec", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RankSubmissionOverallbanking_77medical_subdomain_of_clinical_notesoverrulinggpai_initiativessemiconductor_org_typestwitter_complaintsneurips_impact_statement_riskssystematic_review_inclusionterms_of_servicetai_safety_researchone_stop_english
01lewtun/my-raft-dummy-predictions0.6050790.9489030.7165260.0643950.5294220.6437230.8734780.7569190.3816090.4613020.6241330.655457
10ought/raft-dummy-predictions0.4073450.0095040.5912130.5523900.5947690.3398220.7281160.8783780.2918420.1447720.0896220.260366
\n", "
" ], "text/plain": [ " Rank Submission Overall banking_77 \\\n", "0 1 lewtun/my-raft-dummy-predictions 0.605079 0.948903 \n", "1 0 ought/raft-dummy-predictions 0.407345 0.009504 \n", "\n", " medical_subdomain_of_clinical_notes overruling gpai_initiatives \\\n", "0 0.716526 0.064395 0.529422 \n", "1 0.591213 0.552390 0.594769 \n", "\n", " semiconductor_org_types twitter_complaints \\\n", "0 0.643723 0.873478 \n", "1 0.339822 0.728116 \n", "\n", " neurips_impact_statement_risks systematic_review_inclusion \\\n", "0 0.756919 0.381609 \n", "1 0.878378 0.291842 \n", "\n", " terms_of_service tai_safety_research one_stop_english \n", "0 0.461302 0.624133 0.655457 \n", "1 0.144772 0.089622 0.260366 " ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.copy().sort_values(\"Overall\", ascending=False).reset_index().rename(columns={\"index\":\"Rank\"})" ] }, { "cell_type": "code", "execution_count": 119, "id": "e1262ff5-6ea3-41ca-affc-b106dd9df5fd", "metadata": {}, "outputs": [], "source": [ "task_names = [\" \".join(t.capitalize() for t in task.split(\"_\")) for task in TASKS]" ] }, { "cell_type": "code", "execution_count": 121, "id": "45d74b9c-c472-4494-aadc-909976d13b08", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SubmissionOverallBanking 77Medical Subdomain Of Clinical NotesOverrulingGpai InitiativesSemiconductor Org TypesTwitter ComplaintsNeurips Impact Statement RisksSystematic Review InclusionTerms Of ServiceTai Safety ResearchOne Stop English
1lewtun/my-raft-dummy-predictions0.6050790.9489030.7165260.0643950.5294220.6437230.8734780.7569190.3816090.4613020.6241330.655457
0ought/raft-dummy-predictions0.4073450.0095040.5912130.5523900.5947690.3398220.7281160.8783780.2918420.1447720.0896220.260366
\n", "
" ], "text/plain": [ " Submission Overall Banking 77 \\\n", "1 lewtun/my-raft-dummy-predictions 0.605079 0.948903 \n", "0 ought/raft-dummy-predictions 0.407345 0.009504 \n", "\n", " Medical Subdomain Of Clinical Notes Overruling Gpai Initiatives \\\n", "1 0.716526 0.064395 0.529422 \n", "0 0.591213 0.552390 0.594769 \n", "\n", " Semiconductor Org Types Twitter Complaints \\\n", "1 0.643723 0.873478 \n", "0 0.339822 0.728116 \n", "\n", " Neurips Impact Statement Risks Systematic Review Inclusion \\\n", "1 0.756919 0.381609 \n", "0 0.878378 0.291842 \n", "\n", " Terms Of Service Tai Safety Research One Stop English \n", "1 0.461302 0.624133 0.655457 \n", "0 0.144772 0.089622 0.260366 " ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.rename(columns={k:v for k,v in zip(TASKS, task_names)})" ] }, { "cell_type": "code", "execution_count": 88, "id": "d31c2bde-1645-4c1b-982b-c9daac40311d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SubmissionOverallbanking_77medical_subdomain_of_clinical_notesoverrulinggpai_initiativessemiconductor_org_typestwitter_complaintsneurips_impact_statement_riskssystematic_review_inclusionterms_of_servicetai_safety_researchone_stop_english
0ought/raft-dummy-predictions0.4073450.0095040.5912130.552390.5947690.3398220.7281160.8783780.2918420.1447720.0896220.260366
\n", "
" ], "text/plain": [ " Submission Overall banking_77 \\\n", "0 ought/raft-dummy-predictions 0.407345 0.009504 \n", "\n", " medical_subdomain_of_clinical_notes overruling gpai_initiatives \\\n", "0 0.591213 0.55239 0.594769 \n", "\n", " semiconductor_org_types twitter_complaints \\\n", "0 0.339822 0.728116 \n", "\n", " neurips_impact_statement_risks systematic_review_inclusion \\\n", "0 0.878378 0.291842 \n", "\n", " terms_of_service tai_safety_research one_stop_english \n", "0 0.144772 0.089622 0.260366 " ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values(\"Overall\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4df33059-020a-43cf-aa3a-de6939268cc7", "metadata": {}, "outputs": [], "source": [ "df[\"Overall\"] = df.mean()" ] }, { "cell_type": "code", "execution_count": null, "id": "327539f3-3bf7-4a2e-ac10-89973a2ba37f", "metadata": {}, "outputs": [], "source": [ "df[\"Submission\"]" ] }, { "cell_type": "code", "execution_count": 38, "id": "f07ec556-2ebf-400e-85f7-c978d03b0dc1", "metadata": {}, "outputs": [], "source": [ "data = format_submissions(submissions[-1:])" ] }, { "cell_type": "code", "execution_count": 48, "id": "a982e024-ab16-4752-984a-5368fa238f1d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bank
00.2
\n", "
" ], "text/plain": [ " bank\n", "0 0.2" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({\"bank\":[0.2]})" ] }, { "cell_type": "code", "execution_count": 60, "id": "b7c73606-d7f9-4f17-bf4d-17cfbb3aa664", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched', 'mnli_matched', 'qnli', 'rte', 'wnli', 'ax']\n" ] } ], "source": [ "from datasets import get_dataset_config_names\n", "\n", "configs = get_dataset_config_names(\"glue\")\n", "print(configs)" ] }, { "cell_type": "code", "execution_count": 50, "id": "92eea464-6b63-4613-ab4d-aa5003e0bb3b", "metadata": {}, "outputs": [], "source": [ "from datasets import get_dataset_config_names" ] }, { "cell_type": "code", "execution_count": 51, "id": "4f9c2924-001a-4b76-a8ed-a072b43eedbd", "metadata": {}, "outputs": [], "source": [ "tasks = get_dataset_config_names(\"ought/raft\")" ] }, { "cell_type": "code", "execution_count": 55, "id": "9b27374f-b118-440a-acab-6e4aa09f42a4", "metadata": {}, "outputs": [], "source": [ "submission_data = {t:[] for t in tasks}\n", "\n", "for task in data[\"card_data\"][\"results\"]:\n", " task_data = task[\"task\"]\n", " task_name = task_data[\"name\"]\n", " score = task_data[\"metrics\"][0][\"value\"]\n", " submission_data[task_name].append(score)" ] }, { "cell_type": "code", "execution_count": 56, "id": "6b7cf2e0-ee92-4647-8d9b-6edef48e06f8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'banking_77': [0.009504218288713173],\n", " 'medical_subdomain_of_clinical_notes': [0.5912133593265538],\n", " 'overruling': [0.5523904885287522],\n", " 'gpai_initiatives': [0.5947694876413803],\n", " 'semiconductor_org_types': [0.33982211621333613],\n", " 'twitter_complaints': [0.7281156178656647],\n", " 'neurips_impact_statement_risks': [0.8783775228874845],\n", " 'systematic_review_inclusion': [0.2918416872180052],\n", " 'terms_of_service': [0.14477157391911066],\n", " 'tai_safety_research': [0.08962249895220364],\n", " 'one_stop_english': [0.2603661495335281]}" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "submission_data" ] }, { "cell_type": "code", "execution_count": 61, "id": "5df282e4-87c4-4ea7-833e-6a87886e2f76", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'benchmark': 'ought/raft',\n", " 'type': 'evaluation',\n", " 'submission_dataset': 'ought/raft-dummy-predictions',\n", " 'tags': ['autonlp', 'evaluation', 'benchmark'],\n", " 'model-index': None,\n", " 'results': [{'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.009504218288713173}],\n", " 'name': 'banking_77',\n", " 'type': 'text-classification'}},\n", " {'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.5912133593265538}],\n", " 'name': 'medical_subdomain_of_clinical_notes',\n", " 'type': 'text-classification'}},\n", " {'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.5523904885287522}],\n", " 'name': 'overruling',\n", " 'type': 'text-classification'}},\n", " {'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.5947694876413803}],\n", " 'name': 'gpai_initiatives',\n", " 'type': 'text-classification'}},\n", " {'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.33982211621333613}],\n", " 'name': 'semiconductor_org_types',\n", " 'type': 'text-classification'}},\n", " {'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.7281156178656647}],\n", " 'name': 'twitter_complaints',\n", " 'type': 'text-classification'}},\n", " {'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.8783775228874845}],\n", " 'name': 'neurips_impact_statement_risks',\n", " 'type': 'text-classification'}},\n", " {'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.2918416872180052}],\n", " 'name': 'systematic_review_inclusion',\n", " 'type': 'text-classification'}},\n", " {'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.14477157391911066}],\n", " 'name': 'terms_of_service',\n", " 'type': 'text-classification'}},\n", " {'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.08962249895220364}],\n", " 'name': 'tai_safety_research',\n", " 'type': 'text-classification'}},\n", " {'task': {'metrics': [{'name': 'f1',\n", " 'type': 'f1',\n", " 'value': 0.2603661495335281}],\n", " 'name': 'one_stop_english',\n", " 'type': 'text-classification'}}]}" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[\"card_data\"]" ] }, { "cell_type": "code", "execution_count": 2, "id": "b07a4fa9-176e-4ff3-bc3f-eb2a6fc9efda", "metadata": {}, "outputs": [], "source": [ "response = requests.get(\"http://huggingface.co/api/datasets\", headers=header)\n", "all_datasets = response.json()" ] }, { "cell_type": "code", "execution_count": 3, "id": "63dc07ec-2f28-483f-8163-c97e8a6a4005", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2510" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(all_datasets)" ] }, { "cell_type": "code", "execution_count": 21, "id": "296f68c1-608d-4ea6-8d0e-cc35fb7d74c4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': 'disfl_qa',\n", " 'tags': ['annotations_creators:expert-generated',\n", " 'language_creators:found',\n", " 'languages:en',\n", " 'licenses:cc-by-4.0',\n", " 'multilinguality:monolingual',\n", " 'pretty_name:DISFL-QA: A Benchmark Dataset for Understanding Disfluencies in Question Answering',\n", " 'size_categories:10K\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"a\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"b\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)" ] } ], "source": [ "a, b = zip(*[\"a\", \"b\"])" ] }, { "cell_type": "code", "execution_count": 12, "id": "4990ce09-a53f-47dd-b662-3f498352b641", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "annotations_creators expert-generated\n", "language_creators found\n", "languages en\n", "licenses mit\n", "multilinguality monolingual\n", "size_categories 10K\n", "1922 ¡\n", "11884 hola\n", "16 ,\n", "378 me\n", "13496 llamo\n", "466 le\n", "91 w\n", "350 is\n", "5 !\n", "2 \n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "model_ckpt = \"bertin-project/bertin-roberta-base-spanish\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=False)\n", "input_ids = tokenizer(\"¡hola, me llamo lewis!\").input_ids\n", "for token in input_ids:\n", " print(token, tokenizer.decode(token))" ] }, { "cell_type": "code", "execution_count": null, "id": "430400f2-2c04-48d7-bf8e-63528441d410", "metadata": {}, "outputs": [], "source": [ "# 1922 ¡\n", "# 11884 hola\n", "# 16 ,\n", "# 378 me\n", "# 13496 llamo\n", "# 466 le\n", "# 91 w\n", "# 350 is\n", "# 5 !" ] }, { "cell_type": "code", "execution_count": 130, "id": "2ecdd872-af9b-4258-8a5e-d867f3785520", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.vocab[\"\"]" ] }, { "cell_type": "code", "execution_count": 131, "id": "16941c33-5e22-485f-9d24-ac8f8542c368", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "''" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.bos_token" ] }, { "cell_type": "code", "execution_count": null, "id": "71929465-5ad5-444d-8c77-22f586b1ba23", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }