lewtun HF staff commited on
Commit
7cfc852
1 Parent(s): b3c67da

Tweak submission column names

Browse files
Files changed (2) hide show
  1. Untitled.ipynb +1833 -0
  2. app.py +3 -4
Untitled.ipynb ADDED
@@ -0,0 +1,1833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 89,
6
+ "id": "c0cdda73-430c-4e18-bce4-b2218e2597b9",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from datasets import load_dataset, get_dataset_config_names"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "id": "4981ce75-5d13-4fd2-b08f-af077066f7d3",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": []
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 32,
24
+ "id": "13e20072-0304-424a-923d-ac31a1769e94",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "import os\n",
29
+ "from datetime import datetime\n",
30
+ "from pathlib import Path\n",
31
+ "from re import sub\n",
32
+ "\n",
33
+ "import pandas as pd\n",
34
+ "import requests\n",
35
+ "import streamlit as st\n",
36
+ "from datasets import get_dataset_config_names\n",
37
+ "from dotenv import load_dotenv\n",
38
+ "\n",
39
+ "if Path(\".env\").is_file():\n",
40
+ " load_dotenv(\".env\")\n",
41
+ "\n",
42
+ "auth_token = os.getenv(\"HF_HUB_TOKEN\")\n",
43
+ "header = {\"Authorization\": \"Bearer \" + auth_token}\n",
44
+ "\n",
45
+ "TASKS = get_dataset_config_names(\"ought/raft\")\n",
46
+ "# Split and capitalize the task names, e.g. banking_77 => Banking 77\n",
47
+ "FORMATTED_TASK_NAMES = [\" \".join(t.capitalize() for t in task.split(\"_\")) for task in TASKS]\n",
48
+ "\n",
49
+ "\n",
50
+ "def extract_tags(dataset):\n",
51
+ " tags = {}\n",
52
+ " for tag in dataset[\"tags\"]:\n",
53
+ " k, v = tuple(tag.split(\":\", 1))\n",
54
+ " tags[k] = v\n",
55
+ " return tags\n",
56
+ "\n",
57
+ "\n",
58
+ "def download_submissions():\n",
59
+ " response = requests.get(\"http://huggingface.co/api/datasets\", headers=header)\n",
60
+ " all_datasets = response.json()\n",
61
+ "\n",
62
+ " submissions = []\n",
63
+ "\n",
64
+ " for dataset in all_datasets:\n",
65
+ " tags = extract_tags(dataset)\n",
66
+ " if tags.get(\"benchmark\") == \"ought/raft\" and tags.get(\"type\") == \"evaluation\":\n",
67
+ " submissions.append(dataset)\n",
68
+ " return submissions\n",
69
+ "\n",
70
+ "\n",
71
+ "def format_submissions(submissions):\n",
72
+ " submission_data = {**{\"Submission\": []}, **{\"Date\": []}, **{t: [] for t in TASKS}}\n",
73
+ "\n",
74
+ " # TODO(lewtun): delete / filter all the junk repos from development\n",
75
+ " # The following picks the latest submissions which adhere to the model card schema\n",
76
+ " for submission in submissions:\n",
77
+ " submission_id = submission[\"id\"]\n",
78
+ " response = requests.get(\n",
79
+ " f\"http://huggingface.co/api/datasets/{submission_id}?full=true\",\n",
80
+ " headers=header,\n",
81
+ " )\n",
82
+ " data = response.json()\n",
83
+ " card_data = data[\"card_data\"]\n",
84
+ " submission_name = card_data[\"submission_dataset\"]\n",
85
+ " submission_data[\"Submission\"].append(submission_name)\n",
86
+ " submission_id = card_data[\"submission_id\"]\n",
87
+ " timestamp = submission_id.split(\"-\")[-1]\n",
88
+ " timestamp = pd.to_datetime(int(timestamp))\n",
89
+ " submission_data[\"Date\"].append(datetime.date(timestamp))\n",
90
+ "\n",
91
+ " for task in card_data[\"results\"]:\n",
92
+ " task_data = task[\"task\"]\n",
93
+ " task_name = task_data[\"name\"]\n",
94
+ " score = task_data[\"metrics\"][0][\"value\"]\n",
95
+ " submission_data[task_name].append(score)\n",
96
+ "\n",
97
+ " df = pd.DataFrame(submission_data)\n",
98
+ " df.insert(2, \"Overall\", df[TASKS].mean(axis=1))\n",
99
+ " df = df.copy().sort_values(\"Overall\", ascending=False).reset_index().rename(columns={\"index\": \"Rank\"})\n",
100
+ " df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True)\n",
101
+ " return df"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 28,
107
+ "id": "8dccc419-7b18-4a10-a4bf-2d69cc3b5888",
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": [
111
+ "submissions = download_submissions()"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 29,
117
+ "id": "934ea3b9-76dd-4d8f-a62d-8e2fa5959111",
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "data": {
122
+ "text/plain": [
123
+ "2"
124
+ ]
125
+ },
126
+ "execution_count": 29,
127
+ "metadata": {},
128
+ "output_type": "execute_result"
129
+ }
130
+ ],
131
+ "source": [
132
+ "len(submissions)"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 34,
138
+ "id": "c3803890-d664-4d24-86bc-8fb095cad40a",
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "df = format_submissions(submissions)"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": 35,
148
+ "id": "2de6f903-c327-42b6-a1ca-a530a62cc412",
149
+ "metadata": {},
150
+ "outputs": [
151
+ {
152
+ "data": {
153
+ "text/html": [
154
+ "<div>\n",
155
+ "<style scoped>\n",
156
+ " .dataframe tbody tr th:only-of-type {\n",
157
+ " vertical-align: middle;\n",
158
+ " }\n",
159
+ "\n",
160
+ " .dataframe tbody tr th {\n",
161
+ " vertical-align: top;\n",
162
+ " }\n",
163
+ "\n",
164
+ " .dataframe thead th {\n",
165
+ " text-align: right;\n",
166
+ " }\n",
167
+ "</style>\n",
168
+ "<table border=\"1\" class=\"dataframe\">\n",
169
+ " <thead>\n",
170
+ " <tr style=\"text-align: right;\">\n",
171
+ " <th></th>\n",
172
+ " <th>Rank</th>\n",
173
+ " <th>Submission</th>\n",
174
+ " <th>Date</th>\n",
175
+ " <th>Overall</th>\n",
176
+ " <th>Ade Corpus V2</th>\n",
177
+ " <th>Banking 77</th>\n",
178
+ " <th>Terms Of Service</th>\n",
179
+ " <th>Tai Safety Research</th>\n",
180
+ " <th>Neurips Impact Statement Risks</th>\n",
181
+ " <th>Overruling</th>\n",
182
+ " <th>Systematic Review Inclusion</th>\n",
183
+ " <th>One Stop English</th>\n",
184
+ " <th>Tweet Eval Hate</th>\n",
185
+ " <th>Twitter Complaints</th>\n",
186
+ " <th>Semiconductor Org Types</th>\n",
187
+ " </tr>\n",
188
+ " </thead>\n",
189
+ " <tbody>\n",
190
+ " <tr>\n",
191
+ " <th>0</th>\n",
192
+ " <td>1</td>\n",
193
+ " <td>Human baseline (crowdsourced)</td>\n",
194
+ " <td>2021-08-27</td>\n",
195
+ " <td>0.735273</td>\n",
196
+ " <td>0.830</td>\n",
197
+ " <td>0.607</td>\n",
198
+ " <td>0.627</td>\n",
199
+ " <td>0.609</td>\n",
200
+ " <td>0.857</td>\n",
201
+ " <td>0.917</td>\n",
202
+ " <td>0.468</td>\n",
203
+ " <td>0.646</td>\n",
204
+ " <td>0.722</td>\n",
205
+ " <td>0.897</td>\n",
206
+ " <td>0.908</td>\n",
207
+ " </tr>\n",
208
+ " <tr>\n",
209
+ " <th>1</th>\n",
210
+ " <td>0</td>\n",
211
+ " <td>GPT-3 baseline</td>\n",
212
+ " <td>2021-08-27</td>\n",
213
+ " <td>0.631000</td>\n",
214
+ " <td>0.688</td>\n",
215
+ " <td>0.295</td>\n",
216
+ " <td>0.579</td>\n",
217
+ " <td>0.667</td>\n",
218
+ " <td>0.595</td>\n",
219
+ " <td>0.940</td>\n",
220
+ " <td>0.535</td>\n",
221
+ " <td>0.407</td>\n",
222
+ " <td>0.529</td>\n",
223
+ " <td>0.822</td>\n",
224
+ " <td>0.884</td>\n",
225
+ " </tr>\n",
226
+ " </tbody>\n",
227
+ "</table>\n",
228
+ "</div>"
229
+ ],
230
+ "text/plain": [
231
+ " Rank Submission Date Overall Ade Corpus V2 \\\n",
232
+ "0 1 Human baseline (crowdsourced) 2021-08-27 0.735273 0.830 \n",
233
+ "1 0 GPT-3 baseline 2021-08-27 0.631000 0.688 \n",
234
+ "\n",
235
+ " Banking 77 Terms Of Service Tai Safety Research \\\n",
236
+ "0 0.607 0.627 0.609 \n",
237
+ "1 0.295 0.579 0.667 \n",
238
+ "\n",
239
+ " Neurips Impact Statement Risks Overruling Systematic Review Inclusion \\\n",
240
+ "0 0.857 0.917 0.468 \n",
241
+ "1 0.595 0.940 0.535 \n",
242
+ "\n",
243
+ " One Stop English Tweet Eval Hate Twitter Complaints \\\n",
244
+ "0 0.646 0.722 0.897 \n",
245
+ "1 0.407 0.529 0.822 \n",
246
+ "\n",
247
+ " Semiconductor Org Types \n",
248
+ "0 0.908 \n",
249
+ "1 0.884 "
250
+ ]
251
+ },
252
+ "execution_count": 35,
253
+ "metadata": {},
254
+ "output_type": "execute_result"
255
+ }
256
+ ],
257
+ "source": [
258
+ "df"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "code",
263
+ "execution_count": 45,
264
+ "id": "ca6ba762-047f-4074-a5c3-b4168c13d398",
265
+ "metadata": {},
266
+ "outputs": [
267
+ {
268
+ "data": {
269
+ "text/html": [
270
+ "<style type=\"text/css\">\n",
271
+ "</style>\n",
272
+ "<table id=\"T_b6d1f_\">\n",
273
+ " <thead>\n",
274
+ " <tr>\n",
275
+ " <th class=\"blank level0\" >&nbsp;</th>\n",
276
+ " <th class=\"col_heading level0 col0\" >Rank</th>\n",
277
+ " <th class=\"col_heading level0 col1\" >Submission</th>\n",
278
+ " <th class=\"col_heading level0 col2\" >Date</th>\n",
279
+ " <th class=\"col_heading level0 col3\" >Overall</th>\n",
280
+ " <th class=\"col_heading level0 col4\" >Ade Corpus V2</th>\n",
281
+ " <th class=\"col_heading level0 col5\" >Banking 77</th>\n",
282
+ " <th class=\"col_heading level0 col6\" >Terms Of Service</th>\n",
283
+ " <th class=\"col_heading level0 col7\" >Tai Safety Research</th>\n",
284
+ " <th class=\"col_heading level0 col8\" >Neurips Impact Statement Risks</th>\n",
285
+ " <th class=\"col_heading level0 col9\" >Overruling</th>\n",
286
+ " <th class=\"col_heading level0 col10\" >Systematic Review Inclusion</th>\n",
287
+ " <th class=\"col_heading level0 col11\" >One Stop English</th>\n",
288
+ " <th class=\"col_heading level0 col12\" >Tweet Eval Hate</th>\n",
289
+ " <th class=\"col_heading level0 col13\" >Twitter Complaints</th>\n",
290
+ " <th class=\"col_heading level0 col14\" >Semiconductor Org Types</th>\n",
291
+ " </tr>\n",
292
+ " </thead>\n",
293
+ " <tbody>\n",
294
+ " <tr>\n",
295
+ " <th id=\"T_b6d1f_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
296
+ " <td id=\"T_b6d1f_row0_col0\" class=\"data row0 col0\" >1</td>\n",
297
+ " <td id=\"T_b6d1f_row0_col1\" class=\"data row0 col1\" >Human baseline (crowdsourced)</td>\n",
298
+ " <td id=\"T_b6d1f_row0_col2\" class=\"data row0 col2\" >2021-08-27</td>\n",
299
+ " <td id=\"T_b6d1f_row0_col3\" class=\"data row0 col3\" >0.735</td>\n",
300
+ " <td id=\"T_b6d1f_row0_col4\" class=\"data row0 col4\" >0.830</td>\n",
301
+ " <td id=\"T_b6d1f_row0_col5\" class=\"data row0 col5\" >0.607</td>\n",
302
+ " <td id=\"T_b6d1f_row0_col6\" class=\"data row0 col6\" >0.627</td>\n",
303
+ " <td id=\"T_b6d1f_row0_col7\" class=\"data row0 col7\" >0.609</td>\n",
304
+ " <td id=\"T_b6d1f_row0_col8\" class=\"data row0 col8\" >0.857</td>\n",
305
+ " <td id=\"T_b6d1f_row0_col9\" class=\"data row0 col9\" >0.917</td>\n",
306
+ " <td id=\"T_b6d1f_row0_col10\" class=\"data row0 col10\" >0.468</td>\n",
307
+ " <td id=\"T_b6d1f_row0_col11\" class=\"data row0 col11\" >0.646</td>\n",
308
+ " <td id=\"T_b6d1f_row0_col12\" class=\"data row0 col12\" >0.722</td>\n",
309
+ " <td id=\"T_b6d1f_row0_col13\" class=\"data row0 col13\" >0.897</td>\n",
310
+ " <td id=\"T_b6d1f_row0_col14\" class=\"data row0 col14\" >0.908</td>\n",
311
+ " </tr>\n",
312
+ " <tr>\n",
313
+ " <th id=\"T_b6d1f_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
314
+ " <td id=\"T_b6d1f_row1_col0\" class=\"data row1 col0\" >0</td>\n",
315
+ " <td id=\"T_b6d1f_row1_col1\" class=\"data row1 col1\" >GPT-3 baseline</td>\n",
316
+ " <td id=\"T_b6d1f_row1_col2\" class=\"data row1 col2\" >2021-08-27</td>\n",
317
+ " <td id=\"T_b6d1f_row1_col3\" class=\"data row1 col3\" >0.631</td>\n",
318
+ " <td id=\"T_b6d1f_row1_col4\" class=\"data row1 col4\" >0.688</td>\n",
319
+ " <td id=\"T_b6d1f_row1_col5\" class=\"data row1 col5\" >0.295</td>\n",
320
+ " <td id=\"T_b6d1f_row1_col6\" class=\"data row1 col6\" >0.579</td>\n",
321
+ " <td id=\"T_b6d1f_row1_col7\" class=\"data row1 col7\" >0.667</td>\n",
322
+ " <td id=\"T_b6d1f_row1_col8\" class=\"data row1 col8\" >0.595</td>\n",
323
+ " <td id=\"T_b6d1f_row1_col9\" class=\"data row1 col9\" >0.940</td>\n",
324
+ " <td id=\"T_b6d1f_row1_col10\" class=\"data row1 col10\" >0.535</td>\n",
325
+ " <td id=\"T_b6d1f_row1_col11\" class=\"data row1 col11\" >0.407</td>\n",
326
+ " <td id=\"T_b6d1f_row1_col12\" class=\"data row1 col12\" >0.529</td>\n",
327
+ " <td id=\"T_b6d1f_row1_col13\" class=\"data row1 col13\" >0.822</td>\n",
328
+ " <td id=\"T_b6d1f_row1_col14\" class=\"data row1 col14\" >0.884</td>\n",
329
+ " </tr>\n",
330
+ " </tbody>\n",
331
+ "</table>\n"
332
+ ],
333
+ "text/plain": [
334
+ "<pandas.io.formats.style.Styler at 0x7fba946d44c0>"
335
+ ]
336
+ },
337
+ "execution_count": 45,
338
+ "metadata": {},
339
+ "output_type": "execute_result"
340
+ }
341
+ ],
342
+ "source": [
343
+ "df.style.format(precision=3)"
344
+ ]
345
+ },
346
+ {
347
+ "cell_type": "code",
348
+ "execution_count": 47,
349
+ "id": "094e757c-1c6a-4d01-abb1-872face8c72b",
350
+ "metadata": {},
351
+ "outputs": [],
352
+ "source": [
353
+ "df2 = df.assign(hack=\"\").set_index(\"hack\")"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": 48,
359
+ "id": "2ff434e2-5bf6-453f-8470-28c7b1034154",
360
+ "metadata": {},
361
+ "outputs": [
362
+ {
363
+ "data": {
364
+ "text/html": [
365
+ "<style type=\"text/css\">\n",
366
+ "</style>\n",
367
+ "<table id=\"T_59a1f_\">\n",
368
+ " <thead>\n",
369
+ " <tr>\n",
370
+ " <th class=\"blank level0\" >&nbsp;</th>\n",
371
+ " <th class=\"col_heading level0 col0\" >Rank</th>\n",
372
+ " <th class=\"col_heading level0 col1\" >Submission</th>\n",
373
+ " <th class=\"col_heading level0 col2\" >Date</th>\n",
374
+ " <th class=\"col_heading level0 col3\" >Overall</th>\n",
375
+ " <th class=\"col_heading level0 col4\" >Ade Corpus V2</th>\n",
376
+ " <th class=\"col_heading level0 col5\" >Banking 77</th>\n",
377
+ " <th class=\"col_heading level0 col6\" >Terms Of Service</th>\n",
378
+ " <th class=\"col_heading level0 col7\" >Tai Safety Research</th>\n",
379
+ " <th class=\"col_heading level0 col8\" >Neurips Impact Statement Risks</th>\n",
380
+ " <th class=\"col_heading level0 col9\" >Overruling</th>\n",
381
+ " <th class=\"col_heading level0 col10\" >Systematic Review Inclusion</th>\n",
382
+ " <th class=\"col_heading level0 col11\" >One Stop English</th>\n",
383
+ " <th class=\"col_heading level0 col12\" >Tweet Eval Hate</th>\n",
384
+ " <th class=\"col_heading level0 col13\" >Twitter Complaints</th>\n",
385
+ " <th class=\"col_heading level0 col14\" >Semiconductor Org Types</th>\n",
386
+ " </tr>\n",
387
+ " <tr>\n",
388
+ " <th class=\"index_name level0\" >hack</th>\n",
389
+ " <th class=\"blank col0\" >&nbsp;</th>\n",
390
+ " <th class=\"blank col1\" >&nbsp;</th>\n",
391
+ " <th class=\"blank col2\" >&nbsp;</th>\n",
392
+ " <th class=\"blank col3\" >&nbsp;</th>\n",
393
+ " <th class=\"blank col4\" >&nbsp;</th>\n",
394
+ " <th class=\"blank col5\" >&nbsp;</th>\n",
395
+ " <th class=\"blank col6\" >&nbsp;</th>\n",
396
+ " <th class=\"blank col7\" >&nbsp;</th>\n",
397
+ " <th class=\"blank col8\" >&nbsp;</th>\n",
398
+ " <th class=\"blank col9\" >&nbsp;</th>\n",
399
+ " <th class=\"blank col10\" >&nbsp;</th>\n",
400
+ " <th class=\"blank col11\" >&nbsp;</th>\n",
401
+ " <th class=\"blank col12\" >&nbsp;</th>\n",
402
+ " <th class=\"blank col13\" >&nbsp;</th>\n",
403
+ " <th class=\"blank col14\" >&nbsp;</th>\n",
404
+ " </tr>\n",
405
+ " </thead>\n",
406
+ " <tbody>\n",
407
+ " <tr>\n",
408
+ " <th id=\"T_59a1f_level0_row0\" class=\"row_heading level0 row0\" ></th>\n",
409
+ " <td id=\"T_59a1f_row0_col0\" class=\"data row0 col0\" >1</td>\n",
410
+ " <td id=\"T_59a1f_row0_col1\" class=\"data row0 col1\" >Human baseline (crowdsourced)</td>\n",
411
+ " <td id=\"T_59a1f_row0_col2\" class=\"data row0 col2\" >2021-08-27</td>\n",
412
+ " <td id=\"T_59a1f_row0_col3\" class=\"data row0 col3\" >0.735</td>\n",
413
+ " <td id=\"T_59a1f_row0_col4\" class=\"data row0 col4\" >0.830</td>\n",
414
+ " <td id=\"T_59a1f_row0_col5\" class=\"data row0 col5\" >0.607</td>\n",
415
+ " <td id=\"T_59a1f_row0_col6\" class=\"data row0 col6\" >0.627</td>\n",
416
+ " <td id=\"T_59a1f_row0_col7\" class=\"data row0 col7\" >0.609</td>\n",
417
+ " <td id=\"T_59a1f_row0_col8\" class=\"data row0 col8\" >0.857</td>\n",
418
+ " <td id=\"T_59a1f_row0_col9\" class=\"data row0 col9\" >0.917</td>\n",
419
+ " <td id=\"T_59a1f_row0_col10\" class=\"data row0 col10\" >0.468</td>\n",
420
+ " <td id=\"T_59a1f_row0_col11\" class=\"data row0 col11\" >0.646</td>\n",
421
+ " <td id=\"T_59a1f_row0_col12\" class=\"data row0 col12\" >0.722</td>\n",
422
+ " <td id=\"T_59a1f_row0_col13\" class=\"data row0 col13\" >0.897</td>\n",
423
+ " <td id=\"T_59a1f_row0_col14\" class=\"data row0 col14\" >0.908</td>\n",
424
+ " </tr>\n",
425
+ " <tr>\n",
426
+ " <th id=\"T_59a1f_level0_row1\" class=\"row_heading level0 row1\" ></th>\n",
427
+ " <td id=\"T_59a1f_row1_col0\" class=\"data row1 col0\" >0</td>\n",
428
+ " <td id=\"T_59a1f_row1_col1\" class=\"data row1 col1\" >GPT-3 baseline</td>\n",
429
+ " <td id=\"T_59a1f_row1_col2\" class=\"data row1 col2\" >2021-08-27</td>\n",
430
+ " <td id=\"T_59a1f_row1_col3\" class=\"data row1 col3\" >0.631</td>\n",
431
+ " <td id=\"T_59a1f_row1_col4\" class=\"data row1 col4\" >0.688</td>\n",
432
+ " <td id=\"T_59a1f_row1_col5\" class=\"data row1 col5\" >0.295</td>\n",
433
+ " <td id=\"T_59a1f_row1_col6\" class=\"data row1 col6\" >0.579</td>\n",
434
+ " <td id=\"T_59a1f_row1_col7\" class=\"data row1 col7\" >0.667</td>\n",
435
+ " <td id=\"T_59a1f_row1_col8\" class=\"data row1 col8\" >0.595</td>\n",
436
+ " <td id=\"T_59a1f_row1_col9\" class=\"data row1 col9\" >0.940</td>\n",
437
+ " <td id=\"T_59a1f_row1_col10\" class=\"data row1 col10\" >0.535</td>\n",
438
+ " <td id=\"T_59a1f_row1_col11\" class=\"data row1 col11\" >0.407</td>\n",
439
+ " <td id=\"T_59a1f_row1_col12\" class=\"data row1 col12\" >0.529</td>\n",
440
+ " <td id=\"T_59a1f_row1_col13\" class=\"data row1 col13\" >0.822</td>\n",
441
+ " <td id=\"T_59a1f_row1_col14\" class=\"data row1 col14\" >0.884</td>\n",
442
+ " </tr>\n",
443
+ " </tbody>\n",
444
+ "</table>\n"
445
+ ],
446
+ "text/plain": [
447
+ "<pandas.io.formats.style.Styler at 0x7fba946d4910>"
448
+ ]
449
+ },
450
+ "execution_count": 48,
451
+ "metadata": {},
452
+ "output_type": "execute_result"
453
+ }
454
+ ],
455
+ "source": [
456
+ "df2.style.format(precision=3)"
457
+ ]
458
+ },
459
+ {
460
+ "cell_type": "code",
461
+ "execution_count": 186,
462
+ "id": "8be02c77-bda3-499b-9ac4-d50ec35644a5",
463
+ "metadata": {},
464
+ "outputs": [],
465
+ "source": [
466
+ "for submission in submissions[-1:]:\n",
467
+ " submission_id = submission[\"id\"]\n",
468
+ " response = requests.get(\n",
469
+ " f\"http://huggingface.co/api/datasets/{submission_id}?full=true\",\n",
470
+ " headers=header,\n",
471
+ " )\n",
472
+ " data = response.json()"
473
+ ]
474
+ },
475
+ {
476
+ "cell_type": "code",
477
+ "execution_count": 188,
478
+ "id": "7ab07904-0f7e-401b-96f8-3558433e479a",
479
+ "metadata": {},
480
+ "outputs": [
481
+ {
482
+ "data": {
483
+ "text/plain": [
484
+ "{'Submission': [], 'foo': [], 'bar': []}"
485
+ ]
486
+ },
487
+ "execution_count": 188,
488
+ "metadata": {},
489
+ "output_type": "execute_result"
490
+ }
491
+ ],
492
+ "source": [
493
+ "{**{\"Submission\": []}, **{\"foo\":[]}, **{\"bar\": []}}"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 191,
499
+ "id": "69ffb778-09cf-4eb8-ab95-739700d68420",
500
+ "metadata": {},
501
+ "outputs": [
502
+ {
503
+ "data": {
504
+ "text/plain": [
505
+ "'f5a21c3fcb58ac17c8a47cfffd509b55cbad7ccf-1629986165000000000'"
506
+ ]
507
+ },
508
+ "execution_count": 191,
509
+ "metadata": {},
510
+ "output_type": "execute_result"
511
+ }
512
+ ],
513
+ "source": [
514
+ "sub_id = data[\"card_data\"][\"submission_id\"]\n",
515
+ "sub_id"
516
+ ]
517
+ },
518
+ {
519
+ "cell_type": "code",
520
+ "execution_count": 195,
521
+ "id": "f7c3e8c0-68c7-4bad-802b-1b39703e100d",
522
+ "metadata": {},
523
+ "outputs": [
524
+ {
525
+ "data": {
526
+ "text/plain": [
527
+ "'1629986165000000000'"
528
+ ]
529
+ },
530
+ "execution_count": 195,
531
+ "metadata": {},
532
+ "output_type": "execute_result"
533
+ }
534
+ ],
535
+ "source": [
536
+ "t = sub_id.split(\"-\")[-1]\n",
537
+ "t"
538
+ ]
539
+ },
540
+ {
541
+ "cell_type": "code",
542
+ "execution_count": 197,
543
+ "id": "34a7483c-0b00-42a7-99b9-ee6bdf34048a",
544
+ "metadata": {},
545
+ "outputs": [
546
+ {
547
+ "data": {
548
+ "text/plain": [
549
+ "Timestamp('2021-08-26 13:56:05')"
550
+ ]
551
+ },
552
+ "execution_count": 197,
553
+ "metadata": {},
554
+ "output_type": "execute_result"
555
+ }
556
+ ],
557
+ "source": [
558
+ "d = pd.to_datetime(int(t))\n",
559
+ "d"
560
+ ]
561
+ },
562
+ {
563
+ "cell_type": "code",
564
+ "execution_count": 201,
565
+ "id": "95f1ecff-025f-4b42-8761-2c0964dfac5f",
566
+ "metadata": {},
567
+ "outputs": [],
568
+ "source": [
569
+ "import datetime"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "code",
574
+ "execution_count": 205,
575
+ "id": "c606cfea-165d-4b58-ba37-6fc9b06795cf",
576
+ "metadata": {},
577
+ "outputs": [
578
+ {
579
+ "data": {
580
+ "text/html": [
581
+ "<div>\n",
582
+ "<style scoped>\n",
583
+ " .dataframe tbody tr th:only-of-type {\n",
584
+ " vertical-align: middle;\n",
585
+ " }\n",
586
+ "\n",
587
+ " .dataframe tbody tr th {\n",
588
+ " vertical-align: top;\n",
589
+ " }\n",
590
+ "\n",
591
+ " .dataframe thead th {\n",
592
+ " text-align: right;\n",
593
+ " }\n",
594
+ "</style>\n",
595
+ "<table border=\"1\" class=\"dataframe\">\n",
596
+ " <thead>\n",
597
+ " <tr style=\"text-align: right;\">\n",
598
+ " <th></th>\n",
599
+ " <th>d</th>\n",
600
+ " </tr>\n",
601
+ " </thead>\n",
602
+ " <tbody>\n",
603
+ " <tr>\n",
604
+ " <th>0</th>\n",
605
+ " <td>2021-08-26</td>\n",
606
+ " </tr>\n",
607
+ " </tbody>\n",
608
+ "</table>\n",
609
+ "</div>"
610
+ ],
611
+ "text/plain": [
612
+ " d\n",
613
+ "0 2021-08-26"
614
+ ]
615
+ },
616
+ "execution_count": 205,
617
+ "metadata": {},
618
+ "output_type": "execute_result"
619
+ }
620
+ ],
621
+ "source": [
622
+ "pd.DataFrame({\"d\":[datetime.datetime.date(d)]})"
623
+ ]
624
+ },
625
+ {
626
+ "cell_type": "code",
627
+ "execution_count": 147,
628
+ "id": "eebefd5a-6451-44b9-bc0f-d0663f321e34",
629
+ "metadata": {},
630
+ "outputs": [],
631
+ "source": [
632
+ "timestamp = data[\"lastModified\"]"
633
+ ]
634
+ },
635
+ {
636
+ "cell_type": "code",
637
+ "execution_count": 149,
638
+ "id": "bf663ca3-12e8-4178-9aef-aba46621477a",
639
+ "metadata": {},
640
+ "outputs": [],
641
+ "source": [
642
+ "import pandas as pd"
643
+ ]
644
+ },
645
+ {
646
+ "cell_type": "code",
647
+ "execution_count": 160,
648
+ "id": "4723aeb6-3993-49b1-b779-c1394b54d776",
649
+ "metadata": {},
650
+ "outputs": [
651
+ {
652
+ "data": {
653
+ "text/plain": [
654
+ "Timestamp('2021-08-04 22:52:57+0000', tz='UTC')"
655
+ ]
656
+ },
657
+ "execution_count": 160,
658
+ "metadata": {},
659
+ "output_type": "execute_result"
660
+ }
661
+ ],
662
+ "source": [
663
+ "t = pd.to_datetime(timestamp)\n",
664
+ "t"
665
+ ]
666
+ },
667
+ {
668
+ "cell_type": "code",
669
+ "execution_count": 157,
670
+ "id": "cbcc2bf8-e2c8-449c-9f00-38ed80e46ae0",
671
+ "metadata": {},
672
+ "outputs": [
673
+ {
674
+ "data": {
675
+ "text/plain": [
676
+ "'2021-08-04T22:52:57.000Z'"
677
+ ]
678
+ },
679
+ "execution_count": 157,
680
+ "metadata": {},
681
+ "output_type": "execute_result"
682
+ }
683
+ ],
684
+ "source": [
685
+ "timestamp"
686
+ ]
687
+ },
688
+ {
689
+ "cell_type": "code",
690
+ "execution_count": 161,
691
+ "id": "177574a9-327e-4999-a1db-c316bb741c8c",
692
+ "metadata": {},
693
+ "outputs": [],
694
+ "source": [
695
+ "t_int = int(t.timestamp() * 10 **9)"
696
+ ]
697
+ },
698
+ {
699
+ "cell_type": "code",
700
+ "execution_count": 162,
701
+ "id": "b3aa4f70-50af-47b3-a492-c77f65266a5b",
702
+ "metadata": {},
703
+ "outputs": [
704
+ {
705
+ "data": {
706
+ "text/plain": [
707
+ "Timestamp('2021-08-04 22:52:57')"
708
+ ]
709
+ },
710
+ "execution_count": 162,
711
+ "metadata": {},
712
+ "output_type": "execute_result"
713
+ }
714
+ ],
715
+ "source": [
716
+ "pd.to_datetime(t_int)"
717
+ ]
718
+ },
719
+ {
720
+ "cell_type": "code",
721
+ "execution_count": 96,
722
+ "id": "ae7c9100-a630-4b4e-a060-331914f86055",
723
+ "metadata": {},
724
+ "outputs": [],
725
+ "source": [
726
+ "submissions = download_submissions()"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "code",
731
+ "execution_count": 97,
732
+ "id": "831077a1-7f44-4d31-94b3-49257a62c5f7",
733
+ "metadata": {},
734
+ "outputs": [
735
+ {
736
+ "data": {
737
+ "text/plain": [
738
+ "16"
739
+ ]
740
+ },
741
+ "execution_count": 97,
742
+ "metadata": {},
743
+ "output_type": "execute_result"
744
+ }
745
+ ],
746
+ "source": [
747
+ "len(submissions)"
748
+ ]
749
+ },
750
+ {
751
+ "cell_type": "code",
752
+ "execution_count": 35,
753
+ "id": "4a8a9ff9-7f94-4abb-8194-9d570ad2216b",
754
+ "metadata": {},
755
+ "outputs": [
756
+ {
757
+ "data": {
758
+ "text/plain": [
759
+ "[{'id': 'autonlp/autonlp-benchmark-raft-ought__raft-ought__raft-dummy-predictions-642',\n",
760
+ " 'private': True,\n",
761
+ " 'tags': ['benchmark:ought/raft',\n",
762
+ " 'type:evaluation',\n",
763
+ " 'submission_dataset:ought/raft-dummy-predictions',\n",
764
+ " 'tags:autonlp',\n",
765
+ " 'tags:evaluation',\n",
766
+ " 'tags:benchmark'],\n",
767
+ " 'author': 'autonlp',\n",
768
+ " 'key': ''}]"
769
+ ]
770
+ },
771
+ "execution_count": 35,
772
+ "metadata": {},
773
+ "output_type": "execute_result"
774
+ }
775
+ ],
776
+ "source": [
777
+ "submissions[-1:]"
778
+ ]
779
+ },
780
+ {
781
+ "cell_type": "code",
782
+ "execution_count": 98,
783
+ "id": "2d6e56cb-fca3-4e9e-9a8b-9d2e26816773",
784
+ "metadata": {},
785
+ "outputs": [],
786
+ "source": [
787
+ "df = format_submissions(submissions[-2:])"
788
+ ]
789
+ },
790
+ {
791
+ "cell_type": "code",
792
+ "execution_count": 109,
793
+ "id": "7d8a3402-f7b8-4edb-8d1e-afb704dc3c67",
794
+ "metadata": {},
795
+ "outputs": [
796
+ {
797
+ "data": {
798
+ "text/html": [
799
+ "<div>\n",
800
+ "<style scoped>\n",
801
+ " .dataframe tbody tr th:only-of-type {\n",
802
+ " vertical-align: middle;\n",
803
+ " }\n",
804
+ "\n",
805
+ " .dataframe tbody tr th {\n",
806
+ " vertical-align: top;\n",
807
+ " }\n",
808
+ "\n",
809
+ " .dataframe thead th {\n",
810
+ " text-align: right;\n",
811
+ " }\n",
812
+ "</style>\n",
813
+ "<table border=\"1\" class=\"dataframe\">\n",
814
+ " <thead>\n",
815
+ " <tr style=\"text-align: right;\">\n",
816
+ " <th></th>\n",
817
+ " <th>Submission</th>\n",
818
+ " <th>Overall</th>\n",
819
+ " <th>banking_77</th>\n",
820
+ " <th>medical_subdomain_of_clinical_notes</th>\n",
821
+ " <th>overruling</th>\n",
822
+ " <th>gpai_initiatives</th>\n",
823
+ " <th>semiconductor_org_types</th>\n",
824
+ " <th>twitter_complaints</th>\n",
825
+ " <th>neurips_impact_statement_risks</th>\n",
826
+ " <th>systematic_review_inclusion</th>\n",
827
+ " <th>terms_of_service</th>\n",
828
+ " <th>tai_safety_research</th>\n",
829
+ " <th>one_stop_english</th>\n",
830
+ " </tr>\n",
831
+ " </thead>\n",
832
+ " <tbody>\n",
833
+ " <tr>\n",
834
+ " <th>1</th>\n",
835
+ " <td>lewtun/my-raft-dummy-predictions</td>\n",
836
+ " <td>0.605079</td>\n",
837
+ " <td>0.948903</td>\n",
838
+ " <td>0.716526</td>\n",
839
+ " <td>0.064395</td>\n",
840
+ " <td>0.529422</td>\n",
841
+ " <td>0.643723</td>\n",
842
+ " <td>0.873478</td>\n",
843
+ " <td>0.756919</td>\n",
844
+ " <td>0.381609</td>\n",
845
+ " <td>0.461302</td>\n",
846
+ " <td>0.624133</td>\n",
847
+ " <td>0.655457</td>\n",
848
+ " </tr>\n",
849
+ " <tr>\n",
850
+ " <th>0</th>\n",
851
+ " <td>ought/raft-dummy-predictions</td>\n",
852
+ " <td>0.407345</td>\n",
853
+ " <td>0.009504</td>\n",
854
+ " <td>0.591213</td>\n",
855
+ " <td>0.552390</td>\n",
856
+ " <td>0.594769</td>\n",
857
+ " <td>0.339822</td>\n",
858
+ " <td>0.728116</td>\n",
859
+ " <td>0.878378</td>\n",
860
+ " <td>0.291842</td>\n",
861
+ " <td>0.144772</td>\n",
862
+ " <td>0.089622</td>\n",
863
+ " <td>0.260366</td>\n",
864
+ " </tr>\n",
865
+ " </tbody>\n",
866
+ "</table>\n",
867
+ "</div>"
868
+ ],
869
+ "text/plain": [
870
+ " Submission Overall banking_77 \\\n",
871
+ "1 lewtun/my-raft-dummy-predictions 0.605079 0.948903 \n",
872
+ "0 ought/raft-dummy-predictions 0.407345 0.009504 \n",
873
+ "\n",
874
+ " medical_subdomain_of_clinical_notes overruling gpai_initiatives \\\n",
875
+ "1 0.716526 0.064395 0.529422 \n",
876
+ "0 0.591213 0.552390 0.594769 \n",
877
+ "\n",
878
+ " semiconductor_org_types twitter_complaints \\\n",
879
+ "1 0.643723 0.873478 \n",
880
+ "0 0.339822 0.728116 \n",
881
+ "\n",
882
+ " neurips_impact_statement_risks systematic_review_inclusion \\\n",
883
+ "1 0.756919 0.381609 \n",
884
+ "0 0.878378 0.291842 \n",
885
+ "\n",
886
+ " terms_of_service tai_safety_research one_stop_english \n",
887
+ "1 0.461302 0.624133 0.655457 \n",
888
+ "0 0.144772 0.089622 0.260366 "
889
+ ]
890
+ },
891
+ "execution_count": 109,
892
+ "metadata": {},
893
+ "output_type": "execute_result"
894
+ }
895
+ ],
896
+ "source": [
897
+ "df"
898
+ ]
899
+ },
900
+ {
901
+ "cell_type": "code",
902
+ "execution_count": 100,
903
+ "id": "f60f453b-2457-4597-9eee-324d4c3a2f2e",
904
+ "metadata": {},
905
+ "outputs": [],
906
+ "source": [
907
+ "df.insert(1, \"Overall\", df[TASKS].mean(axis=1))"
908
+ ]
909
+ },
910
+ {
911
+ "cell_type": "code",
912
+ "execution_count": 110,
913
+ "id": "1fd83f7a-b554-4e7d-aef6-4338b01f3eec",
914
+ "metadata": {},
915
+ "outputs": [
916
+ {
917
+ "data": {
918
+ "text/html": [
919
+ "<div>\n",
920
+ "<style scoped>\n",
921
+ " .dataframe tbody tr th:only-of-type {\n",
922
+ " vertical-align: middle;\n",
923
+ " }\n",
924
+ "\n",
925
+ " .dataframe tbody tr th {\n",
926
+ " vertical-align: top;\n",
927
+ " }\n",
928
+ "\n",
929
+ " .dataframe thead th {\n",
930
+ " text-align: right;\n",
931
+ " }\n",
932
+ "</style>\n",
933
+ "<table border=\"1\" class=\"dataframe\">\n",
934
+ " <thead>\n",
935
+ " <tr style=\"text-align: right;\">\n",
936
+ " <th></th>\n",
937
+ " <th>Rank</th>\n",
938
+ " <th>Submission</th>\n",
939
+ " <th>Overall</th>\n",
940
+ " <th>banking_77</th>\n",
941
+ " <th>medical_subdomain_of_clinical_notes</th>\n",
942
+ " <th>overruling</th>\n",
943
+ " <th>gpai_initiatives</th>\n",
944
+ " <th>semiconductor_org_types</th>\n",
945
+ " <th>twitter_complaints</th>\n",
946
+ " <th>neurips_impact_statement_risks</th>\n",
947
+ " <th>systematic_review_inclusion</th>\n",
948
+ " <th>terms_of_service</th>\n",
949
+ " <th>tai_safety_research</th>\n",
950
+ " <th>one_stop_english</th>\n",
951
+ " </tr>\n",
952
+ " </thead>\n",
953
+ " <tbody>\n",
954
+ " <tr>\n",
955
+ " <th>0</th>\n",
956
+ " <td>1</td>\n",
957
+ " <td>lewtun/my-raft-dummy-predictions</td>\n",
958
+ " <td>0.605079</td>\n",
959
+ " <td>0.948903</td>\n",
960
+ " <td>0.716526</td>\n",
961
+ " <td>0.064395</td>\n",
962
+ " <td>0.529422</td>\n",
963
+ " <td>0.643723</td>\n",
964
+ " <td>0.873478</td>\n",
965
+ " <td>0.756919</td>\n",
966
+ " <td>0.381609</td>\n",
967
+ " <td>0.461302</td>\n",
968
+ " <td>0.624133</td>\n",
969
+ " <td>0.655457</td>\n",
970
+ " </tr>\n",
971
+ " <tr>\n",
972
+ " <th>1</th>\n",
973
+ " <td>0</td>\n",
974
+ " <td>ought/raft-dummy-predictions</td>\n",
975
+ " <td>0.407345</td>\n",
976
+ " <td>0.009504</td>\n",
977
+ " <td>0.591213</td>\n",
978
+ " <td>0.552390</td>\n",
979
+ " <td>0.594769</td>\n",
980
+ " <td>0.339822</td>\n",
981
+ " <td>0.728116</td>\n",
982
+ " <td>0.878378</td>\n",
983
+ " <td>0.291842</td>\n",
984
+ " <td>0.144772</td>\n",
985
+ " <td>0.089622</td>\n",
986
+ " <td>0.260366</td>\n",
987
+ " </tr>\n",
988
+ " </tbody>\n",
989
+ "</table>\n",
990
+ "</div>"
991
+ ],
992
+ "text/plain": [
993
+ " Rank Submission Overall banking_77 \\\n",
994
+ "0 1 lewtun/my-raft-dummy-predictions 0.605079 0.948903 \n",
995
+ "1 0 ought/raft-dummy-predictions 0.407345 0.009504 \n",
996
+ "\n",
997
+ " medical_subdomain_of_clinical_notes overruling gpai_initiatives \\\n",
998
+ "0 0.716526 0.064395 0.529422 \n",
999
+ "1 0.591213 0.552390 0.594769 \n",
1000
+ "\n",
1001
+ " semiconductor_org_types twitter_complaints \\\n",
1002
+ "0 0.643723 0.873478 \n",
1003
+ "1 0.339822 0.728116 \n",
1004
+ "\n",
1005
+ " neurips_impact_statement_risks systematic_review_inclusion \\\n",
1006
+ "0 0.756919 0.381609 \n",
1007
+ "1 0.878378 0.291842 \n",
1008
+ "\n",
1009
+ " terms_of_service tai_safety_research one_stop_english \n",
1010
+ "0 0.461302 0.624133 0.655457 \n",
1011
+ "1 0.144772 0.089622 0.260366 "
1012
+ ]
1013
+ },
1014
+ "execution_count": 110,
1015
+ "metadata": {},
1016
+ "output_type": "execute_result"
1017
+ }
1018
+ ],
1019
+ "source": [
1020
+ "df.copy().sort_values(\"Overall\", ascending=False).reset_index().rename(columns={\"index\":\"Rank\"})"
1021
+ ]
1022
+ },
1023
+ {
1024
+ "cell_type": "code",
1025
+ "execution_count": 119,
1026
+ "id": "e1262ff5-6ea3-41ca-affc-b106dd9df5fd",
1027
+ "metadata": {},
1028
+ "outputs": [],
1029
+ "source": [
1030
+ "task_names = [\" \".join(t.capitalize() for t in task.split(\"_\")) for task in TASKS]"
1031
+ ]
1032
+ },
1033
+ {
1034
+ "cell_type": "code",
1035
+ "execution_count": 121,
1036
+ "id": "45d74b9c-c472-4494-aadc-909976d13b08",
1037
+ "metadata": {},
1038
+ "outputs": [
1039
+ {
1040
+ "data": {
1041
+ "text/html": [
1042
+ "<div>\n",
1043
+ "<style scoped>\n",
1044
+ " .dataframe tbody tr th:only-of-type {\n",
1045
+ " vertical-align: middle;\n",
1046
+ " }\n",
1047
+ "\n",
1048
+ " .dataframe tbody tr th {\n",
1049
+ " vertical-align: top;\n",
1050
+ " }\n",
1051
+ "\n",
1052
+ " .dataframe thead th {\n",
1053
+ " text-align: right;\n",
1054
+ " }\n",
1055
+ "</style>\n",
1056
+ "<table border=\"1\" class=\"dataframe\">\n",
1057
+ " <thead>\n",
1058
+ " <tr style=\"text-align: right;\">\n",
1059
+ " <th></th>\n",
1060
+ " <th>Submission</th>\n",
1061
+ " <th>Overall</th>\n",
1062
+ " <th>Banking 77</th>\n",
1063
+ " <th>Medical Subdomain Of Clinical Notes</th>\n",
1064
+ " <th>Overruling</th>\n",
1065
+ " <th>Gpai Initiatives</th>\n",
1066
+ " <th>Semiconductor Org Types</th>\n",
1067
+ " <th>Twitter Complaints</th>\n",
1068
+ " <th>Neurips Impact Statement Risks</th>\n",
1069
+ " <th>Systematic Review Inclusion</th>\n",
1070
+ " <th>Terms Of Service</th>\n",
1071
+ " <th>Tai Safety Research</th>\n",
1072
+ " <th>One Stop English</th>\n",
1073
+ " </tr>\n",
1074
+ " </thead>\n",
1075
+ " <tbody>\n",
1076
+ " <tr>\n",
1077
+ " <th>1</th>\n",
1078
+ " <td>lewtun/my-raft-dummy-predictions</td>\n",
1079
+ " <td>0.605079</td>\n",
1080
+ " <td>0.948903</td>\n",
1081
+ " <td>0.716526</td>\n",
1082
+ " <td>0.064395</td>\n",
1083
+ " <td>0.529422</td>\n",
1084
+ " <td>0.643723</td>\n",
1085
+ " <td>0.873478</td>\n",
1086
+ " <td>0.756919</td>\n",
1087
+ " <td>0.381609</td>\n",
1088
+ " <td>0.461302</td>\n",
1089
+ " <td>0.624133</td>\n",
1090
+ " <td>0.655457</td>\n",
1091
+ " </tr>\n",
1092
+ " <tr>\n",
1093
+ " <th>0</th>\n",
1094
+ " <td>ought/raft-dummy-predictions</td>\n",
1095
+ " <td>0.407345</td>\n",
1096
+ " <td>0.009504</td>\n",
1097
+ " <td>0.591213</td>\n",
1098
+ " <td>0.552390</td>\n",
1099
+ " <td>0.594769</td>\n",
1100
+ " <td>0.339822</td>\n",
1101
+ " <td>0.728116</td>\n",
1102
+ " <td>0.878378</td>\n",
1103
+ " <td>0.291842</td>\n",
1104
+ " <td>0.144772</td>\n",
1105
+ " <td>0.089622</td>\n",
1106
+ " <td>0.260366</td>\n",
1107
+ " </tr>\n",
1108
+ " </tbody>\n",
1109
+ "</table>\n",
1110
+ "</div>"
1111
+ ],
1112
+ "text/plain": [
1113
+ " Submission Overall Banking 77 \\\n",
1114
+ "1 lewtun/my-raft-dummy-predictions 0.605079 0.948903 \n",
1115
+ "0 ought/raft-dummy-predictions 0.407345 0.009504 \n",
1116
+ "\n",
1117
+ " Medical Subdomain Of Clinical Notes Overruling Gpai Initiatives \\\n",
1118
+ "1 0.716526 0.064395 0.529422 \n",
1119
+ "0 0.591213 0.552390 0.594769 \n",
1120
+ "\n",
1121
+ " Semiconductor Org Types Twitter Complaints \\\n",
1122
+ "1 0.643723 0.873478 \n",
1123
+ "0 0.339822 0.728116 \n",
1124
+ "\n",
1125
+ " Neurips Impact Statement Risks Systematic Review Inclusion \\\n",
1126
+ "1 0.756919 0.381609 \n",
1127
+ "0 0.878378 0.291842 \n",
1128
+ "\n",
1129
+ " Terms Of Service Tai Safety Research One Stop English \n",
1130
+ "1 0.461302 0.624133 0.655457 \n",
1131
+ "0 0.144772 0.089622 0.260366 "
1132
+ ]
1133
+ },
1134
+ "execution_count": 121,
1135
+ "metadata": {},
1136
+ "output_type": "execute_result"
1137
+ }
1138
+ ],
1139
+ "source": [
1140
+ "df.rename(columns={k:v for k,v in zip(TASKS, task_names)})"
1141
+ ]
1142
+ },
1143
+ {
1144
+ "cell_type": "code",
1145
+ "execution_count": 88,
1146
+ "id": "d31c2bde-1645-4c1b-982b-c9daac40311d",
1147
+ "metadata": {},
1148
+ "outputs": [
1149
+ {
1150
+ "data": {
1151
+ "text/html": [
1152
+ "<div>\n",
1153
+ "<style scoped>\n",
1154
+ " .dataframe tbody tr th:only-of-type {\n",
1155
+ " vertical-align: middle;\n",
1156
+ " }\n",
1157
+ "\n",
1158
+ " .dataframe tbody tr th {\n",
1159
+ " vertical-align: top;\n",
1160
+ " }\n",
1161
+ "\n",
1162
+ " .dataframe thead th {\n",
1163
+ " text-align: right;\n",
1164
+ " }\n",
1165
+ "</style>\n",
1166
+ "<table border=\"1\" class=\"dataframe\">\n",
1167
+ " <thead>\n",
1168
+ " <tr style=\"text-align: right;\">\n",
1169
+ " <th></th>\n",
1170
+ " <th>Submission</th>\n",
1171
+ " <th>Overall</th>\n",
1172
+ " <th>banking_77</th>\n",
1173
+ " <th>medical_subdomain_of_clinical_notes</th>\n",
1174
+ " <th>overruling</th>\n",
1175
+ " <th>gpai_initiatives</th>\n",
1176
+ " <th>semiconductor_org_types</th>\n",
1177
+ " <th>twitter_complaints</th>\n",
1178
+ " <th>neurips_impact_statement_risks</th>\n",
1179
+ " <th>systematic_review_inclusion</th>\n",
1180
+ " <th>terms_of_service</th>\n",
1181
+ " <th>tai_safety_research</th>\n",
1182
+ " <th>one_stop_english</th>\n",
1183
+ " </tr>\n",
1184
+ " </thead>\n",
1185
+ " <tbody>\n",
1186
+ " <tr>\n",
1187
+ " <th>0</th>\n",
1188
+ " <td>ought/raft-dummy-predictions</td>\n",
1189
+ " <td>0.407345</td>\n",
1190
+ " <td>0.009504</td>\n",
1191
+ " <td>0.591213</td>\n",
1192
+ " <td>0.55239</td>\n",
1193
+ " <td>0.594769</td>\n",
1194
+ " <td>0.339822</td>\n",
1195
+ " <td>0.728116</td>\n",
1196
+ " <td>0.878378</td>\n",
1197
+ " <td>0.291842</td>\n",
1198
+ " <td>0.144772</td>\n",
1199
+ " <td>0.089622</td>\n",
1200
+ " <td>0.260366</td>\n",
1201
+ " </tr>\n",
1202
+ " </tbody>\n",
1203
+ "</table>\n",
1204
+ "</div>"
1205
+ ],
1206
+ "text/plain": [
1207
+ " Submission Overall banking_77 \\\n",
1208
+ "0 ought/raft-dummy-predictions 0.407345 0.009504 \n",
1209
+ "\n",
1210
+ " medical_subdomain_of_clinical_notes overruling gpai_initiatives \\\n",
1211
+ "0 0.591213 0.55239 0.594769 \n",
1212
+ "\n",
1213
+ " semiconductor_org_types twitter_complaints \\\n",
1214
+ "0 0.339822 0.728116 \n",
1215
+ "\n",
1216
+ " neurips_impact_statement_risks systematic_review_inclusion \\\n",
1217
+ "0 0.878378 0.291842 \n",
1218
+ "\n",
1219
+ " terms_of_service tai_safety_research one_stop_english \n",
1220
+ "0 0.144772 0.089622 0.260366 "
1221
+ ]
1222
+ },
1223
+ "execution_count": 88,
1224
+ "metadata": {},
1225
+ "output_type": "execute_result"
1226
+ }
1227
+ ],
1228
+ "source": [
1229
+ "df.sort_values(\"Overall\")"
1230
+ ]
1231
+ },
1232
+ {
1233
+ "cell_type": "code",
1234
+ "execution_count": null,
1235
+ "id": "4df33059-020a-43cf-aa3a-de6939268cc7",
1236
+ "metadata": {},
1237
+ "outputs": [],
1238
+ "source": [
1239
+ "df[\"Overall\"] = df.mean()"
1240
+ ]
1241
+ },
1242
+ {
1243
+ "cell_type": "code",
1244
+ "execution_count": null,
1245
+ "id": "327539f3-3bf7-4a2e-ac10-89973a2ba37f",
1246
+ "metadata": {},
1247
+ "outputs": [],
1248
+ "source": [
1249
+ "df[\"Submission\"]"
1250
+ ]
1251
+ },
1252
+ {
1253
+ "cell_type": "code",
1254
+ "execution_count": 38,
1255
+ "id": "f07ec556-2ebf-400e-85f7-c978d03b0dc1",
1256
+ "metadata": {},
1257
+ "outputs": [],
1258
+ "source": [
1259
+ "data = format_submissions(submissions[-1:])"
1260
+ ]
1261
+ },
1262
+ {
1263
+ "cell_type": "code",
1264
+ "execution_count": 48,
1265
+ "id": "a982e024-ab16-4752-984a-5368fa238f1d",
1266
+ "metadata": {},
1267
+ "outputs": [
1268
+ {
1269
+ "data": {
1270
+ "text/html": [
1271
+ "<div>\n",
1272
+ "<style scoped>\n",
1273
+ " .dataframe tbody tr th:only-of-type {\n",
1274
+ " vertical-align: middle;\n",
1275
+ " }\n",
1276
+ "\n",
1277
+ " .dataframe tbody tr th {\n",
1278
+ " vertical-align: top;\n",
1279
+ " }\n",
1280
+ "\n",
1281
+ " .dataframe thead th {\n",
1282
+ " text-align: right;\n",
1283
+ " }\n",
1284
+ "</style>\n",
1285
+ "<table border=\"1\" class=\"dataframe\">\n",
1286
+ " <thead>\n",
1287
+ " <tr style=\"text-align: right;\">\n",
1288
+ " <th></th>\n",
1289
+ " <th>bank</th>\n",
1290
+ " </tr>\n",
1291
+ " </thead>\n",
1292
+ " <tbody>\n",
1293
+ " <tr>\n",
1294
+ " <th>0</th>\n",
1295
+ " <td>0.2</td>\n",
1296
+ " </tr>\n",
1297
+ " </tbody>\n",
1298
+ "</table>\n",
1299
+ "</div>"
1300
+ ],
1301
+ "text/plain": [
1302
+ " bank\n",
1303
+ "0 0.2"
1304
+ ]
1305
+ },
1306
+ "execution_count": 48,
1307
+ "metadata": {},
1308
+ "output_type": "execute_result"
1309
+ }
1310
+ ],
1311
+ "source": [
1312
+ "pd.DataFrame({\"bank\":[0.2]})"
1313
+ ]
1314
+ },
1315
+ {
1316
+ "cell_type": "code",
1317
+ "execution_count": 60,
1318
+ "id": "b7c73606-d7f9-4f17-bf4d-17cfbb3aa664",
1319
+ "metadata": {},
1320
+ "outputs": [
1321
+ {
1322
+ "name": "stdout",
1323
+ "output_type": "stream",
1324
+ "text": [
1325
+ "['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched', 'mnli_matched', 'qnli', 'rte', 'wnli', 'ax']\n"
1326
+ ]
1327
+ }
1328
+ ],
1329
+ "source": [
1330
+ "from datasets import get_dataset_config_names\n",
1331
+ "\n",
1332
+ "configs = get_dataset_config_names(\"glue\")\n",
1333
+ "print(configs)"
1334
+ ]
1335
+ },
1336
+ {
1337
+ "cell_type": "code",
1338
+ "execution_count": 50,
1339
+ "id": "92eea464-6b63-4613-ab4d-aa5003e0bb3b",
1340
+ "metadata": {},
1341
+ "outputs": [],
1342
+ "source": [
1343
+ "from datasets import get_dataset_config_names"
1344
+ ]
1345
+ },
1346
+ {
1347
+ "cell_type": "code",
1348
+ "execution_count": 51,
1349
+ "id": "4f9c2924-001a-4b76-a8ed-a072b43eedbd",
1350
+ "metadata": {},
1351
+ "outputs": [],
1352
+ "source": [
1353
+ "tasks = get_dataset_config_names(\"ought/raft\")"
1354
+ ]
1355
+ },
1356
+ {
1357
+ "cell_type": "code",
1358
+ "execution_count": 55,
1359
+ "id": "9b27374f-b118-440a-acab-6e4aa09f42a4",
1360
+ "metadata": {},
1361
+ "outputs": [],
1362
+ "source": [
1363
+ "submission_data = {t:[] for t in tasks}\n",
1364
+ "\n",
1365
+ "for task in data[\"card_data\"][\"results\"]:\n",
1366
+ " task_data = task[\"task\"]\n",
1367
+ " task_name = task_data[\"name\"]\n",
1368
+ " score = task_data[\"metrics\"][0][\"value\"]\n",
1369
+ " submission_data[task_name].append(score)"
1370
+ ]
1371
+ },
1372
+ {
1373
+ "cell_type": "code",
1374
+ "execution_count": 56,
1375
+ "id": "6b7cf2e0-ee92-4647-8d9b-6edef48e06f8",
1376
+ "metadata": {},
1377
+ "outputs": [
1378
+ {
1379
+ "data": {
1380
+ "text/plain": [
1381
+ "{'banking_77': [0.009504218288713173],\n",
1382
+ " 'medical_subdomain_of_clinical_notes': [0.5912133593265538],\n",
1383
+ " 'overruling': [0.5523904885287522],\n",
1384
+ " 'gpai_initiatives': [0.5947694876413803],\n",
1385
+ " 'semiconductor_org_types': [0.33982211621333613],\n",
1386
+ " 'twitter_complaints': [0.7281156178656647],\n",
1387
+ " 'neurips_impact_statement_risks': [0.8783775228874845],\n",
1388
+ " 'systematic_review_inclusion': [0.2918416872180052],\n",
1389
+ " 'terms_of_service': [0.14477157391911066],\n",
1390
+ " 'tai_safety_research': [0.08962249895220364],\n",
1391
+ " 'one_stop_english': [0.2603661495335281]}"
1392
+ ]
1393
+ },
1394
+ "execution_count": 56,
1395
+ "metadata": {},
1396
+ "output_type": "execute_result"
1397
+ }
1398
+ ],
1399
+ "source": [
1400
+ "submission_data"
1401
+ ]
1402
+ },
1403
+ {
1404
+ "cell_type": "code",
1405
+ "execution_count": 61,
1406
+ "id": "5df282e4-87c4-4ea7-833e-6a87886e2f76",
1407
+ "metadata": {},
1408
+ "outputs": [
1409
+ {
1410
+ "data": {
1411
+ "text/plain": [
1412
+ "{'benchmark': 'ought/raft',\n",
1413
+ " 'type': 'evaluation',\n",
1414
+ " 'submission_dataset': 'ought/raft-dummy-predictions',\n",
1415
+ " 'tags': ['autonlp', 'evaluation', 'benchmark'],\n",
1416
+ " 'model-index': None,\n",
1417
+ " 'results': [{'task': {'metrics': [{'name': 'f1',\n",
1418
+ " 'type': 'f1',\n",
1419
+ " 'value': 0.009504218288713173}],\n",
1420
+ " 'name': 'banking_77',\n",
1421
+ " 'type': 'text-classification'}},\n",
1422
+ " {'task': {'metrics': [{'name': 'f1',\n",
1423
+ " 'type': 'f1',\n",
1424
+ " 'value': 0.5912133593265538}],\n",
1425
+ " 'name': 'medical_subdomain_of_clinical_notes',\n",
1426
+ " 'type': 'text-classification'}},\n",
1427
+ " {'task': {'metrics': [{'name': 'f1',\n",
1428
+ " 'type': 'f1',\n",
1429
+ " 'value': 0.5523904885287522}],\n",
1430
+ " 'name': 'overruling',\n",
1431
+ " 'type': 'text-classification'}},\n",
1432
+ " {'task': {'metrics': [{'name': 'f1',\n",
1433
+ " 'type': 'f1',\n",
1434
+ " 'value': 0.5947694876413803}],\n",
1435
+ " 'name': 'gpai_initiatives',\n",
1436
+ " 'type': 'text-classification'}},\n",
1437
+ " {'task': {'metrics': [{'name': 'f1',\n",
1438
+ " 'type': 'f1',\n",
1439
+ " 'value': 0.33982211621333613}],\n",
1440
+ " 'name': 'semiconductor_org_types',\n",
1441
+ " 'type': 'text-classification'}},\n",
1442
+ " {'task': {'metrics': [{'name': 'f1',\n",
1443
+ " 'type': 'f1',\n",
1444
+ " 'value': 0.7281156178656647}],\n",
1445
+ " 'name': 'twitter_complaints',\n",
1446
+ " 'type': 'text-classification'}},\n",
1447
+ " {'task': {'metrics': [{'name': 'f1',\n",
1448
+ " 'type': 'f1',\n",
1449
+ " 'value': 0.8783775228874845}],\n",
1450
+ " 'name': 'neurips_impact_statement_risks',\n",
1451
+ " 'type': 'text-classification'}},\n",
1452
+ " {'task': {'metrics': [{'name': 'f1',\n",
1453
+ " 'type': 'f1',\n",
1454
+ " 'value': 0.2918416872180052}],\n",
1455
+ " 'name': 'systematic_review_inclusion',\n",
1456
+ " 'type': 'text-classification'}},\n",
1457
+ " {'task': {'metrics': [{'name': 'f1',\n",
1458
+ " 'type': 'f1',\n",
1459
+ " 'value': 0.14477157391911066}],\n",
1460
+ " 'name': 'terms_of_service',\n",
1461
+ " 'type': 'text-classification'}},\n",
1462
+ " {'task': {'metrics': [{'name': 'f1',\n",
1463
+ " 'type': 'f1',\n",
1464
+ " 'value': 0.08962249895220364}],\n",
1465
+ " 'name': 'tai_safety_research',\n",
1466
+ " 'type': 'text-classification'}},\n",
1467
+ " {'task': {'metrics': [{'name': 'f1',\n",
1468
+ " 'type': 'f1',\n",
1469
+ " 'value': 0.2603661495335281}],\n",
1470
+ " 'name': 'one_stop_english',\n",
1471
+ " 'type': 'text-classification'}}]}"
1472
+ ]
1473
+ },
1474
+ "execution_count": 61,
1475
+ "metadata": {},
1476
+ "output_type": "execute_result"
1477
+ }
1478
+ ],
1479
+ "source": [
1480
+ "data[\"card_data\"]"
1481
+ ]
1482
+ },
1483
+ {
1484
+ "cell_type": "code",
1485
+ "execution_count": 2,
1486
+ "id": "b07a4fa9-176e-4ff3-bc3f-eb2a6fc9efda",
1487
+ "metadata": {},
1488
+ "outputs": [],
1489
+ "source": [
1490
+ "response = requests.get(\"http://huggingface.co/api/datasets\", headers=header)\n",
1491
+ "all_datasets = response.json()"
1492
+ ]
1493
+ },
1494
+ {
1495
+ "cell_type": "code",
1496
+ "execution_count": 3,
1497
+ "id": "63dc07ec-2f28-483f-8163-c97e8a6a4005",
1498
+ "metadata": {},
1499
+ "outputs": [
1500
+ {
1501
+ "data": {
1502
+ "text/plain": [
1503
+ "2510"
1504
+ ]
1505
+ },
1506
+ "execution_count": 3,
1507
+ "metadata": {},
1508
+ "output_type": "execute_result"
1509
+ }
1510
+ ],
1511
+ "source": [
1512
+ "len(all_datasets)"
1513
+ ]
1514
+ },
1515
+ {
1516
+ "cell_type": "code",
1517
+ "execution_count": 21,
1518
+ "id": "296f68c1-608d-4ea6-8d0e-cc35fb7d74c4",
1519
+ "metadata": {},
1520
+ "outputs": [
1521
+ {
1522
+ "data": {
1523
+ "text/plain": [
1524
+ "{'id': 'disfl_qa',\n",
1525
+ " 'tags': ['annotations_creators:expert-generated',\n",
1526
+ " 'language_creators:found',\n",
1527
+ " 'languages:en',\n",
1528
+ " 'licenses:cc-by-4.0',\n",
1529
+ " 'multilinguality:monolingual',\n",
1530
+ " 'pretty_name:DISFL-QA: A Benchmark Dataset for Understanding Disfluencies in Question Answering',\n",
1531
+ " 'size_categories:10K<n<100K',\n",
1532
+ " 'source_datasets:original',\n",
1533
+ " 'task_categories:question-answering',\n",
1534
+ " 'task_ids:extractive-qa',\n",
1535
+ " 'task_ids:open-domain-qa'],\n",
1536
+ " 'citation': '@inproceedings{gupta-etal-2021-disflqa,\\n title = \"{Disfl-QA: A Benchmark Dataset for Understanding Disfluencies in Question Answering}\",\\n author = \"Gupta, Aditya and Xu, Jiacheng and Upadhyay, Shyam and Yang, Diyi and Faruqui, Manaal\",\\n booktitle = \"Findings of ACL\",\\n year = \"2021\"\\n}',\n",
1537
+ " 'description': 'Disfl-QA is a targeted dataset for contextual disfluencies in an information seeking setting,\\nnamely question answering over Wikipedia passages. Disfl-QA builds upon the SQuAD-v2 (Rajpurkar et al., 2018)\\ndataset, where each question in the dev set is annotated to add a contextual disfluency using the paragraph as\\na source of distractors.\\n\\nThe final dataset consists of ~12k (disfluent question, answer) pairs. Over 90% of the disfluencies are\\ncorrections or restarts, making it a much harder test set for disfluency correction. Disfl-QA aims to fill a\\nmajor gap between speech and NLP research community. We hope the dataset can serve as a benchmark dataset for\\ntesting robustness of models against disfluent inputs.\\n\\nOur expriments reveal that the state-of-the-art models are brittle when subjected to disfluent inputs from\\nDisfl-QA. Detailed experiments and analyses can be found in our paper.',\n",
1538
+ " 'key': ''}"
1539
+ ]
1540
+ },
1541
+ "execution_count": 21,
1542
+ "metadata": {},
1543
+ "output_type": "execute_result"
1544
+ }
1545
+ ],
1546
+ "source": [
1547
+ "all_datasets[154]"
1548
+ ]
1549
+ },
1550
+ {
1551
+ "cell_type": "code",
1552
+ "execution_count": 22,
1553
+ "id": "8c73c912-c903-48f9-9ccf-fdb70d0bd556",
1554
+ "metadata": {},
1555
+ "outputs": [],
1556
+ "source": [
1557
+ "def extract_tags(dataset):\n",
1558
+ " tags = {}\n",
1559
+ " for tag in dataset[\"tags\"]:\n",
1560
+ " k,v = tuple(tag.split(\":\", 1))\n",
1561
+ " tags[k] = v\n",
1562
+ " return tags"
1563
+ ]
1564
+ },
1565
+ {
1566
+ "cell_type": "code",
1567
+ "execution_count": 24,
1568
+ "id": "d4aa1b62-1501-4f3d-8613-e2dfb5fef79d",
1569
+ "metadata": {},
1570
+ "outputs": [],
1571
+ "source": [
1572
+ "tags = extract_tags(all_datasets[0])"
1573
+ ]
1574
+ },
1575
+ {
1576
+ "cell_type": "code",
1577
+ "execution_count": 27,
1578
+ "id": "b8c25fe5-d0d5-4ca9-afc6-8d5cf68f20fd",
1579
+ "metadata": {},
1580
+ "outputs": [
1581
+ {
1582
+ "data": {
1583
+ "text/plain": [
1584
+ "False"
1585
+ ]
1586
+ },
1587
+ "execution_count": 27,
1588
+ "metadata": {},
1589
+ "output_type": "execute_result"
1590
+ }
1591
+ ],
1592
+ "source": [
1593
+ "tags.get(\"benchmark\") == \"raft\""
1594
+ ]
1595
+ },
1596
+ {
1597
+ "cell_type": "code",
1598
+ "execution_count": 23,
1599
+ "id": "441f0b74-68a4-4b82-862d-2fcc69331cc0",
1600
+ "metadata": {},
1601
+ "outputs": [],
1602
+ "source": [
1603
+ "for idx, dset in enumerate(all_datasets):\n",
1604
+ " try:\n",
1605
+ " extract_tags(dset)\n",
1606
+ " except:\n",
1607
+ " print(dset[\"id\"], idx)"
1608
+ ]
1609
+ },
1610
+ {
1611
+ "cell_type": "code",
1612
+ "execution_count": 5,
1613
+ "id": "b43f6131-6509-455f-ac02-1efabd9cdd1c",
1614
+ "metadata": {},
1615
+ "outputs": [
1616
+ {
1617
+ "data": {
1618
+ "text/plain": [
1619
+ "{'annotations_creators': 'expert-generated',\n",
1620
+ " 'language_creators': 'found',\n",
1621
+ " 'languages': 'en',\n",
1622
+ " 'licenses': 'mit',\n",
1623
+ " 'multilinguality': 'monolingual',\n",
1624
+ " 'size_categories': '10K<n<100K',\n",
1625
+ " 'source_datasets': 'original',\n",
1626
+ " 'task_categories': 'structure-prediction',\n",
1627
+ " 'task_ids': 'structure-prediction-other-acronym-identification'}"
1628
+ ]
1629
+ },
1630
+ "execution_count": 5,
1631
+ "metadata": {},
1632
+ "output_type": "execute_result"
1633
+ }
1634
+ ],
1635
+ "source": [
1636
+ "{i[0]:i[1] for t.split(\":\") in all_datasets[0][\"tags\"]}"
1637
+ ]
1638
+ },
1639
+ {
1640
+ "cell_type": "code",
1641
+ "execution_count": 11,
1642
+ "id": "63420516-9870-4ecf-80d8-d922994e4b17",
1643
+ "metadata": {},
1644
+ "outputs": [
1645
+ {
1646
+ "name": "stdout",
1647
+ "output_type": "stream",
1648
+ "text": [
1649
+ "('a',)\n",
1650
+ "('b',)\n"
1651
+ ]
1652
+ }
1653
+ ],
1654
+ "source": [
1655
+ "for i in zip(\"a:b\".split(\":\")):\n",
1656
+ " print(i)"
1657
+ ]
1658
+ },
1659
+ {
1660
+ "cell_type": "code",
1661
+ "execution_count": 15,
1662
+ "id": "dc43998b-c93b-48c0-bee4-e80845950246",
1663
+ "metadata": {},
1664
+ "outputs": [
1665
+ {
1666
+ "ename": "ValueError",
1667
+ "evalue": "not enough values to unpack (expected 2, got 1)",
1668
+ "output_type": "error",
1669
+ "traceback": [
1670
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1671
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
1672
+ "\u001b[0;32m/var/folders/28/k4cy5q7s2hs92xq7_h89_vgm0000gn/T/ipykernel_19497/2621214275.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"a\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"b\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
1673
+ "\u001b[0;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)"
1674
+ ]
1675
+ }
1676
+ ],
1677
+ "source": [
1678
+ "a, b = zip(*[\"a\", \"b\"])"
1679
+ ]
1680
+ },
1681
+ {
1682
+ "cell_type": "code",
1683
+ "execution_count": 12,
1684
+ "id": "4990ce09-a53f-47dd-b662-3f498352b641",
1685
+ "metadata": {},
1686
+ "outputs": [
1687
+ {
1688
+ "name": "stdout",
1689
+ "output_type": "stream",
1690
+ "text": [
1691
+ "annotations_creators expert-generated\n",
1692
+ "language_creators found\n",
1693
+ "languages en\n",
1694
+ "licenses mit\n",
1695
+ "multilinguality monolingual\n",
1696
+ "size_categories 10K<n<100K\n",
1697
+ "source_datasets original\n",
1698
+ "task_categories structure-prediction\n",
1699
+ "task_ids structure-prediction-other-acronym-identification\n"
1700
+ ]
1701
+ }
1702
+ ],
1703
+ "source": [
1704
+ "for tag in all_datasets[0][\"tags\"]:\n",
1705
+ " k,v = tuple(tag.split(\":\"))\n",
1706
+ " print(k,v)"
1707
+ ]
1708
+ },
1709
+ {
1710
+ "cell_type": "code",
1711
+ "execution_count": 138,
1712
+ "id": "cf6b59da-02ff-4522-892d-8fe0aa254d01",
1713
+ "metadata": {},
1714
+ "outputs": [
1715
+ {
1716
+ "name": "stdout",
1717
+ "output_type": "stream",
1718
+ "text": [
1719
+ "0 <s>\n",
1720
+ "1922 ¡\n",
1721
+ "11884 hola\n",
1722
+ "16 ,\n",
1723
+ "378 me\n",
1724
+ "13496 llamo\n",
1725
+ "466 le\n",
1726
+ "91 w\n",
1727
+ "350 is\n",
1728
+ "5 !\n",
1729
+ "2 </s>\n"
1730
+ ]
1731
+ }
1732
+ ],
1733
+ "source": [
1734
+ "from transformers import AutoTokenizer\n",
1735
+ "\n",
1736
+ "model_ckpt = \"bertin-project/bertin-roberta-base-spanish\"\n",
1737
+ "tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=False)\n",
1738
+ "input_ids = tokenizer(\"¡hola, me llamo lewis!\").input_ids\n",
1739
+ "for token in input_ids:\n",
1740
+ " print(token, tokenizer.decode(token))"
1741
+ ]
1742
+ },
1743
+ {
1744
+ "cell_type": "code",
1745
+ "execution_count": null,
1746
+ "id": "430400f2-2c04-48d7-bf8e-63528441d410",
1747
+ "metadata": {},
1748
+ "outputs": [],
1749
+ "source": [
1750
+ "# 1922 ¡\n",
1751
+ "# 11884 hola\n",
1752
+ "# 16 ,\n",
1753
+ "# 378 me\n",
1754
+ "# 13496 llamo\n",
1755
+ "# 466 le\n",
1756
+ "# 91 w\n",
1757
+ "# 350 is\n",
1758
+ "# 5 !"
1759
+ ]
1760
+ },
1761
+ {
1762
+ "cell_type": "code",
1763
+ "execution_count": 130,
1764
+ "id": "2ecdd872-af9b-4258-8a5e-d867f3785520",
1765
+ "metadata": {},
1766
+ "outputs": [
1767
+ {
1768
+ "data": {
1769
+ "text/plain": [
1770
+ "0"
1771
+ ]
1772
+ },
1773
+ "execution_count": 130,
1774
+ "metadata": {},
1775
+ "output_type": "execute_result"
1776
+ }
1777
+ ],
1778
+ "source": [
1779
+ "tokenizer.vocab[\"<s>\"]"
1780
+ ]
1781
+ },
1782
+ {
1783
+ "cell_type": "code",
1784
+ "execution_count": 131,
1785
+ "id": "16941c33-5e22-485f-9d24-ac8f8542c368",
1786
+ "metadata": {},
1787
+ "outputs": [
1788
+ {
1789
+ "data": {
1790
+ "text/plain": [
1791
+ "'<s>'"
1792
+ ]
1793
+ },
1794
+ "execution_count": 131,
1795
+ "metadata": {},
1796
+ "output_type": "execute_result"
1797
+ }
1798
+ ],
1799
+ "source": [
1800
+ "tokenizer.bos_token"
1801
+ ]
1802
+ },
1803
+ {
1804
+ "cell_type": "code",
1805
+ "execution_count": null,
1806
+ "id": "71929465-5ad5-444d-8c77-22f586b1ba23",
1807
+ "metadata": {},
1808
+ "outputs": [],
1809
+ "source": []
1810
+ }
1811
+ ],
1812
+ "metadata": {
1813
+ "kernelspec": {
1814
+ "display_name": "Python 3 (ipykernel)",
1815
+ "language": "python",
1816
+ "name": "python3"
1817
+ },
1818
+ "language_info": {
1819
+ "codemirror_mode": {
1820
+ "name": "ipython",
1821
+ "version": 3
1822
+ },
1823
+ "file_extension": ".py",
1824
+ "mimetype": "text/x-python",
1825
+ "name": "python",
1826
+ "nbconvert_exporter": "python",
1827
+ "pygments_lexer": "ipython3",
1828
+ "version": "3.8.10"
1829
+ }
1830
+ },
1831
+ "nbformat": 4,
1832
+ "nbformat_minor": 5
1833
+ }
app.py CHANGED
@@ -42,9 +42,8 @@ def download_submissions():
42
 
43
 
44
  def format_submissions(submissions):
45
- submission_data = {**{"Team": []}, **{"Model": []}, **{"Submission Date": []}, **{t: [] for t in TASKS}}
46
 
47
- # TODO(lewtun): delete / filter all the junk repos from development
48
  # The following picks the latest submissions which adhere to the model card schema
49
  for submission in submissions:
50
  submission_id = submission["id"]
@@ -55,10 +54,10 @@ def format_submissions(submissions):
55
  data = response.json()
56
  card_data = data["card_data"]
57
  username = card_data["submission_dataset"].split("/")[0]
58
- submission_data["Team"].append(username)
59
  submission_id = card_data["submission_id"]
60
  submission_name, sha, timestamp = submission_id.split("__")
61
- submission_data["Model"].append(submission_name)
62
  timestamp = pd.to_datetime(int(timestamp))
63
  submission_data["Submission Date"].append(datetime.date(timestamp).strftime("%b %d, %Y"))
64
 
 
42
 
43
 
44
  def format_submissions(submissions):
45
+ submission_data = {**{"Submitter": []}, **{"Submission Name": []}, **{"Submission Date": []}, **{t: [] for t in TASKS}}
46
 
 
47
  # The following picks the latest submissions which adhere to the model card schema
48
  for submission in submissions:
49
  submission_id = submission["id"]
 
54
  data = response.json()
55
  card_data = data["card_data"]
56
  username = card_data["submission_dataset"].split("/")[0]
57
+ submission_data["Submitter"].append(username)
58
  submission_id = card_data["submission_id"]
59
  submission_name, sha, timestamp = submission_id.split("__")
60
+ submission_data["Submission Name"].append(submission_name)
61
  timestamp = pd.to_datetime(int(timestamp))
62
  submission_data["Submission Date"].append(datetime.date(timestamp).strftime("%b %d, %Y"))
63