Upload from GitHub Actions: Use task subset for average score
Browse files
evals/backend.py
CHANGED
@@ -28,6 +28,8 @@ task_metrics = [
|
|
28 |
"mgsm_accuracy",
|
29 |
]
|
30 |
|
|
|
|
|
31 |
|
32 |
def compute_normalized_average(df, metrics):
|
33 |
"""Compute average of min-max normalized metric columns."""
|
@@ -52,7 +54,7 @@ def make_model_table(df, models):
|
|
52 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
53 |
df = df.drop(columns=["task", "metric"])
|
54 |
df = df.pivot(index="model", columns="task_metric", values="score")
|
55 |
-
df["average"] = compute_normalized_average(df,
|
56 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
57 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
58 |
df["rank"] = df.index + 1
|
@@ -84,7 +86,7 @@ def make_language_table(df, languages):
|
|
84 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
85 |
df = df.drop(columns=["task", "metric"])
|
86 |
df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
|
87 |
-
df["average"] = compute_normalized_average(df,
|
88 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
89 |
df = df.sort_values(by="speakers", ascending=False)
|
90 |
df = df[
|
|
|
28 |
"mgsm_accuracy",
|
29 |
]
|
30 |
|
31 |
+
task_metrics_basic = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
|
32 |
+
|
33 |
|
34 |
def compute_normalized_average(df, metrics):
|
35 |
"""Compute average of min-max normalized metric columns."""
|
|
|
54 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
55 |
df = df.drop(columns=["task", "metric"])
|
56 |
df = df.pivot(index="model", columns="task_metric", values="score")
|
57 |
+
df["average"] = compute_normalized_average(df, task_metrics_basic)
|
58 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
59 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
60 |
df["rank"] = df.index + 1
|
|
|
86 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
87 |
df = df.drop(columns=["task", "metric"])
|
88 |
df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
|
89 |
+
df["average"] = compute_normalized_average(df, task_metrics_basic)
|
90 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
91 |
df = df.sort_values(by="speakers", ascending=False)
|
92 |
df = df[
|
frontend/src/components/LanguagePlot.js
CHANGED
@@ -3,38 +3,38 @@ import * as Plot from '@observablehq/plot'
|
|
3 |
|
4 |
const LanguagePlot = ({ data, width = 750, height = 500 }) => {
|
5 |
const containerRef = useRef()
|
6 |
-
const languages = data.language_table.filter(a => a.
|
7 |
const families = [...new Set(languages.map(a => a.family))]
|
8 |
|
9 |
useEffect(() => {
|
10 |
const plot = Plot.plot({
|
11 |
width: width,
|
12 |
height: height,
|
13 |
-
subtitle: '
|
14 |
x: {
|
15 |
label: 'Number of Speakers',
|
16 |
type: 'log'
|
17 |
},
|
18 |
y: {
|
19 |
-
label: '
|
20 |
},
|
21 |
marks: [
|
22 |
Plot.dot(languages, {
|
23 |
x: 'speakers',
|
24 |
-
y: d => d.
|
25 |
r: 'speakers',
|
26 |
fill: 'family',
|
27 |
title: d =>
|
28 |
`${d.language_name}\n${d.speakers.toLocaleString('en-US', {
|
29 |
notation: 'compact'
|
30 |
-
})} speakers\nScore: ${d.
|
31 |
tip: true
|
32 |
}),
|
33 |
Plot.text(
|
34 |
languages.filter(a => a.speakers > 1e8),
|
35 |
{
|
36 |
x: 'speakers',
|
37 |
-
y: d => d.
|
38 |
text: d => d.language_name,
|
39 |
fill: 'black',
|
40 |
frameAnchor: 'left',
|
|
|
3 |
|
4 |
const LanguagePlot = ({ data, width = 750, height = 500 }) => {
|
5 |
const containerRef = useRef()
|
6 |
+
const languages = data.language_table.filter(a => a.average > 0)
|
7 |
const families = [...new Set(languages.map(a => a.family))]
|
8 |
|
9 |
useEffect(() => {
|
10 |
const plot = Plot.plot({
|
11 |
width: width,
|
12 |
height: height,
|
13 |
+
subtitle: 'Proficiency scores by language',
|
14 |
x: {
|
15 |
label: 'Number of Speakers',
|
16 |
type: 'log'
|
17 |
},
|
18 |
y: {
|
19 |
+
label: 'Language proficiency score'
|
20 |
},
|
21 |
marks: [
|
22 |
Plot.dot(languages, {
|
23 |
x: 'speakers',
|
24 |
+
y: d => d.average,
|
25 |
r: 'speakers',
|
26 |
fill: 'family',
|
27 |
title: d =>
|
28 |
`${d.language_name}\n${d.speakers.toLocaleString('en-US', {
|
29 |
notation: 'compact'
|
30 |
+
})} speakers\nScore: ${d.average.toFixed(2)}`,
|
31 |
tip: true
|
32 |
}),
|
33 |
Plot.text(
|
34 |
languages.filter(a => a.speakers > 1e8),
|
35 |
{
|
36 |
x: 'speakers',
|
37 |
+
y: d => d.average,
|
38 |
text: d => d.language_name,
|
39 |
fill: 'black',
|
40 |
frameAnchor: 'left',
|
frontend/src/components/ScoreColumns.js
CHANGED
@@ -13,8 +13,8 @@ const scoreBodyTemplate = (field, options = {}) => {
|
|
13 |
const ScoreColumns = [
|
14 |
<Column
|
15 |
field='average'
|
16 |
-
header='
|
17 |
-
headerTooltip='Language Proficiency Score (average
|
18 |
sortable
|
19 |
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
|
|
13 |
const ScoreColumns = [
|
14 |
<Column
|
15 |
field='average'
|
16 |
+
header='Proficiency'
|
17 |
+
headerTooltip='Language Proficiency Score (average translation and classification scores, after min-max normalization)'
|
18 |
sortable
|
19 |
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|