davidpomerenke commited on
Commit
b1e5b40
·
verified ·
1 Parent(s): 941d5c5

Upload from GitHub Actions: Use task subset for average score

Browse files
evals/backend.py CHANGED
@@ -28,6 +28,8 @@ task_metrics = [
28
  "mgsm_accuracy",
29
  ]
30
 
 
 
31
 
32
  def compute_normalized_average(df, metrics):
33
  """Compute average of min-max normalized metric columns."""
@@ -52,7 +54,7 @@ def make_model_table(df, models):
52
  df["task_metric"] = df["task"] + "_" + df["metric"]
53
  df = df.drop(columns=["task", "metric"])
54
  df = df.pivot(index="model", columns="task_metric", values="score")
55
- df["average"] = compute_normalized_average(df, task_metrics)
56
  df = df.sort_values(by="average", ascending=False).reset_index()
57
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
58
  df["rank"] = df.index + 1
@@ -84,7 +86,7 @@ def make_language_table(df, languages):
84
  df["task_metric"] = df["task"] + "_" + df["metric"]
85
  df = df.drop(columns=["task", "metric"])
86
  df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
87
- df["average"] = compute_normalized_average(df, task_metrics)
88
  df = pd.merge(languages, df, on="bcp_47", how="outer")
89
  df = df.sort_values(by="speakers", ascending=False)
90
  df = df[
 
28
  "mgsm_accuracy",
29
  ]
30
 
31
+ task_metrics_basic = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
32
+
33
 
34
  def compute_normalized_average(df, metrics):
35
  """Compute average of min-max normalized metric columns."""
 
54
  df["task_metric"] = df["task"] + "_" + df["metric"]
55
  df = df.drop(columns=["task", "metric"])
56
  df = df.pivot(index="model", columns="task_metric", values="score")
57
+ df["average"] = compute_normalized_average(df, task_metrics_basic)
58
  df = df.sort_values(by="average", ascending=False).reset_index()
59
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
60
  df["rank"] = df.index + 1
 
86
  df["task_metric"] = df["task"] + "_" + df["metric"]
87
  df = df.drop(columns=["task", "metric"])
88
  df = df.pivot(index="bcp_47", columns="task_metric", values="score").reset_index()
89
+ df["average"] = compute_normalized_average(df, task_metrics_basic)
90
  df = pd.merge(languages, df, on="bcp_47", how="outer")
91
  df = df.sort_values(by="speakers", ascending=False)
92
  df = df[
frontend/src/components/LanguagePlot.js CHANGED
@@ -3,38 +3,38 @@ import * as Plot from '@observablehq/plot'
3
 
4
  const LanguagePlot = ({ data, width = 750, height = 500 }) => {
5
  const containerRef = useRef()
6
- const languages = data.language_table.filter(a => a.translation_from_bleu > 0)
7
  const families = [...new Set(languages.map(a => a.family))]
8
 
9
  useEffect(() => {
10
  const plot = Plot.plot({
11
  width: width,
12
  height: height,
13
- subtitle: 'Translation quality by language',
14
  x: {
15
  label: 'Number of Speakers',
16
  type: 'log'
17
  },
18
  y: {
19
- label: 'Translation quality (spBLEU score for translating from the given language to other languages)'
20
  },
21
  marks: [
22
  Plot.dot(languages, {
23
  x: 'speakers',
24
- y: d => d.translation_from_bleu,
25
  r: 'speakers',
26
  fill: 'family',
27
  title: d =>
28
  `${d.language_name}\n${d.speakers.toLocaleString('en-US', {
29
  notation: 'compact'
30
- })} speakers\nScore: ${d.translation_from_bleu.toFixed(2)}`,
31
  tip: true
32
  }),
33
  Plot.text(
34
  languages.filter(a => a.speakers > 1e8),
35
  {
36
  x: 'speakers',
37
- y: d => d.translation_from_bleu,
38
  text: d => d.language_name,
39
  fill: 'black',
40
  frameAnchor: 'left',
 
3
 
4
  const LanguagePlot = ({ data, width = 750, height = 500 }) => {
5
  const containerRef = useRef()
6
+ const languages = data.language_table.filter(a => a.average > 0)
7
  const families = [...new Set(languages.map(a => a.family))]
8
 
9
  useEffect(() => {
10
  const plot = Plot.plot({
11
  width: width,
12
  height: height,
13
+ subtitle: 'Proficiency scores by language',
14
  x: {
15
  label: 'Number of Speakers',
16
  type: 'log'
17
  },
18
  y: {
19
+ label: 'Language proficiency score'
20
  },
21
  marks: [
22
  Plot.dot(languages, {
23
  x: 'speakers',
24
+ y: d => d.average,
25
  r: 'speakers',
26
  fill: 'family',
27
  title: d =>
28
  `${d.language_name}\n${d.speakers.toLocaleString('en-US', {
29
  notation: 'compact'
30
+ })} speakers\nScore: ${d.average.toFixed(2)}`,
31
  tip: true
32
  }),
33
  Plot.text(
34
  languages.filter(a => a.speakers > 1e8),
35
  {
36
  x: 'speakers',
37
+ y: d => d.average,
38
  text: d => d.language_name,
39
  fill: 'black',
40
  frameAnchor: 'left',
frontend/src/components/ScoreColumns.js CHANGED
@@ -13,8 +13,8 @@ const scoreBodyTemplate = (field, options = {}) => {
13
  const ScoreColumns = [
14
  <Column
15
  field='average'
16
- header='Overall'
17
- headerTooltip='Language Proficiency Score (average of all displayed scores, after min-max normalization)'
18
  sortable
19
  body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
20
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
 
13
  const ScoreColumns = [
14
  <Column
15
  field='average'
16
+ header='Proficiency'
17
+ headerTooltip='Language Proficiency Score (average translation and classification scores, after min-max normalization)'
18
  sortable
19
  body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
20
  style={{ minWidth: '5rem', maxWidth: '10rem' }}