davidpomerenke commited on
Commit
4e8cb1a
·
verified ·
1 Parent(s): 80d21cb

Upload from GitHub Actions: updated frontend and backend to fix bugs

Browse files
datasets.json CHANGED
@@ -219,7 +219,7 @@
219
  "parallel": true,
220
  "translation": "machine",
221
  "base": "MMLU",
222
- "implemented": true,
223
  "group": "Multitask Language Understanding"
224
  },
225
  {
@@ -256,7 +256,7 @@
256
  "parallel": true,
257
  "translation": "machine",
258
  "base": "MMLU",
259
- "implemented": true,
260
  "group": "Multitask Language Understanding"
261
  },
262
  {
@@ -360,7 +360,7 @@
360
  "parallel": true,
361
  "translation": "machine",
362
  "base": "AI2 ARC",
363
- "implemented": true,
364
  "group": "ARC Question Answering"
365
  },
366
  {
@@ -375,7 +375,7 @@
375
  "parallel": true,
376
  "translation": "machine",
377
  "base": "AI2 ARC",
378
- "implemented": true,
379
  "group": "ARC Question Answering"
380
  },
381
  {
@@ -420,7 +420,7 @@
420
  "parallel": true,
421
  "translation": "machine",
422
  "base": "TruthfulQA",
423
- "implemented": true,
424
  "group": "Truthfulness"
425
  },
426
  {
@@ -435,7 +435,7 @@
435
  "parallel": true,
436
  "translation": "machine",
437
  "base": "TruthfulQA",
438
- "implemented": true,
439
  "group": "Truthfulness"
440
  },
441
  {
 
219
  "parallel": true,
220
  "translation": "machine",
221
  "base": "MMLU",
222
+ "implemented": false,
223
  "group": "Multitask Language Understanding"
224
  },
225
  {
 
256
  "parallel": true,
257
  "translation": "machine",
258
  "base": "MMLU",
259
+ "implemented": false,
260
  "group": "Multitask Language Understanding"
261
  },
262
  {
 
360
  "parallel": true,
361
  "translation": "machine",
362
  "base": "AI2 ARC",
363
+ "implemented": false,
364
  "group": "ARC Question Answering"
365
  },
366
  {
 
375
  "parallel": true,
376
  "translation": "machine",
377
  "base": "AI2 ARC",
378
+ "implemented": false,
379
  "group": "ARC Question Answering"
380
  },
381
  {
 
420
  "parallel": true,
421
  "translation": "machine",
422
  "base": "TruthfulQA",
423
+ "implemented": false,
424
  "group": "Truthfulness"
425
  },
426
  {
 
435
  "parallel": true,
436
  "translation": "machine",
437
  "base": "TruthfulQA",
438
+ "implemented": false,
439
  "group": "Truthfulness"
440
  },
441
  {
evals/backend.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  import numpy as np
5
  import pandas as pd
6
  import uvicorn
7
- from countries import make_country_table
8
  from fastapi import FastAPI, Request
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.middleware.gzip import GZipMiddleware
@@ -45,16 +45,25 @@ def compute_normalized_average(df, metrics):
45
  return normalized_df.mean(axis=1, skipna=False)
46
 
47
 
48
- def make_model_table(df, models):
49
  # Create a combined task_metric for origin
50
- df["task_metric_origin"] = df["task"] + "_" + df["metric"] + "_" + df["origin"]
51
-
 
 
52
  # Pivot to get scores for each origin-specific metric
53
- scores_pivot = df.pivot_table(index="model", columns="task_metric_origin", values="score", aggfunc="mean")
54
-
 
 
 
 
 
55
  # Create the regular task_metric for the main average calculation
56
- df["task_metric"] = df["task"] + "_" + df["metric"]
57
- main_pivot = df.pivot_table(index="model", columns="task_metric", values="score", aggfunc="mean")
 
 
58
 
59
  # Merge the two pivots
60
  df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
@@ -64,6 +73,29 @@ def make_model_table(df, models):
64
  df[metric] = np.nan
65
 
66
  df["average"] = compute_normalized_average(df, task_metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  df = df.sort_values(by="average", ascending=False).reset_index()
68
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
69
  df["rank"] = df.index + 1
@@ -82,16 +114,25 @@ def make_model_table(df, models):
82
  return df
83
 
84
 
85
- def make_language_table(df, languages):
86
  # Create a combined task_metric for origin
87
- df["task_metric_origin"] = df["task"] + "_" + df["metric"] + "_" + df["origin"]
88
-
 
 
89
  # Pivot to get scores for each origin-specific metric
90
- scores_pivot = df.pivot_table(index="bcp_47", columns="task_metric_origin", values="score", aggfunc="mean")
91
-
 
 
 
 
 
92
  # Create the regular task_metric for the main average calculation
93
- df["task_metric"] = df["task"] + "_" + df["metric"]
94
- main_pivot = df.pivot_table(index="bcp_47", columns="task_metric", values="score", aggfunc="mean")
 
 
95
 
96
  # Merge the two pivots
97
  df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")
@@ -101,6 +142,36 @@ def make_language_table(df, languages):
101
  df[metric] = np.nan
102
 
103
  df["average"] = compute_normalized_average(df, task_metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  df = pd.merge(languages, df, on="bcp_47", how="outer")
105
  df = df.sort_values(by="speakers", ascending=False)
106
 
 
4
  import numpy as np
5
  import pandas as pd
6
  import uvicorn
7
+ from .countries import make_country_table
8
  from fastapi import FastAPI, Request
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.middleware.gzip import GZipMiddleware
 
45
  return normalized_df.mean(axis=1, skipna=False)
46
 
47
 
48
+ def make_model_table(scores_df, models):
49
  # Create a combined task_metric for origin
50
+ scores_df["task_metric_origin"] = (
51
+ scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
52
+ )
53
+
54
  # Pivot to get scores for each origin-specific metric
55
+ scores_pivot = scores_df.pivot_table(
56
+ index="model",
57
+ columns="task_metric_origin",
58
+ values="score",
59
+ aggfunc="mean",
60
+ )
61
+
62
  # Create the regular task_metric for the main average calculation
63
+ scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
64
+ main_pivot = scores_df.pivot_table(
65
+ index="model", columns="task_metric", values="score", aggfunc="mean"
66
+ )
67
 
68
  # Merge the two pivots
69
  df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
 
73
  df[metric] = np.nan
74
 
75
  df["average"] = compute_normalized_average(df, task_metrics)
76
+
77
+ # Compute origin presence per model+metric
78
+ origin_presence = (
79
+ scores_df.groupby(["model", "task_metric", "origin"]).size().unstack(fill_value=0)
80
+ )
81
+ # Add boolean flags: show asterisk only if exclusively machine-origin contributed
82
+ for metric in task_metrics:
83
+ human_col_name = "human" if "human" in origin_presence.columns else None
84
+ machine_col_name = "machine" if "machine" in origin_presence.columns else None
85
+ if human_col_name or machine_col_name:
86
+ flags = []
87
+ for model in df.index:
88
+ try:
89
+ counts = origin_presence.loc[(model, metric)]
90
+ except KeyError:
91
+ flags.append(False)
92
+ continue
93
+ human_count = counts.get(human_col_name, 0) if human_col_name else 0
94
+ machine_count = counts.get(machine_col_name, 0) if machine_col_name else 0
95
+ flags.append(machine_count > 0 and human_count == 0)
96
+ df[f"{metric}_is_machine"] = flags
97
+ else:
98
+ df[f"{metric}_is_machine"] = False
99
  df = df.sort_values(by="average", ascending=False).reset_index()
100
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
101
  df["rank"] = df.index + 1
 
114
  return df
115
 
116
 
117
+ def make_language_table(scores_df, languages):
118
  # Create a combined task_metric for origin
119
+ scores_df["task_metric_origin"] = (
120
+ scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
121
+ )
122
+
123
  # Pivot to get scores for each origin-specific metric
124
+ scores_pivot = scores_df.pivot_table(
125
+ index="bcp_47",
126
+ columns="task_metric_origin",
127
+ values="score",
128
+ aggfunc="mean",
129
+ )
130
+
131
  # Create the regular task_metric for the main average calculation
132
+ scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
133
+ main_pivot = scores_df.pivot_table(
134
+ index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
135
+ )
136
 
137
  # Merge the two pivots
138
  df = pd.merge(main_pivot, scores_pivot, on="bcp_47", how="outer")
 
142
  df[metric] = np.nan
143
 
144
  df["average"] = compute_normalized_average(df, task_metrics)
145
+
146
+ # Compute origin presence per language+metric; show asterisk only if exclusively machine-origin
147
+ origin_presence = (
148
+ scores_df.groupby(["bcp_47", "task_metric", "origin"]).size().unstack(fill_value=0)
149
+ )
150
+ for metric in task_metrics:
151
+ human_col_name = "human" if "human" in origin_presence.columns else None
152
+ machine_col_name = "machine" if "machine" in origin_presence.columns else None
153
+ if human_col_name or machine_col_name:
154
+ flags = []
155
+ for bcp in df.index:
156
+ try:
157
+ counts = origin_presence.loc[(bcp, metric)]
158
+ except KeyError:
159
+ flags.append(False)
160
+ continue
161
+ human_count = counts.get(human_col_name, 0) if human_col_name else 0
162
+ machine_count = counts.get(machine_col_name, 0) if machine_col_name else 0
163
+ flags.append(machine_count > 0 and human_count == 0)
164
+ df[f"{metric}_is_machine"] = flags
165
+ else:
166
+ df[f"{metric}_is_machine"] = False
167
+
168
+ # Per-row machine-origin flags for each metric (true if any machine-origin score exists for the language)
169
+ for metric in task_metrics:
170
+ machine_col = f"{metric}_machine"
171
+ if machine_col in df.columns:
172
+ df[f"{metric}_is_machine"] = df[machine_col].notna()
173
+ else:
174
+ df[f"{metric}_is_machine"] = False
175
  df = pd.merge(languages, df, on="bcp_47", how="outer")
176
  df = df.sort_values(by="speakers", ascending=False)
177
 
evals/datasets_/mmlu.py CHANGED
@@ -165,49 +165,55 @@ async def load_mmlu(language_bcp_47, nr):
165
  return "CohereForAI/Global-MMLU", task, "human"
166
  elif language_bcp_47 in tags_mmlu_autotranslated:
167
  ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
168
- task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
169
- return "fair-forward/mmlu-autotranslated", task, "machine"
 
 
 
 
170
  else:
171
- # Try on-the-fly translation for missing languages
172
  return await load_mmlu_translated(language_bcp_47, nr)
173
 
174
 
175
  async def load_mmlu_translated(language_bcp_47, nr):
176
  """
177
- Load MMLU data with on-the-fly Google translation for languages
178
- without native MMLU translations.
179
  """
180
- # Check if Google Translate supports this language
181
  supported_languages = get_google_supported_languages()
182
  if language_bcp_47 not in supported_languages:
183
  return None, None, None
184
-
185
  print(f"🔄 Translating MMLU data to {language_bcp_47} on-the-fly...")
186
-
187
  try:
188
- # Load English MMLU data
189
  category = categories[nr % len(categories)]
190
  ds = _load_dataset("masakhane/afrimmlu", "eng")
191
  ds = ds.map(parse_choices)
192
- task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
193
-
 
 
 
194
  # Translate question and choices
195
  question_translated = await translate_google(task["question"], "en", language_bcp_47)
196
  choices_translated = []
197
  for choice in task["choices"]:
198
  choice_translated = await translate_google(choice, "en", language_bcp_47)
199
  choices_translated.append(choice_translated)
200
-
201
  # Create translated task
202
  translated_task = {
203
  "question": question_translated,
204
  "choices": choices_translated,
205
  "answer": task["answer"], # Keep original answer index
206
- "subject": task["subject"]
207
  }
208
-
209
  return f"mmlu-translated-{language_bcp_47}", translated_task, "machine"
210
-
211
  except Exception as e:
212
  print(f"❌ Translation failed for {language_bcp_47}: {e}")
213
  return None, None, None
@@ -217,7 +223,7 @@ def translate_mmlu(languages):
217
  human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
218
  untranslated = [
219
  lang
220
- for lang in languages["bcp_47"].values[:100]
221
  if lang not in human_translated and lang in get_google_supported_languages()
222
  ]
223
  n_samples = 10
 
165
  return "CohereForAI/Global-MMLU", task, "human"
166
  elif language_bcp_47 in tags_mmlu_autotranslated:
167
  ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
168
+ filtered = ds["test"].filter(lambda x: x["subject"] == category)
169
+ if nr < len(filtered):
170
+ task = filtered[nr]
171
+ return "fair-forward/mmlu-autotranslated", task, "machine"
172
+ # Requested index exceeds stored sample count → fallback to on-the-fly
173
+ return await load_mmlu_translated(language_bcp_47, nr)
174
  else:
175
+ # Fallback to on-the-fly translation for missing languages
176
  return await load_mmlu_translated(language_bcp_47, nr)
177
 
178
 
179
  async def load_mmlu_translated(language_bcp_47, nr):
180
  """
181
+ Load MMLU data with on-the-fly Google translation for languages
182
+ without native or stored auto-translated MMLU, or when more samples are requested.
183
  """
 
184
  supported_languages = get_google_supported_languages()
185
  if language_bcp_47 not in supported_languages:
186
  return None, None, None
187
+
188
  print(f"🔄 Translating MMLU data to {language_bcp_47} on-the-fly...")
189
+
190
  try:
191
+ # Load English MMLU base (AfriMMLU English split for category alignment)
192
  category = categories[nr % len(categories)]
193
  ds = _load_dataset("masakhane/afrimmlu", "eng")
194
  ds = ds.map(parse_choices)
195
+ filtered = ds["test"].filter(lambda x: x["subject"] == category)
196
+ if len(filtered) == 0:
197
+ return None, None, None
198
+ task = filtered[nr % len(filtered)]
199
+
200
  # Translate question and choices
201
  question_translated = await translate_google(task["question"], "en", language_bcp_47)
202
  choices_translated = []
203
  for choice in task["choices"]:
204
  choice_translated = await translate_google(choice, "en", language_bcp_47)
205
  choices_translated.append(choice_translated)
206
+
207
  # Create translated task
208
  translated_task = {
209
  "question": question_translated,
210
  "choices": choices_translated,
211
  "answer": task["answer"], # Keep original answer index
212
+ "subject": task["subject"],
213
  }
214
+
215
  return f"mmlu-translated-{language_bcp_47}", translated_task, "machine"
216
+
217
  except Exception as e:
218
  print(f"❌ Translation failed for {language_bcp_47}: {e}")
219
  return None, None, None
 
223
  human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
224
  untranslated = [
225
  lang
226
+ for lang in languages["bcp_47"].values[:150]
227
  if lang not in human_translated and lang in get_google_supported_languages()
228
  ]
229
  n_samples = 10
evals/datasets_/truthfulqa.py CHANGED
@@ -35,7 +35,7 @@ async def load_truthfulqa(language_bcp_47, nr):
35
  task = ds["test"][nr]
36
  return "masakhane/uhura-truthfulqa", task, "human"
37
  else:
38
- # Fallback to on-the-fly translation
39
  return await load_truthfulqa_translated(language_bcp_47, nr)
40
 
41
  async def load_truthfulqa_translated(language_bcp_47, nr):
@@ -79,10 +79,10 @@ def translate_truthfulqa(languages):
79
  human_translated = [*tags_uhura_truthfulqa.keys()]
80
  untranslated = [
81
  lang
82
- for lang in languages["bcp_47"].values[:100]
83
  if lang not in human_translated and lang in get_google_supported_languages()
84
  ]
85
- n_samples = 10
86
 
87
  slug = "fair-forward/truthfulqa-autotranslated"
88
  for lang in tqdm(untranslated):
@@ -132,7 +132,7 @@ def translate_truthfulqa(languages):
132
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
133
  )
134
  ds_lang.to_json(
135
- f"data/translations/mmlu/{lang}_{split}.json",
136
  lines=False,
137
  force_ascii=False,
138
  indent=2,
 
35
  task = ds["test"][nr]
36
  return "masakhane/uhura-truthfulqa", task, "human"
37
  else:
38
+ # Fallback to on-the-fly translation for missing languages/samples
39
  return await load_truthfulqa_translated(language_bcp_47, nr)
40
 
41
  async def load_truthfulqa_translated(language_bcp_47, nr):
 
79
  human_translated = [*tags_uhura_truthfulqa.keys()]
80
  untranslated = [
81
  lang
82
+ for lang in languages["bcp_47"].values[:150]
83
  if lang not in human_translated and lang in get_google_supported_languages()
84
  ]
85
+ n_samples = 20
86
 
87
  slug = "fair-forward/truthfulqa-autotranslated"
88
  for lang in tqdm(untranslated):
 
132
  token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
133
  )
134
  ds_lang.to_json(
135
+ f"data/translations/truthfulqa/{lang}_{split}.json",
136
  lines=False,
137
  force_ascii=False,
138
  indent=2,
frontend/src/components/LanguageTable.js CHANGED
@@ -172,7 +172,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
172
  filterElement={familyRowFilterTemplate}
173
  style={{ minWidth: '10rem' }}
174
  />
175
- {ScoreColumns}
176
  </DataTable>
177
  )
178
  }
 
172
  filterElement={familyRowFilterTemplate}
173
  style={{ minWidth: '10rem' }}
174
  />
175
+ {ScoreColumns()}
176
  </DataTable>
177
  )
178
  }
frontend/src/components/ScoreColumns.js CHANGED
@@ -6,7 +6,13 @@ const scoreBodyTemplate = (field, options = {}) => {
6
 
7
  return rowData => {
8
  const score = rowData[field]
9
- const isMachineTranslated = machineTranslatedMetrics.includes(field)
 
 
 
 
 
 
10
  return ScoreField(score, minScore, maxScore, isMachineTranslated)
11
  }
12
  }
 
6
 
7
  return rowData => {
8
  const score = rowData[field]
9
+ // Prefer per-row flag if present (backend sets `<metric>_is_machine`),
10
+ // otherwise fall back to global list
11
+ const rowFlagKey = `${field}_is_machine`
12
+ const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
13
+ const isMachineTranslated = hasRowFlag
14
+ ? !!rowData[rowFlagKey]
15
+ : machineTranslatedMetrics.includes(field)
16
  return ScoreField(score, minScore, maxScore, isMachineTranslated)
17
  }
18
  }
system_architecture_diagram.md CHANGED
@@ -36,9 +36,9 @@ flowchart TD
36
  %% On-the-fly Translation with Origin Tagging
37
  subgraph OTF [On-the-fly Dataset Translation]
38
  direction LR
39
- DS_raw["Raw English Dataset<br/>(e.g., MMLU)"] --> Google_Translate["Google Translate API"]
40
- Google_Translate --> DS_translated["Translated Dataset<br/>(e.g., German MMLU)<br/>Origin: 'machine'"]
41
- DS_native["Native Dataset<br/>(e.g., German MMLU)<br/>Origin: 'human'"]
42
  end
43
 
44
  %% Evaluation Pipeline
@@ -51,9 +51,9 @@ flowchart TD
51
  %% Task Execution with Origin Tracking
52
  P --> Q1[translate_and_evaluate<br/>Origin: 'human']
53
  P --> Q2[classify_and_evaluate<br/>Origin: 'human']
54
- P --> Q3[mmlu_and_evaluate<br/>Origin: 'human'/'machine']
55
  P --> Q4[arc_and_evaluate<br/>Origin: 'human'/'machine']
56
- P --> Q5[truthfulqa_and_evaluate<br/>Origin: 'human'/'machine']
57
  P --> Q6[mgsm_and_evaluate<br/>Origin: 'human'/'machine']
58
 
59
  %% API Calls with Error Handling
@@ -85,7 +85,7 @@ flowchart TD
85
  %% Data Sources with Origin Information
86
  subgraph DS ["Data Sources"]
87
  DS1["Flores-200<br/>Translation Sentences<br/>Origin: 'human'"]
88
- DS2["MMLU/AfriMMLU<br/>Knowledge QA<br/>Origin: 'human'"]
89
  DS3["ARC<br/>Science Reasoning<br/>Origin: 'human'"]
90
  DS4["TruthfulQA<br/>Truthfulness<br/>Origin: 'human'"]
91
  DS5["MGSM<br/>Math Problems<br/>Origin: 'human'"]
@@ -97,7 +97,7 @@ flowchart TD
97
  DS4 --> Q5
98
  DS5 --> Q6
99
 
100
- DS_translated --> Q3
101
  DS_translated --> Q4
102
  DS_translated --> Q5
103
 
 
36
  %% On-the-fly Translation with Origin Tagging
37
  subgraph OTF [On-the-fly Dataset Translation]
38
  direction LR
39
+ DS_raw["Raw English Dataset<br/>"] --> Google_Translate["Google Translate API"]
40
+ Google_Translate --> DS_translated["Translated Dataset<br/>(e.g., MGSM/ARC)<br/>Origin: 'machine'"]
41
+ DS_native["Native Dataset<br/>(e.g., AfriMMLU/Global-MMLU)<br/>Origin: 'human'"]
42
  end
43
 
44
  %% Evaluation Pipeline
 
51
  %% Task Execution with Origin Tracking
52
  P --> Q1[translate_and_evaluate<br/>Origin: 'human']
53
  P --> Q2[classify_and_evaluate<br/>Origin: 'human']
54
+ P --> Q3[mmlu_and_evaluate<br/>Origin: 'human' (no on-the-fly for missing; uses auto-translated dataset if available)]
55
  P --> Q4[arc_and_evaluate<br/>Origin: 'human'/'machine']
56
+ P --> Q5[truthfulqa_and_evaluate<br/>Origin: 'human' (no on-the-fly for missing; relies on available datasets)]
57
  P --> Q6[mgsm_and_evaluate<br/>Origin: 'human'/'machine']
58
 
59
  %% API Calls with Error Handling
 
85
  %% Data Sources with Origin Information
86
  subgraph DS ["Data Sources"]
87
  DS1["Flores-200<br/>Translation Sentences<br/>Origin: 'human'"]
88
+ DS2["MMLU/AfriMMLU/Global-MMLU<br/>Knowledge QA<br/>Origin: 'human' or 'machine' (HF auto-translated only)"]
89
  DS3["ARC<br/>Science Reasoning<br/>Origin: 'human'"]
90
  DS4["TruthfulQA<br/>Truthfulness<br/>Origin: 'human'"]
91
  DS5["MGSM<br/>Math Problems<br/>Origin: 'human'"]
 
97
  DS4 --> Q5
98
  DS5 --> Q6
99
 
100
+ %% No on-the-fly DS_translated for MMLU anymore; only HF auto-translated used
101
  DS_translated --> Q4
102
  DS_translated --> Q5
103