David Pomerenke commited on
Commit
ada20e2
Β·
2 Parent(s): 790c5f2 c527cda

Merge branch 'feature-observable'

Browse files
.gitignore CHANGED
@@ -4,6 +4,12 @@ ScriptCodes.csv
4
  .cache
5
  .env
6
 
 
 
 
 
 
 
7
  # Python-generated files
8
  __pycache__/
9
  *.py[oc]
 
4
  .cache
5
  .env
6
 
7
+ # Observable
8
+ .DS_Store
9
+ /dist/
10
+ node_modules/
11
+ yarn-error.log
12
+
13
  # Python-generated files
14
  __pycache__/
15
  *.py[oc]
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: AI Language Monitor
3
- emoji: πŸ‘€
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: static
 
1
  ---
2
  title: AI Language Monitor
3
+ emoji: 🌍
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: static
index.html DELETED
@@ -1,254 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
-
4
- <head>
5
- <title>AI Language Monitor</title>
6
- <script src="https://cdn.tailwindcss.com"></script>
7
- <style>
8
- body {
9
- margin: 0 auto;
10
- padding: 20px;
11
- font-family: sans-serif;
12
- }
13
-
14
- .language-header {
15
- margin-bottom: 10px;
16
- }
17
-
18
- .speaker-count {
19
- font-size: 0.8em;
20
- color: #666;
21
- font-weight: normal;
22
- margin: 0;
23
- }
24
- </style>
25
- <link rel="icon"
26
- href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22 fill=%22black%22>🌍</text></svg>">
27
- </head>
28
-
29
- <body>
30
- <nav>
31
- <div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
32
- <!-- Mobile menu button -->
33
- <div class="sm:hidden absolute left-4 top-4">
34
- <button onclick="toggleMobileMenu()" class="text-gray-500 hover:text-gray-700 focus:outline-none">
35
- <svg class="h-6 w-6" fill="none" viewBox="0 0 24 24" stroke="currentColor">
36
- <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 6h16M4 12h16M4 18h16" />
37
- </svg>
38
- </button>
39
- </div>
40
-
41
- <!-- Mobile menu (hidden by default) -->
42
- <div id="mobileMenu" class="hidden sm:hidden absolute left-0 top-16 bg-white shadow-lg py-4 mx-4 rounded-lg border border-gray-200">
43
- <div class="flex flex-col">
44
- <h3 class="px-6 py-2 text-gray-400 text-sm font-medium">Navigation</h3>
45
- <a href="#" onclick="showSection('coverage'); toggleMobileMenu()" class="nav-link px-6 py-3 text-gray-600 hover:bg-gray-50">
46
- Language Coverage
47
- </a>
48
- <a href="#" onclick="showSection('comparison'); toggleMobileMenu()" class="nav-link px-6 py-3 text-gray-600 hover:bg-gray-50">
49
- LLM Comparison
50
- </a>
51
- <a href="#" onclick="showSection('results'); toggleMobileMenu()" class="nav-link px-6 py-3 text-gray-600 hover:bg-gray-50">
52
- Results by Language
53
- </a>
54
- </div>
55
- </div>
56
-
57
- <!-- Desktop menu -->
58
- <div class="hidden sm:flex justify-center h-16 border-b border-gray-200">
59
- <div class="flex">
60
- <div class="flex space-x-8">
61
- <a href="#" onclick="showSection('coverage')" class="nav-link active inline-flex items-center px-1 pt-1 border-b-2 border-indigo-500 text-sm font-medium text-gray-900">
62
- Language Coverage
63
- </a>
64
- <a href="#" onclick="showSection('comparison')" class="nav-link inline-flex items-center px-1 pt-1 border-b-2 border-transparent text-sm font-medium text-gray-500 hover:border-gray-300 hover:text-gray-700">
65
- LLM Comparison
66
- </a>
67
- <a href="#" onclick="showSection('results')" class="nav-link inline-flex items-center px-1 pt-1 border-b-2 border-transparent text-sm font-medium text-gray-500 hover:border-gray-300 hover:text-gray-700">
68
- Results by Language
69
- </a>
70
- </div>
71
- </div>
72
- </div>
73
- </div>
74
- </nav>
75
-
76
- <div class="p-6">
77
- <section id="coverage" class="section">
78
- <div id="summary-chart"></div>
79
- </section>
80
-
81
- <section id="comparison" class="section hidden">
82
- <p class="text-gray-600">Coming soon...</p>
83
- <!--
84
- - Leaderboard
85
- - Filters
86
- - commercial vs open source
87
- - Eval results per task (across all languages)
88
- - Timeline
89
- -->
90
- </section>
91
-
92
- <section id="results" class="section hidden">
93
- <div id="language-list"></div>
94
- <!--
95
- - Filters
96
- - free-text search
97
- - by continent, by language family
98
- - sort by: population ><, performance ><, datasets ><
99
- - Language list with details
100
- - Eval results for each task and model
101
- - Available datasets
102
- - Form field to submit more datasets and custom models
103
- -->
104
- </section>
105
- </div>
106
-
107
- <script type="module">
108
- // Import Plot using ESM
109
- import * as Plot from "https://cdn.jsdelivr.net/npm/@observablehq/[email protected]/+esm";
110
-
111
- function showSection(sectionId) {
112
- // Update nav links
113
- document.querySelectorAll('.nav-link').forEach(link => {
114
- link.classList.remove('border-indigo-500', 'text-gray-900');
115
- link.classList.add('border-transparent', 'text-gray-500');
116
- });
117
- const activeLink = document.querySelector(`[onclick="showSection('${sectionId}')"]`);
118
- activeLink.classList.remove('border-transparent', 'text-gray-500');
119
- activeLink.classList.add('border-indigo-500', 'text-gray-900');
120
-
121
- // Show/hide sections
122
- document.querySelectorAll('.section').forEach(section => {
123
- section.classList.add('hidden');
124
- });
125
- document.getElementById(sectionId).classList.remove('hidden');
126
- }
127
- window.showSection = showSection;
128
-
129
- function toggleMobileMenu() {
130
- const mobileMenu = document.getElementById('mobileMenu');
131
- mobileMenu.classList.toggle('hidden');
132
- }
133
- window.toggleMobileMenu = toggleMobileMenu;
134
-
135
- async function init() {
136
- const scoreKey = "bleu"
137
- const scoreName = "BLEU Score"
138
- const summaryChartDiv = document.getElementById('summary-chart');
139
- const languageListDiv = document.getElementById('language-list');
140
-
141
- const response = await fetch('results.json');
142
- const data = await response.json();
143
- // Format captions
144
- const formatScore = (score) => score > 0 ? score.toFixed(2) : "No benchmark available!"
145
- const formatTitle = d => (d.language_name + "\n" + parseInt(d.speakers / 1_000_00) / 10 + "M speakers\n" + scoreName + ": " + formatScore(d[scoreKey]))
146
-
147
- // Create summary plot
148
- const summaryPlot = Plot.plot({
149
- width: summaryChartDiv.clientWidth,
150
- height: 400,
151
- marginBottom: 100,
152
- x: { label: "Number of speakers", axis: null },
153
- y: { label: `${scoreName} (average across models)` },
154
- // color: { scheme: "BrBG" },
155
- marks: [
156
- Plot.rectY(data, Plot.stackX({
157
- x: "speakers",
158
- order: scoreKey,
159
- reverse: true,
160
- y2: scoreKey, // y2 to avoid stacking by y
161
- title: formatTitle,
162
- tip: true,
163
- fill: d => d[scoreKey] > 0 ? "black" : "pink"
164
- })),
165
- Plot.rectY(data, Plot.pointerX(Plot.stackX({
166
- x: "speakers",
167
- order: scoreKey,
168
- reverse: true,
169
- y2: scoreKey, // y2 to avoid stacking by y
170
- fill: "grey",
171
- }))),
172
- Plot.text(data, Plot.stackX({
173
- x: "speakers",
174
- y2: scoreKey,
175
- order: scoreKey,
176
- reverse: true,
177
- text: "language_name",
178
- frameAnchor: "bottom",
179
- textAnchor: "end",
180
- dy: 10,
181
- rotate: 270,
182
- opacity: (d) => d.speakers > 50_000_000 ? 1 : 0,
183
- }))
184
- ]
185
- });
186
-
187
- // Add summary plot to the coverage section
188
- summaryChartDiv.appendChild(summaryPlot);
189
-
190
- // Get unique languages with their speaker counts
191
- const languageMap = new Map();
192
- data.forEach(r => {
193
- if (!languageMap.has(r.language_name)) {
194
- languageMap.set(r.language_name, r.speakers);
195
- }
196
- });
197
-
198
- // Sort languages by speaker count (descending)
199
- const languages = [...languageMap.entries()]
200
- .sort((a, b) => b[1] - a[1])
201
- .map(([lang]) => lang);
202
-
203
- // Section for each language
204
- languages.forEach(language => {
205
- const headerDiv = document.createElement('div');
206
- headerDiv.className = 'language-header';
207
-
208
- const h2 = document.createElement('h2');
209
- h2.textContent = language;
210
- h2.style.marginBottom = '5px';
211
-
212
- const speakerP = document.createElement('p');
213
- speakerP.className = 'speaker-count';
214
- const speakerCount = (languageMap.get(language) / 1_000_000).toFixed(1);
215
- speakerP.textContent = `${speakerCount}M speakers`;
216
-
217
- headerDiv.appendChild(h2);
218
- headerDiv.appendChild(speakerP);
219
- languageListDiv.appendChild(headerDiv);
220
-
221
- const languageData = data.filter(r => r.language_name === language)[0]["scores"];
222
-
223
- const descriptor = code => {
224
- let [org, model] = code.split("/")
225
- return model.split("-")[0]
226
- }
227
-
228
- // Plot for how well the models perform on this language
229
- if (languageData && languageData.length > 1) {
230
- const plot = Plot.plot({
231
- width: 400,
232
- height: 200,
233
- margin: 30,
234
- y: {
235
- domain: [0, 1],
236
- label: scoreName
237
- },
238
- marks: [
239
- Plot.barY(languageData, {
240
- x: d => descriptor(d.model),
241
- y: scoreKey
242
- })
243
- ]
244
- });
245
- languageListDiv.appendChild(plot);
246
- }
247
- });
248
- }
249
-
250
- init();
251
- </script>
252
- </body>
253
-
254
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
observablehq.config.js ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // See https://observablehq.com/framework/config for documentation.
2
+ export default {
3
+ // The app’s title; used in the sidebar and webpage titles.
4
+ title: "AI Language Monitor",
5
+
6
+ // The pages and sections in the sidebar. If you don’t specify this option,
7
+ // all pages will be listed in alphabetical order. Listing pages explicitly
8
+ // lets you organize them into sections and have unlisted pages.
9
+ pages: [
10
+ { name: "Compare Languages", path: "/compare-languages" },
11
+ { name: "Compare AI Models", path: "/compare-ai-models" },
12
+ { name: "Methodology", path: "/methodology" },
13
+ ],
14
+
15
+ // Content to add to the head of the page, e.g. for a favicon:
16
+ head: '<link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22 fill=%22black%22>🌍</text></svg>">',
17
+
18
+ // The path to the source root.
19
+ root: "src",
20
+
21
+ // Some additional configuration options and their defaults:
22
+ // theme: "default", // try "light", "dark", "slate", etc.
23
+ // header: "", // what to show in the header (HTML)
24
+ // footer: "Built with Observable.", // what to show in the footer (HTML)
25
+ // sidebar: true, // whether to show the sidebar
26
+ // toc: true, // whether to show the table of contents
27
+ // pager: true, // whether to show previous & next links in the footer
28
+ // output: "dist", // path to the output root for build
29
+ // search: true, // activate search
30
+ // linkify: true, // convert URLs in Markdown to links
31
+ // typographer: false, // smart quotes and other typographic improvements
32
+ // preserveExtension: false, // drop .html from URLs
33
+ // preserveIndex: false, // drop /index from URLs
34
+ };
package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
package.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "module",
3
+ "private": true,
4
+ "scripts": {
5
+ "clean": "rimraf src/.observablehq/cache",
6
+ "build": "observable build",
7
+ "dev": "observable preview",
8
+ "deploy": "observable deploy",
9
+ "observable": "observable"
10
+ },
11
+ "dependencies": {
12
+ "@observablehq/framework": "^1.13.2",
13
+ "d3-dsv": "^3.0.1",
14
+ "d3-time-format": "^4.1.0"
15
+ },
16
+ "devDependencies": {
17
+ "rimraf": "^5.0.5"
18
+ },
19
+ "engines": {
20
+ "node": ">=18"
21
+ }
22
+ }
results.json DELETED
The diff for this file is too large to render. See raw diff
 
src/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /.observablehq/cache/
src/compare-ai-models.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ theme: dashboard
3
+ title: Compare AI models
4
+ ---
5
+
6
+ # Compare AI models
7
+
8
+ ```js
9
+ const data = FileAttachment("data/languagebench.json").json();
10
+ ```
11
+
12
+ ```js
13
+ const scoreKey = "bleu"
14
+ const scoreName = "BLEU Score"
15
+
16
+ // Get unique languages with their speaker counts
17
+ const languageMap = new Map();
18
+ data.forEach(r => {
19
+ if (!languageMap.has(r.language_name)) {
20
+ languageMap.set(r.language_name, r.speakers);
21
+ }
22
+ });
23
+
24
+ // Sort languages by speaker count (descending)
25
+ const languages = [...languageMap.entries()]
26
+ .sort((a, b) => b[1] - a[1])
27
+ .map(([lang]) => lang);
28
+
29
+ // Section for each language
30
+ languages.forEach(language => {
31
+ display(html`<h2 class="language-header">${language}</h2>`)
32
+
33
+ const speakerCount = (languageMap.get(language) / 1_000_000).toFixed(1);
34
+ display(html`${speakerCount}M speakers`);
35
+
36
+ const languageData = data.filter(r => r.language_name === language)[0]["scores"];
37
+ console.log(languageData)
38
+
39
+ const descriptor = code => {
40
+ let [org, model] = code.split("/")
41
+ return model.split("-")[0]
42
+ }
43
+
44
+ // Plot for how well the models perform on this language
45
+ if (languageData && languageData.length >= 1) {
46
+ console.log("yes")
47
+ const chart = Plot.plot({
48
+ width: 400,
49
+ height: 200,
50
+ margin: 30,
51
+ y: {
52
+ domain: [0, 1],
53
+ label: scoreName
54
+ },
55
+ marks: [
56
+ Plot.barY(languageData, {
57
+ x: d => descriptor(d.model),
58
+ y: scoreKey
59
+ })
60
+ ]
61
+ });
62
+ display(chart)
63
+ }
64
+ });
65
+ ```
src/compare-languages.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ theme: dashboard
3
+ title: Compare languages
4
+ ---
5
+
6
+ # Compare languages
7
+
8
+ ```js
9
+ import { languageChart } from "./components/language-chart.js";
10
+
11
+ const data = FileAttachment("data/languagebench.json").json();
12
+ ```
13
+
14
+ ```js
15
+ const scoreKey = "bleu"
16
+ const scoreName = "BLEU Score"
17
+
18
+ // Create summary plot
19
+ display(languageChart(data, {width: 1000, height: 400, scoreKey: scoreKey, scoreName: scoreName}))
20
+ ```
src/components/language-chart.js ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import * as Plot from "npm:@observablehq/plot";
2
+
3
+ export function languageChart(
4
+ languageData,
5
+ { width, height, scoreKey, scoreName } = {}
6
+ ) {
7
+ // Format captions
8
+ const formatScore = (score) =>
9
+ score > 0 ? score.toFixed(2) : "No benchmark available!";
10
+ const formatTitle = (d) =>
11
+ d.language_name +
12
+ "\n" +
13
+ parseInt(d.speakers / 1_000_00) / 10 +
14
+ "M speakers\n" +
15
+ scoreName +
16
+ ": " +
17
+ formatScore(d[scoreKey]);
18
+
19
+ return Plot.plot({
20
+ width: width,
21
+ height: height,
22
+ marginBottom: 100,
23
+ x: { label: "Number of speakers", axis: null },
24
+ y: { label: `${scoreName} (average across models)` },
25
+ // color: { scheme: "BrBG" },
26
+ marks: [
27
+ Plot.rectY(
28
+ languageData,
29
+ Plot.stackX({
30
+ x: "speakers",
31
+ order: scoreKey,
32
+ reverse: true,
33
+ y2: scoreKey, // y2 to avoid stacking by y
34
+ title: formatTitle,
35
+ tip: true,
36
+ fill: (d) => (d[scoreKey] > 0 ? "black" : "pink"),
37
+ })
38
+ ),
39
+ Plot.rectY(
40
+ languageData,
41
+ Plot.pointerX(
42
+ Plot.stackX({
43
+ x: "speakers",
44
+ order: scoreKey,
45
+ reverse: true,
46
+ y2: scoreKey, // y2 to avoid stacking by y
47
+ fill: "grey",
48
+ })
49
+ )
50
+ ),
51
+ Plot.text(
52
+ languageData,
53
+ Plot.stackX({
54
+ x: "speakers",
55
+ y2: scoreKey,
56
+ order: scoreKey,
57
+ reverse: true,
58
+ text: "language_name",
59
+ frameAnchor: "bottom",
60
+ textAnchor: "end",
61
+ dy: 10,
62
+ rotate: 270,
63
+ opacity: (d) => (d.speakers > 50_000_000 ? 1 : 0),
64
+ })
65
+ ),
66
+ ],
67
+ });
68
+ }
data.txt β†’ src/data/data.txt RENAMED
@@ -1,4 +1,4 @@
1
  floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
2
  languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
3
  LanguageCodes.tab: https://www.ethnologue.com/
4
- ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
 
1
  floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
2
  languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
3
  LanguageCodes.tab: https://www.ethnologue.com/
4
+ ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
languagebench.py β†’ src/data/languagebench.json.py RENAMED
@@ -1,7 +1,9 @@
1
  import asyncio
2
  import json
3
  import os
 
4
  from os import getenv
 
5
 
6
  import evaluate
7
  import pandas as pd
@@ -15,14 +17,14 @@ from transformers import NllbTokenizer
15
 
16
  # config
17
  models = [
18
- "openai/gpt-4o",
19
- "anthropic/claude-3.5-sonnet",
20
- "meta-llama/llama-3.1-405b-instruct", # lots of slow repetitions for LRLs
21
- "mistralai/mistral-large",
22
  # "google/gemini-flash-1.5", # very fast
23
- "qwen/qwen-2.5-72b-instruct", # somewhat slow
24
  ]
25
- fast_model = "anthropic/claude-3.5-sonnet"
26
  n_sentences = 30
27
 
28
  # setup
@@ -43,9 +45,9 @@ def reorder(language_name):
43
  return language_name.split(",")[1] + " " + language_name.split(",")[0]
44
  return language_name
45
 
46
-
47
  # load benchmark languages and scripts
48
- benchmark_dir = "floresp-v2.0-rc.3/dev"
 
49
  benchmark_languages = pd.DataFrame(
50
  [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
51
  columns=["language_code", "script_code"],
@@ -56,7 +58,7 @@ benchmark_languages["in_benchmark"] = True
56
 
57
  # load Ethnologue language names
58
  language_names = (
59
- pd.read_csv("LanguageCodes.tab", sep="\t")
60
  .rename(columns={"LangID": "language_code", "Name": "language_name"})[
61
  ["language_code", "language_name"]
62
  ]
@@ -65,7 +67,7 @@ language_names = (
65
 
66
  # load Wikidata speaker stats
67
  language_stats = (
68
- pd.read_csv("languages.tsv", sep="\t")
69
  .rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
70
  ["language_code", "speakers"]
71
  ]
@@ -84,7 +86,7 @@ language_stats = language_stats[
84
  ]
85
 
86
  # load unicode script names
87
- script_names = pd.read_csv("ScriptCodes.csv").rename(
88
  columns={"Code": "script_code", "English Name": "script_name"}
89
  )[["script_code", "script_name"]]
90
 
@@ -160,13 +162,13 @@ def load_sentences(language):
160
  # evaluation!
161
  async def main():
162
  results = []
163
- for language in languages.itertuples():
164
  name = (
165
  language.language_name
166
  if not pd.isna(language.language_name)
167
  else language.language_code
168
  )
169
- print(name)
170
  scores = []
171
  if language.in_benchmark:
172
  target_sentences = load_sentences(language)[:n_sentences]
@@ -185,7 +187,7 @@ async def main():
185
  load_sentences(lang)[i]
186
  for i, lang in enumerate(_original_languages.itertuples())
187
  ]
188
- print(model)
189
  predictions = [
190
  translate(
191
  model, language.language_name, language.script_name, sentence
@@ -220,8 +222,7 @@ async def main():
220
  # "bert_score": mean([s["bert_score"] for s in scores]),
221
  }
222
  )
223
- with open("results.json", "w") as f:
224
- json.dump(results, f, indent=2, ensure_ascii=False)
225
 
226
 
227
  if __name__ == "__main__":
 
1
  import asyncio
2
  import json
3
  import os
4
+ import sys
5
  from os import getenv
6
+ from pathlib import Path
7
 
8
  import evaluate
9
  import pandas as pd
 
17
 
18
  # config
19
  models = [
20
+ "openai/gpt-4o-mini",
21
+ "anthropic/claude-3.5-haiku",
22
+ # "meta-llama/llama-3.1-405b-instruct", # lots of slow repetitions for LRLs
23
+ # "mistralai/mistral-large",
24
  # "google/gemini-flash-1.5", # very fast
25
+ # "qwen/qwen-2.5-72b-instruct", # somewhat slow
26
  ]
27
+ fast_model = "anthropic/claude-3.5-haiku"
28
  n_sentences = 30
29
 
30
  # setup
 
45
  return language_name.split(",")[1] + " " + language_name.split(",")[0]
46
  return language_name
47
 
 
48
  # load benchmark languages and scripts
49
+ data = Path("src/data")
50
+ benchmark_dir = data / "floresp-v2.0-rc.3/dev"
51
  benchmark_languages = pd.DataFrame(
52
  [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
53
  columns=["language_code", "script_code"],
 
58
 
59
  # load Ethnologue language names
60
  language_names = (
61
+ pd.read_csv(data / "LanguageCodes.tab", sep="\t")
62
  .rename(columns={"LangID": "language_code", "Name": "language_name"})[
63
  ["language_code", "language_name"]
64
  ]
 
67
 
68
  # load Wikidata speaker stats
69
  language_stats = (
70
+ pd.read_csv(data / "languages.tsv", sep="\t")
71
  .rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
72
  ["language_code", "speakers"]
73
  ]
 
86
  ]
87
 
88
  # load unicode script names
89
+ script_names = pd.read_csv(data / "ScriptCodes.csv").rename(
90
  columns={"Code": "script_code", "English Name": "script_name"}
91
  )[["script_code", "script_name"]]
92
 
 
162
  # evaluation!
163
  async def main():
164
  results = []
165
+ for language in list(languages.itertuples())[:5]:
166
  name = (
167
  language.language_name
168
  if not pd.isna(language.language_name)
169
  else language.language_code
170
  )
171
+ print(name, file=sys.stderr)
172
  scores = []
173
  if language.in_benchmark:
174
  target_sentences = load_sentences(language)[:n_sentences]
 
187
  load_sentences(lang)[i]
188
  for i, lang in enumerate(_original_languages.itertuples())
189
  ]
190
+ print(model, file=sys.stderr)
191
  predictions = [
192
  translate(
193
  model, language.language_name, language.script_name, sentence
 
222
  # "bert_score": mean([s["bert_score"] for s in scores]),
223
  }
224
  )
225
+ print(json.dumps(results, indent=2, ensure_ascii=False))
 
226
 
227
 
228
  if __name__ == "__main__":
languages.rq β†’ src/data/languages.rq RENAMED
File without changes
languages.tsv β†’ src/data/languages.tsv RENAMED
File without changes
src/index.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ toc: false
3
+ ---
4
+
5
+ <div class="hero">
6
+ <h1>AI Language Monitor</h1>
7
+ <h2>Benchmarking all big AI models on all benchmarkable languages.</h2>
8
+ </div>
9
+
10
+ ```js
11
+ import { languageChart } from "./components/language-chart.js";
12
+
13
+ const data = FileAttachment("data/languagebench.json").json();
14
+ ```
15
+
16
+
17
+ <div class="grid grid-cols-2" style="grid-auto-rows: 504px;">
18
+ <div class="card">
19
+ <h2 class="hero">Compare languages</h2>
20
+ ${resize((width) => languageChart(data, {width: 1000, height: 400, scoreKey: "bleu", scoreName: "BLEU Score"}))}
21
+ </div>
22
+ <div class="card">
23
+ <h2 class="hero">Compare AI models</h2>
24
+ ...
25
+ </div>
26
+ </div>
27
+
28
+ <style>
29
+
30
+ .hero {
31
+ display: flex;
32
+ flex-direction: column;
33
+ align-items: center;
34
+ font-family: var(--sans-serif);
35
+ margin: 4rem 0 8rem;
36
+ text-wrap: balance;
37
+ text-align: center;
38
+ }
39
+
40
+ .hero h1 {
41
+ margin: 1rem 0;
42
+ padding: 1rem 0;
43
+ max-width: none;
44
+ font-size: 90px;
45
+ font-weight: 900;
46
+ line-height: 1;
47
+ background: linear-gradient(30deg, var(--theme-foreground-focus), currentColor);
48
+ -webkit-background-clip: text;
49
+ -webkit-text-fill-color: transparent;
50
+ background-clip: text;
51
+ }
52
+
53
+ </style>
src/methodology.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Methodology
3
+ ---
4
+
5
+ # Methodology
6
+
7
+ Sources:
8
+
9
+ 1. For AI models: [OpenRouter](https://openrouter.ai/)
10
+ 2. For language benchmarks: [FLORES+](https://github.com/openlanguagedata/flores)
11
+ 3. For language statistics: [Wikidata](https://gist.github.com/unhammer/3e8f2e0f79972bf5008a4c970081502d), [Ethnologue](https://www.ethnologue.com/browse/names/)
12
+