David Pomerenke commited on
Commit
6b6f157
·
1 Parent(s): 86b8b3a

Display all languages and translate from multiple languages

Browse files
Files changed (7) hide show
  1. bibliography.bib +6 -0
  2. index.html +15 -18
  3. languagebench.py +131 -75
  4. pyproject.toml +5 -0
  5. results.json +0 -0
  6. results_summary.json +0 -1202
  7. uv.lock +119 -0
bibliography.bib CHANGED
@@ -243,6 +243,12 @@
243
  file = {/Users/david/Zotero/storage/VU6IFENR/Siminyu et al. - 2021 - AI4D -- African Language Program.pdf;/Users/david/Zotero/storage/7TV2PS8J/2104.html}
244
  }
245
 
 
 
 
 
 
 
246
  @misc{teamNoLanguageLeft2022,
247
  title = {No {{Language Left Behind}}: {{Scaling Human-Centered Machine Translation}}},
248
  shorttitle = {No {{Language Left Behind}}},
 
243
  file = {/Users/david/Zotero/storage/VU6IFENR/Siminyu et al. - 2021 - AI4D -- African Language Program.pdf;/Users/david/Zotero/storage/7TV2PS8J/2104.html}
244
  }
245
 
246
+ @misc{Tatoeba,
247
+ title = {Tatoeba},
248
+ urldate = {2024-11-03},
249
+ file = {/Users/david/Zotero/storage/4NDTCGWG/sentences_by_language.html}
250
+ }
251
+
252
  @misc{teamNoLanguageLeft2022,
253
  title = {No {{Language Left Behind}}: {{Scaling Human-Centered Machine Translation}}},
254
  shorttitle = {No {{Language Left Behind}}},
index.html CHANGED
@@ -39,42 +39,43 @@
39
  const scoreName = "BLEU Score"
40
  const chartsDiv = document.getElementById('charts');
41
 
42
- const summary = await fetch('results_summary.json');
43
- const summaryData = await summary.json();
44
  // Format captions
45
- const formatTitle = d => (d.target_language_name + "\n" + parseInt(d.speakers / 1_000_00) / 10 + "M speakers\n" + scoreName + ": " + d[scoreKey].toFixed(1))
 
46
 
47
  // Create summary plot
48
  const summaryPlot = Plot.plot({
49
- width: 800,
50
  height: 400,
51
  marginBottom: 100,
52
  x: { label: "Number of speakers", axis: null },
53
  y: { label: `${scoreName} (average across models)` },
54
  // color: { scheme: "BrBG" },
55
  marks: [
56
- Plot.rectY(summaryData, Plot.stackX({
57
  x: "speakers",
58
  order: scoreKey,
59
  reverse: true,
60
  y2: scoreKey, // y2 to avoid stacking by y
61
  title: formatTitle,
62
  tip: true,
63
- // fill: d => -d.bleu
64
  })),
65
- Plot.rectY(summaryData, Plot.pointerX(Plot.stackX({
66
  x: "speakers",
67
  order: scoreKey,
68
  reverse: true,
69
  y2: scoreKey, // y2 to avoid stacking by y
70
  fill: "grey",
71
  }))),
72
- Plot.text(summaryData, Plot.stackX({
73
  x: "speakers",
74
  y2: scoreKey,
75
  order: scoreKey,
76
  reverse: true,
77
- text: "target_language_name",
78
  frameAnchor: "bottom",
79
  textAnchor: "end",
80
  dy: 10,
@@ -87,14 +88,11 @@
87
  // Add summary plot at the top
88
  chartsDiv.insertBefore(summaryPlot, chartsDiv.firstChild);
89
 
90
- const response = await fetch('results.json');
91
- const results = await response.json();
92
-
93
  // Get unique languages with their speaker counts
94
  const languageMap = new Map();
95
- results.forEach(r => {
96
- if (!languageMap.has(r.target_language_name)) {
97
- languageMap.set(r.target_language_name, r.speakers);
98
  }
99
  });
100
 
@@ -122,7 +120,7 @@
122
  headerDiv.appendChild(speakerP);
123
  chartsDiv.appendChild(headerDiv);
124
 
125
- const languageData = results.filter(r => r.target_language_name === language);
126
 
127
  const descriptor = code => {
128
  let [org, model] = code.split("/")
@@ -130,8 +128,7 @@
130
  }
131
 
132
  // Plot for how well the models perform on this language
133
- if (languageData.length > 2) {
134
- console.log(languageData);
135
  const plot = Plot.plot({
136
  width: 400,
137
  height: 200,
 
39
  const scoreName = "BLEU Score"
40
  const chartsDiv = document.getElementById('charts');
41
 
42
+ const response = await fetch('results.json');
43
+ const data = await response.json();
44
  // Format captions
45
+ const formatScore = (score) => score > 0 ? score.toFixed(2) : "No benchmark available!"
46
+ const formatTitle = d => (d.language_name + "\n" + parseInt(d.speakers / 1_000_00) / 10 + "M speakers\n" + scoreName + ": " + formatScore(d[scoreKey]))
47
 
48
  // Create summary plot
49
  const summaryPlot = Plot.plot({
50
+ width: chartsDiv.clientWidth,
51
  height: 400,
52
  marginBottom: 100,
53
  x: { label: "Number of speakers", axis: null },
54
  y: { label: `${scoreName} (average across models)` },
55
  // color: { scheme: "BrBG" },
56
  marks: [
57
+ Plot.rectY(data, Plot.stackX({
58
  x: "speakers",
59
  order: scoreKey,
60
  reverse: true,
61
  y2: scoreKey, // y2 to avoid stacking by y
62
  title: formatTitle,
63
  tip: true,
64
+ fill: d => d[scoreKey] > 0 ? "black" : "pink"
65
  })),
66
+ Plot.rectY(data, Plot.pointerX(Plot.stackX({
67
  x: "speakers",
68
  order: scoreKey,
69
  reverse: true,
70
  y2: scoreKey, // y2 to avoid stacking by y
71
  fill: "grey",
72
  }))),
73
+ Plot.text(data, Plot.stackX({
74
  x: "speakers",
75
  y2: scoreKey,
76
  order: scoreKey,
77
  reverse: true,
78
+ text: "language_name",
79
  frameAnchor: "bottom",
80
  textAnchor: "end",
81
  dy: 10,
 
88
  // Add summary plot at the top
89
  chartsDiv.insertBefore(summaryPlot, chartsDiv.firstChild);
90
 
 
 
 
91
  // Get unique languages with their speaker counts
92
  const languageMap = new Map();
93
+ data.forEach(r => {
94
+ if (!languageMap.has(r.language_name)) {
95
+ languageMap.set(r.language_name, r.speakers);
96
  }
97
  });
98
 
 
120
  headerDiv.appendChild(speakerP);
121
  chartsDiv.appendChild(headerDiv);
122
 
123
+ const languageData = data.filter(r => r.language_name === language)[0]["scores"];
124
 
125
  const descriptor = code => {
126
  let [org, model] = code.split("/")
 
128
  }
129
 
130
  // Plot for how well the models perform on this language
131
+ if (languageData && languageData.length > 1) {
 
132
  const plot = Plot.plot({
133
  width: 400,
134
  height: 200,
languagebench.py CHANGED
@@ -1,7 +1,6 @@
1
  import asyncio
2
  import json
3
  import os
4
- import random
5
  from os import getenv
6
 
7
  import evaluate
@@ -12,22 +11,19 @@ from dotenv import load_dotenv
12
  from joblib.memory import Memory
13
  from openai import AsyncOpenAI
14
  from tqdm.asyncio import tqdm_asyncio
 
15
 
16
  # config
17
  models = [
18
- "openai/gpt-4o-mini",
19
  "anthropic/claude-3.5-sonnet",
20
- "meta-llama/llama-3.1-70b-instruct", # lots of slow repetitions for LRLs
21
- "mistralai/mistral-nemo",
22
  # "google/gemini-flash-1.5", # very fast
23
  "qwen/qwen-2.5-72b-instruct", # somewhat slow
24
  ]
25
  fast_model = "anthropic/claude-3.5-sonnet"
26
- original_language = "eng_Latn"
27
- dataset = "floresp-v2.0-rc.3/dev"
28
- random.seed(42)
29
- target_languages = [f.split(".")[1] for f in os.listdir(dataset)]
30
- detailed_target_languages = random.choices(target_languages, k=5)
31
 
32
  # setup
33
  load_dotenv()
@@ -36,9 +32,10 @@ client = AsyncOpenAI(
36
  api_key=getenv("OPENROUTER_API_KEY"),
37
  )
38
  cache = Memory(location=".cache", verbose=0).cache
39
- bleu = evaluate.load("sacrebleu")
40
  bertscore = evaluate.load("bertscore")
41
- rate_limit = AsyncLimiter(max_rate=15, time_period=1)
 
42
 
43
 
44
  def reorder(language_name):
@@ -47,10 +44,65 @@ def reorder(language_name):
47
  return language_name
48
 
49
 
50
- language_names = pd.read_csv("LanguageCodes.tab", sep="\t")
51
- language_names["Name"] = language_names["Name"].apply(reorder).str.strip()
52
- language_stats = pd.read_csv("languages.tsv", sep="\t")
53
- script_names = pd.read_csv("ScriptCodes.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
 
56
  # utils
@@ -94,73 +146,77 @@ async def translate(model, target_language, target_script, sentence):
94
  return reply.choices[0].message.content
95
 
96
 
97
- def get_language_stats(language_code):
98
- lang, script = language_code.split("_", 1)
99
- script = script.split("_", 1)[0]
100
- stats = language_stats[language_stats["iso639_3"] == lang]
101
- if not stats.empty:
102
- stats = stats.iloc[0].to_dict()
103
- else:
104
- stats = dict()
105
- stats["script"] = script_names[script_names["Code"] == script]["English Name"].iloc[
106
- 0
107
- ]
108
- name_series = language_names[language_names["LangID"] == lang]["Name"]
109
- stats["name"] = (
110
- name_series.iloc[0]
111
- if not name_series.empty
112
- else stats.get("itemLabel_en") or stats.get("itemLabel", lang)
113
- )
114
- return stats
115
 
116
 
117
- def mean(l):
118
- return sum(l) / len(l)
 
 
119
 
120
 
121
  # evaluation!
122
  async def main():
123
- n = 30
124
  results = []
125
- original_sentences = open(f"{dataset}/dev.{original_language}").readlines()
126
- for target_language in target_languages:
127
- target_sentences = open(f"{dataset}/dev.{target_language}").readlines()
128
- for model in models:
129
- if model != fast_model and target_language not in detailed_target_languages:
130
- continue
131
- stats = get_language_stats(target_language)
132
- print(f"{model} -> {stats['name']}")
133
- predictions = [
134
- translate(model, stats["name"], stats["script"], sentence)
135
- for sentence in original_sentences[:n]
136
- ]
137
- predictions = await tqdm_asyncio.gather(*predictions, miniters=1)
138
- metrics = bleu.compute(
139
- predictions=predictions,
140
- references=target_sentences[:n],
141
- tokenize="char",
142
- )
143
- bert_metrics = bertscore.compute(
144
- predictions=predictions,
145
- references=target_sentences[:n],
146
- model_type="distilbert-base-uncased",
147
- )
148
- results.append(
149
- {
150
- "model": model,
151
- "original_language": original_language,
152
- "target_language": target_language,
153
- "target_language_name": stats["name"],
154
- "speakers": int(stats.get("maxSpeakers", 0)),
155
- "bleu": metrics["score"],
156
- "bert_score": mean(bert_metrics["f1"]),
157
- }
158
- )
159
- with open("results.json", "w") as f:
160
- json.dump(results, f, indent=2, ensure_ascii=False)
161
- pd.DataFrame(results).groupby("target_language_name").agg(
162
- {"bleu": "mean", "bert_score": "mean", "speakers": "mean"}
163
- ).reset_index().to_json("results_summary.json", indent=2, orient="records")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
 
166
  if __name__ == "__main__":
 
1
  import asyncio
2
  import json
3
  import os
 
4
  from os import getenv
5
 
6
  import evaluate
 
11
  from joblib.memory import Memory
12
  from openai import AsyncOpenAI
13
  from tqdm.asyncio import tqdm_asyncio
14
+ from transformers import NllbTokenizer
15
 
16
  # config
17
  models = [
18
+ "openai/gpt-4o",
19
  "anthropic/claude-3.5-sonnet",
20
+ "meta-llama/llama-3.1-405b-instruct", # lots of slow repetitions for LRLs
21
+ "mistralai/mistral-large",
22
  # "google/gemini-flash-1.5", # very fast
23
  "qwen/qwen-2.5-72b-instruct", # somewhat slow
24
  ]
25
  fast_model = "anthropic/claude-3.5-sonnet"
26
+ n_sentences = 30
 
 
 
 
27
 
28
  # setup
29
  load_dotenv()
 
32
  api_key=getenv("OPENROUTER_API_KEY"),
33
  )
34
  cache = Memory(location=".cache", verbose=0).cache
35
+ bleu = evaluate.load("bleu")
36
  bertscore = evaluate.load("bertscore")
37
+ tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
38
+ rate_limit = AsyncLimiter(max_rate=20, time_period=1)
39
 
40
 
41
  def reorder(language_name):
 
44
  return language_name
45
 
46
 
47
+ # load benchmark languages and scripts
48
+ benchmark_dir = "floresp-v2.0-rc.3/dev"
49
+ benchmark_languages = pd.DataFrame(
50
+ [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
51
+ columns=["language_code", "script_code"],
52
+ )
53
+ # hack: drop additional script codes for languages with multiple scripts
54
+ benchmark_languages = benchmark_languages.groupby("language_code").head(1)
55
+ benchmark_languages["in_benchmark"] = True
56
+
57
+ # load Ethnologue language names
58
+ language_names = (
59
+ pd.read_csv("LanguageCodes.tab", sep="\t")
60
+ .rename(columns={"LangID": "language_code", "Name": "language_name"})[
61
+ ["language_code", "language_name"]
62
+ ]
63
+ .assign(language_name=lambda df: df["language_name"].apply(reorder).str.strip())
64
+ )
65
+
66
+ # load Wikidata speaker stats
67
+ language_stats = (
68
+ pd.read_csv("languages.tsv", sep="\t")
69
+ .rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
70
+ ["language_code", "speakers"]
71
+ ]
72
+ .dropna(subset=["language_code"])
73
+ )
74
+ language_stats["speakers"] = pd.to_numeric(language_stats["speakers"], errors="coerce")
75
+ ignored_languages = [
76
+ "zho", # Chinese -> use Mandarin (cmn) instead
77
+ "ara", # Arabic -> use Standard Arabic (arb) instead
78
+ "pus", # Pashto -> use Nothern / Central / Southern Pashto instead (pbt / pst / pbu)
79
+ "fas", # Persian -> use Iranian Persian (pes) instead
80
+ "msa", # Malay -> use Indonesian (ind) instead
81
+ ]
82
+ language_stats = language_stats[
83
+ ~language_stats["language_code"].isin(ignored_languages)
84
+ ]
85
+
86
+ # load unicode script names
87
+ script_names = pd.read_csv("ScriptCodes.csv").rename(
88
+ columns={"Code": "script_code", "English Name": "script_name"}
89
+ )[["script_code", "script_name"]]
90
+
91
+ # merge data
92
+ languages = pd.merge(language_stats, language_names, on="language_code", how="outer")
93
+ languages = pd.merge(benchmark_languages, languages, on="language_code", how="outer")
94
+ languages = pd.merge(languages, script_names, on="script_code", how="left")
95
+ languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
96
+ languages = languages.sort_values(by="speakers", ascending=False)
97
+
98
+ # sample languages to translate from
99
+ original_languages = languages[languages["in_benchmark"]].sample(
100
+ n=n_sentences, weights="speakers", replace=True, random_state=42
101
+ )
102
+ # sample languages to analyze with all models
103
+ detailed_target_languages = languages[languages["in_benchmark"]].sample(
104
+ n=25, random_state=42
105
+ )
106
 
107
 
108
  # utils
 
146
  return reply.choices[0].message.content
147
 
148
 
149
+ def mean(l):
150
+ return sum(l) / len(l) if l else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
 
153
+ def load_sentences(language):
154
+ return open(
155
+ f"{benchmark_dir}/dev.{language.language_code}_{language.script_code}"
156
+ ).readlines()
157
 
158
 
159
  # evaluation!
160
  async def main():
 
161
  results = []
162
+ for language in languages.itertuples():
163
+ name = (
164
+ language.language_name
165
+ if not pd.isna(language.language_name)
166
+ else language.language_code
167
+ )
168
+ print(name)
169
+ scores = []
170
+ if language.in_benchmark:
171
+ target_sentences = load_sentences(language)[:n_sentences]
172
+ for model in models:
173
+ if (
174
+ model != fast_model
175
+ and language.language_code
176
+ not in detailed_target_languages.language_code.values
177
+ ):
178
+ continue
179
+ original_sentences = [
180
+ load_sentences(lang)[i]
181
+ for i, lang in enumerate(original_languages.itertuples())
182
+ ]
183
+ print(model)
184
+ predictions = [
185
+ translate(
186
+ model, language.language_name, language.script_name, sentence
187
+ )
188
+ for sentence in original_sentences
189
+ ]
190
+ predictions = await tqdm_asyncio.gather(*predictions, miniters=1)
191
+ metrics_bleu = bleu.compute(
192
+ predictions=predictions,
193
+ references=target_sentences,
194
+ tokenizer=tokenizer.tokenize,
195
+ )
196
+ # metrics_bert = bertscore.compute(
197
+ # predictions=predictions,
198
+ # references=target_sentences,
199
+ # model_type="distilbert-base-uncased",
200
+ # )
201
+ scores.append(
202
+ {
203
+ "model": model,
204
+ "bleu": metrics_bleu["bleu"],
205
+ # "bert_score": mean(metrics_bert["f1"]),
206
+ }
207
+ )
208
+ results.append(
209
+ {
210
+ "language_name": name,
211
+ "language_code": language.language_code,
212
+ "speakers": language.speakers if not pd.isna(language.speakers) else 0,
213
+ "scores": scores,
214
+ "bleu": mean([s["bleu"] for s in scores]) or -0.02,
215
+ # "bert_score": mean([s["bert_score"] for s in scores]),
216
+ }
217
+ )
218
+ with open("results.json", "w") as f:
219
+ json.dump(results, f, indent=2, ensure_ascii=False)
220
 
221
 
222
  if __name__ == "__main__":
pyproject.toml CHANGED
@@ -9,9 +9,14 @@ dependencies = [
9
  "bert-score>=0.3.13",
10
  "evaluate>=0.4.3",
11
  "joblib>=1.4.2",
 
12
  "openai>=1.52.2",
13
  "pandas>=2.2.3",
 
14
  "python-dotenv>=1.0.1",
15
  "sacrebleu>=2.4.3",
 
 
16
  "tqdm>=4.66.6",
 
17
  ]
 
9
  "bert-score>=0.3.13",
10
  "evaluate>=0.4.3",
11
  "joblib>=1.4.2",
12
+ "nltk>=3.9.1",
13
  "openai>=1.52.2",
14
  "pandas>=2.2.3",
15
+ "protobuf>=5.28.3",
16
  "python-dotenv>=1.0.1",
17
  "sacrebleu>=2.4.3",
18
+ "sentencepiece>=0.2.0",
19
+ "tiktoken>=0.8.0",
20
  "tqdm>=4.66.6",
21
+ "transformers>=4.46.1",
22
  ]
results.json CHANGED
The diff for this file is too large to render. See raw diff
 
results_summary.json DELETED
@@ -1,1202 +0,0 @@
1
- [
2
- {
3
- "target_language_name":"Aceh",
4
- "bleu":39.1659660901,
5
- "bert_score":0.8998966595,
6
- "speakers":3500032.0
7
- },
8
- {
9
- "target_language_name":"Afrikaans",
10
- "bleu":76.8900540777,
11
- "bert_score":0.9481831173,
12
- "speakers":10300000.0
13
- },
14
- {
15
- "target_language_name":"Amharic",
16
- "bleu":43.1544568697,
17
- "bert_score":0.989116921,
18
- "speakers":25000000.0
19
- },
20
- {
21
- "target_language_name":"Armenian",
22
- "bleu":64.6804400806,
23
- "bert_score":0.9550812801,
24
- "speakers":6700000.0
25
- },
26
- {
27
- "target_language_name":"Assamese",
28
- "bleu":47.0351331605,
29
- "bert_score":0.928119574,
30
- "speakers":15300000.0
31
- },
32
- {
33
- "target_language_name":"Asturian",
34
- "bleu":71.3445623493,
35
- "bert_score":0.931475842,
36
- "speakers":450000.0
37
- },
38
- {
39
- "target_language_name":"Awadhi",
40
- "bleu":46.0797144146,
41
- "bert_score":0.9333642821,
42
- "speakers":22000000.0
43
- },
44
- {
45
- "target_language_name":"Ayacucho Quechua",
46
- "bleu":45.6534927028,
47
- "bert_score":0.8731370111,
48
- "speakers":918200.0
49
- },
50
- {
51
- "target_language_name":"Bali (Indonesia)",
52
- "bleu":52.8752419159,
53
- "bert_score":0.8934772114,
54
- "speakers":4000000.0
55
- },
56
- {
57
- "target_language_name":"Bamanankan",
58
- "bleu":38.6939091408,
59
- "bert_score":0.8872043769,
60
- "speakers":2700000.0
61
- },
62
- {
63
- "target_language_name":"Banjar",
64
- "bleu":46.5453977487,
65
- "bert_score":0.91599799,
66
- "speakers":3500000.0
67
- },
68
- {
69
- "target_language_name":"Bashkort",
70
- "bleu":57.5453842927,
71
- "bert_score":0.9298217595,
72
- "speakers":1200000.0
73
- },
74
- {
75
- "target_language_name":"Basque",
76
- "bleu":65.8968721377,
77
- "bert_score":0.9192741295,
78
- "speakers":750000.0
79
- },
80
- {
81
- "target_language_name":"Belarusian",
82
- "bleu":54.5195166442,
83
- "bert_score":0.9329862595,
84
- "speakers":7900000.0
85
- },
86
- {
87
- "target_language_name":"Bemba",
88
- "bleu":47.8068548956,
89
- "bert_score":0.889907831,
90
- "speakers":3600000.0
91
- },
92
- {
93
- "target_language_name":"Bengali",
94
- "bleu":57.1417588816,
95
- "bert_score":0.9483523647,
96
- "speakers":300000000.0
97
- },
98
- {
99
- "target_language_name":"Bhojpuri",
100
- "bleu":44.5412337907,
101
- "bert_score":0.9288184981,
102
- "speakers":52200000.0
103
- },
104
- {
105
- "target_language_name":"Bokm\u00e5l",
106
- "bleu":77.4939513016,
107
- "bert_score":0.9550971886,
108
- "speakers":4000000.0
109
- },
110
- {
111
- "target_language_name":"Boro (India)",
112
- "bleu":36.1100474969,
113
- "bert_score":0.925187854,
114
- "speakers":1482929.0
115
- },
116
- {
117
- "target_language_name":"Bosnian",
118
- "bleu":72.5488027131,
119
- "bert_score":0.947693936,
120
- "speakers":3500000.0
121
- },
122
- {
123
- "target_language_name":"Bugis",
124
- "bleu":44.8388170031,
125
- "bert_score":0.8647923966,
126
- "speakers":5017800.0
127
- },
128
- {
129
- "target_language_name":"Bulgarian",
130
- "bleu":72.9695925131,
131
- "bert_score":0.9545443177,
132
- "speakers":9000000.0
133
- },
134
- {
135
- "target_language_name":"Burmese",
136
- "bleu":55.7235911677,
137
- "bert_score":0.9759751062,
138
- "speakers":32900000.0
139
- },
140
- {
141
- "target_language_name":"Catalan",
142
- "bleu":74.4595007932,
143
- "bert_score":0.9464139263,
144
- "speakers":5100000.0
145
- },
146
- {
147
- "target_language_name":"Cebuano",
148
- "bleu":69.4557958655,
149
- "bert_score":0.9321281234,
150
- "speakers":15900000.0
151
- },
152
- {
153
- "target_language_name":"Central Aymara",
154
- "bleu":42.7698436669,
155
- "bert_score":0.8625142018,
156
- "speakers":0.0
157
- },
158
- {
159
- "target_language_name":"Central Kurdish",
160
- "bleu":59.1927910692,
161
- "bert_score":0.9332568824,
162
- "speakers":7250000.0
163
- },
164
- {
165
- "target_language_name":"Central Tibetan",
166
- "bleu":51.349075274,
167
- "bert_score":0.967157503,
168
- "speakers":1200000.0
169
- },
170
- {
171
- "target_language_name":"Chhattisgarhi",
172
- "bleu":47.9797501304,
173
- "bert_score":0.9363766015,
174
- "speakers":16300000.0
175
- },
176
- {
177
- "target_language_name":"Chichewa",
178
- "bleu":59.7601680161,
179
- "bert_score":0.9069253902,
180
- "speakers":12000000.0
181
- },
182
- {
183
- "target_language_name":"Chokwe",
184
- "bleu":10.1864074161,
185
- "bert_score":0.727788798,
186
- "speakers":0.0
187
- },
188
- {
189
- "target_language_name":"Chuvash",
190
- "bleu":45.0546658723,
191
- "bert_score":0.9203916192,
192
- "speakers":1279650.0
193
- },
194
- {
195
- "target_language_name":"Crimean Tatar",
196
- "bleu":52.7050249448,
197
- "bert_score":0.8972040812,
198
- "speakers":552740.0
199
- },
200
- {
201
- "target_language_name":"Croatian",
202
- "bleu":69.5456983662,
203
- "bert_score":0.9444877982,
204
- "speakers":7000000.0
205
- },
206
- {
207
- "target_language_name":"Czech",
208
- "bleu":69.7112290599,
209
- "bert_score":0.9384464244,
210
- "speakers":10700000.0
211
- },
212
- {
213
- "target_language_name":"Danish",
214
- "bleu":78.0935433284,
215
- "bert_score":0.9506490747,
216
- "speakers":6000000.0
217
- },
218
- {
219
- "target_language_name":"Dari",
220
- "bleu":52.5539795795,
221
- "bert_score":0.9466466506,
222
- "speakers":9600000.0
223
- },
224
- {
225
- "target_language_name":"Dholuo",
226
- "bleu":46.4119479071,
227
- "bert_score":0.8803233822,
228
- "speakers":3000000.0
229
- },
230
- {
231
- "target_language_name":"Dogri",
232
- "bleu":44.9153535278,
233
- "bert_score":0.934070154,
234
- "speakers":2000000.0
235
- },
236
- {
237
- "target_language_name":"Dutch",
238
- "bleu":71.1849326315,
239
- "bert_score":0.9376831949,
240
- "speakers":23100000.0
241
- },
242
- {
243
- "target_language_name":"Dzongkha",
244
- "bleu":44.3573814017,
245
- "bert_score":0.9664796074,
246
- "speakers":237080.0
247
- },
248
- {
249
- "target_language_name":"Eastern Punjabi",
250
- "bleu":60.468441109,
251
- "bert_score":0.988244007,
252
- "speakers":125000000.0
253
- },
254
- {
255
- "target_language_name":"Eastern Yiddish",
256
- "bleu":47.5562009325,
257
- "bert_score":0.9590989411,
258
- "speakers":0.0
259
- },
260
- {
261
- "target_language_name":"Egyptian Arabic",
262
- "bleu":53.6818081038,
263
- "bert_score":0.9394114673,
264
- "speakers":100542400.0
265
- },
266
- {
267
- "target_language_name":"English",
268
- "bleu":75.3501173486,
269
- "bert_score":0.8807334363,
270
- "speakers":1132366680.0
271
- },
272
- {
273
- "target_language_name":"Esperanto",
274
- "bleu":69.6056577554,
275
- "bert_score":0.9302131255,
276
- "speakers":2000000.0
277
- },
278
- {
279
- "target_language_name":"Faroese",
280
- "bleu":65.9147902483,
281
- "bert_score":0.9332413753,
282
- "speakers":69150.0
283
- },
284
- {
285
- "target_language_name":"Fijian",
286
- "bleu":58.2892667246,
287
- "bert_score":0.9183188617,
288
- "speakers":341270.0
289
- },
290
- {
291
- "target_language_name":"Filipino",
292
- "bleu":70.1928498378,
293
- "bert_score":0.9269425154,
294
- "speakers":90000000.0
295
- },
296
- {
297
- "target_language_name":"Finnish",
298
- "bleu":70.9425029518,
299
- "bert_score":0.9320579688,
300
- "speakers":5413380.0
301
- },
302
- {
303
- "target_language_name":"Fon",
304
- "bleu":25.2797773666,
305
- "bert_score":0.8664443592,
306
- "speakers":1935500.0
307
- },
308
- {
309
- "target_language_name":"French",
310
- "bleu":79.3023871219,
311
- "bert_score":0.9554367423,
312
- "speakers":208157220.0
313
- },
314
- {
315
- "target_language_name":"Friulian",
316
- "bleu":66.5488092372,
317
- "bert_score":0.9255799611,
318
- "speakers":300000.0
319
- },
320
- {
321
- "target_language_name":"Galician",
322
- "bleu":68.7024786904,
323
- "bert_score":0.9283550183,
324
- "speakers":2500000.0
325
- },
326
- {
327
- "target_language_name":"Ganda",
328
- "bleu":45.8693322936,
329
- "bert_score":0.88344028,
330
- "speakers":4100000.0
331
- },
332
- {
333
- "target_language_name":"Georgian",
334
- "bleu":61.0166361442,
335
- "bert_score":0.9546662311,
336
- "speakers":3700000.0
337
- },
338
- {
339
- "target_language_name":"Gikuyu",
340
- "bleu":40.9288275291,
341
- "bert_score":0.8850945433,
342
- "speakers":6623000.0
343
- },
344
- {
345
- "target_language_name":"Goan Konkani",
346
- "bleu":47.1084017945,
347
- "bert_score":0.9314287245,
348
- "speakers":3633900.0
349
- },
350
- {
351
- "target_language_name":"Greek",
352
- "bleu":66.2347782153,
353
- "bert_score":0.9577525119,
354
- "speakers":15000000.0
355
- },
356
- {
357
- "target_language_name":"Gujarati",
358
- "bleu":55.5884513452,
359
- "bert_score":0.9753397226,
360
- "speakers":56400000.0
361
- },
362
- {
363
- "target_language_name":"Haitian Creole",
364
- "bleu":63.8532187591,
365
- "bert_score":0.93236112,
366
- "speakers":9600000.0
367
- },
368
- {
369
- "target_language_name":"Halh Mongolian",
370
- "bleu":58.5037789971,
371
- "bert_score":0.9380823056,
372
- "speakers":2704030.0
373
- },
374
- {
375
- "target_language_name":"Hausa",
376
- "bleu":56.3431957901,
377
- "bert_score":0.9012877802,
378
- "speakers":43900000.0
379
- },
380
- {
381
- "target_language_name":"Hebrew",
382
- "bleu":72.0702990513,
383
- "bert_score":0.964064618,
384
- "speakers":9303950.0
385
- },
386
- {
387
- "target_language_name":"Hindi",
388
- "bleu":64.9362166898,
389
- "bert_score":0.9463364283,
390
- "speakers":341000000.0
391
- },
392
- {
393
- "target_language_name":"Hungarian",
394
- "bleu":66.1301119408,
395
- "bert_score":0.9249218643,
396
- "speakers":12600000.0
397
- },
398
- {
399
- "target_language_name":"Icelandic",
400
- "bleu":54.4330055353,
401
- "bert_score":0.9120460276,
402
- "speakers":358000.0
403
- },
404
- {
405
- "target_language_name":"Igbo",
406
- "bleu":46.4017344934,
407
- "bert_score":0.9137314638,
408
- "speakers":27000000.0
409
- },
410
- {
411
- "target_language_name":"Ilocano",
412
- "bleu":62.6058864594,
413
- "bert_score":0.9115280092,
414
- "speakers":9100000.0
415
- },
416
- {
417
- "target_language_name":"Indonesian",
418
- "bleu":72.9087066262,
419
- "bert_score":0.9301403503,
420
- "speakers":198996550.0
421
- },
422
- {
423
- "target_language_name":"Iranian Persian",
424
- "bleu":57.6444169698,
425
- "bert_score":0.9476486345,
426
- "speakers":52800000.0
427
- },
428
- {
429
- "target_language_name":"Irish",
430
- "bleu":69.9725194524,
431
- "bert_score":0.9440232972,
432
- "speakers":1030000.0
433
- },
434
- {
435
- "target_language_name":"Italian",
436
- "bleu":69.1588343572,
437
- "bert_score":0.9358606537,
438
- "speakers":64819790.0
439
- },
440
- {
441
- "target_language_name":"Japanese",
442
- "bleu":49.9166135693,
443
- "bert_score":0.9425287286,
444
- "speakers":128000000.0
445
- },
446
- {
447
- "target_language_name":"Javanese",
448
- "bleu":60.440335299,
449
- "bert_score":0.9125308077,
450
- "speakers":84308740.0
451
- },
452
- {
453
- "target_language_name":"Jingpho",
454
- "bleu":43.5500581403,
455
- "bert_score":0.8727998992,
456
- "speakers":940000.0
457
- },
458
- {
459
- "target_language_name":"Jula",
460
- "bleu":29.5415180297,
461
- "bert_score":0.822332112,
462
- "speakers":2700000.0
463
- },
464
- {
465
- "target_language_name":"Kabiy\u00e8",
466
- "bleu":22.5498504655,
467
- "bert_score":0.8587520639,
468
- "speakers":1000000.0
469
- },
470
- {
471
- "target_language_name":"Kabuverdianu",
472
- "bleu":65.1106010391,
473
- "bert_score":0.9213403026,
474
- "speakers":871000.0
475
- },
476
- {
477
- "target_language_name":"Kabyle",
478
- "bleu":41.1442992587,
479
- "bert_score":0.8803219795,
480
- "speakers":5586000.0
481
- },
482
- {
483
- "target_language_name":"Kamba",
484
- "bleu":41.733489671,
485
- "bert_score":0.8780206362,
486
- "speakers":3893000.0
487
- },
488
- {
489
- "target_language_name":"Kannada",
490
- "bleu":60.0142028332,
491
- "bert_score":0.9730932295,
492
- "speakers":43600000.0
493
- },
494
- {
495
- "target_language_name":"Kashmiri",
496
- "bleu":22.3019416547,
497
- "bert_score":0.8984790143,
498
- "speakers":6900000.0
499
- },
500
- {
501
- "target_language_name":"Kazakh",
502
- "bleu":61.1251621375,
503
- "bert_score":0.9379647116,
504
- "speakers":13161980.0
505
- },
506
- {
507
- "target_language_name":"Khmer",
508
- "bleu":49.2098257043,
509
- "bert_score":0.8907732884,
510
- "speakers":16600000.0
511
- },
512
- {
513
- "target_language_name":"Kimbundu",
514
- "bleu":5.8523457224,
515
- "bert_score":0.6849321783,
516
- "speakers":0.0
517
- },
518
- {
519
- "target_language_name":"Kinyarwanda",
520
- "bleu":57.2410626756,
521
- "bert_score":0.906923449,
522
- "speakers":12100000.0
523
- },
524
- {
525
- "target_language_name":"Kituba (Democratic Republic of the Congo)",
526
- "bleu":52.8484601602,
527
- "bert_score":0.9017938395,
528
- "speakers":0.0
529
- },
530
- {
531
- "target_language_name":"Korean",
532
- "bleu":43.6872285974,
533
- "bert_score":0.9579092761,
534
- "speakers":77300000.0
535
- },
536
- {
537
- "target_language_name":"Kyrgyz",
538
- "bleu":57.0824422453,
539
- "bert_score":0.9317750076,
540
- "speakers":4568480.0
541
- },
542
- {
543
- "target_language_name":"Lao",
544
- "bleu":60.0210909677,
545
- "bert_score":0.904438438,
546
- "speakers":5225552.0
547
- },
548
- {
549
- "target_language_name":"Latgalian",
550
- "bleu":56.4843556524,
551
- "bert_score":0.9078494012,
552
- "speakers":200000.0
553
- },
554
- {
555
- "target_language_name":"Levantine Arabic",
556
- "bleu":56.0898634013,
557
- "bert_score":0.9437467565,
558
- "speakers":44000000.0
559
- },
560
- {
561
- "target_language_name":"Ligurian",
562
- "bleu":55.8530636302,
563
- "bert_score":0.9047620773,
564
- "speakers":500000.0
565
- },
566
- {
567
- "target_language_name":"Limburgish",
568
- "bleu":59.4485504982,
569
- "bert_score":0.8987095455,
570
- "speakers":1600000.0
571
- },
572
- {
573
- "target_language_name":"Lingala",
574
- "bleu":30.4322896531,
575
- "bert_score":0.8553236572,
576
- "speakers":20000000.0
577
- },
578
- {
579
- "target_language_name":"Lithuanian",
580
- "bleu":67.1625695571,
581
- "bert_score":0.9154702902,
582
- "speakers":4000000.0
583
- },
584
- {
585
- "target_language_name":"Lombard",
586
- "bleu":46.3884402674,
587
- "bert_score":0.8643471499,
588
- "speakers":3900000.0
589
- },
590
- {
591
- "target_language_name":"Luba-Kasai",
592
- "bleu":45.0655291655,
593
- "bert_score":0.8749240279,
594
- "speakers":6300000.0
595
- },
596
- {
597
- "target_language_name":"Luxembourgish",
598
- "bleu":70.8338190438,
599
- "bert_score":0.9297492107,
600
- "speakers":391200.0
601
- },
602
- {
603
- "target_language_name":"Macedonian",
604
- "bleu":72.2733471437,
605
- "bert_score":0.9558346649,
606
- "speakers":2000000.0
607
- },
608
- {
609
- "target_language_name":"Magahi",
610
- "bleu":58.5474221546,
611
- "bert_score":0.9458349566,
612
- "speakers":20700000.0
613
- },
614
- {
615
- "target_language_name":"Maithili",
616
- "bleu":54.6530071391,
617
- "bert_score":0.9433513383,
618
- "speakers":33900000.0
619
- },
620
- {
621
- "target_language_name":"Malayalam",
622
- "bleu":64.0655894091,
623
- "bert_score":0.9803075671,
624
- "speakers":37100000.0
625
- },
626
- {
627
- "target_language_name":"Maltese",
628
- "bleu":80.0866777263,
629
- "bert_score":0.9520254652,
630
- "speakers":570000.0
631
- },
632
- {
633
- "target_language_name":"Mandarin Chinese",
634
- "bleu":42.5300166785,
635
- "bert_score":0.9634857118,
636
- "speakers":1074000000.0
637
- },
638
- {
639
- "target_language_name":"Maori",
640
- "bleu":54.8319935643,
641
- "bert_score":0.9185245017,
642
- "speakers":160000.0
643
- },
644
- {
645
- "target_language_name":"Marathi",
646
- "bleu":57.4434090711,
647
- "bert_score":0.9421781262,
648
- "speakers":83100000.0
649
- },
650
- {
651
- "target_language_name":"Meadow Mari",
652
- "bleu":49.7911680582,
653
- "bert_score":0.9295116961,
654
- "speakers":482000.0
655
- },
656
- {
657
- "target_language_name":"Meitei",
658
- "bleu":41.2619945571,
659
- "bert_score":0.9528288851,
660
- "speakers":1470000.0
661
- },
662
- {
663
- "target_language_name":"Merina Malagasy",
664
- "bleu":61.0968434546,
665
- "bert_score":0.9032936792,
666
- "speakers":0.0
667
- },
668
- {
669
- "target_language_name":"Mesopotamian Arabic",
670
- "bleu":49.5184865297,
671
- "bert_score":0.9382626355,
672
- "speakers":15700000.0
673
- },
674
- {
675
- "target_language_name":"Minangkabau",
676
- "bleu":50.7407956197,
677
- "bert_score":0.9252789746,
678
- "speakers":5530000.0
679
- },
680
- {
681
- "target_language_name":"Mizo",
682
- "bleu":51.6558017488,
683
- "bert_score":0.8875152906,
684
- "speakers":500000.0
685
- },
686
- {
687
- "target_language_name":"Moore",
688
- "bleu":32.8458097983,
689
- "bert_score":0.8583020627,
690
- "speakers":7600000.0
691
- },
692
- {
693
- "target_language_name":"Moroccan Arabic",
694
- "bleu":49.3082976781,
695
- "bert_score":0.9317501009,
696
- "speakers":27500000.0
697
- },
698
- {
699
- "target_language_name":"Najdi Arabic",
700
- "bleu":46.4102430377,
701
- "bert_score":0.9332984229,
702
- "speakers":0.0
703
- },
704
- {
705
- "target_language_name":"Nepali",
706
- "bleu":55.2919347352,
707
- "bert_score":0.9358912428,
708
- "speakers":0.0
709
- },
710
- {
711
- "target_language_name":"Nigerian Fulfulde",
712
- "bleu":28.1761055913,
713
- "bert_score":0.8343587597,
714
- "speakers":14500000.0
715
- },
716
- {
717
- "target_language_name":"North Azerbaijani",
718
- "bleu":55.5265107063,
719
- "bert_score":0.9145456314,
720
- "speakers":9220610.0
721
- },
722
- {
723
- "target_language_name":"Northern Kurdish",
724
- "bleu":55.7965878227,
725
- "bert_score":0.9104436457,
726
- "speakers":14600000.0
727
- },
728
- {
729
- "target_language_name":"Northern Sotho",
730
- "bleu":62.8769401692,
731
- "bert_score":0.9261207898,
732
- "speakers":4100000.0
733
- },
734
- {
735
- "target_language_name":"Northern Uzbek",
736
- "bleu":63.205573851,
737
- "bert_score":0.9120756924,
738
- "speakers":26912410.0
739
- },
740
- {
741
- "target_language_name":"Nuer",
742
- "bleu":16.5796987951,
743
- "bert_score":0.8528214693,
744
- "speakers":900000.0
745
- },
746
- {
747
- "target_language_name":"N\u2019Ko",
748
- "bleu":32.483490799,
749
- "bert_score":0.9823745767,
750
- "speakers":0.0
751
- },
752
- {
753
- "target_language_name":"Occitan",
754
- "bleu":71.532740184,
755
- "bert_score":0.9337525626,
756
- "speakers":542000.0
757
- },
758
- {
759
- "target_language_name":"Odia",
760
- "bleu":57.3628096518,
761
- "bert_score":0.9768644154,
762
- "speakers":34500000.0
763
- },
764
- {
765
- "target_language_name":"Pangasinan",
766
- "bleu":56.0048183827,
767
- "bert_score":0.8906280657,
768
- "speakers":1100000.0
769
- },
770
- {
771
- "target_language_name":"Papiamentu",
772
- "bleu":69.7955328133,
773
- "bert_score":0.9325902323,
774
- "speakers":321300.0
775
- },
776
- {
777
- "target_language_name":"Paraguayan Guaran\u00ed",
778
- "bleu":41.7929863707,
779
- "bert_score":0.8764786462,
780
- "speakers":0.0
781
- },
782
- {
783
- "target_language_name":"Polish",
784
- "bleu":61.8768399674,
785
- "bert_score":0.9179250948,
786
- "speakers":40200000.0
787
- },
788
- {
789
- "target_language_name":"Portuguese",
790
- "bleu":77.4978074222,
791
- "bert_score":0.9494876027,
792
- "speakers":254300000.0
793
- },
794
- {
795
- "target_language_name":"Romanian",
796
- "bleu":76.4907159035,
797
- "bert_score":0.9455295324,
798
- "speakers":24300000.0
799
- },
800
- {
801
- "target_language_name":"Rundi",
802
- "bleu":48.943513629,
803
- "bert_score":0.8933652222,
804
- "speakers":10800000.0
805
- },
806
- {
807
- "target_language_name":"Russian",
808
- "bleu":71.1489441039,
809
- "bert_score":0.9518508852,
810
- "speakers":171428900.0
811
- },
812
- {
813
- "target_language_name":"Samoan",
814
- "bleu":56.7138831423,
815
- "bert_score":0.9166683555,
816
- "speakers":415720.0
817
- },
818
- {
819
- "target_language_name":"Sango",
820
- "bleu":34.8754222657,
821
- "bert_score":0.8720244229,
822
- "speakers":4600000.0
823
- },
824
- {
825
- "target_language_name":"Sanskrit",
826
- "bleu":32.7813249911,
827
- "bert_score":0.8987655501,
828
- "speakers":49736.0
829
- },
830
- {
831
- "target_language_name":"Santhali",
832
- "bleu":31.5119247269,
833
- "bert_score":0.944095705,
834
- "speakers":7200000.0
835
- },
836
- {
837
- "target_language_name":"Sardinian",
838
- "bleu":62.6903914771,
839
- "bert_score":0.9118991812,
840
- "speakers":1300000.0
841
- },
842
- {
843
- "target_language_name":"Scottish Gaelic",
844
- "bleu":62.6044371338,
845
- "bert_score":0.9264988482,
846
- "speakers":60130.0
847
- },
848
- {
849
- "target_language_name":"Serbian",
850
- "bleu":69.9691396176,
851
- "bert_score":0.9582955678,
852
- "speakers":9000000.0
853
- },
854
- {
855
- "target_language_name":"Setswana",
856
- "bleu":55.2288890228,
857
- "bert_score":0.9117900888,
858
- "speakers":4500000.0
859
- },
860
- {
861
- "target_language_name":"Shan",
862
- "bleu":29.2129948577,
863
- "bert_score":0.9378574808,
864
- "speakers":3000000.0
865
- },
866
- {
867
- "target_language_name":"Shona",
868
- "bleu":51.5592191405,
869
- "bert_score":0.8798740129,
870
- "speakers":9023000.0
871
- },
872
- {
873
- "target_language_name":"Sicilian",
874
- "bleu":58.5895359443,
875
- "bert_score":0.90428345,
876
- "speakers":4700000.0
877
- },
878
- {
879
- "target_language_name":"Silesian",
880
- "bleu":56.7836392069,
881
- "bert_score":0.9106028736,
882
- "speakers":522000.0
883
- },
884
- {
885
- "target_language_name":"Sindhi",
886
- "bleu":48.1876056648,
887
- "bert_score":0.936702015,
888
- "speakers":25000000.0
889
- },
890
- {
891
- "target_language_name":"Sinhala",
892
- "bleu":56.7567311796,
893
- "bert_score":0.9713358581,
894
- "speakers":15300000.0
895
- },
896
- {
897
- "target_language_name":"Slovak",
898
- "bleu":67.9284804086,
899
- "bert_score":0.9360236605,
900
- "speakers":6000000.0
901
- },
902
- {
903
- "target_language_name":"Slovene",
904
- "bleu":72.5691270757,
905
- "bert_score":0.9432346245,
906
- "speakers":2400000.0
907
- },
908
- {
909
- "target_language_name":"Somali",
910
- "bleu":55.3706496473,
911
- "bert_score":0.908571593,
912
- "speakers":16200000.0
913
- },
914
- {
915
- "target_language_name":"South Azerbaijani",
916
- "bleu":44.3712804302,
917
- "bert_score":0.9420697371,
918
- "speakers":15000000.0
919
- },
920
- {
921
- "target_language_name":"Southern Pashto",
922
- "bleu":38.3124819374,
923
- "bert_score":0.921268179,
924
- "speakers":10900000.0
925
- },
926
- {
927
- "target_language_name":"Southern Sotho",
928
- "bleu":56.735299554,
929
- "bert_score":0.9102749407,
930
- "speakers":6000000.0
931
- },
932
- {
933
- "target_language_name":"Southwestern Dinka",
934
- "bleu":17.5913281403,
935
- "bert_score":0.8016291638,
936
- "speakers":0.0
937
- },
938
- {
939
- "target_language_name":"Spanish",
940
- "bleu":63.8467073379,
941
- "bert_score":0.9224406302,
942
- "speakers":485000000.0
943
- },
944
- {
945
- "target_language_name":"Standard Arabic",
946
- "bleu":56.8831262708,
947
- "bert_score":0.9168330083,
948
- "speakers":0.0
949
- },
950
- {
951
- "target_language_name":"Standard Estonian",
952
- "bleu":67.4156919517,
953
- "bert_score":0.9277306815,
954
- "speakers":1164770.0
955
- },
956
- {
957
- "target_language_name":"Standard German",
958
- "bleu":77.1966515107,
959
- "bert_score":0.9468763133,
960
- "speakers":105000000.0
961
- },
962
- {
963
- "target_language_name":"Standard Latvian",
964
- "bleu":65.0833210037,
965
- "bert_score":0.9217625757,
966
- "speakers":0.0
967
- },
968
- {
969
- "target_language_name":"Standard Malay",
970
- "bleu":74.2657232798,
971
- "bert_score":0.9445500493,
972
- "speakers":0.0
973
- },
974
- {
975
- "target_language_name":"Standard Moroccan Tamazight",
976
- "bleu":35.6247648109,
977
- "bert_score":0.9847298423,
978
- "speakers":0.0
979
- },
980
- {
981
- "target_language_name":"Sunda",
982
- "bleu":56.4065999104,
983
- "bert_score":0.9077177823,
984
- "speakers":32400000.0
985
- },
986
- {
987
- "target_language_name":"Swahili",
988
- "bleu":73.5199042142,
989
- "bert_score":0.9450787365,
990
- "speakers":82300000.0
991
- },
992
- {
993
- "target_language_name":"Swati",
994
- "bleu":52.7746096439,
995
- "bert_score":0.8899940272,
996
- "speakers":2034200.0
997
- },
998
- {
999
- "target_language_name":"Swedish",
1000
- "bleu":77.421610247,
1001
- "bert_score":0.9571870168,
1002
- "speakers":9244250.0
1003
- },
1004
- {
1005
- "target_language_name":"Tajik",
1006
- "bleu":60.9783684158,
1007
- "bert_score":0.9378365338,
1008
- "speakers":14000000.0
1009
- },
1010
- {
1011
- "target_language_name":"Tamasheq",
1012
- "bleu":18.4319889721,
1013
- "bert_score":0.8427422295,
1014
- "speakers":500000.0
1015
- },
1016
- {
1017
- "target_language_name":"Tamil",
1018
- "bleu":65.7863221054,
1019
- "bert_score":0.9536473691,
1020
- "speakers":75000000.0
1021
- },
1022
- {
1023
- "target_language_name":"Tatar",
1024
- "bleu":60.3447467213,
1025
- "bert_score":0.9364115715,
1026
- "speakers":5427318.0
1027
- },
1028
- {
1029
- "target_language_name":"Ta\u2019izzi-Adeni Arabic",
1030
- "bleu":49.4139335281,
1031
- "bert_score":0.9354432185,
1032
- "speakers":10500000.0
1033
- },
1034
- {
1035
- "target_language_name":"Telugu",
1036
- "bleu":61.6352457629,
1037
- "bert_score":0.9790697515,
1038
- "speakers":82000000.0
1039
- },
1040
- {
1041
- "target_language_name":"Thai",
1042
- "bleu":62.8125360944,
1043
- "bert_score":0.9225328485,
1044
- "speakers":40000000.0
1045
- },
1046
- {
1047
- "target_language_name":"Tigrigna",
1048
- "bleu":32.8711961703,
1049
- "bert_score":0.9852415164,
1050
- "speakers":7507780.0
1051
- },
1052
- {
1053
- "target_language_name":"Tok Pisin",
1054
- "bleu":56.5407760367,
1055
- "bert_score":0.9031182428,
1056
- "speakers":4000000.0
1057
- },
1058
- {
1059
- "target_language_name":"Tosk Albanian",
1060
- "bleu":69.4218765092,
1061
- "bert_score":0.9402680953,
1062
- "speakers":3000000.0
1063
- },
1064
- {
1065
- "target_language_name":"Tsonga",
1066
- "bleu":58.3516573597,
1067
- "bert_score":0.9134832978,
1068
- "speakers":13000000.0
1069
- },
1070
- {
1071
- "target_language_name":"Tumbuka",
1072
- "bleu":44.0490017392,
1073
- "bert_score":0.8865564326,
1074
- "speakers":2680000.0
1075
- },
1076
- {
1077
- "target_language_name":"Tunisian Arabic",
1078
- "bleu":49.6714090744,
1079
- "bert_score":0.9337966998,
1080
- "speakers":11600000.0
1081
- },
1082
- {
1083
- "target_language_name":"Turkish",
1084
- "bleu":67.1600625676,
1085
- "bert_score":0.9309494158,
1086
- "speakers":82231620.0
1087
- },
1088
- {
1089
- "target_language_name":"Turkmen",
1090
- "bleu":60.5593705936,
1091
- "bert_score":0.9125106474,
1092
- "speakers":16000000.0
1093
- },
1094
- {
1095
- "target_language_name":"Twi",
1096
- "bleu":44.7976562068,
1097
- "bert_score":0.8913615406,
1098
- "speakers":3000000.0
1099
- },
1100
- {
1101
- "target_language_name":"Ukrainian",
1102
- "bleu":68.0976232544,
1103
- "bert_score":0.9468558848,
1104
- "speakers":34710100.0
1105
- },
1106
- {
1107
- "target_language_name":"Umbundu",
1108
- "bleu":21.0802775597,
1109
- "bert_score":0.8461364289,
1110
- "speakers":6000000.0
1111
- },
1112
- {
1113
- "target_language_name":"Urdu",
1114
- "bleu":61.1255457272,
1115
- "bert_score":0.953888009,
1116
- "speakers":94022900.0
1117
- },
1118
- {
1119
- "target_language_name":"Uyghur",
1120
- "bleu":53.5346877103,
1121
- "bert_score":0.9397906005,
1122
- "speakers":10400000.0
1123
- },
1124
- {
1125
- "target_language_name":"Venetian",
1126
- "bleu":60.6140876271,
1127
- "bert_score":0.9080212533,
1128
- "speakers":2000000.0
1129
- },
1130
- {
1131
- "target_language_name":"Vietnamese",
1132
- "bleu":70.3560749464,
1133
- "bert_score":0.9527418713,
1134
- "speakers":76000000.0
1135
- },
1136
- {
1137
- "target_language_name":"Waray-Waray",
1138
- "bleu":66.3850231243,
1139
- "bert_score":0.920412008,
1140
- "speakers":3100000.0
1141
- },
1142
- {
1143
- "target_language_name":"Welsh",
1144
- "bleu":83.3437724474,
1145
- "bert_score":0.9662299534,
1146
- "speakers":977366.0
1147
- },
1148
- {
1149
- "target_language_name":"West Central Oromo",
1150
- "bleu":46.9090350028,
1151
- "bert_score":0.8845542371,
1152
- "speakers":0.0
1153
- },
1154
- {
1155
- "target_language_name":"Wolof",
1156
- "bleu":42.6430127569,
1157
- "bert_score":0.8762976408,
1158
- "speakers":3700000.0
1159
- },
1160
- {
1161
- "target_language_name":"Xhosa",
1162
- "bleu":55.4688091009,
1163
- "bert_score":0.9008744816,
1164
- "speakers":11000000.0
1165
- },
1166
- {
1167
- "target_language_name":"Yerwa Kanuri",
1168
- "bleu":18.5081787556,
1169
- "bert_score":0.839997381,
1170
- "speakers":0.0
1171
- },
1172
- {
1173
- "target_language_name":"Yoruba",
1174
- "bleu":34.2642542268,
1175
- "bert_score":0.9001545012,
1176
- "speakers":40000000.0
1177
- },
1178
- {
1179
- "target_language_name":"Yue Chinese",
1180
- "bleu":34.5614651228,
1181
- "bert_score":0.9634495397,
1182
- "speakers":73100000.0
1183
- },
1184
- {
1185
- "target_language_name":"Zulu",
1186
- "bleu":59.1762078389,
1187
- "bert_score":0.9099391103,
1188
- "speakers":15700000.0
1189
- },
1190
- {
1191
- "target_language_name":"nno",
1192
- "bleu":71.8615646296,
1193
- "bert_score":0.9335320314,
1194
- "speakers":0.0
1195
- },
1196
- {
1197
- "target_language_name":"\u00c9w\u00e9",
1198
- "bleu":41.6614038791,
1199
- "bert_score":0.8829316159,
1200
- "speakers":3000000.0
1201
- }
1202
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
uv.lock CHANGED
@@ -253,6 +253,18 @@ wheels = [
253
  { url = "https://files.pythonhosted.org/packages/bf/9b/08c0432272d77b04803958a4598a51e2a4b51c06640af8b8f0f908c18bf2/charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079", size = 49446 },
254
  ]
255
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  [[package]]
257
  name = "colorama"
258
  version = "0.4.6"
@@ -767,11 +779,16 @@ dependencies = [
767
  { name = "bert-score" },
768
  { name = "evaluate" },
769
  { name = "joblib" },
 
770
  { name = "openai" },
771
  { name = "pandas" },
 
772
  { name = "python-dotenv" },
773
  { name = "sacrebleu" },
 
 
774
  { name = "tqdm" },
 
775
  ]
776
 
777
  [package.metadata]
@@ -780,11 +797,16 @@ requires-dist = [
780
  { name = "bert-score", specifier = ">=0.3.13" },
781
  { name = "evaluate", specifier = ">=0.4.3" },
782
  { name = "joblib", specifier = ">=1.4.2" },
 
783
  { name = "openai", specifier = ">=1.52.2" },
784
  { name = "pandas", specifier = ">=2.2.3" },
 
785
  { name = "python-dotenv", specifier = ">=1.0.1" },
786
  { name = "sacrebleu", specifier = ">=2.4.3" },
 
 
787
  { name = "tqdm", specifier = ">=4.66.6" },
 
788
  ]
789
 
790
  [[package]]
@@ -1083,6 +1105,21 @@ wheels = [
1083
  { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263 },
1084
  ]
1085
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1086
  [[package]]
1087
  name = "numpy"
1088
  version = "2.1.2"
@@ -1491,6 +1528,20 @@ wheels = [
1491
  { url = "https://files.pythonhosted.org/packages/3d/b6/e6d98278f2d49b22b4d033c9f792eda783b9ab2094b041f013fc69bcde87/propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036", size = 11603 },
1492
  ]
1493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1494
  [[package]]
1495
  name = "pyarrow"
1496
  version = "18.0.0"
@@ -1878,6 +1929,38 @@ wheels = [
1878
  { url = "https://files.pythonhosted.org/packages/19/46/5d11dc300feaad285c2f1bd784ff3f689f5e0ab6be49aaf568f3a77019eb/safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f", size = 606660 },
1879
  ]
1880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1881
  [[package]]
1882
  name = "setuptools"
1883
  version = "75.3.0"
@@ -1926,6 +2009,42 @@ wheels = [
1926
  { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252 },
1927
  ]
1928
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1929
  [[package]]
1930
  name = "tokenizers"
1931
  version = "0.20.1"
 
253
  { url = "https://files.pythonhosted.org/packages/bf/9b/08c0432272d77b04803958a4598a51e2a4b51c06640af8b8f0f908c18bf2/charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079", size = 49446 },
254
  ]
255
 
256
+ [[package]]
257
+ name = "click"
258
+ version = "8.1.7"
259
+ source = { registry = "https://pypi.org/simple" }
260
+ dependencies = [
261
+ { name = "colorama", marker = "platform_system == 'Windows'" },
262
+ ]
263
+ sdist = { url = "https://files.pythonhosted.org/packages/96/d3/f04c7bfcf5c1862a2a5b845c6b2b360488cf47af55dfa79c98f6a6bf98b5/click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de", size = 336121 }
264
+ wheels = [
265
+ { url = "https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28", size = 97941 },
266
+ ]
267
+
268
  [[package]]
269
  name = "colorama"
270
  version = "0.4.6"
 
779
  { name = "bert-score" },
780
  { name = "evaluate" },
781
  { name = "joblib" },
782
+ { name = "nltk" },
783
  { name = "openai" },
784
  { name = "pandas" },
785
+ { name = "protobuf" },
786
  { name = "python-dotenv" },
787
  { name = "sacrebleu" },
788
+ { name = "sentencepiece" },
789
+ { name = "tiktoken" },
790
  { name = "tqdm" },
791
+ { name = "transformers" },
792
  ]
793
 
794
  [package.metadata]
 
797
  { name = "bert-score", specifier = ">=0.3.13" },
798
  { name = "evaluate", specifier = ">=0.4.3" },
799
  { name = "joblib", specifier = ">=1.4.2" },
800
+ { name = "nltk", specifier = ">=3.9.1" },
801
  { name = "openai", specifier = ">=1.52.2" },
802
  { name = "pandas", specifier = ">=2.2.3" },
803
+ { name = "protobuf", specifier = ">=5.28.3" },
804
  { name = "python-dotenv", specifier = ">=1.0.1" },
805
  { name = "sacrebleu", specifier = ">=2.4.3" },
806
+ { name = "sentencepiece", specifier = ">=0.2.0" },
807
+ { name = "tiktoken", specifier = ">=0.8.0" },
808
  { name = "tqdm", specifier = ">=4.66.6" },
809
+ { name = "transformers", specifier = ">=4.46.1" },
810
  ]
811
 
812
  [[package]]
 
1105
  { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263 },
1106
  ]
1107
 
1108
+ [[package]]
1109
+ name = "nltk"
1110
+ version = "3.9.1"
1111
+ source = { registry = "https://pypi.org/simple" }
1112
+ dependencies = [
1113
+ { name = "click" },
1114
+ { name = "joblib" },
1115
+ { name = "regex" },
1116
+ { name = "tqdm" },
1117
+ ]
1118
+ sdist = { url = "https://files.pythonhosted.org/packages/3c/87/db8be88ad32c2d042420b6fd9ffd4a149f9a0d7f0e86b3f543be2eeeedd2/nltk-3.9.1.tar.gz", hash = "sha256:87d127bd3de4bd89a4f81265e5fa59cb1b199b27440175370f7417d2bc7ae868", size = 2904691 }
1119
+ wheels = [
1120
+ { url = "https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl", hash = "sha256:4fa26829c5b00715afe3061398a8989dc643b92ce7dd93fb4585a70930d168a1", size = 1505442 },
1121
+ ]
1122
+
1123
  [[package]]
1124
  name = "numpy"
1125
  version = "2.1.2"
 
1528
  { url = "https://files.pythonhosted.org/packages/3d/b6/e6d98278f2d49b22b4d033c9f792eda783b9ab2094b041f013fc69bcde87/propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036", size = 11603 },
1529
  ]
1530
 
1531
+ [[package]]
1532
+ name = "protobuf"
1533
+ version = "5.28.3"
1534
+ source = { registry = "https://pypi.org/simple" }
1535
+ sdist = { url = "https://files.pythonhosted.org/packages/74/6e/e69eb906fddcb38f8530a12f4b410699972ab7ced4e21524ece9d546ac27/protobuf-5.28.3.tar.gz", hash = "sha256:64badbc49180a5e401f373f9ce7ab1d18b63f7dd4a9cdc43c92b9f0b481cef7b", size = 422479 }
1536
+ wheels = [
1537
+ { url = "https://files.pythonhosted.org/packages/d1/c5/05163fad52d7c43e124a545f1372d18266db36036377ad29de4271134a6a/protobuf-5.28.3-cp310-abi3-win32.whl", hash = "sha256:0c4eec6f987338617072592b97943fdbe30d019c56126493111cf24344c1cc24", size = 419624 },
1538
+ { url = "https://files.pythonhosted.org/packages/9c/4c/4563ebe001ff30dca9d7ed12e471fa098d9759712980cde1fd03a3a44fb7/protobuf-5.28.3-cp310-abi3-win_amd64.whl", hash = "sha256:91fba8f445723fcf400fdbe9ca796b19d3b1242cd873907979b9ed71e4afe868", size = 431464 },
1539
+ { url = "https://files.pythonhosted.org/packages/1c/f2/baf397f3dd1d3e4af7e3f5a0382b868d25ac068eefe1ebde05132333436c/protobuf-5.28.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a3f6857551e53ce35e60b403b8a27b0295f7d6eb63d10484f12bc6879c715687", size = 414743 },
1540
+ { url = "https://files.pythonhosted.org/packages/85/50/cd61a358ba1601f40e7d38bcfba22e053f40ef2c50d55b55926aecc8fec7/protobuf-5.28.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:3fa2de6b8b29d12c61911505d893afe7320ce7ccba4df913e2971461fa36d584", size = 316511 },
1541
+ { url = "https://files.pythonhosted.org/packages/5d/ae/3257b09328c0b4e59535e497b0c7537d4954038bdd53a2f0d2f49d15a7c4/protobuf-5.28.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:712319fbdddb46f21abb66cd33cb9e491a5763b2febd8f228251add221981135", size = 316624 },
1542
+ { url = "https://files.pythonhosted.org/packages/ad/c3/2377c159e28ea89a91cf1ca223f827ae8deccb2c9c401e5ca233cd73002f/protobuf-5.28.3-py3-none-any.whl", hash = "sha256:cee1757663fa32a1ee673434fcf3bf24dd54763c79690201208bafec62f19eed", size = 169511 },
1543
+ ]
1544
+
1545
  [[package]]
1546
  name = "pyarrow"
1547
  version = "18.0.0"
 
1929
  { url = "https://files.pythonhosted.org/packages/19/46/5d11dc300feaad285c2f1bd784ff3f689f5e0ab6be49aaf568f3a77019eb/safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f", size = 606660 },
1930
  ]
1931
 
1932
+ [[package]]
1933
+ name = "sentencepiece"
1934
+ version = "0.2.0"
1935
+ source = { registry = "https://pypi.org/simple" }
1936
+ sdist = { url = "https://files.pythonhosted.org/packages/c9/d2/b9c7ca067c26d8ff085d252c89b5f69609ca93fb85a00ede95f4857865d4/sentencepiece-0.2.0.tar.gz", hash = "sha256:a52c19171daaf2e697dc6cbe67684e0fa341b1248966f6aebb541de654d15843", size = 2632106 }
1937
+ wheels = [
1938
+ { url = "https://files.pythonhosted.org/packages/f6/71/98648c3b64b23edb5403f74bcc906ad21766872a6e1ada26ea3f1eb941ab/sentencepiece-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:188779e1298a1c8b8253c7d3ad729cb0a9891e5cef5e5d07ce4592c54869e227", size = 2408979 },
1939
+ { url = "https://files.pythonhosted.org/packages/77/9f/7efbaa6d4c0c718a9affbecc536b03ca62f99f421bdffb531c16030e2d2b/sentencepiece-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bed9cf85b296fa2b76fc2547b9cbb691a523864cebaee86304c43a7b4cb1b452", size = 1238845 },
1940
+ { url = "https://files.pythonhosted.org/packages/1c/e4/c2541027a43ec6962ba9b601805d17ba3f86b38bdeae0e8ac65a2981e248/sentencepiece-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d7b67e724bead13f18db6e1d10b6bbdc454af574d70efbb36f27d90387be1ca3", size = 1181472 },
1941
+ { url = "https://files.pythonhosted.org/packages/fd/46/316c1ba6c52b97de76aff7b9da678f7afbb52136afb2987c474d95630e65/sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fde4b08cfe237be4484c6c7c2e2c75fb862cfeab6bd5449ce4caeafd97b767a", size = 1259151 },
1942
+ { url = "https://files.pythonhosted.org/packages/aa/5a/3c48738a0835d76dd06c62b6ac48d39c923cde78dd0f587353bdcbb99851/sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c378492056202d1c48a4979650981635fd97875a00eabb1f00c6a236b013b5e", size = 1355931 },
1943
+ { url = "https://files.pythonhosted.org/packages/a6/27/33019685023221ca8ed98e8ceb7ae5e166032686fa3662c68f1f1edf334e/sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1380ce6540a368de2ef6d7e6ba14ba8f3258df650d39ba7d833b79ee68a52040", size = 1301537 },
1944
+ { url = "https://files.pythonhosted.org/packages/ca/e4/55f97cef14293171fef5f96e96999919ab5b4d1ce95b53547ad653d7e3bf/sentencepiece-0.2.0-cp310-cp310-win32.whl", hash = "sha256:a1151d6a6dd4b43e552394aed0edfe9292820272f0194bd56c7c1660a0c06c3d", size = 936747 },
1945
+ { url = "https://files.pythonhosted.org/packages/85/f4/4ef1a6e0e9dbd8a60780a91df8b7452ada14cfaa0e17b3b8dfa42cecae18/sentencepiece-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:d490142b0521ef22bc1085f061d922a2a6666175bb6b42e588ff95c0db6819b2", size = 991525 },
1946
+ { url = "https://files.pythonhosted.org/packages/32/43/8f8885168a47a02eba1455bd3f4f169f50ad5b8cebd2402d0f5e20854d04/sentencepiece-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:17982700c4f6dbb55fa3594f3d7e5dd1c8659a274af3738e33c987d2a27c9d5c", size = 2409036 },
1947
+ { url = "https://files.pythonhosted.org/packages/0f/35/e63ba28062af0a3d688a9f128e407a1a2608544b2f480cb49bf7f4b1cbb9/sentencepiece-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7c867012c0e8bcd5bdad0f791609101cb5c66acb303ab3270218d6debc68a65e", size = 1238921 },
1948
+ { url = "https://files.pythonhosted.org/packages/de/42/ae30952c4a0bd773e90c9bf2579f5533037c886dfc8ec68133d5694f4dd2/sentencepiece-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7fd6071249c74f779c5b27183295b9202f8dedb68034e716784364443879eaa6", size = 1181477 },
1949
+ { url = "https://files.pythonhosted.org/packages/e3/ac/2f2ab1d60bb2d795d054eebe5e3f24b164bc21b5a9b75fba7968b3b91b5a/sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f90c55a65013cbb8f4d7aab0599bf925cde4adc67ae43a0d323677b5a1c6cb", size = 1259182 },
1950
+ { url = "https://files.pythonhosted.org/packages/45/fb/14633c6ecf262c468759ffcdb55c3a7ee38fe4eda6a70d75ee7c7d63c58b/sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b293734059ef656dcd65be62ff771507bea8fed0a711b6733976e1ed3add4553", size = 1355537 },
1951
+ { url = "https://files.pythonhosted.org/packages/fb/12/2f5c8d4764b00033cf1c935b702d3bb878d10be9f0b87f0253495832d85f/sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e58b47f933aca74c6a60a79dcb21d5b9e47416256c795c2d58d55cec27f9551d", size = 1301464 },
1952
+ { url = "https://files.pythonhosted.org/packages/4e/b1/67afc0bde24f6dcb3acdea0dd8dcdf4b8b0db240f6bacd39378bd32d09f8/sentencepiece-0.2.0-cp311-cp311-win32.whl", hash = "sha256:c581258cf346b327c62c4f1cebd32691826306f6a41d8c4bec43b010dee08e75", size = 936749 },
1953
+ { url = "https://files.pythonhosted.org/packages/a2/f6/587c62fd21fc988555b85351f50bbde43a51524caafd63bc69240ded14fd/sentencepiece-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:0993dbc665f4113017892f1b87c3904a44d0640eda510abcacdfb07f74286d36", size = 991520 },
1954
+ { url = "https://files.pythonhosted.org/packages/27/5a/141b227ed54293360a9ffbb7bf8252b4e5efc0400cdeac5809340e5d2b21/sentencepiece-0.2.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ea5f536e32ea8ec96086ee00d7a4a131ce583a1b18d130711707c10e69601cb2", size = 2409370 },
1955
+ { url = "https://files.pythonhosted.org/packages/2e/08/a4c135ad6fc2ce26798d14ab72790d66e813efc9589fd30a5316a88ca8d5/sentencepiece-0.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d0cb51f53b6aae3c36bafe41e86167c71af8370a039f542c43b0cce5ef24a68c", size = 1239288 },
1956
+ { url = "https://files.pythonhosted.org/packages/49/0a/2fe387f825ac5aad5a0bfe221904882106cac58e1b693ba7818785a882b6/sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3212121805afc58d8b00ab4e7dd1f8f76c203ddb9dc94aa4079618a31cf5da0f", size = 1181597 },
1957
+ { url = "https://files.pythonhosted.org/packages/cc/38/e4698ee2293fe4835dc033c49796a39b3eebd8752098f6bd0aa53a14af1f/sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a3149e3066c2a75e0d68a43eb632d7ae728c7925b517f4c05c40f6f7280ce08", size = 1259220 },
1958
+ { url = "https://files.pythonhosted.org/packages/12/24/fd7ef967c9dad2f6e6e5386d0cadaf65cda8b7be6e3861a9ab3121035139/sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:632f3594d3e7ac8b367bca204cb3fd05a01d5b21455acd097ea4c0e30e2f63d7", size = 1355962 },
1959
+ { url = "https://files.pythonhosted.org/packages/4f/d2/18246f43ca730bb81918f87b7e886531eda32d835811ad9f4657c54eee35/sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f295105c6bdbb05bd5e1b0cafbd78ff95036f5d3641e7949455a3f4e5e7c3109", size = 1301706 },
1960
+ { url = "https://files.pythonhosted.org/packages/8a/47/ca237b562f420044ab56ddb4c278672f7e8c866e183730a20e413b38a989/sentencepiece-0.2.0-cp312-cp312-win32.whl", hash = "sha256:fb89f811e5efd18bab141afc3fea3de141c3f69f3fe9e898f710ae7fe3aab251", size = 936941 },
1961
+ { url = "https://files.pythonhosted.org/packages/c6/97/d159c32642306ee2b70732077632895438867b3b6df282354bd550cf2a67/sentencepiece-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a673a72aab81fef5ebe755c6e0cc60087d1f3a4700835d40537183c1703a45f", size = 991994 },
1962
+ ]
1963
+
1964
  [[package]]
1965
  name = "setuptools"
1966
  version = "75.3.0"
 
2009
  { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252 },
2010
  ]
2011
 
2012
+ [[package]]
2013
+ name = "tiktoken"
2014
+ version = "0.8.0"
2015
+ source = { registry = "https://pypi.org/simple" }
2016
+ dependencies = [
2017
+ { name = "regex" },
2018
+ { name = "requests" },
2019
+ ]
2020
+ sdist = { url = "https://files.pythonhosted.org/packages/37/02/576ff3a6639e755c4f70997b2d315f56d6d71e0d046f4fb64cb81a3fb099/tiktoken-0.8.0.tar.gz", hash = "sha256:9ccbb2740f24542534369c5635cfd9b2b3c2490754a78ac8831d99f89f94eeb2", size = 35107 }
2021
+ wheels = [
2022
+ { url = "https://files.pythonhosted.org/packages/c9/ba/a35fad753bbca8ba0cc1b0f3402a70256a110ced7ac332cf84ba89fc87ab/tiktoken-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b07e33283463089c81ef1467180e3e00ab00d46c2c4bbcef0acab5f771d6695e", size = 1039905 },
2023
+ { url = "https://files.pythonhosted.org/packages/91/05/13dab8fd7460391c387b3e69e14bf1e51ff71fe0a202cd2933cc3ea93fb6/tiktoken-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9269348cb650726f44dd3bbb3f9110ac19a8dcc8f54949ad3ef652ca22a38e21", size = 982417 },
2024
+ { url = "https://files.pythonhosted.org/packages/e9/98/18ec4a8351a6cf4537e40cd6e19a422c10cce1ef00a2fcb716e0a96af58b/tiktoken-0.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e13f37bc4ef2d012731e93e0fef21dc3b7aea5bb9009618de9a4026844e560", size = 1144915 },
2025
+ { url = "https://files.pythonhosted.org/packages/2e/28/cf3633018cbcc6deb7805b700ccd6085c9a5a7f72b38974ee0bffd56d311/tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f13d13c981511331eac0d01a59b5df7c0d4060a8be1e378672822213da51e0a2", size = 1177221 },
2026
+ { url = "https://files.pythonhosted.org/packages/57/81/8a5be305cbd39d4e83a794f9e80c7f2c84b524587b7feb27c797b2046d51/tiktoken-0.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6b2ddbc79a22621ce8b1166afa9f9a888a664a579350dc7c09346a3b5de837d9", size = 1237398 },
2027
+ { url = "https://files.pythonhosted.org/packages/dc/da/8d1cc3089a83f5cf11c2e489332752981435280285231924557350523a59/tiktoken-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:d8c2d0e5ba6453a290b86cd65fc51fedf247e1ba170191715b049dac1f628005", size = 884215 },
2028
+ { url = "https://files.pythonhosted.org/packages/f6/1e/ca48e7bfeeccaf76f3a501bd84db1fa28b3c22c9d1a1f41af9fb7579c5f6/tiktoken-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d622d8011e6d6f239297efa42a2657043aaed06c4f68833550cac9e9bc723ef1", size = 1039700 },
2029
+ { url = "https://files.pythonhosted.org/packages/8c/f8/f0101d98d661b34534769c3818f5af631e59c36ac6d07268fbfc89e539ce/tiktoken-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2efaf6199717b4485031b4d6edb94075e4d79177a172f38dd934d911b588d54a", size = 982413 },
2030
+ { url = "https://files.pythonhosted.org/packages/ac/3c/2b95391d9bd520a73830469f80a96e3790e6c0a5ac2444f80f20b4b31051/tiktoken-0.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5637e425ce1fc49cf716d88df3092048359a4b3bbb7da762840426e937ada06d", size = 1144242 },
2031
+ { url = "https://files.pythonhosted.org/packages/01/c4/c4a4360de845217b6aa9709c15773484b50479f36bb50419c443204e5de9/tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fb0e352d1dbe15aba082883058b3cce9e48d33101bdaac1eccf66424feb5b47", size = 1176588 },
2032
+ { url = "https://files.pythonhosted.org/packages/f8/a3/ef984e976822cd6c2227c854f74d2e60cf4cd6fbfca46251199914746f78/tiktoken-0.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:56edfefe896c8f10aba372ab5706b9e3558e78db39dd497c940b47bf228bc419", size = 1237261 },
2033
+ { url = "https://files.pythonhosted.org/packages/1e/86/eea2309dc258fb86c7d9b10db536434fc16420feaa3b6113df18b23db7c2/tiktoken-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:326624128590def898775b722ccc327e90b073714227175ea8febbc920ac0a99", size = 884537 },
2034
+ { url = "https://files.pythonhosted.org/packages/c1/22/34b2e136a6f4af186b6640cbfd6f93400783c9ef6cd550d9eab80628d9de/tiktoken-0.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:881839cfeae051b3628d9823b2e56b5cc93a9e2efb435f4cf15f17dc45f21586", size = 1039357 },
2035
+ { url = "https://files.pythonhosted.org/packages/04/d2/c793cf49c20f5855fd6ce05d080c0537d7418f22c58e71f392d5e8c8dbf7/tiktoken-0.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fe9399bdc3f29d428f16a2f86c3c8ec20be3eac5f53693ce4980371c3245729b", size = 982616 },
2036
+ { url = "https://files.pythonhosted.org/packages/b3/a1/79846e5ef911cd5d75c844de3fa496a10c91b4b5f550aad695c5df153d72/tiktoken-0.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a58deb7075d5b69237a3ff4bb51a726670419db6ea62bdcd8bd80c78497d7ab", size = 1144011 },
2037
+ { url = "https://files.pythonhosted.org/packages/26/32/e0e3a859136e95c85a572e4806dc58bf1ddf651108ae8b97d5f3ebe1a244/tiktoken-0.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2908c0d043a7d03ebd80347266b0e58440bdef5564f84f4d29fb235b5df3b04", size = 1175432 },
2038
+ { url = "https://files.pythonhosted.org/packages/c7/89/926b66e9025b97e9fbabeaa59048a736fe3c3e4530a204109571104f921c/tiktoken-0.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:294440d21a2a51e12d4238e68a5972095534fe9878be57d905c476017bff99fc", size = 1236576 },
2039
+ { url = "https://files.pythonhosted.org/packages/45/e2/39d4aa02a52bba73b2cd21ba4533c84425ff8786cc63c511d68c8897376e/tiktoken-0.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:d8f3192733ac4d77977432947d563d7e1b310b96497acd3c196c9bddb36ed9db", size = 883824 },
2040
+ { url = "https://files.pythonhosted.org/packages/e3/38/802e79ba0ee5fcbf240cd624143f57744e5d411d2e9d9ad2db70d8395986/tiktoken-0.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:02be1666096aff7da6cbd7cdaa8e7917bfed3467cd64b38b1f112e96d3b06a24", size = 1039648 },
2041
+ { url = "https://files.pythonhosted.org/packages/b1/da/24cdbfc302c98663fbea66f5866f7fa1048405c7564ab88483aea97c3b1a/tiktoken-0.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c94ff53c5c74b535b2cbf431d907fc13c678bbd009ee633a2aca269a04389f9a", size = 982763 },
2042
+ { url = "https://files.pythonhosted.org/packages/e4/f0/0ecf79a279dfa41fc97d00adccf976ecc2556d3c08ef3e25e45eb31f665b/tiktoken-0.8.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b231f5e8982c245ee3065cd84a4712d64692348bc609d84467c57b4b72dcbc5", size = 1144417 },
2043
+ { url = "https://files.pythonhosted.org/packages/ab/d3/155d2d4514f3471a25dc1d6d20549ef254e2aa9bb5b1060809b1d3b03d3a/tiktoken-0.8.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4177faa809bd55f699e88c96d9bb4635d22e3f59d635ba6fd9ffedf7150b9953", size = 1175108 },
2044
+ { url = "https://files.pythonhosted.org/packages/19/eb/5989e16821ee8300ef8ee13c16effc20dfc26c777d05fbb6825e3c037b81/tiktoken-0.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5376b6f8dc4753cd81ead935c5f518fa0fbe7e133d9e25f648d8c4dabdd4bad7", size = 1236520 },
2045
+ { url = "https://files.pythonhosted.org/packages/40/59/14b20465f1d1cb89cfbc96ec27e5617b2d41c79da12b5e04e96d689be2a7/tiktoken-0.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:18228d624807d66c87acd8f25fc135665617cab220671eb65b50f5d70fa51f69", size = 883849 },
2046
+ ]
2047
+
2048
  [[package]]
2049
  name = "tokenizers"
2050
  version = "0.20.1"