David Pomerenke commited on
Commit
e6f1c56
·
1 Parent(s): 544091e

Use sacrebleu

Browse files
Files changed (4) hide show
  1. languagebench.py +5 -4
  2. pyproject.toml +1 -2
  3. results.json +55 -25
  4. uv.lock +0 -0
languagebench.py CHANGED
@@ -26,16 +26,17 @@ target_languages = [
26
  "fra_Latn",
27
  "spa_Latn",
28
  "cmn_Hans",
 
29
  ]
30
 
31
  # setup
 
32
  client = AsyncOpenAI(
33
  base_url="https://openrouter.ai/api/v1",
34
  api_key=getenv("OPENROUTER_API_KEY"),
35
  )
36
- load_dotenv()
37
  cache = Memory(location=".cache", verbose=0).cache
38
- bleu = evaluate.load("bleu")
39
 
40
 
41
  @cache
@@ -75,11 +76,11 @@ async def main():
75
  "model": model,
76
  "original_language": original_language,
77
  "target_language": target_language,
78
- "bleu": metrics["bleu"],
79
  }
80
  )
81
  with open("results.json", "w") as f:
82
- json.dump(results, f, indent=2)
83
 
84
 
85
  if __name__ == "__main__":
 
26
  "fra_Latn",
27
  "spa_Latn",
28
  "cmn_Hans",
29
+ "cmn_Hant",
30
  ]
31
 
32
  # setup
33
+ load_dotenv()
34
  client = AsyncOpenAI(
35
  base_url="https://openrouter.ai/api/v1",
36
  api_key=getenv("OPENROUTER_API_KEY"),
37
  )
 
38
  cache = Memory(location=".cache", verbose=0).cache
39
+ bleu = evaluate.load("sacrebleu")
40
 
41
 
42
  @cache
 
76
  "model": model,
77
  "original_language": original_language,
78
  "target_language": target_language,
79
+ "bleu": metrics["score"],
80
  }
81
  )
82
  with open("results.json", "w") as f:
83
+ json.dump(results, f, indent=2, ensure_ascii=False)
84
 
85
 
86
  if __name__ == "__main__":
pyproject.toml CHANGED
@@ -9,8 +9,7 @@ dependencies = [
9
  "joblib>=1.4.2",
10
  "openai>=1.52.2",
11
  "pandas>=2.2.3",
12
- "plotly>=5.24.1",
13
  "python-dotenv>=1.0.1",
14
- "streamlit>=1.39.0",
15
  "tqdm>=4.66.6",
16
  ]
 
9
  "joblib>=1.4.2",
10
  "openai>=1.52.2",
11
  "pandas>=2.2.3",
 
12
  "python-dotenv>=1.0.1",
13
+ "sacrebleu>=2.4.3",
14
  "tqdm>=4.66.6",
15
  ]
results.json CHANGED
@@ -3,150 +3,180 @@
3
  "model": "openai/gpt-4o-mini",
4
  "original_language": "eng_Latn",
5
  "target_language": "eng_Latn",
6
- "bleu": 0.9601875101934466
7
  },
8
  {
9
  "model": "google/gemini-flash-1.5",
10
  "original_language": "eng_Latn",
11
  "target_language": "eng_Latn",
12
- "bleu": 0.796483772261889
13
  },
14
  {
15
  "model": "anthropic/claude-3.5-sonnet",
16
  "original_language": "eng_Latn",
17
  "target_language": "eng_Latn",
18
- "bleu": 0.4789694173473208
19
  },
20
  {
21
  "model": "qwen/qwen-2.5-72b-instruct",
22
  "original_language": "eng_Latn",
23
  "target_language": "eng_Latn",
24
- "bleu": 0.5708253125905761
25
  },
26
  {
27
  "model": "meta-llama/llama-3.1-8b-instruct",
28
  "original_language": "eng_Latn",
29
  "target_language": "eng_Latn",
30
- "bleu": 0.7139866196167579
31
  },
32
  {
33
  "model": "openai/gpt-4o-mini",
34
  "original_language": "eng_Latn",
35
  "target_language": "deu_Latn",
36
- "bleu": 0.42769123869791453
37
  },
38
  {
39
  "model": "google/gemini-flash-1.5",
40
  "original_language": "eng_Latn",
41
  "target_language": "deu_Latn",
42
- "bleu": 0.481667025275085
43
  },
44
  {
45
  "model": "anthropic/claude-3.5-sonnet",
46
  "original_language": "eng_Latn",
47
  "target_language": "deu_Latn",
48
- "bleu": 0.47566381880734276
49
  },
50
  {
51
  "model": "qwen/qwen-2.5-72b-instruct",
52
  "original_language": "eng_Latn",
53
  "target_language": "deu_Latn",
54
- "bleu": 0.3886704151083369
55
  },
56
  {
57
  "model": "meta-llama/llama-3.1-8b-instruct",
58
  "original_language": "eng_Latn",
59
  "target_language": "deu_Latn",
60
- "bleu": 0.3229429355718441
61
  },
62
  {
63
  "model": "openai/gpt-4o-mini",
64
  "original_language": "eng_Latn",
65
  "target_language": "fra_Latn",
66
- "bleu": 0.4770220301445618
67
  },
68
  {
69
  "model": "google/gemini-flash-1.5",
70
  "original_language": "eng_Latn",
71
  "target_language": "fra_Latn",
72
- "bleu": 0.4950529382461408
73
  },
74
  {
75
  "model": "anthropic/claude-3.5-sonnet",
76
  "original_language": "eng_Latn",
77
  "target_language": "fra_Latn",
78
- "bleu": 0.505571990673057
79
  },
80
  {
81
  "model": "qwen/qwen-2.5-72b-instruct",
82
  "original_language": "eng_Latn",
83
  "target_language": "fra_Latn",
84
- "bleu": 0.4343766704709354
85
  },
86
  {
87
  "model": "meta-llama/llama-3.1-8b-instruct",
88
  "original_language": "eng_Latn",
89
  "target_language": "fra_Latn",
90
- "bleu": 0.3738013101452592
91
  },
92
  {
93
  "model": "openai/gpt-4o-mini",
94
  "original_language": "eng_Latn",
95
  "target_language": "spa_Latn",
96
- "bleu": 0.34656060748435535
97
  },
98
  {
99
  "model": "google/gemini-flash-1.5",
100
  "original_language": "eng_Latn",
101
  "target_language": "spa_Latn",
102
- "bleu": 0.3449205632717461
103
  },
104
  {
105
  "model": "anthropic/claude-3.5-sonnet",
106
  "original_language": "eng_Latn",
107
  "target_language": "spa_Latn",
108
- "bleu": 0.34586378905270954
109
  },
110
  {
111
  "model": "qwen/qwen-2.5-72b-instruct",
112
  "original_language": "eng_Latn",
113
  "target_language": "spa_Latn",
114
- "bleu": 0.3341419407814188
115
  },
116
  {
117
  "model": "meta-llama/llama-3.1-8b-instruct",
118
  "original_language": "eng_Latn",
119
  "target_language": "spa_Latn",
120
- "bleu": 0.29470460185415065
121
  },
122
  {
123
  "model": "openai/gpt-4o-mini",
124
  "original_language": "eng_Latn",
125
  "target_language": "cmn_Hans",
126
- "bleu": 0.0
127
  },
128
  {
129
  "model": "google/gemini-flash-1.5",
130
  "original_language": "eng_Latn",
131
  "target_language": "cmn_Hans",
132
- "bleu": 0.0
133
  },
134
  {
135
  "model": "anthropic/claude-3.5-sonnet",
136
  "original_language": "eng_Latn",
137
  "target_language": "cmn_Hans",
138
- "bleu": 0.0
139
  },
140
  {
141
  "model": "qwen/qwen-2.5-72b-instruct",
142
  "original_language": "eng_Latn",
143
  "target_language": "cmn_Hans",
144
- "bleu": 0.0
145
  },
146
  {
147
  "model": "meta-llama/llama-3.1-8b-instruct",
148
  "original_language": "eng_Latn",
149
  "target_language": "cmn_Hans",
150
- "bleu": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  }
152
  ]
 
3
  "model": "openai/gpt-4o-mini",
4
  "original_language": "eng_Latn",
5
  "target_language": "eng_Latn",
6
+ "bleu": 96.0187510193446
7
  },
8
  {
9
  "model": "google/gemini-flash-1.5",
10
  "original_language": "eng_Latn",
11
  "target_language": "eng_Latn",
12
+ "bleu": 79.64837722618887
13
  },
14
  {
15
  "model": "anthropic/claude-3.5-sonnet",
16
  "original_language": "eng_Latn",
17
  "target_language": "eng_Latn",
18
+ "bleu": 47.89694173473209
19
  },
20
  {
21
  "model": "qwen/qwen-2.5-72b-instruct",
22
  "original_language": "eng_Latn",
23
  "target_language": "eng_Latn",
24
+ "bleu": 57.08253125905762
25
  },
26
  {
27
  "model": "meta-llama/llama-3.1-8b-instruct",
28
  "original_language": "eng_Latn",
29
  "target_language": "eng_Latn",
30
+ "bleu": 71.3986619616758
31
  },
32
  {
33
  "model": "openai/gpt-4o-mini",
34
  "original_language": "eng_Latn",
35
  "target_language": "deu_Latn",
36
+ "bleu": 42.76912386979146
37
  },
38
  {
39
  "model": "google/gemini-flash-1.5",
40
  "original_language": "eng_Latn",
41
  "target_language": "deu_Latn",
42
+ "bleu": 48.166702527508484
43
  },
44
  {
45
  "model": "anthropic/claude-3.5-sonnet",
46
  "original_language": "eng_Latn",
47
  "target_language": "deu_Latn",
48
+ "bleu": 47.56638188073429
49
  },
50
  {
51
  "model": "qwen/qwen-2.5-72b-instruct",
52
  "original_language": "eng_Latn",
53
  "target_language": "deu_Latn",
54
+ "bleu": 38.8670415108337
55
  },
56
  {
57
  "model": "meta-llama/llama-3.1-8b-instruct",
58
  "original_language": "eng_Latn",
59
  "target_language": "deu_Latn",
60
+ "bleu": 32.2942935571844
61
  },
62
  {
63
  "model": "openai/gpt-4o-mini",
64
  "original_language": "eng_Latn",
65
  "target_language": "fra_Latn",
66
+ "bleu": 47.70220301445618
67
  },
68
  {
69
  "model": "google/gemini-flash-1.5",
70
  "original_language": "eng_Latn",
71
  "target_language": "fra_Latn",
72
+ "bleu": 49.50529382461407
73
  },
74
  {
75
  "model": "anthropic/claude-3.5-sonnet",
76
  "original_language": "eng_Latn",
77
  "target_language": "fra_Latn",
78
+ "bleu": 50.55719906730571
79
  },
80
  {
81
  "model": "qwen/qwen-2.5-72b-instruct",
82
  "original_language": "eng_Latn",
83
  "target_language": "fra_Latn",
84
+ "bleu": 43.43766704709355
85
  },
86
  {
87
  "model": "meta-llama/llama-3.1-8b-instruct",
88
  "original_language": "eng_Latn",
89
  "target_language": "fra_Latn",
90
+ "bleu": 37.38013101452594
91
  },
92
  {
93
  "model": "openai/gpt-4o-mini",
94
  "original_language": "eng_Latn",
95
  "target_language": "spa_Latn",
96
+ "bleu": 34.65606074843554
97
  },
98
  {
99
  "model": "google/gemini-flash-1.5",
100
  "original_language": "eng_Latn",
101
  "target_language": "spa_Latn",
102
+ "bleu": 34.49205632717459
103
  },
104
  {
105
  "model": "anthropic/claude-3.5-sonnet",
106
  "original_language": "eng_Latn",
107
  "target_language": "spa_Latn",
108
+ "bleu": 34.58637890527096
109
  },
110
  {
111
  "model": "qwen/qwen-2.5-72b-instruct",
112
  "original_language": "eng_Latn",
113
  "target_language": "spa_Latn",
114
+ "bleu": 33.41419407814188
115
  },
116
  {
117
  "model": "meta-llama/llama-3.1-8b-instruct",
118
  "original_language": "eng_Latn",
119
  "target_language": "spa_Latn",
120
+ "bleu": 29.470460185415075
121
  },
122
  {
123
  "model": "openai/gpt-4o-mini",
124
  "original_language": "eng_Latn",
125
  "target_language": "cmn_Hans",
126
+ "bleu": 0.7678283495493847
127
  },
128
  {
129
  "model": "google/gemini-flash-1.5",
130
  "original_language": "eng_Latn",
131
  "target_language": "cmn_Hans",
132
+ "bleu": 0.3178534804335777
133
  },
134
  {
135
  "model": "anthropic/claude-3.5-sonnet",
136
  "original_language": "eng_Latn",
137
  "target_language": "cmn_Hans",
138
+ "bleu": 0.8670958769249191
139
  },
140
  {
141
  "model": "qwen/qwen-2.5-72b-instruct",
142
  "original_language": "eng_Latn",
143
  "target_language": "cmn_Hans",
144
+ "bleu": 0.6796400550094367
145
  },
146
  {
147
  "model": "meta-llama/llama-3.1-8b-instruct",
148
  "original_language": "eng_Latn",
149
  "target_language": "cmn_Hans",
150
+ "bleu": 0.027154305073795664
151
+ },
152
+ {
153
+ "model": "openai/gpt-4o-mini",
154
+ "original_language": "eng_Latn",
155
+ "target_language": "cmn_Hant",
156
+ "bleu": 2.175042632198715
157
+ },
158
+ {
159
+ "model": "google/gemini-flash-1.5",
160
+ "original_language": "eng_Latn",
161
+ "target_language": "cmn_Hant",
162
+ "bleu": 0.3480387797702917
163
+ },
164
+ {
165
+ "model": "anthropic/claude-3.5-sonnet",
166
+ "original_language": "eng_Latn",
167
+ "target_language": "cmn_Hant",
168
+ "bleu": 3.8196828383724886
169
+ },
170
+ {
171
+ "model": "qwen/qwen-2.5-72b-instruct",
172
+ "original_language": "eng_Latn",
173
+ "target_language": "cmn_Hant",
174
+ "bleu": 2.1029807575075994
175
+ },
176
+ {
177
+ "model": "meta-llama/llama-3.1-8b-instruct",
178
+ "original_language": "eng_Latn",
179
+ "target_language": "cmn_Hant",
180
+ "bleu": 0.017008567925605175
181
  }
182
  ]
uv.lock CHANGED
The diff for this file is too large to render. See raw diff