Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

davidpomerenke commited on Jun 4

Commit

34983b9

verified ·

1 Parent(s): 5519510

Upload from GitHub Actions: Add citations

Browse files

Files changed (1) hide show

bibliography.bib +138 -4

bibliography.bib CHANGED Viewed

@@ -8,6 +8,23 @@
   file = {/Users/david/Zotero/storage/AU759RXC/nllb-200-high-quality-machine-translation.html}
 }
 @misc{adelaniSIB200SimpleInclusive2024,
   title = {{{SIB-200}}: {{A Simple}}, {{Inclusive}}, and {{Big Evaluation Dataset}} for {{Topic Classification}} in 200+ {{Languages}} and {{Dialects}}},
   shorttitle = {{{SIB-200}}},
@@ -47,6 +64,23 @@
   file = {/Users/david/Zotero/storage/Q8A3WGUG/Ahuja et al. - 2024 - MEGAVERSE Benchmarking Large Language Models Across Languages, Modalities, Models and Tasks.pdf;/Users/david/Zotero/storage/ZHA8FR3E/2311.html}
 }
 @misc{bapnaBuildingMachineTranslation2022,
   title = {Building {{Machine Translation Systems}} for the {{Next Thousand Languages}}},
   author = {Bapna, Ankur and Caswell, Isaac and Kreutzer, Julia and Firat, Orhan and van Esch, Daan and Siddhant, Aditya and Niu, Mengmeng and Baljekar, Pallavi and Garcia, Xavier and Macherey, Wolfgang and Breiner, Theresa and Axelrod, Vera and Riesa, Jason and Cao, Yuan and Chen, Mia Xu and Macherey, Klaus and Krikun, Maxim and Wang, Pidong and Gutkin, Alexander and Shah, Apurva and Huang, Yanping and Chen, Zhifeng and Wu, Yonghui and Hughes, Macduff},
@@ -63,7 +97,7 @@
   file = {/Users/david/Zotero/storage/YCW6FWWE/Bapna et al. - 2022 - Building Machine Translation Systems for the Next Thousand Languages.pdf;/Users/david/Zotero/storage/EL7PA6YJ/2205.html}
 }
-@article{costa-jussaScalingNeuralMachine2024,
   title = {Scaling Neural Machine Translation to 200 Languages},
   author = {{Costa-juss{\`a}}, Marta R. and Cross, James and {\c C}elebi, Onur and Elbayad, Maha and Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett, Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews, Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia and Goswami, Vedanuj and Guzm{\'a}n, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers, Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff and {NLLB Team}},
   year = {2024},
@@ -75,12 +109,11 @@
   publisher = {Nature Publishing Group},
   issn = {1476-4687},
   doi = {10.1038/s41586-024-07335-x},
-  urldate = {2024-11-02},
   abstract = {The development of neural techniques has opened up new avenues for research in machine translation. Today, neural machine translation (NMT) systems can leverage highly multilingual capacities and even perform zero-shot translation, delivering promising results in terms of language coverage and quality. However, scaling quality NMT requires large volumes of parallel bilingual data, which are not equally available for the 7,000+ languages in the world1. Focusing on improving the translation qualities of a relatively small group of high-resource languages comes at the expense of directing research attention to low-resource languages, exacerbating digital inequities in the long run. To break this pattern, here we introduce No Language Left Behind---a single massively multilingual model that leverages transfer learning across languages. We developed a conditional computational model based on the Sparsely Gated Mixture of Experts architecture2--7, which we trained on data obtained with new mining techniques tailored for low-resource languages. Furthermore, we devised multiple architectural and training improvements to counteract overfitting while training on thousands of tasks. We evaluated the performance of our model over 40,000 translation directions using tools created specifically for this purpose---an automatic benchmark (FLORES-200), a human evaluation metric (XSTS) and a toxicity detector that covers every language in our model. Compared with the previous state-of-the-art models, our model achieves an average of 44\% improvement in translation quality as measured by BLEU. By demonstrating how to scale NMT to 200 languages and making all contributions in this effort freely available for non-commercial use, our work lays important groundwork for the development of a universal translation system.},
   copyright = {2024 Meta},
   langid = {english},
-  keywords = {dataset,model,n=200},
-  file = {/Users/david/Zotero/storage/R7AB5BR3/Costa-jussà et al. - 2024 - Scaling neural machine translation to 200 languages.pdf}
 }
 @book{eberhard2024ethnologue,
@@ -107,6 +140,22 @@
   file = {/Users/david/Zotero/storage/E286EDPU/Federmann et al. - 2022 - NTREX-128 – News Test References for MT Evaluation of 128 Languages.pdf}
 }
 @misc{gurgurovLowREmRepositoryWord2024,
   title = {{{LowREm}}: {{A Repository}} of {{Word Embeddings}} for 87 {{Low-Resource Languages Enhanced}} with {{Multilingual Graph Knowledge}}},
   shorttitle = {{{LowREm}}},
@@ -154,6 +203,23 @@
   file = {/Users/david/Zotero/storage/SMRV9HE2/language.html}
 }
 @misc{Lanfrica,
   title = {Lanfrica},
   urldate = {2024-11-02},
@@ -180,6 +246,15 @@
   file = {/Users/david/Zotero/storage/6BYYZ7V2/Maillard et al. - 2023 - Small Data, Big Impact Leveraging Minimal Data for Effective Machine Translation.pdf}
 }
 @inproceedings{nekotoParticipatoryResearchLowresourced2020,
   title = {Participatory {{Research}} for {{Low-resourced Machine Translation}}: {{A Case Study}} in {{African Languages}}},
   shorttitle = {Participatory {{Research}} for {{Low-resourced Machine Translation}}},
@@ -198,6 +273,16 @@
   file = {/Users/david/Zotero/storage/JJ2S8CT3/Nekoto et al. - 2020 - Participatory Research for Low-resourced Machine Translation A Case Study in African Languages.pdf}
 }
 @misc{OpenlanguagedataFlores2024,
   title = {Openlanguagedata/Flores},
   year = {2024},
@@ -243,6 +328,22 @@
   file = {/Users/david/Zotero/storage/BWFBTAZ9/Robinson et al. - 2023 - ChatGPT MT Competitive for High- (but Not Low-) Resource Languages.pdf}
 }
 @misc{siminyuAI4DAfricanLanguage2021,
   title = {{{AI4D}} -- {{African Language Program}}},
   author = {Siminyu, Kathleen and Kalipe, Godson and Orlic, Davor and Abbott, Jade and Marivate, Vukosi and Freshia, Sackey and Sibal, Prateek and Neupane, Bhanu and Adelani, David I. and Taylor, Amelia and ALI, Jamiil Toure and Degila, Kevin and Balogoun, Momboladji and DIOP, Thierno Ibrahima and David, Davis and Fourati, Chayma and Haddad, Hatem and Naski, Malek},
@@ -259,6 +360,23 @@
   file = {/Users/david/Zotero/storage/VU6IFENR/Siminyu et al. - 2021 - AI4D -- African Language Program.pdf;/Users/david/Zotero/storage/7TV2PS8J/2104.html}
 }
 @misc{Tatoeba,
   title = {Tatoeba},
   urldate = {2024-11-03},
@@ -281,3 +399,19 @@
   keywords = {dataset,model,n=200},
   file = {/Users/david/Zotero/storage/GHWEGFFS/Team et al. - 2022 - No Language Left Behind Scaling Human-Centered Machine Translation.pdf;/Users/david/Zotero/storage/SZK3CP9C/2207.html}
 }

   file = {/Users/david/Zotero/storage/AU759RXC/nllb-200-high-quality-machine-translation.html}
 }
+@misc{adelaniIrokoBenchNewBenchmark2025,
+  title = {{{IrokoBench}}: {{A New Benchmark}} for {{African Languages}} in the {{Age}} of {{Large Language Models}}},
+  shorttitle = {{{IrokoBench}}},
+  author = {Adelani, David Ifeoluwa and Ojo, Jessica and Azime, Israel Abebe and Zhuang, Jian Yun and Alabi, Jesujoba O. and He, Xuanli and Ochieng, Millicent and Hooker, Sara and Bukula, Andiswa and Lee, En-Shiun Annie and Chukwuneke, Chiamaka and Buzaaba, Happy and Sibanda, Blessing and Kalipe, Godson and Mukiibi, Jonathan and Kabongo, Salomon and Yuehgoh, Foutse and Setaka, Mmasibidi and Ndolela, Lolwethu and Odu, Nkiruka and Mabuya, Rooweither and Muhammad, Shamsuddeen Hassan and Osei, Salomey and Samb, Sokhar and Guge, Tadesse Kebede and Sherman, Tombekai Vangoni and Stenetorp, Pontus},
+  year = {2025},
+  month = jan,
+  number = {arXiv:2406.03368},
+  eprint = {2406.03368},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2406.03368},
+  urldate = {2025-05-28},
+  abstract = {Despite the widespread adoption of Large language models (LLMs), their remarkable capabilities remain limited to a few high-resource languages. Additionally, many low-resource languages ({\textbackslash}eg African languages) are often evaluated only on basic text classification tasks due to the lack of appropriate or comprehensive benchmarks outside of high-resource languages. In this paper, we introduce IrokoBench -- a human-translated benchmark dataset for 17 typologically-diverse low-resource African languages covering three tasks: natural language inference{\textasciitilde}(AfriXNLI), mathematical reasoning{\textasciitilde}(AfriMGSM), and multi-choice knowledge-based question answering{\textasciitilde}(AfriMMLU). We use IrokoBench to evaluate zero-shot, few-shot, and translate-test settings{\textasciitilde}(where test sets are translated into English) across 10 open and six proprietary LLMs. Our evaluation reveals a significant performance gap between high-resource languages{\textasciitilde}(such as English and French) and low-resource African languages. We observe a significant performance gap between open and proprietary models, with the highest performing open model, Gemma 2 27B only at 63{\textbackslash}\% of the best-performing proprietary model GPT-4o performance. In addition, machine translating the test set to English before evaluation helped to close the gap for larger models that are English-centric, such as Gemma 2 27B and LLaMa 3.1 70B. These findings suggest that more efforts are needed to develop and adapt LLMs for African languages.},
+  archiveprefix = {arXiv},
+  file = {/Users/david/Zotero/storage/9X39YMAR/Adelani et al. - 2025 - IrokoBench A New Benchmark for African Languages in the Age of Large Language Models.pdf;/Users/david/Zotero/storage/L69AFMRS/2406.html}
+}
 @misc{adelaniSIB200SimpleInclusive2024,
   title = {{{SIB-200}}: {{A Simple}}, {{Inclusive}}, and {{Big Evaluation Dataset}} for {{Topic Classification}} in 200+ {{Languages}} and {{Dialects}}},
   shorttitle = {{{SIB-200}}},
   file = {/Users/david/Zotero/storage/Q8A3WGUG/Ahuja et al. - 2024 - MEGAVERSE Benchmarking Large Language Models Across Languages, Modalities, Models and Tasks.pdf;/Users/david/Zotero/storage/ZHA8FR3E/2311.html}
 }
+@misc{aliExpandingFLORESBenchmark2024,
+  title = {Expanding {{FLORES}}+ {{Benchmark}} for More {{Low-Resource Settings}}: {{Portuguese-Emakhuwa Machine Translation Evaluation}}},
+  shorttitle = {Expanding {{FLORES}}+ {{Benchmark}} for More {{Low-Resource Settings}}},
+  author = {Ali, Felermino D. M. Antonio and Cardoso, Henrique Lopes and {Sousa-Silva}, Rui},
+  year = {2024},
+  month = aug,
+  number = {arXiv:2408.11457},
+  eprint = {2408.11457},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2408.11457},
+  urldate = {2025-05-28},
+  abstract = {As part of the Open Language Data Initiative shared tasks, we have expanded the FLORES+ evaluation set to include Emakhuwa, a low-resource language widely spoken in Mozambique. We translated the dev and devtest sets from Portuguese into Emakhuwa, and we detail the translation process and quality assurance measures used. Our methodology involved various quality checks, including post-editing and adequacy assessments. The resulting datasets consist of multiple reference sentences for each source. We present baseline results from training a Neural Machine Translation system and fine-tuning existing multilingual translation models. Our findings suggest that spelling inconsistencies remain a challenge in Emakhuwa. Additionally, the baseline models underperformed on this evaluation set, underscoring the necessity for further research to enhance machine translation quality for Emakhuwa. The data is publicly available at https://huggingface.co/datasets/LIACC/Emakhuwa-FLORES.},
+  archiveprefix = {arXiv},
+  file = {/Users/david/Zotero/storage/THELPPXB/Ali et al. - 2024 - Expanding FLORES+ Benchmark for more Low-Resource Settings Portuguese-Emakhuwa Machine Translation.pdf;/Users/david/Zotero/storage/NWMT7ZHL/2408.html}
+}
 @misc{bapnaBuildingMachineTranslation2022,
   title = {Building {{Machine Translation Systems}} for the {{Next Thousand Languages}}},
   author = {Bapna, Ankur and Caswell, Isaac and Kreutzer, Julia and Firat, Orhan and van Esch, Daan and Siddhant, Aditya and Niu, Mengmeng and Baljekar, Pallavi and Garcia, Xavier and Macherey, Wolfgang and Breiner, Theresa and Axelrod, Vera and Riesa, Jason and Cao, Yuan and Chen, Mia Xu and Macherey, Klaus and Krikun, Maxim and Wang, Pidong and Gutkin, Alexander and Shah, Apurva and Huang, Yanping and Chen, Zhifeng and Wu, Yonghui and Hughes, Macduff},
   file = {/Users/david/Zotero/storage/YCW6FWWE/Bapna et al. - 2022 - Building Machine Translation Systems for the Next Thousand Languages.pdf;/Users/david/Zotero/storage/EL7PA6YJ/2205.html}
 }
+@article{costa-jussaScalingNeuralMachine2024a,
   title = {Scaling Neural Machine Translation to 200 Languages},
   author = {{Costa-juss{\`a}}, Marta R. and Cross, James and {\c C}elebi, Onur and Elbayad, Maha and Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett, Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews, Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia and Goswami, Vedanuj and Guzm{\'a}n, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers, Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff and {NLLB Team}},
   year = {2024},
   publisher = {Nature Publishing Group},
   issn = {1476-4687},
   doi = {10.1038/s41586-024-07335-x},
+  urldate = {2025-05-28},
   abstract = {The development of neural techniques has opened up new avenues for research in machine translation. Today, neural machine translation (NMT) systems can leverage highly multilingual capacities and even perform zero-shot translation, delivering promising results in terms of language coverage and quality. However, scaling quality NMT requires large volumes of parallel bilingual data, which are not equally available for the 7,000+ languages in the world1. Focusing on improving the translation qualities of a relatively small group of high-resource languages comes at the expense of directing research attention to low-resource languages, exacerbating digital inequities in the long run. To break this pattern, here we introduce No Language Left Behind---a single massively multilingual model that leverages transfer learning across languages. We developed a conditional computational model based on the Sparsely Gated Mixture of Experts architecture2--7, which we trained on data obtained with new mining techniques tailored for low-resource languages. Furthermore, we devised multiple architectural and training improvements to counteract overfitting while training on thousands of tasks. We evaluated the performance of our model over 40,000 translation directions using tools created specifically for this purpose---an automatic benchmark (FLORES-200), a human evaluation metric (XSTS) and a toxicity detector that covers every language in our model. Compared with the previous state-of-the-art models, our model achieves an average of 44\% improvement in translation quality as measured by BLEU. By demonstrating how to scale NMT to 200 languages and making all contributions in this effort freely available for non-commercial use, our work lays important groundwork for the development of a universal translation system.},
   copyright = {2024 Meta},
   langid = {english},
+  file = {/Users/david/Zotero/storage/6RWFDDH5/Costa-jussà et al. - 2024 - Scaling neural machine translation to 200 languages.pdf}
 }
 @book{eberhard2024ethnologue,
   file = {/Users/david/Zotero/storage/E286EDPU/Federmann et al. - 2022 - NTREX-128 – News Test References for MT Evaluation of 128 Languages.pdf}
 }
+@inproceedings{gordeevFLORESTranslationMachine2024,
+  title = {{{FLORES}}+ {{Translation}} and {{Machine Translation Evaluation}} for the {{Erzya Language}}},
+  booktitle = {Proceedings of the {{Ninth Conference}} on {{Machine Translation}}},
+  author = {Gordeev, Isai and Kuldin, Sergey and Dale, David},
+  editor = {Haddow, Barry and Kocmi, Tom and Koehn, Philipp and Monz, Christof},
+  year = {2024},
+  month = nov,
+  pages = {614--623},
+  publisher = {Association for Computational Linguistics},
+  address = {Miami, Florida, USA},
+  doi = {10.18653/v1/2024.wmt-1.49},
+  urldate = {2025-05-28},
+  abstract = {This paper introduces a translation of the FLORES+ dataset into the endangered Erzya language, with the goal of evaluating machine translation between this language and any of the other 200 languages already included into FLORES+. This translation was carried out as a part of the Open Language Data shared task at WMT24. We also present a benchmark of existing translation models bases on this dataset and a new translation model that achieves the state-of-the-art quality of translation into Erzya from Russian and English.},
+  file = {/Users/david/Zotero/storage/KHZ84I94/Gordeev et al. - 2024 - FLORES+ Translation and Machine Translation Evaluation for the Erzya Language.pdf}
+}
 @misc{gurgurovLowREmRepositoryWord2024,
   title = {{{LowREm}}: {{A Repository}} of {{Word Embeddings}} for 87 {{Low-Resource Languages Enhanced}} with {{Multilingual Graph Knowledge}}},
   shorttitle = {{{LowREm}}},
   file = {/Users/david/Zotero/storage/SMRV9HE2/language.html}
 }
+@misc{laiOkapiInstructiontunedLarge2023,
+  title = {Okapi: {{Instruction-tuned Large Language Models}} in {{Multiple Languages}} with {{Reinforcement Learning}} from {{Human Feedback}}},
+  shorttitle = {Okapi},
+  author = {Lai, Viet Dac and Nguyen, Chien Van and Ngo, Nghia Trung and Nguyen, Thuat and Dernoncourt, Franck and Rossi, Ryan A. and Nguyen, Thien Huu},
+  year = {2023},
+  month = aug,
+  number = {arXiv:2307.16039},
+  eprint = {2307.16039},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2307.16039},
+  urldate = {2025-05-28},
+  abstract = {A key technology for the development of large language models (LLMs) involves instruction tuning that helps align the models' responses with human expectations to realize impressive learning abilities. Two major approaches for instruction tuning characterize supervised fine-tuning (SFT) and reinforcement learning from human feedback (RLHF), which are currently applied to produce the best commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for research and development efforts, various instruction-tuned open-source LLMs have also been introduced recently, e.g., Alpaca, Vicuna, to name a few. However, existing open-source LLMs have only been instruction-tuned for English and a few popular languages, thus hindering their impacts and accessibility to many other languages in the world. Among a few very recent work to explore instruction tuning for LLMs in multiple languages, SFT has been used as the only approach to instruction-tune LLMs for multiple languages. This has left a significant gap for fine-tuned LLMs based on RLHF in diverse languages and raised important questions on how RLHF can boost the performance of multilingual instruction tuning. To overcome this issue, we present Okapi, the first system with instruction-tuned LLMs based on RLHF for multiple languages. Okapi introduces instruction and response-ranked data in 26 diverse languages to facilitate the experiments and development of future multilingual LLM research. We also present benchmark datasets to enable the evaluation of generative LLMs in multiple languages. Our experiments demonstrate the advantages of RLHF for multilingual instruction over SFT for different base models and datasets. Our framework and resources are released at https://github.com/nlp-uoregon/Okapi.},
+  archiveprefix = {arXiv},
+  file = {/Users/david/Zotero/storage/2GB79E4F/Lai et al. - 2023 - Okapi Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning fro.pdf;/Users/david/Zotero/storage/VZXBL7F2/2307.html}
+}
 @misc{Lanfrica,
   title = {Lanfrica},
   urldate = {2024-11-02},
   file = {/Users/david/Zotero/storage/6BYYZ7V2/Maillard et al. - 2023 - Small Data, Big Impact Leveraging Minimal Data for Effective Machine Translation.pdf}
 }
+@misc{MasakhaneAfrimmluDatasets2024,
+  title = {Masakhane/Afrimmlu {$\cdot$} {{Datasets}} at {{Hugging Face}}},
+  year = {2024},
+  month = jun,
+  urldate = {2025-05-28},
+  abstract = {We're on a journey to advance and democratize artificial intelligence through open source and open science.},
+  howpublished = {https://huggingface.co/datasets/masakhane/afrimmlu}
+}
 @inproceedings{nekotoParticipatoryResearchLowresourced2020,
   title = {Participatory {{Research}} for {{Low-resourced Machine Translation}}: {{A Case Study}} in {{African Languages}}},
   shorttitle = {Participatory {{Research}} for {{Low-resourced Machine Translation}}},
   file = {/Users/david/Zotero/storage/JJ2S8CT3/Nekoto et al. - 2020 - Participatory Research for Low-resourced Machine Translation A Case Study in African Languages.pdf}
 }
+@misc{OpenaiMMMLUDatasets2024,
+  title = {Openai/{{MMMLU}} {$\cdot$} {{Datasets}} at {{Hugging Face}}},
+  year = {2024},
+  month = oct,
+  urldate = {2025-05-28},
+  abstract = {We're on a journey to advance and democratize artificial intelligence through open source and open science.},
+  howpublished = {https://huggingface.co/datasets/openai/MMMLU},
+  file = {/Users/david/Zotero/storage/LPEN8SFL/MMMLU.html}
+}
 @misc{OpenlanguagedataFlores2024,
   title = {Openlanguagedata/Flores},
   year = {2024},
   file = {/Users/david/Zotero/storage/BWFBTAZ9/Robinson et al. - 2023 - ChatGPT MT Competitive for High- (but Not Low-) Resource Languages.pdf}
 }
+@misc{shiLanguageModelsAre2022,
+  title = {Language {{Models}} Are {{Multilingual Chain-of-Thought Reasoners}}},
+  author = {Shi, Freda and Suzgun, Mirac and Freitag, Markus and Wang, Xuezhi and Srivats, Suraj and Vosoughi, Soroush and Chung, Hyung Won and Tay, Yi and Ruder, Sebastian and Zhou, Denny and Das, Dipanjan and Wei, Jason},
+  year = {2022},
+  month = oct,
+  number = {arXiv:2210.03057},
+  eprint = {2210.03057},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2210.03057},
+  urldate = {2025-05-28},
+  abstract = {We evaluate the reasoning abilities of large language models in multilingual settings. We introduce the Multilingual Grade School Math (MGSM) benchmark, by manually translating 250 grade-school math problems from the GSM8K dataset (Cobbe et al., 2021) into ten typologically diverse languages. We find that the ability to solve MGSM problems via chain-of-thought prompting emerges with increasing model scale, and that models have strikingly strong multilingual reasoning abilities, even in underrepresented languages such as Bengali and Swahili. Finally, we show that the multilingual reasoning abilities of language models extend to other tasks such as commonsense reasoning and word-in-context semantic judgment. The MGSM benchmark is publicly available at https://github.com/google-research/url-nlp.},
+  archiveprefix = {arXiv},
+  file = {/Users/david/Zotero/storage/3W9ATYCI/Shi et al. - 2022 - Language Models are Multilingual Chain-of-Thought Reasoners.pdf;/Users/david/Zotero/storage/4HED3DGQ/2210.html}
+}
 @misc{siminyuAI4DAfricanLanguage2021,
   title = {{{AI4D}} -- {{African Language Program}}},
   author = {Siminyu, Kathleen and Kalipe, Godson and Orlic, Davor and Abbott, Jade and Marivate, Vukosi and Freshia, Sackey and Sibal, Prateek and Neupane, Bhanu and Adelani, David I. and Taylor, Amelia and ALI, Jamiil Toure and Degila, Kevin and Balogoun, Momboladji and DIOP, Thierno Ibrahima and David, Davis and Fourati, Chayma and Haddad, Hatem and Naski, Malek},
   file = {/Users/david/Zotero/storage/VU6IFENR/Siminyu et al. - 2021 - AI4D -- African Language Program.pdf;/Users/david/Zotero/storage/7TV2PS8J/2104.html}
 }
+@misc{singhGlobalMMLUUnderstanding2025,
+  title = {Global {{MMLU}}: {{Understanding}} and {{Addressing Cultural}} and {{Linguistic Biases}} in {{Multilingual Evaluation}}},
+  shorttitle = {Global {{MMLU}}},
+  author = {Singh, Shivalika and Romanou, Angelika and Fourrier, Cl{\'e}mentine and Adelani, David I. and Ngui, Jian Gang and {Vila-Suero}, Daniel and Limkonchotiwat, Peerat and Marchisio, Kelly and Leong, Wei Qi and Susanto, Yosephine and Ng, Raymond and Longpre, Shayne and Ko, Wei-Yin and Ruder, Sebastian and Smith, Madeline and Bosselut, Antoine and Oh, Alice and Martins, Andre F. T. and Choshen, Leshem and Ippolito, Daphne and Ferrante, Enzo and Fadaee, Marzieh and Ermis, Beyza and Hooker, Sara},
+  year = {2025},
+  month = feb,
+  number = {arXiv:2412.03304},
+  eprint = {2412.03304},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2412.03304},
+  urldate = {2025-05-28},
+  abstract = {Cultural biases in multilingual datasets pose significant challenges for their effectiveness as global benchmarks. These biases stem not only from differences in language but also from the cultural knowledge required to interpret questions, reducing the practical utility of translated datasets like MMLU. Furthermore, translation often introduces artefacts that can distort the meaning or clarity of questions in the target language. A common practice in multilingual evaluation is to rely on machine-translated evaluation sets, but simply translating a dataset is insufficient to address these challenges. In this work, we trace the impact of both of these issues on multilingual evaluations and ensuing model performances. Our large-scale evaluation of state-of-the-art open and proprietary models illustrates that progress on MMLU depends heavily on learning Western-centric concepts, with 28\% of all questions requiring culturally sensitive knowledge. Moreover, for questions requiring geographic knowledge, an astounding 84.9\% focus on either North American or European regions. Rankings of model evaluations change depending on whether they are evaluated on the full portion or the subset of questions annotated as culturally sensitive, showing the distortion to model rankings when blindly relying on translated MMLU. We release Global MMLU, an improved MMLU with evaluation coverage across 42 languages -- with improved overall quality by engaging with compensated professional and community annotators to verify translation quality while also rigorously evaluating cultural biases present in the original dataset. This comprehensive Global MMLU set also includes designated subsets labeled as culturally sensitive and culturally agnostic to allow for more holistic, complete evaluation.},
+  archiveprefix = {arXiv},
+  file = {/Users/david/Zotero/storage/23LXUKPU/Singh et al. - 2025 - Global MMLU Understanding and Addressing Cultural and Linguistic Biases in Multilingual Evaluation.pdf;/Users/david/Zotero/storage/563VJ87N/2412.html}
+}
 @misc{Tatoeba,
   title = {Tatoeba},
   urldate = {2024-11-03},
   keywords = {dataset,model,n=200},
   file = {/Users/david/Zotero/storage/GHWEGFFS/Team et al. - 2022 - No Language Left Behind Scaling Human-Centered Machine Translation.pdf;/Users/david/Zotero/storage/SZK3CP9C/2207.html}
 }
+@misc{thellmannMultilingualLLMEvaluation2024,
+  title = {Towards {{Multilingual LLM Evaluation}} for {{European Languages}}},
+  author = {Thellmann, Klaudia and Stadler, Bernhard and Fromm, Michael and Buschhoff, Jasper Schulze and Jude, Alex and Barth, Fabio and Leveling, Johannes and {Flores-Herr}, Nicolas and K{\"o}hler, Joachim and J{\"a}kel, Ren{\'e} and Ali, Mehdi},
+  year = {2024},
+  month = oct,
+  number = {arXiv:2410.08928},
+  eprint = {2410.08928},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2410.08928},
+  urldate = {2025-05-28},
+  abstract = {The rise of Large Language Models (LLMs) has revolutionized natural language processing across numerous languages and tasks. However, evaluating LLM performance in a consistent and meaningful way across multiple European languages remains challenging, especially due to the scarcity of language-parallel multilingual benchmarks. We introduce a multilingual evaluation approach tailored for European languages. We employ translated versions of five widely-used benchmarks to assess the capabilities of 40 LLMs across 21 European languages. Our contributions include examining the effectiveness of translated benchmarks, assessing the impact of different translation services, and offering a multilingual evaluation framework for LLMs that includes newly created datasets: EU20-MMLU, EU20-HellaSwag, EU20-ARC, EU20-TruthfulQA, and EU20-GSM8K. The benchmarks and results are made publicly available to encourage further research in multilingual LLM evaluation.},
+  archiveprefix = {arXiv},
+  file = {/Users/david/Zotero/storage/DCEINCKD/Thellmann et al. - 2024 - Towards Multilingual LLM Evaluation for European Languages.pdf;/Users/david/Zotero/storage/EAQS33RW/2410.html}
+}