|
class NllbLang(): |
|
def __init__(self, code, name, code_whisper=None, name_whisper=None): |
|
self.code = code |
|
self.name = name |
|
self.code_whisper = code_whisper |
|
self.name_whisper = name_whisper |
|
|
|
def __str__(self): |
|
return "Language(code={}, name={})".format(self.code, self.name) |
|
|
|
NLLB_LANGS = [ |
|
NllbLang('ace_Arab', 'Acehnese (Arabic script)'), |
|
NllbLang('ace_Latn', 'Acehnese (Latin script)'), |
|
NllbLang('acm_Arab', 'Mesopotamian Arabic', 'ar', 'Arabic'), |
|
NllbLang('acq_Arab', 'Ta’izzi-Adeni Arabic', 'ar', 'Arabic'), |
|
NllbLang('aeb_Arab', 'Tunisian Arabic'), |
|
NllbLang('afr_Latn', 'Afrikaans', 'am', 'Amharic'), |
|
NllbLang('ajp_Arab', 'South Levantine Arabic', 'ar', 'Arabic'), |
|
NllbLang('aka_Latn', 'Akan'), |
|
NllbLang('amh_Ethi', 'Amharic'), |
|
NllbLang('apc_Arab', 'North Levantine Arabic', 'ar', 'Arabic'), |
|
NllbLang('arb_Arab', 'Modern Standard Arabic', 'ar', 'Arabic'), |
|
NllbLang('arb_Latn', 'Modern Standard Arabic (Romanized)'), |
|
NllbLang('ars_Arab', 'Najdi Arabic', 'ar', 'Arabic'), |
|
NllbLang('ary_Arab', 'Moroccan Arabic', 'ar', 'Arabic'), |
|
NllbLang('arz_Arab', 'Egyptian Arabic', 'ar', 'Arabic'), |
|
NllbLang('asm_Beng', 'Assamese', 'as', 'Assamese'), |
|
NllbLang('ast_Latn', 'Asturian'), |
|
NllbLang('awa_Deva', 'Awadhi'), |
|
NllbLang('ayr_Latn', 'Central Aymara'), |
|
NllbLang('azb_Arab', 'South Azerbaijani', 'az', 'Azerbaijani'), |
|
NllbLang('azj_Latn', 'North Azerbaijani', 'az', 'Azerbaijani'), |
|
NllbLang('bak_Cyrl', 'Bashkir', 'ba', 'Bashkir'), |
|
NllbLang('bam_Latn', 'Bambara'), |
|
NllbLang('ban_Latn', 'Balinese'), |
|
NllbLang('bel_Cyrl', 'Belarusian', 'be', 'Belarusian'), |
|
NllbLang('bem_Latn', 'Bemba'), |
|
NllbLang('ben_Beng', 'Bengali', 'bn', 'Bengali'), |
|
NllbLang('bho_Deva', 'Bhojpuri'), |
|
NllbLang('bjn_Arab', 'Banjar (Arabic script)'), |
|
NllbLang('bjn_Latn', 'Banjar (Latin script)'), |
|
NllbLang('bod_Tibt', 'Standard Tibetan', 'bo', 'Tibetan'), |
|
NllbLang('bos_Latn', 'Bosnian', 'bs', 'Bosnian'), |
|
NllbLang('bug_Latn', 'Buginese'), |
|
NllbLang('bul_Cyrl', 'Bulgarian', 'bg', 'Bulgarian'), |
|
NllbLang('cat_Latn', 'Catalan', 'ca', 'Catalan'), |
|
NllbLang('ceb_Latn', 'Cebuano'), |
|
NllbLang('ces_Latn', 'Czech', 'cs', 'Czech'), |
|
NllbLang('cjk_Latn', 'Chokwe'), |
|
NllbLang('ckb_Arab', 'Central Kurdish'), |
|
NllbLang('crh_Latn', 'Crimean Tatar'), |
|
NllbLang('cym_Latn', 'Welsh', 'cy', 'Welsh'), |
|
NllbLang('dan_Latn', 'Danish', 'da', 'Danish'), |
|
NllbLang('deu_Latn', 'German', 'de', 'German'), |
|
NllbLang('dik_Latn', 'Southwestern Dinka'), |
|
NllbLang('dyu_Latn', 'Dyula'), |
|
NllbLang('dzo_Tibt', 'Dzongkha'), |
|
NllbLang('ell_Grek', 'Greek', 'el', 'Greek'), |
|
NllbLang('eng_Latn', 'English', 'en', 'English'), |
|
NllbLang('epo_Latn', 'Esperanto'), |
|
NllbLang('est_Latn', 'Estonian', 'et', 'Estonian'), |
|
NllbLang('eus_Latn', 'Basque', 'eu', 'Basque'), |
|
NllbLang('ewe_Latn', 'Ewe'), |
|
NllbLang('fao_Latn', 'Faroese', 'fo', 'Faroese'), |
|
NllbLang('fij_Latn', 'Fijian'), |
|
NllbLang('fin_Latn', 'Finnish', 'fi', 'Finnish'), |
|
NllbLang('fon_Latn', 'Fon'), |
|
NllbLang('fra_Latn', 'French', 'fr', 'French'), |
|
NllbLang('fur_Latn', 'Friulian'), |
|
NllbLang('fuv_Latn', 'Nigerian Fulfulde'), |
|
NllbLang('gla_Latn', 'Scottish Gaelic'), |
|
NllbLang('gle_Latn', 'Irish'), |
|
NllbLang('glg_Latn', 'Galician', 'gl', 'Galician'), |
|
NllbLang('grn_Latn', 'Guarani'), |
|
NllbLang('guj_Gujr', 'Gujarati', 'gu', 'Gujarati'), |
|
NllbLang('hat_Latn', 'Haitian Creole', 'ht', 'Haitian creole'), |
|
NllbLang('hau_Latn', 'Hausa', 'ha', 'Hausa'), |
|
NllbLang('heb_Hebr', 'Hebrew', 'he', 'Hebrew'), |
|
NllbLang('hin_Deva', 'Hindi', 'hi', 'Hindi'), |
|
NllbLang('hne_Deva', 'Chhattisgarhi'), |
|
NllbLang('hrv_Latn', 'Croatian', 'hr', 'Croatian'), |
|
NllbLang('hun_Latn', 'Hungarian', 'hu', 'Hungarian'), |
|
NllbLang('hye_Armn', 'Armenian', 'hy', 'Armenian'), |
|
NllbLang('ibo_Latn', 'Igbo'), |
|
NllbLang('ilo_Latn', 'Ilocano'), |
|
NllbLang('ind_Latn', 'Indonesian', 'id', 'Indonesian'), |
|
NllbLang('isl_Latn', 'Icelandic', 'is', 'Icelandic'), |
|
NllbLang('ita_Latn', 'Italian', 'it', 'Italian'), |
|
NllbLang('jav_Latn', 'Javanese', 'jw', 'Javanese'), |
|
NllbLang('jpn_Jpan', 'Japanese', 'ja', 'Japanese'), |
|
NllbLang('kab_Latn', 'Kabyle'), |
|
NllbLang('kac_Latn', 'Jingpho'), |
|
NllbLang('kam_Latn', 'Kamba'), |
|
NllbLang('kan_Knda', 'Kannada', 'kn', 'Kannada'), |
|
NllbLang('kas_Arab', 'Kashmiri (Arabic script)'), |
|
NllbLang('kas_Deva', 'Kashmiri (Devanagari script)'), |
|
NllbLang('kat_Geor', 'Georgian', 'ka', 'Georgian'), |
|
NllbLang('knc_Arab', 'Central Kanuri (Arabic script)'), |
|
NllbLang('knc_Latn', 'Central Kanuri (Latin script)'), |
|
NllbLang('kaz_Cyrl', 'Kazakh', 'kk', 'Kazakh'), |
|
NllbLang('kbp_Latn', 'Kabiyè'), |
|
NllbLang('kea_Latn', 'Kabuverdianu'), |
|
NllbLang('khm_Khmr', 'Khmer', 'km', 'Khmer'), |
|
NllbLang('kik_Latn', 'Kikuyu'), |
|
NllbLang('kin_Latn', 'Kinyarwanda'), |
|
NllbLang('kir_Cyrl', 'Kyrgyz'), |
|
NllbLang('kmb_Latn', 'Kimbundu'), |
|
NllbLang('kmr_Latn', 'Northern Kurdish'), |
|
NllbLang('kon_Latn', 'Kikongo'), |
|
NllbLang('kor_Hang', 'Korean', 'ko', 'Korean'), |
|
NllbLang('lao_Laoo', 'Lao', 'lo', 'Lao'), |
|
NllbLang('lij_Latn', 'Ligurian'), |
|
NllbLang('lim_Latn', 'Limburgish'), |
|
NllbLang('lin_Latn', 'Lingala', 'ln', 'Lingala'), |
|
NllbLang('lit_Latn', 'Lithuanian', 'lt', 'Lithuanian'), |
|
NllbLang('lmo_Latn', 'Lombard'), |
|
NllbLang('ltg_Latn', 'Latgalian'), |
|
NllbLang('ltz_Latn', 'Luxembourgish', 'lb', 'Luxembourgish'), |
|
NllbLang('lua_Latn', 'Luba-Kasai'), |
|
NllbLang('lug_Latn', 'Ganda'), |
|
NllbLang('luo_Latn', 'Luo'), |
|
NllbLang('lus_Latn', 'Mizo'), |
|
NllbLang('lvs_Latn', 'Standard Latvian', 'lv', 'Latvian'), |
|
NllbLang('mag_Deva', 'Magahi'), |
|
NllbLang('mai_Deva', 'Maithili'), |
|
NllbLang('mal_Mlym', 'Malayalam', 'ml', 'Malayalam'), |
|
NllbLang('mar_Deva', 'Marathi', 'mr', 'Marathi'), |
|
NllbLang('min_Arab', 'Minangkabau (Arabic script)'), |
|
NllbLang('min_Latn', 'Minangkabau (Latin script)'), |
|
NllbLang('mkd_Cyrl', 'Macedonian', 'mk', 'Macedonian'), |
|
NllbLang('plt_Latn', 'Plateau Malagasy', 'mg', 'Malagasy'), |
|
NllbLang('mlt_Latn', 'Maltese', 'mt', 'Maltese'), |
|
NllbLang('mni_Beng', 'Meitei (Bengali script)'), |
|
NllbLang('khk_Cyrl', 'Halh Mongolian', 'mn', 'Mongolian'), |
|
NllbLang('mos_Latn', 'Mossi'), |
|
NllbLang('mri_Latn', 'Maori', 'mi', 'Maori'), |
|
NllbLang('mya_Mymr', 'Burmese', 'my', 'Myanmar'), |
|
NllbLang('nld_Latn', 'Dutch', 'nl', 'Dutch'), |
|
NllbLang('nno_Latn', 'Norwegian Nynorsk', 'nn', 'Nynorsk'), |
|
NllbLang('nob_Latn', 'Norwegian Bokmål', 'no', 'Norwegian'), |
|
NllbLang('npi_Deva', 'Nepali', 'ne', 'Nepali'), |
|
NllbLang('nso_Latn', 'Northern Sotho'), |
|
NllbLang('nus_Latn', 'Nuer'), |
|
NllbLang('nya_Latn', 'Nyanja'), |
|
NllbLang('oci_Latn', 'Occitan', 'oc', 'Occitan'), |
|
NllbLang('gaz_Latn', 'West Central Oromo'), |
|
NllbLang('ory_Orya', 'Odia'), |
|
NllbLang('pag_Latn', 'Pangasinan'), |
|
NllbLang('pan_Guru', 'Eastern Panjabi', 'pa', 'Punjabi'), |
|
NllbLang('pap_Latn', 'Papiamento'), |
|
NllbLang('pes_Arab', 'Western Persian', 'fa', 'Persian'), |
|
NllbLang('pol_Latn', 'Polish', 'pl', 'Polish'), |
|
NllbLang('por_Latn', 'Portuguese', 'pt', 'Portuguese'), |
|
NllbLang('prs_Arab', 'Dari'), |
|
NllbLang('pbt_Arab', 'Southern Pashto', 'ps', 'Pashto'), |
|
NllbLang('quy_Latn', 'Ayacucho Quechua'), |
|
NllbLang('ron_Latn', 'Romanian', 'ro', 'Romanian'), |
|
NllbLang('run_Latn', 'Rundi'), |
|
NllbLang('rus_Cyrl', 'Russian', 'ru', 'Russian'), |
|
NllbLang('sag_Latn', 'Sango'), |
|
NllbLang('san_Deva', 'Sanskrit', 'sa', 'Sanskrit'), |
|
NllbLang('sat_Olck', 'Santali'), |
|
NllbLang('scn_Latn', 'Sicilian'), |
|
NllbLang('shn_Mymr', 'Shan'), |
|
NllbLang('sin_Sinh', 'Sinhala', 'si', 'Sinhala'), |
|
NllbLang('slk_Latn', 'Slovak', 'sk', 'Slovak'), |
|
NllbLang('slv_Latn', 'Slovenian', 'sl', 'Slovenian'), |
|
NllbLang('smo_Latn', 'Samoan'), |
|
NllbLang('sna_Latn', 'Shona', 'sn', 'Shona'), |
|
NllbLang('snd_Arab', 'Sindhi', 'sd', 'Sindhi'), |
|
NllbLang('som_Latn', 'Somali', 'so', 'Somali'), |
|
NllbLang('sot_Latn', 'Southern Sotho'), |
|
NllbLang('spa_Latn', 'Spanish', 'es', 'Spanish'), |
|
NllbLang('als_Latn', 'Tosk Albanian', 'sq', 'Albanian'), |
|
NllbLang('srd_Latn', 'Sardinian'), |
|
NllbLang('srp_Cyrl', 'Serbian', 'sr', 'Serbian'), |
|
NllbLang('ssw_Latn', 'Swati'), |
|
NllbLang('sun_Latn', 'Sundanese', 'su', 'Sundanese'), |
|
NllbLang('swe_Latn', 'Swedish', 'sv', 'Swedish'), |
|
NllbLang('swh_Latn', 'Swahili', 'sw', 'Swahili'), |
|
NllbLang('szl_Latn', 'Silesian'), |
|
NllbLang('tam_Taml', 'Tamil', 'ta', 'Tamil'), |
|
NllbLang('tat_Cyrl', 'Tatar', 'tt', 'Tatar'), |
|
NllbLang('tel_Telu', 'Telugu', 'te', 'Telugu'), |
|
NllbLang('tgk_Cyrl', 'Tajik', 'tg', 'Tajik'), |
|
NllbLang('tgl_Latn', 'Tagalog', 'tl', 'Tagalog'), |
|
NllbLang('tha_Thai', 'Thai', 'th', 'Thai'), |
|
NllbLang('tir_Ethi', 'Tigrinya'), |
|
NllbLang('taq_Latn', 'Tamasheq (Latin script)'), |
|
NllbLang('taq_Tfng', 'Tamasheq (Tifinagh script)'), |
|
NllbLang('tpi_Latn', 'Tok Pisin'), |
|
NllbLang('tsn_Latn', 'Tswana'), |
|
NllbLang('tso_Latn', 'Tsonga'), |
|
NllbLang('tuk_Latn', 'Turkmen', 'tk', 'Turkmen'), |
|
NllbLang('tum_Latn', 'Tumbuka'), |
|
NllbLang('tur_Latn', 'Turkish', 'tr', 'Turkish'), |
|
NllbLang('twi_Latn', 'Twi'), |
|
NllbLang('tzm_Tfng', 'Central Atlas Tamazight'), |
|
NllbLang('uig_Arab', 'Uyghur'), |
|
NllbLang('ukr_Cyrl', 'Ukrainian', 'uk', 'Ukrainian'), |
|
NllbLang('umb_Latn', 'Umbundu'), |
|
NllbLang('urd_Arab', 'Urdu', 'ur', 'Urdu'), |
|
NllbLang('uzn_Latn', 'Northern Uzbek', 'uz', 'Uzbek'), |
|
NllbLang('vec_Latn', 'Venetian'), |
|
NllbLang('vie_Latn', 'Vietnamese', 'vi', 'Vietnamese'), |
|
NllbLang('war_Latn', 'Waray'), |
|
NllbLang('wol_Latn', 'Wolof'), |
|
NllbLang('xho_Latn', 'Xhosa'), |
|
NllbLang('ydd_Hebr', 'Eastern Yiddish', 'yi', 'Yiddish'), |
|
NllbLang('yor_Latn', 'Yoruba', 'yo', 'Yoruba'), |
|
NllbLang('yue_Hant', 'Yue Chinese', 'zh', 'Chinese'), |
|
NllbLang('zho_Hans', 'Chinese (Simplified)', 'zh', 'Chinese'), |
|
NllbLang('zho_Hant', 'Chinese (Traditional)', 'zh', 'Chinese'), |
|
NllbLang('zsm_Latn', 'Standard Malay', 'ms', 'Malay'), |
|
NllbLang('zul_Latn', 'Zulu'), |
|
] |
|
|
|
_TO_NLLB_LANG_CODE = {language.code.lower(): language for language in NLLB_LANGS if language.code is not None} |
|
|
|
_TO_NLLB_LANG_NAME = {language.name.lower(): language for language in NLLB_LANGS if language.name is not None} |
|
|
|
_TO_NLLB_LANG_WHISPER_CODE = {language.code_whisper.lower(): language for language in NLLB_LANGS if language.code_whisper is not None} |
|
|
|
_TO_NLLB_LANG_WHISPER_NAME = {language.name_whisper.lower(): language for language in NLLB_LANGS if language.name_whisper is not None} |
|
|
|
def get_nllb_lang_from_code(lang_code, default=None) -> NllbLang: |
|
"""Return the language from the language code.""" |
|
return _TO_NLLB_LANG_CODE.get(lang_code, default) |
|
|
|
def get_nllb_lang_from_name(lang_name, default=None) -> NllbLang: |
|
"""Return the language from the language name.""" |
|
return _TO_NLLB_LANG_NAME.get(lang_name.lower() if lang_name else None, default) |
|
|
|
def get_nllb_lang_from_code_whisper(lang_code_whisper, default=None) -> NllbLang: |
|
"""Return the language from the language code.""" |
|
return _TO_NLLB_LANG_WHISPER_CODE.get(lang_code_whisper, default) |
|
|
|
def get_nllb_lang_from_name_whisper(lang_name_whisper, default=None) -> NllbLang: |
|
"""Return the language from the language name.""" |
|
return _TO_NLLB_LANG_WHISPER_NAME.get(lang_name_whisper.lower() if lang_name_whisper else None, default) |
|
|
|
def get_nllb_lang_names(): |
|
"""Return a list of language names.""" |
|
return [language.name for language in NLLB_LANGS] |
|
|
|
if __name__ == "__main__": |
|
|
|
print(get_nllb_lang_from_code('eng_Latn')) |
|
print(get_nllb_lang_from_name('English')) |
|
|
|
print(get_nllb_lang_names()) |