--- language: - myv - ru - fi - de - es - en - hi - zh - tr - uk - fr - ar tags: - erzya - mordovian - fill-mask - pretraining - embeddings - masked-lm - feature-extraction - sentence-similarity license: cc-by-sa-4.0 datasets: - slone/myv_ru_2022 - yhavinga/ccmatrix --- This a model to translate texts from the Erzya language (`myv`, cyrillic script) to 11 other languages: `ru,fi,de,es,en,hi,zh,tr,uk,fr,ar`. It is described in the paper "The first neural machine translation system for the Erzya language". This model is based on [facebook/mbart-large-50](https://huggingface.co/facebook/mbart-large-50) ([license here](https://tfhub.dev/google/LaBSE/2)), but with updated vocabulary and checkpoint: - Added an extra language token `myv_XX` and 19K new BPE tokens for the Erzya language; - Fine-tuned to translate to Erzya: first from Russian, then from all 11 languages. The following code can be used to run translation using the model ```Python from transformers import MBartForConditionalGeneration, MBart50Tokenizer def fix_tokenizer(tokenizer): """ Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """ old_len = len(tokenizer) - int('myv_XX' in tokenizer.added_tokens_encoder) tokenizer.lang_code_to_id['myv_XX'] = old_len-1 tokenizer.id_to_lang_code[old_len-1] = 'myv_XX' tokenizer.fairseq_tokens_to_ids[""] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id) tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()} if 'myv_XX' not in tokenizer._additional_special_tokens: tokenizer._additional_special_tokens.append('myv_XX') tokenizer.added_tokens_encoder = {} def translate(text, model, tokenizer, src='ru_RU', trg='myv_XX', max_length='auto', num_beams=3, repetition_penalty=5.0, train_mode=False, n_out=None, **kwargs): tokenizer.src_lang = src encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024) if max_length == 'auto': max_length = int(32 + 1.5 * encoded.input_ids.shape[1]) if train_mode: model.train() else: model.eval() generated_tokens = model.generate( **encoded.to(model.device), forced_bos_token_id=tokenizer.lang_code_to_id[trg], max_length=max_length, num_beams=num_beams, repetition_penalty=repetition_penalty, num_return_sequences=n_out or 1, **kwargs ) out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) if isinstance(text, str) and n_out is None: return out[0] return out mname = 'slone/mbart-large-51-mul-myv-v1' model = MBartForConditionalGeneration.from_pretrained(mname) tokenizer = MBart50Tokenizer.from_pretrained(mname) fix_tokenizer(tokenizer) print(translate('Привет, собака!', model, tokenizer, src='ru_RU', trg='myv_XX')) # Шумбрат, киска! # действительно, по-эрзянски собака именно так print(translate('Hello, doggy!', model, tokenizer, src='en_XX', trg='myv_XX')) # Шумбрат, киска! ```