Japanese to Korean translator for FFXIV

FINAL FANTASY is a registered trademark of Square Enix Holdings Co., Ltd.

This project is detailed on the Github repo.

Demo

demo.gif Click to try demo demo2.gif Check this Windows app demo with ONNX model

Usage

Inference (PyTorch)

from transformers import(
    EncoderDecoderModel,
    PreTrainedTokenizerFast,
    BertJapaneseTokenizer,
)

import torch

encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
decoder_model_name = "skt/kogpt2-base-v2"

src_tokenizer = BertJapaneseTokenizer.from_pretrained(encoder_model_name)
trg_tokenizer = PreTrainedTokenizerFast.from_pretrained(decoder_model_name)

# You should change following `./best_model` to the path of model **directory**
model = EncoderDecoderModel.from_pretrained("./best_model")

text = "ใ‚ฎใƒซใ‚ฌใƒกใƒƒใ‚ทใƒฅ่จŽไผๆˆฆ"
# text = "ใ‚ฎใƒซใ‚ฌใƒกใƒƒใ‚ทใƒฅ่จŽไผๆˆฆใซ่กŒใฃใฆใใพใ™ใ€‚ไธ€็ท’ใซ่กŒใใพใ—ใ‚‡ใ†ใ‹๏ผŸ"

def translate(text_src):
    embeddings = src_tokenizer(text_src, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
    embeddings = {k: v for k, v in embeddings.items()}
    output = model.generate(**embeddings, max_length=500)[0, 1:-1]
    text_trg = trg_tokenizer.decode(output.cpu())
    return text_trg

print(translate(text))

Inference (Optimum.OnnxRuntime)

Note that current Optimum.OnnxRuntime still requires PyTorch for backend. [Issue] You can use either [ONNX] or [quantized ONNX] model.

from transformers import BertJapaneseTokenizer,PreTrainedTokenizerFast
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from onnxruntime import SessionOptions
import torch

encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
decoder_model_name = "skt/kogpt2-base-v2"

src_tokenizer = BertJapaneseTokenizer.from_pretrained(encoder_model_name)
trg_tokenizer = PreTrainedTokenizerFast.from_pretrained(decoder_model_name)

sess_options = SessionOptions()
sess_options.log_severity_level = 3 # mute warnings including CleanUnusedInitializersAndNodeArgs
# change subfolder to "onnxq" if you want to use the quantized model
model = ORTModelForSeq2SeqLM.from_pretrained("sappho192/ffxiv-ja-ko-translator",
        sess_options=sess_options, subfolder="onnx") 

texts = [
    "้€ƒใ’ใ‚!",  # Should be "๋„๋ง์ณ!"
    "ๅˆใ‚ใพใ—ใฆ.",  # "๋ฐ˜๊ฐ€์›Œ์š”"
    "ใ‚ˆใ‚ใ—ใใŠ้ก˜ใ„ใ—ใพใ™.",  # "์ž˜ ๋ถ€ํƒ๋“œ๋ฆฝ๋‹ˆ๋‹ค."
    "ใ‚ฎใƒซใ‚ฌใƒกใƒƒใ‚ทใƒฅ่จŽไผๆˆฆ",  # "๊ธธ๊ฐ€๋ฉ”์‰ฌ ํ† ๋ฒŒ์ „"
    "ใ‚ฎใƒซใ‚ฌใƒกใƒƒใ‚ทใƒฅ่จŽไผๆˆฆใซ่กŒใฃใฆใใพใ™ใ€‚ไธ€็ท’ใซ่กŒใใพใ—ใ‚‡ใ†ใ‹๏ผŸ",  # "๊ธธ๊ฐ€๋ฉ”์‰ฌ ํ† ๋ฒŒ์ „์— ๊ฐ‘๋‹ˆ๋‹ค. ๊ฐ™์ด ๊ฐ€์‹ค๋ž˜์š”?"
    "ๅคœใซใชใ‚Šใพใ—ใŸ",  # "๋ฐค์ด ๋˜์—ˆ์Šต๋‹ˆ๋‹ค"
    "ใ”้ฃฏใ‚’้ฃŸในใพใ—ใ‚‡ใ†."  # "์Œ, ์ด์ œ ์‹์‚ฌ๋„ ํ•ด๋ณผ๊นŒ์š”"
 ]


def translate(text_src):
    embeddings = src_tokenizer(text_src, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
    print(f'Src tokens: {embeddings.data["input_ids"]}')
    embeddings = {k: v for k, v in embeddings.items()}

    output = model.generate(**embeddings, max_length=500)[0, 1:-1]
    print(f'Trg tokens: {output}')
    text_trg = trg_tokenizer.decode(output.cpu())
    return text_trg


for text in texts:
    print(translate(text))
    print()

Training

Check the training.ipynb.

Downloads last month
21
Safetensors
Model size
290M params
Tensor type
I64
ยท
F32
ยท
BOOL
ยท
Inference Examples
Inference API (serverless) has been turned off for this model.

Datasets used to train sappho192/ffxiv-ja-ko-translator

Space using sappho192/ffxiv-ja-ko-translator 1