|
import re
|
|
import sys
|
|
from sacremoses import MosesDetokenizer
|
|
|
|
md_en = MosesDetokenizer(lang='en')
|
|
md_zh = MosesDetokenizer(lang='zh')
|
|
|
|
def moses_detokenize(tokens, language='en'):
|
|
en_detokenizer = MosesDetokenizer(lang=language)
|
|
|
|
stdout = en_detokenizer.detokenize(tokens,return_str=True)
|
|
|
|
|
|
return stdout.strip()
|
|
|
|
def detokenize(tokens, mode):
|
|
if mode == "汉译英" :
|
|
text = moses_detokenize(tokens)
|
|
text = re.sub(r" n't", "n't",text)
|
|
else :
|
|
text = ''.join(tokens)
|
|
|
|
return text
|
|
|
|
def detokenize2(tokens, mode):
|
|
if mode == "汉译英" :
|
|
answer_en_bpe = md_en.detokenize(tokens,return_str=True)
|
|
text = re.sub(r"@@ ", "",answer_en_bpe)
|
|
else :
|
|
answer_zh_bpe = md_zh.detokenize(tokens,return_str=True)
|
|
text = re.sub(r"@@ ", "",answer_zh_bpe)
|
|
return text |