File size: 1,998 Bytes
6109610
 
 
 
6ea299f
 
c16f075
6ea299f
c16f075
6ea299f
 
c16f075
6ea299f
 
 
c16f075
6ea299f
c16f075
6ea299f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c16f075
6ea299f
c16f075
6ea299f
c16f075
6ea299f
 
 
 
c16f075
 
 
6ea299f
 
 
 
 
c16f075
6ea299f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c19755
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import subprocess

subprocess.run(["pip", "install", "spacy"])

import spacy

spacy.cli.download("en_core_web_sm")

from spacy.tokens import Doc

# 加载英文模型
nlp = spacy.load('en_core_web_sm')

import nltk

nltk.download('punkt')

from nltk.tokenize import word_tokenize

import jieba

from sacremoses import MosesTokenizer
from subword_nmt import apply_bpe
import codecs

jieba1 = jieba.Tokenizer()
jieba2 = jieba.Tokenizer()
jieba2.load_userdict('model2_data/dict.zh.txt')

mt_zh = MosesTokenizer(lang='zh')
with codecs.open('model2_data/bpecode.zh', 'r', 'utf-8') as f:
    bpe_zh_f = apply_bpe.BPE(f)  

#英文部分初始化,定义tokenize等等
mt_en = MosesTokenizer(lang='en')
with codecs.open('model2_data/bpecode.en', 'r', 'utf-8') as f:
    bpe_en_f = apply_bpe.BPE(f)

def spacy_tokenize(line):
    # 使用spaCy处理文本
    doc = nlp(line)
    # 获取单词列表
    words = [token.text for token in doc]
    # 将单词连接成一个字符串,单词间用一个空格间隔
    return ' '.join(words)


def nltk_tokenize(line):
    # 使用NLTK的word_tokenize进行分词
    tokens = word_tokenize(line)

    return tokens


def jieba_tokenize(line):
    # 使用jieba进行分词
    tokens = list(jieba1.cut(line.strip()))  # strip用于去除可能的空白字符

    return tokens

def tokenize(line, mode):
    if mode == "汉译英" :
        return jieba_tokenize(line)
    else :
        return nltk_tokenize(spacy_tokenize(line))


def jieba_tokenize2(line):
    tokens = list(jieba2.cut(line.strip()))
    return tokens

def mt_bpe_zh(line):
    zh_tok = mt_zh.tokenize(line)
    bpe_zh = bpe_zh_f.segment_tokens(zh_tok)
    print(bpe_zh)
    return bpe_zh

def mt_bpe_en(line):
    en_tok = mt_en.tokenize(line)
    bpe_en = bpe_en_f.segment_tokens(en_tok)
    print(bpe_en)
    return bpe_en

def tokenize2(line, mode):
    if mode == "汉译英" :
        return mt_bpe_zh(' '.join(jieba_tokenize2(line)))
    else :
        return mt_bpe_en(line)