Dy3257 commited on
Commit
6ea299f
1 Parent(s): cccb03c

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +83 -77
tokenizer.py CHANGED
@@ -1,78 +1,84 @@
1
- import spacy
2
- from spacy.tokens import Doc
3
-
4
- # 加载英文模型
5
- nlp = spacy.load('en_core_web_sm')
6
-
7
- import nltk
8
- from nltk.tokenize import word_tokenize
9
-
10
- import jieba
11
-
12
- from sacremoses import MosesTokenizer
13
- from subword_nmt import apply_bpe
14
- import codecs
15
-
16
- jieba1 = jieba.Tokenizer()
17
- jieba2 = jieba.Tokenizer()
18
- jieba2.load_userdict('model2_data/dict.zh.txt')
19
-
20
- mt_zh = MosesTokenizer(lang='zh')
21
- with codecs.open('model2_data/bpecode.zh', 'r', 'utf-8') as f:
22
- bpe_zh_f = apply_bpe.BPE(f)
23
-
24
- #英文部分初始化,定义tokenize等等
25
- mt_en = MosesTokenizer(lang='en')
26
- with codecs.open('model2_data/bpecode.en', 'r', 'utf-8') as f:
27
- bpe_en_f = apply_bpe.BPE(f)
28
-
29
- def spacy_tokenize(line):
30
- # 使用spaCy处理文本
31
- doc = nlp(line)
32
- # 获取单词列表
33
- words = [token.text for token in doc]
34
- # 将单词连接成一个字符串,单词间用一个空格间隔
35
- return ' '.join(words)
36
-
37
-
38
- def nltk_tokenize(line):
39
- # 使用NLTK的word_tokenize进行分词
40
- tokens = word_tokenize(line)
41
- #print(tokens)
42
- return tokens
43
-
44
-
45
- def jieba_tokenize(line):
46
- # 使用jieba进行分词
47
- tokens = list(jieba1.cut(line.strip())) # strip用于去除可能的空白字符
48
- #print(tokens)
49
- return tokens
50
-
51
- def tokenize(line, mode):
52
- if mode == "汉译英" :
53
- return jieba_tokenize(line)
54
- else :
55
- return nltk_tokenize(spacy_tokenize(line))
56
-
57
-
58
- def jieba_tokenize2(line):
59
- tokens = list(jieba2.cut(line.strip()))
60
- return tokens
61
-
62
- def mt_bpe_zh(line):
63
- zh_tok = mt_zh.tokenize(line)
64
- bpe_zh = bpe_zh_f.segment_tokens(zh_tok)
65
- print(bpe_zh)
66
- return bpe_zh
67
-
68
- def mt_bpe_en(line):
69
- en_tok = mt_en.tokenize(line)
70
- bpe_en = bpe_en_f.segment_tokens(en_tok)
71
- print(bpe_en)
72
- return bpe_en
73
-
74
- def tokenize2(line, mode):
75
- if mode == "汉译英" :
76
- return mt_bpe_zh(' '.join(jieba_tokenize2(line)))
77
- else :
 
 
 
 
 
 
78
  return mt_bpe_en(line)
 
1
+ import spacy
2
+
3
+ spacy.cli.download("en_core_web_sm")
4
+
5
+ from spacy.tokens import Doc
6
+
7
+ # 加载英文模型
8
+ nlp = spacy.load('en_core_web_sm')
9
+
10
+ import nltk
11
+
12
+ nltk.download('punkt')
13
+
14
+ from nltk.tokenize import word_tokenize
15
+
16
+ import jieba
17
+
18
+ from sacremoses import MosesTokenizer
19
+ from subword_nmt import apply_bpe
20
+ import codecs
21
+
22
+ jieba1 = jieba.Tokenizer()
23
+ jieba2 = jieba.Tokenizer()
24
+ jieba2.load_userdict('model2_data/dict.zh.txt')
25
+
26
+ mt_zh = MosesTokenizer(lang='zh')
27
+ with codecs.open('model2_data/bpecode.zh', 'r', 'utf-8') as f:
28
+ bpe_zh_f = apply_bpe.BPE(f)
29
+
30
+ #英文部分初始化,定义tokenize等等
31
+ mt_en = MosesTokenizer(lang='en')
32
+ with codecs.open('model2_data/bpecode.en', 'r', 'utf-8') as f:
33
+ bpe_en_f = apply_bpe.BPE(f)
34
+
35
+ def spacy_tokenize(line):
36
+ # 使用spaCy处理文本
37
+ doc = nlp(line)
38
+ # 获取单词列表
39
+ words = [token.text for token in doc]
40
+ # 将单词连接成一个字符串,单词间用一个空格间隔
41
+ return ' '.join(words)
42
+
43
+
44
+ def nltk_tokenize(line):
45
+ # 使用NLTK的word_tokenize进行分词
46
+ tokens = word_tokenize(line)
47
+ #print(tokens)
48
+ return tokens
49
+
50
+
51
+ def jieba_tokenize(line):
52
+ # 使用jieba进行分词
53
+ tokens = list(jieba1.cut(line.strip())) # strip用于去除可能的空白字符
54
+ #print(tokens)
55
+ return tokens
56
+
57
+ def tokenize(line, mode):
58
+ if mode == "汉译英" :
59
+ return jieba_tokenize(line)
60
+ else :
61
+ return nltk_tokenize(spacy_tokenize(line))
62
+
63
+
64
+ def jieba_tokenize2(line):
65
+ tokens = list(jieba2.cut(line.strip()))
66
+ return tokens
67
+
68
+ def mt_bpe_zh(line):
69
+ zh_tok = mt_zh.tokenize(line)
70
+ bpe_zh = bpe_zh_f.segment_tokens(zh_tok)
71
+ print(bpe_zh)
72
+ return bpe_zh
73
+
74
+ def mt_bpe_en(line):
75
+ en_tok = mt_en.tokenize(line)
76
+ bpe_en = bpe_en_f.segment_tokens(en_tok)
77
+ print(bpe_en)
78
+ return bpe_en
79
+
80
+ def tokenize2(line, mode):
81
+ if mode == "汉译英" :
82
+ return mt_bpe_zh(' '.join(jieba_tokenize2(line)))
83
+ else :
84
  return mt_bpe_en(line)