Dy3257 commited on
Commit
e510137
1 Parent(s): 6ea299f

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +12 -10
tokenizer.py CHANGED
@@ -1,17 +1,17 @@
1
  import spacy
2
 
3
- spacy.cli.download("en_core_web_sm")
4
 
5
- from spacy.tokens import Doc
6
 
7
  # 加载英文模型
8
- nlp = spacy.load('en_core_web_sm')
9
 
10
  import nltk
11
 
12
- nltk.download('punkt')
13
 
14
- from nltk.tokenize import word_tokenize
15
 
16
  import jieba
17
 
@@ -34,18 +34,20 @@ with codecs.open('model2_data/bpecode.en', 'r', 'utf-8') as f:
34
 
35
  def spacy_tokenize(line):
36
  # 使用spaCy处理文本
37
- doc = nlp(line)
38
  # 获取单词列表
39
- words = [token.text for token in doc]
40
  # 将单词连接成一个字符串,单词间用一个空格间隔
41
- return ' '.join(words)
 
42
 
43
 
44
  def nltk_tokenize(line):
45
  # 使用NLTK的word_tokenize进行分词
46
- tokens = word_tokenize(line)
47
  #print(tokens)
48
- return tokens
 
49
 
50
 
51
  def jieba_tokenize(line):
 
1
  import spacy
2
 
3
+ #spacy.cli.download("en_core_web_sm")
4
 
5
+ #from spacy.tokens import Doc
6
 
7
  # 加载英文模型
8
+ #nlp = spacy.load('en_core_web_sm')
9
 
10
  import nltk
11
 
12
+ #nltk.download('punkt')
13
 
14
+ #from nltk.tokenize import word_tokenize
15
 
16
  import jieba
17
 
 
34
 
35
  def spacy_tokenize(line):
36
  # 使用spaCy处理文本
37
+ #doc = nlp(line)
38
  # 获取单词列表
39
+ #words = [token.text for token in doc]
40
  # 将单词连接成一个字符串,单词间用一个空格间隔
41
+ #return ' '.join(words)
42
+ return ""
43
 
44
 
45
  def nltk_tokenize(line):
46
  # 使用NLTK的word_tokenize进行分词
47
+ #tokens = word_tokenize(line)
48
  #print(tokens)
49
+ #return tokens
50
+ return []
51
 
52
 
53
  def jieba_tokenize(line):