Pradeep Kumar commited on
Commit
717d17b
·
verified ·
1 Parent(s): dba6591

Delete tokenization_test.py

Browse files
Files changed (1) hide show
  1. tokenization_test.py +0 -156
tokenization_test.py DELETED
@@ -1,156 +0,0 @@
1
- # Copyright 2024 The TensorFlow Authors. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import os
16
- import tempfile
17
-
18
- import six
19
- import tensorflow as tf, tf_keras
20
-
21
- from official.nlp.tools import tokenization
22
-
23
-
24
- class TokenizationTest(tf.test.TestCase):
25
- """Tokenization test.
26
-
27
- The implementation is forked from
28
- https://github.com/google-research/bert/blob/master/tokenization_test.py."
29
- """
30
-
31
- def test_full_tokenizer(self):
32
- vocab_tokens = [
33
- "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
34
- "##ing", ","
35
- ]
36
- with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
37
- if six.PY2:
38
- vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
39
- else:
40
- vocab_writer.write("".join([x + "\n" for x in vocab_tokens
41
- ]).encode("utf-8"))
42
-
43
- vocab_file = vocab_writer.name
44
-
45
- tokenizer = tokenization.FullTokenizer(vocab_file)
46
- os.unlink(vocab_file)
47
-
48
- tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
49
- self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
50
-
51
- self.assertAllEqual(
52
- tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
53
-
54
- def test_chinese(self):
55
- tokenizer = tokenization.BasicTokenizer()
56
-
57
- self.assertAllEqual(
58
- tokenizer.tokenize(u"ah\u535A\u63A8zz"),
59
- [u"ah", u"\u535A", u"\u63A8", u"zz"])
60
-
61
- def test_basic_tokenizer_lower(self):
62
- tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
63
-
64
- self.assertAllEqual(
65
- tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
66
- ["hello", "!", "how", "are", "you", "?"])
67
- self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
68
-
69
- def test_basic_tokenizer_no_lower(self):
70
- tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
71
-
72
- self.assertAllEqual(
73
- tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
74
- ["HeLLo", "!", "how", "Are", "yoU", "?"])
75
-
76
- def test_basic_tokenizer_no_split_on_punc(self):
77
- tokenizer = tokenization.BasicTokenizer(
78
- do_lower_case=True, split_on_punc=False)
79
-
80
- self.assertAllEqual(
81
- tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
82
- ["hello!how", "are", "you?"])
83
-
84
- def test_wordpiece_tokenizer(self):
85
- vocab_tokens = [
86
- "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
87
- "##ing", "##!", "!"
88
- ]
89
-
90
- vocab = {}
91
- for (i, token) in enumerate(vocab_tokens):
92
- vocab[token] = i
93
- tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
94
-
95
- self.assertAllEqual(tokenizer.tokenize(""), [])
96
-
97
- self.assertAllEqual(
98
- tokenizer.tokenize("unwanted running"),
99
- ["un", "##want", "##ed", "runn", "##ing"])
100
-
101
- self.assertAllEqual(
102
- tokenizer.tokenize("unwanted running !"),
103
- ["un", "##want", "##ed", "runn", "##ing", "!"])
104
-
105
- self.assertAllEqual(
106
- tokenizer.tokenize("unwanted running!"),
107
- ["un", "##want", "##ed", "runn", "##ing", "##!"])
108
-
109
- self.assertAllEqual(
110
- tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
111
-
112
- def test_convert_tokens_to_ids(self):
113
- vocab_tokens = [
114
- "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
115
- "##ing"
116
- ]
117
-
118
- vocab = {}
119
- for (i, token) in enumerate(vocab_tokens):
120
- vocab[token] = i
121
-
122
- self.assertAllEqual(
123
- tokenization.convert_tokens_to_ids(
124
- vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
125
-
126
- def test_is_whitespace(self):
127
- self.assertTrue(tokenization._is_whitespace(u" "))
128
- self.assertTrue(tokenization._is_whitespace(u"\t"))
129
- self.assertTrue(tokenization._is_whitespace(u"\r"))
130
- self.assertTrue(tokenization._is_whitespace(u"\n"))
131
- self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
132
-
133
- self.assertFalse(tokenization._is_whitespace(u"A"))
134
- self.assertFalse(tokenization._is_whitespace(u"-"))
135
-
136
- def test_is_control(self):
137
- self.assertTrue(tokenization._is_control(u"\u0005"))
138
-
139
- self.assertFalse(tokenization._is_control(u"A"))
140
- self.assertFalse(tokenization._is_control(u" "))
141
- self.assertFalse(tokenization._is_control(u"\t"))
142
- self.assertFalse(tokenization._is_control(u"\r"))
143
- self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
144
-
145
- def test_is_punctuation(self):
146
- self.assertTrue(tokenization._is_punctuation(u"-"))
147
- self.assertTrue(tokenization._is_punctuation(u"$"))
148
- self.assertTrue(tokenization._is_punctuation(u"`"))
149
- self.assertTrue(tokenization._is_punctuation(u"."))
150
-
151
- self.assertFalse(tokenization._is_punctuation(u"A"))
152
- self.assertFalse(tokenization._is_punctuation(u" "))
153
-
154
-
155
- if __name__ == "__main__":
156
- tf.test.main()