Pradeep Kumar commited on
Commit
be3c9fb
·
verified ·
1 Parent(s): 021cd97

Delete create_xlnet_pretraining_data_test.py

Browse files
Files changed (1) hide show
  1. create_xlnet_pretraining_data_test.py +0 -355
create_xlnet_pretraining_data_test.py DELETED
@@ -1,355 +0,0 @@
1
- # Copyright 2024 The TensorFlow Authors. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- """Tests for official.nlp.data.create_xlnet_pretraining_data."""
16
- import os
17
- import tempfile
18
- from typing import List
19
-
20
- from absl import logging
21
- from absl.testing import parameterized
22
-
23
- import numpy as np
24
- import tensorflow as tf, tf_keras
25
-
26
- from official.nlp.data import create_xlnet_pretraining_data as cpd
27
-
28
- _VOCAB_WORDS = ["vocab_1", "vocab_2"]
29
-
30
-
31
- # pylint: disable=invalid-name
32
- def _create_files(
33
- temp_dir: str, file_contents: List[List[str]]) -> List[str]:
34
- """Writes arbitrary documents into files."""
35
- root_dir = tempfile.mkdtemp(dir=temp_dir)
36
- files = []
37
-
38
- for i, file_content in enumerate(file_contents):
39
- destination = os.path.join(root_dir, "%d.txt" % i)
40
- with open(destination, "wb") as f:
41
- for line in file_content:
42
- f.write(line.encode("utf-8"))
43
- files.append(destination)
44
- return files
45
-
46
-
47
- def _get_mock_tokenizer():
48
- """Creates a mock tokenizer."""
49
-
50
- class MockSpieceModel:
51
- """Mock Spiece model for testing."""
52
-
53
- def __init__(self):
54
- self._special_piece_to_id = {
55
- "<unk>": 0,
56
- }
57
- for piece in set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~')):
58
- self._special_piece_to_id[piece] = 1
59
-
60
- def EncodeAsPieces(self, inputs: str) -> List[str]:
61
- return inputs
62
-
63
- def SampleEncodeAsPieces(self,
64
- inputs: str,
65
- nbest_size: int,
66
- theta: float) -> List[str]:
67
- del nbest_size, theta
68
- return inputs
69
-
70
- def PieceToId(self, piece: str) -> int:
71
- return ord(piece[0])
72
-
73
- def IdToPiece(self, id_: int) -> str:
74
- return chr(id_) * 3
75
-
76
- class Tokenizer:
77
- """Mock Tokenizer for testing."""
78
-
79
- def __init__(self):
80
- self.sp_model = MockSpieceModel()
81
-
82
- def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
83
- return [self.sp_model.IdToPiece(id_) for id_ in ids]
84
-
85
- return Tokenizer()
86
-
87
-
88
- class PreprocessDataTest(tf.test.TestCase):
89
-
90
- def test_remove_extraneous_space(self):
91
- line = " abc "
92
- output = cpd._preprocess_line(line)
93
- self.assertEqual(output, "abc")
94
-
95
- def test_symbol_replacements(self):
96
- self.assertEqual(cpd._preprocess_line("``abc``"), "\"abc\"")
97
- self.assertEqual(cpd._preprocess_line("''abc''"), "\"abc\"")
98
-
99
- def test_accent_replacements(self):
100
- self.assertEqual(cpd._preprocess_line("åbc"), "abc")
101
-
102
- def test_lower_case(self):
103
- self.assertEqual(cpd._preprocess_line("ABC", do_lower_case=True), "abc")
104
-
105
- def test_end_to_end(self):
106
- self.assertEqual(
107
- cpd._preprocess_line("HelLo ``wórLd``", do_lower_case=True),
108
- "hello \"world\"")
109
-
110
-
111
- class PreprocessAndTokenizeFilesTest(tf.test.TestCase):
112
-
113
- def test_basic_end_to_end(self):
114
- documents = [
115
- [
116
- "This is sentence 1.\n",
117
- "This is sentence 2.\n",
118
- "Sentence 3 is what this is.\n",
119
- ],
120
- [
121
- "This is the second document.\n",
122
- "This is the second line of the second document.\n"
123
- ],
124
- ]
125
- input_files = _create_files(temp_dir=self.get_temp_dir(),
126
- file_contents=documents)
127
- all_data = cpd.preprocess_and_tokenize_input_files(
128
- input_files=input_files,
129
- tokenizer=_get_mock_tokenizer(),
130
- log_example_freq=1)
131
-
132
- self.assertEqual(len(all_data), len(documents))
133
- for token_ids, sentence_ids in all_data:
134
- self.assertEqual(len(token_ids), len(sentence_ids))
135
-
136
- def test_basic_correctness(self):
137
- documents = [["a\n", "b\n", "c\n"]]
138
- input_files = _create_files(temp_dir=self.get_temp_dir(),
139
- file_contents=documents)
140
- all_data = cpd.preprocess_and_tokenize_input_files(
141
- input_files=input_files,
142
- tokenizer=_get_mock_tokenizer(),
143
- log_example_freq=1)
144
-
145
- token_ids, sentence_ids = all_data[0]
146
-
147
- self.assertAllClose(token_ids, [97, 98, 99])
148
- self.assertAllClose(sentence_ids, [True, False, True])
149
-
150
- def test_correctness_with_spaces_and_accents(self):
151
- documents = [[
152
- " å \n",
153
- "b \n",
154
- " c \n",
155
- ]]
156
- input_files = _create_files(temp_dir=self.get_temp_dir(),
157
- file_contents=documents)
158
- all_data = cpd.preprocess_and_tokenize_input_files(
159
- input_files=input_files,
160
- tokenizer=_get_mock_tokenizer(),
161
- log_example_freq=1)
162
-
163
- token_ids, sentence_ids = all_data[0]
164
-
165
- self.assertAllClose(token_ids, [97, 98, 99])
166
- self.assertAllClose(sentence_ids, [True, False, True])
167
-
168
-
169
- class BatchReshapeTests(tf.test.TestCase):
170
-
171
- def test_basic_functionality(self):
172
- per_host_batch_size = 3
173
- mock_shape = (20,)
174
-
175
- # Should truncate and reshape.
176
- expected_result_shape = (3, 6)
177
-
178
- tokens = np.zeros(mock_shape)
179
- sentence_ids = np.zeros(mock_shape)
180
-
181
- reshaped_data = cpd._reshape_to_batch_dimensions(
182
- tokens=tokens,
183
- sentence_ids=sentence_ids,
184
- per_host_batch_size=per_host_batch_size)
185
- for values in reshaped_data:
186
- self.assertEqual(len(values.flatten()) % per_host_batch_size, 0)
187
- self.assertAllClose(values.shape, expected_result_shape)
188
-
189
-
190
- class CreateSegmentsTest(tf.test.TestCase):
191
-
192
- def test_basic_functionality(self):
193
- data_length = 10
194
- tokens = np.arange(data_length)
195
- sentence_ids = np.concatenate([np.zeros(data_length // 2),
196
- np.ones(data_length // 2)])
197
- begin_index = 0
198
- total_length = 8
199
- a_data, b_data, label = cpd._create_a_and_b_segments(
200
- tokens=tokens,
201
- sentence_ids=sentence_ids,
202
- begin_index=begin_index,
203
- total_length=total_length,
204
- no_cut_probability=0.)
205
- self.assertAllClose(a_data, [0, 1, 2, 3])
206
- self.assertAllClose(b_data, [5, 6, 7, 8])
207
- self.assertEqual(label, 1)
208
-
209
- def test_no_cut(self):
210
- data_length = 10
211
- tokens = np.arange(data_length)
212
- sentence_ids = np.zeros(data_length)
213
-
214
- begin_index = 0
215
- total_length = 8
216
- a_data, b_data, label = cpd._create_a_and_b_segments(
217
- tokens=tokens,
218
- sentence_ids=sentence_ids,
219
- begin_index=begin_index,
220
- total_length=total_length,
221
- no_cut_probability=0.)
222
- self.assertGreater(len(a_data), 0)
223
- self.assertGreater(len(b_data), 0)
224
- self.assertEqual(label, 0)
225
-
226
- def test_no_cut_with_probability(self):
227
- data_length = 10
228
- tokens = np.arange(data_length)
229
- sentence_ids = np.concatenate([np.zeros(data_length // 2),
230
- np.ones(data_length // 2)])
231
- begin_index = 0
232
- total_length = 8
233
- a_data, b_data, label = cpd._create_a_and_b_segments(
234
- tokens=tokens,
235
- sentence_ids=sentence_ids,
236
- begin_index=begin_index,
237
- total_length=total_length,
238
- no_cut_probability=1.)
239
- self.assertGreater(len(a_data), 0)
240
- self.assertGreater(len(b_data), 0)
241
- self.assertEqual(label, 0)
242
-
243
-
244
- class CreateInstancesTest(tf.test.TestCase):
245
- """Tests conversions of Token/Sentence IDs to training instances."""
246
-
247
- def test_basic(self):
248
- data_length = 12
249
- tokens = np.arange(data_length)
250
- sentence_ids = np.zeros(data_length)
251
- seq_length = 8
252
- instances = cpd._convert_tokens_to_instances(
253
- tokens=tokens,
254
- sentence_ids=sentence_ids,
255
- per_host_batch_size=2,
256
- seq_length=seq_length,
257
- reuse_length=4,
258
- tokenizer=_get_mock_tokenizer(),
259
- bi_data=False,
260
- num_cores_per_host=1,
261
- logging_frequency=1)
262
- for instance in instances:
263
- self.assertEqual(len(instance.data), seq_length)
264
- self.assertEqual(len(instance.segment_ids), seq_length)
265
- self.assertIsInstance(instance.label, int)
266
- self.assertIsInstance(instance.boundary_indices, list)
267
-
268
-
269
- class TFRecordPathTests(tf.test.TestCase):
270
-
271
- def test_basic(self):
272
- base_kwargs = dict(
273
- per_host_batch_size=1,
274
- num_cores_per_host=1,
275
- seq_length=2,
276
- reuse_length=1)
277
-
278
- config1 = dict(
279
- prefix="test",
280
- suffix="",
281
- bi_data=True,
282
- use_eod_token=False,
283
- do_lower_case=True)
284
- config1.update(base_kwargs)
285
- expectation1 = "test_seqlen-2_reuse-1_bs-1_cores-1_uncased_bi.tfrecord"
286
- self.assertEqual(cpd.get_tfrecord_name(**config1), expectation1)
287
-
288
- config2 = dict(
289
- prefix="",
290
- suffix="test",
291
- bi_data=False,
292
- use_eod_token=False,
293
- do_lower_case=False)
294
- config2.update(base_kwargs)
295
- expectation2 = "seqlen-2_reuse-1_bs-1_cores-1_cased_uni_test.tfrecord"
296
- self.assertEqual(cpd.get_tfrecord_name(**config2), expectation2)
297
-
298
- config3 = dict(
299
- prefix="",
300
- suffix="",
301
- use_eod_token=True,
302
- bi_data=False,
303
- do_lower_case=True)
304
- config3.update(base_kwargs)
305
- expectation3 = "seqlen-2_reuse-1_bs-1_cores-1_uncased_eod_uni.tfrecord"
306
- self.assertEqual(cpd.get_tfrecord_name(**config3), expectation3)
307
-
308
-
309
- class TestCreateTFRecords(parameterized.TestCase, tf.test.TestCase):
310
-
311
- @parameterized.named_parameters(
312
- ("bi_data_only", True, False, False),
313
- ("eod_token_only", False, True, True),
314
- ("lower_case_only", False, False, True),
315
- ("all_enabled", True, True, True),
316
- )
317
- def test_end_to_end(self,
318
- bi_data: bool,
319
- use_eod_token: bool,
320
- do_lower_case: bool):
321
- tokenizer = _get_mock_tokenizer()
322
-
323
- num_documents = 5
324
- sentences_per_document = 10
325
- document_length = 50
326
-
327
- documents = [
328
- ["a " * document_length for _ in range(sentences_per_document)]
329
- for _ in range(num_documents)]
330
-
331
- save_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
332
- files = _create_files(temp_dir=self.get_temp_dir(), file_contents=documents)
333
-
334
- cpd.create_tfrecords(
335
- tokenizer=tokenizer,
336
- input_file_or_files=",".join(files),
337
- use_eod_token=use_eod_token,
338
- do_lower_case=do_lower_case,
339
- per_host_batch_size=8,
340
- seq_length=8,
341
- reuse_length=4,
342
- bi_data=bi_data,
343
- num_cores_per_host=2,
344
- save_dir=save_dir)
345
-
346
- self.assertTrue(any(filter(lambda x: x.endswith(".json"),
347
- os.listdir(save_dir))))
348
- self.assertTrue(any(filter(lambda x: x.endswith(".tfrecord"),
349
- os.listdir(save_dir))))
350
-
351
-
352
- if __name__ == "__main__":
353
- np.random.seed(0)
354
- logging.set_verbosity(logging.INFO)
355
- tf.test.main()