Spaces:
Running
Running
Pradeep Kumar
commited on
Delete pretrain_dynamic_dataloader_test.py
Browse files
pretrain_dynamic_dataloader_test.py
DELETED
@@ -1,245 +0,0 @@
|
|
1 |
-
# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
|
2 |
-
#
|
3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
-
# you may not use this file except in compliance with the License.
|
5 |
-
# You may obtain a copy of the License at
|
6 |
-
#
|
7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
-
#
|
9 |
-
# Unless required by applicable law or agreed to in writing, software
|
10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
-
# See the License for the specific language governing permissions and
|
13 |
-
# limitations under the License.
|
14 |
-
|
15 |
-
"""Tests for nlp.data.pretrain_dynamic_dataloader."""
|
16 |
-
import os
|
17 |
-
|
18 |
-
from absl import logging
|
19 |
-
from absl.testing import parameterized
|
20 |
-
import numpy as np
|
21 |
-
import orbit
|
22 |
-
import tensorflow as tf, tf_keras
|
23 |
-
|
24 |
-
from tensorflow.python.distribute import combinations
|
25 |
-
from tensorflow.python.distribute import strategy_combinations
|
26 |
-
from official.nlp.configs import bert
|
27 |
-
from official.nlp.configs import encoders
|
28 |
-
from official.nlp.data import pretrain_dataloader
|
29 |
-
from official.nlp.data import pretrain_dynamic_dataloader
|
30 |
-
from official.nlp.tasks import masked_lm
|
31 |
-
|
32 |
-
|
33 |
-
def _create_fake_dataset(output_path, seq_length, num_masked_tokens,
|
34 |
-
max_seq_length, num_examples):
|
35 |
-
"""Creates a fake dataset."""
|
36 |
-
writer = tf.io.TFRecordWriter(output_path)
|
37 |
-
|
38 |
-
def create_int_feature(values):
|
39 |
-
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
|
40 |
-
return f
|
41 |
-
|
42 |
-
def create_float_feature(values):
|
43 |
-
f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
|
44 |
-
return f
|
45 |
-
|
46 |
-
rng = np.random.default_rng(37)
|
47 |
-
for _ in range(num_examples):
|
48 |
-
features = {}
|
49 |
-
padding = np.zeros(shape=(max_seq_length - seq_length), dtype=np.int32)
|
50 |
-
input_ids = rng.integers(low=1, high=100, size=(seq_length))
|
51 |
-
features['input_ids'] = create_int_feature(
|
52 |
-
np.concatenate((input_ids, padding)))
|
53 |
-
features['input_mask'] = create_int_feature(
|
54 |
-
np.concatenate((np.ones_like(input_ids), padding)))
|
55 |
-
features['segment_ids'] = create_int_feature(
|
56 |
-
np.concatenate((np.ones_like(input_ids), padding)))
|
57 |
-
features['position_ids'] = create_int_feature(
|
58 |
-
np.concatenate((np.ones_like(input_ids), padding)))
|
59 |
-
features['masked_lm_positions'] = create_int_feature(
|
60 |
-
rng.integers(60, size=(num_masked_tokens), dtype=np.int64))
|
61 |
-
features['masked_lm_ids'] = create_int_feature(
|
62 |
-
rng.integers(100, size=(num_masked_tokens), dtype=np.int64))
|
63 |
-
features['masked_lm_weights'] = create_float_feature(
|
64 |
-
np.ones((num_masked_tokens,), dtype=np.float32))
|
65 |
-
features['next_sentence_labels'] = create_int_feature(np.array([0]))
|
66 |
-
|
67 |
-
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
|
68 |
-
writer.write(tf_example.SerializeToString())
|
69 |
-
writer.close()
|
70 |
-
|
71 |
-
|
72 |
-
class PretrainDynamicDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
|
73 |
-
|
74 |
-
@combinations.generate(
|
75 |
-
combinations.combine(
|
76 |
-
distribution_strategy=[
|
77 |
-
strategy_combinations.cloud_tpu_strategy,
|
78 |
-
],
|
79 |
-
mode='eager'))
|
80 |
-
def test_distribution_strategy(self, distribution_strategy):
|
81 |
-
max_seq_length = 128
|
82 |
-
batch_size = 8
|
83 |
-
input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
|
84 |
-
_create_fake_dataset(
|
85 |
-
input_path,
|
86 |
-
seq_length=60,
|
87 |
-
num_masked_tokens=20,
|
88 |
-
max_seq_length=max_seq_length,
|
89 |
-
num_examples=batch_size)
|
90 |
-
data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig(
|
91 |
-
is_training=False,
|
92 |
-
input_path=input_path,
|
93 |
-
seq_bucket_lengths=[64, 128],
|
94 |
-
global_batch_size=batch_size)
|
95 |
-
dataloader = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader(
|
96 |
-
data_config)
|
97 |
-
distributed_ds = orbit.utils.make_distributed_dataset(
|
98 |
-
distribution_strategy, dataloader.load)
|
99 |
-
train_iter = iter(distributed_ds)
|
100 |
-
with distribution_strategy.scope():
|
101 |
-
config = masked_lm.MaskedLMConfig(
|
102 |
-
init_checkpoint=self.get_temp_dir(),
|
103 |
-
model=bert.PretrainerConfig(
|
104 |
-
encoders.EncoderConfig(
|
105 |
-
bert=encoders.BertEncoderConfig(
|
106 |
-
vocab_size=30522, num_layers=1)),
|
107 |
-
cls_heads=[
|
108 |
-
bert.ClsHeadConfig(
|
109 |
-
inner_dim=10, num_classes=2, name='next_sentence')
|
110 |
-
]),
|
111 |
-
train_data=data_config)
|
112 |
-
task = masked_lm.MaskedLMTask(config)
|
113 |
-
model = task.build_model()
|
114 |
-
metrics = task.build_metrics()
|
115 |
-
|
116 |
-
@tf.function
|
117 |
-
def step_fn(features):
|
118 |
-
return task.validation_step(features, model, metrics=metrics)
|
119 |
-
|
120 |
-
distributed_outputs = distribution_strategy.run(
|
121 |
-
step_fn, args=(next(train_iter),))
|
122 |
-
local_results = tf.nest.map_structure(
|
123 |
-
distribution_strategy.experimental_local_results, distributed_outputs)
|
124 |
-
logging.info('Dynamic padding: local_results= %s', str(local_results))
|
125 |
-
dynamic_metrics = {}
|
126 |
-
for metric in metrics:
|
127 |
-
dynamic_metrics[metric.name] = metric.result()
|
128 |
-
|
129 |
-
data_config = pretrain_dataloader.BertPretrainDataConfig(
|
130 |
-
is_training=False,
|
131 |
-
input_path=input_path,
|
132 |
-
seq_length=max_seq_length,
|
133 |
-
max_predictions_per_seq=20,
|
134 |
-
global_batch_size=batch_size)
|
135 |
-
dataloader = pretrain_dataloader.BertPretrainDataLoader(data_config)
|
136 |
-
distributed_ds = orbit.utils.make_distributed_dataset(
|
137 |
-
distribution_strategy, dataloader.load)
|
138 |
-
train_iter = iter(distributed_ds)
|
139 |
-
with distribution_strategy.scope():
|
140 |
-
metrics = task.build_metrics()
|
141 |
-
|
142 |
-
@tf.function
|
143 |
-
def step_fn_b(features):
|
144 |
-
return task.validation_step(features, model, metrics=metrics)
|
145 |
-
|
146 |
-
distributed_outputs = distribution_strategy.run(
|
147 |
-
step_fn_b, args=(next(train_iter),))
|
148 |
-
local_results = tf.nest.map_structure(
|
149 |
-
distribution_strategy.experimental_local_results, distributed_outputs)
|
150 |
-
logging.info('Static padding: local_results= %s', str(local_results))
|
151 |
-
static_metrics = {}
|
152 |
-
for metric in metrics:
|
153 |
-
static_metrics[metric.name] = metric.result()
|
154 |
-
for key in static_metrics:
|
155 |
-
# We need to investigate the differences on losses.
|
156 |
-
if key != 'next_sentence_loss':
|
157 |
-
self.assertEqual(dynamic_metrics[key], static_metrics[key])
|
158 |
-
|
159 |
-
def test_load_dataset(self):
|
160 |
-
tf.random.set_seed(0)
|
161 |
-
max_seq_length = 128
|
162 |
-
batch_size = 2
|
163 |
-
input_path_1 = os.path.join(self.get_temp_dir(), 'train_1.tf_record')
|
164 |
-
_create_fake_dataset(
|
165 |
-
input_path_1,
|
166 |
-
seq_length=60,
|
167 |
-
num_masked_tokens=20,
|
168 |
-
max_seq_length=max_seq_length,
|
169 |
-
num_examples=batch_size)
|
170 |
-
input_path_2 = os.path.join(self.get_temp_dir(), 'train_2.tf_record')
|
171 |
-
_create_fake_dataset(
|
172 |
-
input_path_2,
|
173 |
-
seq_length=100,
|
174 |
-
num_masked_tokens=70,
|
175 |
-
max_seq_length=max_seq_length,
|
176 |
-
num_examples=batch_size)
|
177 |
-
input_paths = ','.join([input_path_1, input_path_2])
|
178 |
-
data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig(
|
179 |
-
is_training=False,
|
180 |
-
input_path=input_paths,
|
181 |
-
seq_bucket_lengths=[64, 128],
|
182 |
-
use_position_id=True,
|
183 |
-
global_batch_size=batch_size,
|
184 |
-
deterministic=True)
|
185 |
-
dataset = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader(
|
186 |
-
data_config).load()
|
187 |
-
dataset_it = iter(dataset)
|
188 |
-
features = next(dataset_it)
|
189 |
-
self.assertCountEqual([
|
190 |
-
'input_word_ids',
|
191 |
-
'input_mask',
|
192 |
-
'input_type_ids',
|
193 |
-
'next_sentence_labels',
|
194 |
-
'masked_lm_positions',
|
195 |
-
'masked_lm_ids',
|
196 |
-
'masked_lm_weights',
|
197 |
-
'position_ids',
|
198 |
-
], features.keys())
|
199 |
-
# Sequence length dimension should be bucketized and pad to 64.
|
200 |
-
self.assertEqual(features['input_word_ids'].shape, (batch_size, 64))
|
201 |
-
self.assertEqual(features['input_mask'].shape, (batch_size, 64))
|
202 |
-
self.assertEqual(features['input_type_ids'].shape, (batch_size, 64))
|
203 |
-
self.assertEqual(features['position_ids'].shape, (batch_size, 64))
|
204 |
-
self.assertEqual(features['masked_lm_positions'].shape, (batch_size, 20))
|
205 |
-
features = next(dataset_it)
|
206 |
-
self.assertEqual(features['input_word_ids'].shape, (batch_size, 128))
|
207 |
-
self.assertEqual(features['input_mask'].shape, (batch_size, 128))
|
208 |
-
self.assertEqual(features['input_type_ids'].shape, (batch_size, 128))
|
209 |
-
self.assertEqual(features['position_ids'].shape, (batch_size, 128))
|
210 |
-
self.assertEqual(features['masked_lm_positions'].shape, (batch_size, 70))
|
211 |
-
|
212 |
-
def test_load_dataset_not_same_masks(self):
|
213 |
-
max_seq_length = 128
|
214 |
-
batch_size = 2
|
215 |
-
input_path_1 = os.path.join(self.get_temp_dir(), 'train_3.tf_record')
|
216 |
-
_create_fake_dataset(
|
217 |
-
input_path_1,
|
218 |
-
seq_length=60,
|
219 |
-
num_masked_tokens=20,
|
220 |
-
max_seq_length=max_seq_length,
|
221 |
-
num_examples=batch_size)
|
222 |
-
input_path_2 = os.path.join(self.get_temp_dir(), 'train_4.tf_record')
|
223 |
-
_create_fake_dataset(
|
224 |
-
input_path_2,
|
225 |
-
seq_length=60,
|
226 |
-
num_masked_tokens=15,
|
227 |
-
max_seq_length=max_seq_length,
|
228 |
-
num_examples=batch_size)
|
229 |
-
input_paths = ','.join([input_path_1, input_path_2])
|
230 |
-
data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig(
|
231 |
-
is_training=False,
|
232 |
-
input_path=input_paths,
|
233 |
-
seq_bucket_lengths=[64, 128],
|
234 |
-
use_position_id=True,
|
235 |
-
global_batch_size=batch_size * 2)
|
236 |
-
dataset = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader(
|
237 |
-
data_config).load()
|
238 |
-
dataset_it = iter(dataset)
|
239 |
-
with self.assertRaisesRegex(
|
240 |
-
tf.errors.InvalidArgumentError, '.*Number of non padded mask tokens.*'):
|
241 |
-
next(dataset_it)
|
242 |
-
|
243 |
-
|
244 |
-
if __name__ == '__main__':
|
245 |
-
tf.test.main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|