File size: 4,914 Bytes
5672777
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for distribution util functions."""

import sys
import tensorflow as tf, tf_keras

from official.common import distribute_utils

TPU_TEST = 'test_tpu' in sys.argv[0]


class DistributeUtilsTest(tf.test.TestCase):
  """Tests for distribute util functions."""

  def test_invalid_args(self):
    with self.assertRaisesRegex(ValueError, '`num_gpus` can not be negative.'):
      _ = distribute_utils.get_distribution_strategy(num_gpus=-1)

    with self.assertRaisesRegex(ValueError,
                                '.*If you meant to pass the string .*'):
      _ = distribute_utils.get_distribution_strategy(
          distribution_strategy=False, num_gpus=0)
    with self.assertRaisesRegex(ValueError, 'When 2 GPUs are specified.*'):
      _ = distribute_utils.get_distribution_strategy(
          distribution_strategy='off', num_gpus=2)
    with self.assertRaisesRegex(ValueError,
                                '`OneDeviceStrategy` can not be used.*'):
      _ = distribute_utils.get_distribution_strategy(
          distribution_strategy='one_device', num_gpus=2)

  def test_one_device_strategy_cpu(self):
    ds = distribute_utils.get_distribution_strategy('one_device', num_gpus=0)
    self.assertEquals(ds.num_replicas_in_sync, 1)
    self.assertEquals(len(ds.extended.worker_devices), 1)
    self.assertIn('CPU', ds.extended.worker_devices[0])

  def test_one_device_strategy_gpu(self):
    ds = distribute_utils.get_distribution_strategy('one_device', num_gpus=1)
    self.assertEquals(ds.num_replicas_in_sync, 1)
    self.assertEquals(len(ds.extended.worker_devices), 1)
    self.assertIn('GPU', ds.extended.worker_devices[0])

  def test_mirrored_strategy(self):
    # CPU only.
    _ = distribute_utils.get_distribution_strategy(num_gpus=0)
    # 5 GPUs.
    ds = distribute_utils.get_distribution_strategy(num_gpus=5)
    self.assertEquals(ds.num_replicas_in_sync, 5)
    self.assertEquals(len(ds.extended.worker_devices), 5)
    for device in ds.extended.worker_devices:
      self.assertIn('GPU', device)

    _ = distribute_utils.get_distribution_strategy(
        distribution_strategy='mirrored',
        num_gpus=2,
        all_reduce_alg='nccl',
        num_packs=2)
    with self.assertRaisesRegex(
        ValueError,
        'When used with `mirrored`, valid values for all_reduce_alg are.*'):
      _ = distribute_utils.get_distribution_strategy(
          distribution_strategy='mirrored',
          num_gpus=2,
          all_reduce_alg='dummy',
          num_packs=2)

  def test_mwms(self):
    distribute_utils.configure_cluster(worker_hosts=None, task_index=-1)
    ds = distribute_utils.get_distribution_strategy(
        'multi_worker_mirrored', all_reduce_alg='nccl')
    self.assertIsInstance(
        ds, tf.distribute.experimental.MultiWorkerMirroredStrategy)

    with self.assertRaisesRegex(
        ValueError,
        'When used with `multi_worker_mirrored`, valid values.*'):
      _ = distribute_utils.get_distribution_strategy(
          'multi_worker_mirrored', all_reduce_alg='dummy')

  def test_no_strategy(self):
    ds = distribute_utils.get_distribution_strategy('off')
    self.assertIs(ds, tf.distribute.get_strategy())

  def test_tpu_strategy(self):
    if not TPU_TEST:
      self.skipTest('Only Cloud TPU VM instances can have local TPUs.')
    with self.assertRaises(ValueError):
      _ = distribute_utils.get_distribution_strategy('tpu')

    ds = distribute_utils.get_distribution_strategy('tpu', tpu_address='local')
    self.assertIsInstance(
        ds, tf.distribute.TPUStrategy)

  def test_invalid_strategy(self):
    with self.assertRaisesRegexp(
        ValueError,
        'distribution_strategy must be a string but got: False. If'):
      distribute_utils.get_distribution_strategy(False)
    with self.assertRaisesRegexp(
        ValueError, 'distribution_strategy must be a string but got: 1'):
      distribute_utils.get_distribution_strategy(1)

  def test_get_strategy_scope(self):
    ds = distribute_utils.get_distribution_strategy('one_device', num_gpus=0)
    with distribute_utils.get_strategy_scope(ds):
      self.assertIs(tf.distribute.get_strategy(), ds)
    with distribute_utils.get_strategy_scope(None):
      self.assertIsNot(tf.distribute.get_strategy(), ds)

if __name__ == '__main__':
  tf.test.main()