milosvuk commited on
Commit
155b8cf
·
1 Parent(s): 61be39b

Delete submission/submit.py

Browse files
Files changed (1) hide show
  1. submission/submit.py +0 -343
submission/submit.py DELETED
@@ -1,343 +0,0 @@
1
- # Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
2
- #
3
- # This work is made available under the Nvidia Source Code License-NC.
4
- # To view a copy of this license, visit
5
- # https://nvlabs.github.io/stylegan2/license.html
6
-
7
- """Submit a function to be run either locally or in a computing cluster."""
8
-
9
- import copy
10
- import inspect
11
- import os
12
- import pathlib
13
- import pickle
14
- import platform
15
- import pprint
16
- import re
17
- import shutil
18
- import sys
19
- import time
20
- import traceback
21
-
22
- from enum import Enum
23
-
24
- from .. import util
25
- from ..util import EasyDict
26
-
27
- from . import internal
28
-
29
- class SubmitTarget(Enum):
30
- """The target where the function should be run.
31
-
32
- LOCAL: Run it locally.
33
- """
34
- LOCAL = 1
35
-
36
-
37
- class PathType(Enum):
38
- """Determines in which format should a path be formatted.
39
-
40
- WINDOWS: Format with Windows style.
41
- LINUX: Format with Linux/Posix style.
42
- AUTO: Use current OS type to select either WINDOWS or LINUX.
43
- """
44
- WINDOWS = 1
45
- LINUX = 2
46
- AUTO = 3
47
-
48
-
49
- class PlatformExtras:
50
- """A mixed bag of values used by dnnlib heuristics.
51
-
52
- Attributes:
53
-
54
- data_reader_buffer_size: Used by DataReader to size internal shared memory buffers.
55
- data_reader_process_count: Number of worker processes to spawn (zero for single thread operation)
56
- """
57
- def __init__(self):
58
- self.data_reader_buffer_size = 1<<30 # 1 GB
59
- self.data_reader_process_count = 0 # single threaded default
60
-
61
-
62
- _user_name_override = None
63
-
64
- class SubmitConfig(util.EasyDict):
65
- """Strongly typed config dict needed to submit runs.
66
-
67
- Attributes:
68
- run_dir_root: Path to the run dir root. Can be optionally templated with tags. Needs to always be run through get_path_from_template.
69
- run_desc: Description of the run. Will be used in the run dir and task name.
70
- run_dir_ignore: List of file patterns used to ignore files when copying files to the run dir.
71
- run_dir_extra_files: List of (abs_path, rel_path) tuples of file paths. rel_path root will be the src directory inside the run dir.
72
- submit_target: Submit target enum value. Used to select where the run is actually launched.
73
- num_gpus: Number of GPUs used/requested for the run.
74
- print_info: Whether to print debug information when submitting.
75
- local.do_not_copy_source_files: Do not copy source files from the working directory to the run dir.
76
- run_id: Automatically populated value during submit.
77
- run_name: Automatically populated value during submit.
78
- run_dir: Automatically populated value during submit.
79
- run_func_name: Automatically populated value during submit.
80
- run_func_kwargs: Automatically populated value during submit.
81
- user_name: Automatically populated value during submit. Can be set by the user which will then override the automatic value.
82
- task_name: Automatically populated value during submit.
83
- host_name: Automatically populated value during submit.
84
- platform_extras: Automatically populated values during submit. Used by various dnnlib libraries such as the DataReader class.
85
- """
86
-
87
- def __init__(self):
88
- super().__init__()
89
-
90
- # run (set these)
91
- self.run_dir_root = "" # should always be passed through get_path_from_template
92
- self.run_desc = ""
93
- self.run_dir_ignore = ["__pycache__", "*.pyproj", "*.sln", "*.suo", ".cache", ".idea", ".vs", ".vscode", "_cudacache"]
94
- self.run_dir_extra_files = []
95
-
96
- # submit (set these)
97
- self.submit_target = SubmitTarget.LOCAL
98
- self.num_gpus = 1
99
- self.print_info = False
100
- self.nvprof = False
101
- self.local = internal.local.TargetOptions()
102
- self.datasets = []
103
-
104
- # (automatically populated)
105
- self.run_id = None
106
- self.run_name = None
107
- self.run_dir = None
108
- self.run_func_name = None
109
- self.run_func_kwargs = None
110
- self.user_name = None
111
- self.task_name = None
112
- self.host_name = "localhost"
113
- self.platform_extras = PlatformExtras()
114
-
115
-
116
- def get_path_from_template(path_template: str, path_type: PathType = PathType.AUTO) -> str:
117
- """Replace tags in the given path template and return either Windows or Linux formatted path."""
118
- # automatically select path type depending on running OS
119
- if path_type == PathType.AUTO:
120
- if platform.system() == "Windows":
121
- path_type = PathType.WINDOWS
122
- elif platform.system() == "Linux":
123
- path_type = PathType.LINUX
124
- else:
125
- raise RuntimeError("Unknown platform")
126
-
127
- path_template = path_template.replace("<USERNAME>", get_user_name())
128
-
129
- # return correctly formatted path
130
- if path_type == PathType.WINDOWS:
131
- return str(pathlib.PureWindowsPath(path_template))
132
- elif path_type == PathType.LINUX:
133
- return str(pathlib.PurePosixPath(path_template))
134
- else:
135
- raise RuntimeError("Unknown platform")
136
-
137
-
138
- def get_template_from_path(path: str) -> str:
139
- """Convert a normal path back to its template representation."""
140
- path = path.replace("\\", "/")
141
- return path
142
-
143
-
144
- def convert_path(path: str, path_type: PathType = PathType.AUTO) -> str:
145
- """Convert a normal path to template and the convert it back to a normal path with given path type."""
146
- path_template = get_template_from_path(path)
147
- path = get_path_from_template(path_template, path_type)
148
- return path
149
-
150
-
151
- def set_user_name_override(name: str) -> None:
152
- """Set the global username override value."""
153
- global _user_name_override
154
- _user_name_override = name
155
-
156
-
157
- def get_user_name():
158
- """Get the current user name."""
159
- if _user_name_override is not None:
160
- return _user_name_override
161
- elif platform.system() == "Windows":
162
- return os.getlogin()
163
- elif platform.system() == "Linux":
164
- try:
165
- import pwd
166
- return pwd.getpwuid(os.geteuid()).pw_name
167
- except:
168
- return "unknown"
169
- else:
170
- raise RuntimeError("Unknown platform")
171
-
172
-
173
- def make_run_dir_path(*paths):
174
- """Make a path/filename that resides under the current submit run_dir.
175
-
176
- Args:
177
- *paths: Path components to be passed to os.path.join
178
-
179
- Returns:
180
- A file/dirname rooted at submit_config.run_dir. If there's no
181
- submit_config or run_dir, the base directory is the current
182
- working directory.
183
-
184
- E.g., `os.path.join(dnnlib.submit_config.run_dir, "output.txt"))`
185
- """
186
- import dnnlib
187
- if (dnnlib.submit_config is None) or (dnnlib.submit_config.run_dir is None):
188
- return os.path.join(os.getcwd(), *paths)
189
- return os.path.join(dnnlib.submit_config.run_dir, *paths)
190
-
191
-
192
- def _create_run_dir_local(submit_config: SubmitConfig) -> str:
193
- """Create a new run dir with increasing ID number at the start."""
194
- run_dir_root = get_path_from_template(submit_config.run_dir_root, PathType.AUTO)
195
-
196
- if not os.path.exists(run_dir_root):
197
- os.makedirs(run_dir_root)
198
-
199
- submit_config.run_id = _get_next_run_id_local(run_dir_root)
200
- submit_config.run_name = "{0:05d}-{1}".format(submit_config.run_id, submit_config.run_desc)
201
- run_dir = os.path.join(run_dir_root, submit_config.run_name)
202
-
203
- if os.path.exists(run_dir):
204
- raise RuntimeError("The run dir already exists! ({0})".format(run_dir))
205
-
206
- os.makedirs(run_dir)
207
-
208
- return run_dir
209
-
210
-
211
- def _get_next_run_id_local(run_dir_root: str) -> int:
212
- """Reads all directory names in a given directory (non-recursive) and returns the next (increasing) run id. Assumes IDs are numbers at the start of the directory names."""
213
- dir_names = [d for d in os.listdir(run_dir_root) if os.path.isdir(os.path.join(run_dir_root, d))]
214
- r = re.compile("^\\d+") # match one or more digits at the start of the string
215
- run_id = 0
216
-
217
- for dir_name in dir_names:
218
- m = r.match(dir_name)
219
-
220
- if m is not None:
221
- i = int(m.group())
222
- run_id = max(run_id, i + 1)
223
-
224
- return run_id
225
-
226
-
227
- def _populate_run_dir(submit_config: SubmitConfig, run_dir: str) -> None:
228
- """Copy all necessary files into the run dir. Assumes that the dir exists, is local, and is writable."""
229
- pickle.dump(submit_config, open(os.path.join(run_dir, "submit_config.pkl"), "wb"))
230
- with open(os.path.join(run_dir, "submit_config.txt"), "w") as f:
231
- pprint.pprint(submit_config, stream=f, indent=4, width=200, compact=False)
232
-
233
- if (submit_config.submit_target == SubmitTarget.LOCAL) and submit_config.local.do_not_copy_source_files:
234
- return
235
-
236
- files = []
237
-
238
- run_func_module_dir_path = util.get_module_dir_by_obj_name(submit_config.run_func_name)
239
- assert '.' in submit_config.run_func_name
240
- for _idx in range(submit_config.run_func_name.count('.') - 1):
241
- run_func_module_dir_path = os.path.dirname(run_func_module_dir_path)
242
- files += util.list_dir_recursively_with_ignore(run_func_module_dir_path, ignores=submit_config.run_dir_ignore, add_base_to_relative=False)
243
-
244
- dnnlib_module_dir_path = util.get_module_dir_by_obj_name("dnnlib")
245
- files += util.list_dir_recursively_with_ignore(dnnlib_module_dir_path, ignores=submit_config.run_dir_ignore, add_base_to_relative=True)
246
-
247
- files += submit_config.run_dir_extra_files
248
-
249
- files = [(f[0], os.path.join(run_dir, "src", f[1])) for f in files]
250
- files += [(os.path.join(dnnlib_module_dir_path, "submission", "internal", "run.py"), os.path.join(run_dir, "run.py"))]
251
-
252
- util.copy_files_and_create_dirs(files)
253
-
254
-
255
-
256
- def run_wrapper(submit_config: SubmitConfig) -> None:
257
- """Wrap the actual run function call for handling logging, exceptions, typing, etc."""
258
- is_local = submit_config.submit_target == SubmitTarget.LOCAL
259
-
260
- # when running locally, redirect stderr to stdout, log stdout to a file, and force flushing
261
- if is_local:
262
- logger = util.Logger(file_name=os.path.join(submit_config.run_dir, "log.txt"), file_mode="w", should_flush=True)
263
- else: # when running in a cluster, redirect stderr to stdout, and just force flushing (log writing is handled by run.sh)
264
- logger = util.Logger(file_name=None, should_flush=True)
265
-
266
- import dnnlib
267
- dnnlib.submit_config = submit_config
268
-
269
- exit_with_errcode = False
270
- try:
271
- print("dnnlib: Running {0}() on {1}...".format(submit_config.run_func_name, submit_config.host_name))
272
- start_time = time.time()
273
-
274
- run_func_obj = util.get_obj_by_name(submit_config.run_func_name)
275
- assert callable(run_func_obj)
276
- sig = inspect.signature(run_func_obj)
277
- if 'submit_config' in sig.parameters:
278
- run_func_obj(submit_config=submit_config, **submit_config.run_func_kwargs)
279
- else:
280
- run_func_obj(**submit_config.run_func_kwargs)
281
-
282
- print("dnnlib: Finished {0}() in {1}.".format(submit_config.run_func_name, util.format_time(time.time() - start_time)))
283
- except:
284
- if is_local:
285
- raise
286
- else:
287
- traceback.print_exc()
288
-
289
- log_src = os.path.join(submit_config.run_dir, "log.txt")
290
- log_dst = os.path.join(get_path_from_template(submit_config.run_dir_root), "{0}-error.txt".format(submit_config.run_name))
291
- shutil.copyfile(log_src, log_dst)
292
-
293
- # Defer sys.exit(1) to happen after we close the logs and create a _finished.txt
294
- exit_with_errcode = True
295
- finally:
296
- open(os.path.join(submit_config.run_dir, "_finished.txt"), "w").close()
297
-
298
- dnnlib.RunContext.get().close()
299
- dnnlib.submit_config = None
300
- logger.close()
301
-
302
- # If we hit an error, get out of the script now and signal the error
303
- # to whatever process that started this script.
304
- if exit_with_errcode:
305
- sys.exit(1)
306
-
307
- return submit_config
308
-
309
-
310
- def submit_run(submit_config: SubmitConfig, run_func_name: str, **run_func_kwargs) -> None:
311
- """Create a run dir, gather files related to the run, copy files to the run dir, and launch the run in appropriate place."""
312
- submit_config = copy.deepcopy(submit_config)
313
-
314
- submit_target = submit_config.submit_target
315
- farm = None
316
- if submit_target == SubmitTarget.LOCAL:
317
- farm = internal.local.Target()
318
- assert farm is not None # unknown target
319
-
320
- # Disallow submitting jobs with zero num_gpus.
321
- if (submit_config.num_gpus is None) or (submit_config.num_gpus == 0):
322
- raise RuntimeError("submit_config.num_gpus must be set to a non-zero value")
323
-
324
- if submit_config.user_name is None:
325
- submit_config.user_name = get_user_name()
326
-
327
- submit_config.run_func_name = run_func_name
328
- submit_config.run_func_kwargs = run_func_kwargs
329
-
330
- #--------------------------------------------------------------------
331
- # Prepare submission by populating the run dir
332
- #--------------------------------------------------------------------
333
- host_run_dir = _create_run_dir_local(submit_config)
334
-
335
- submit_config.task_name = "{0}-{1:05d}-{2}".format(submit_config.user_name, submit_config.run_id, submit_config.run_desc)
336
- docker_valid_name_regex = "^[a-zA-Z0-9][a-zA-Z0-9_.-]+$"
337
- if not re.match(docker_valid_name_regex, submit_config.task_name):
338
- raise RuntimeError("Invalid task name. Probable reason: unacceptable characters in your submit_config.run_desc. Task name must be accepted by the following regex: " + docker_valid_name_regex + ", got " + submit_config.task_name)
339
-
340
- # Farm specific preparations for a submit
341
- farm.finalize_submit_config(submit_config, host_run_dir)
342
- _populate_run_dir(submit_config, host_run_dir)
343
- return farm.submit(submit_config, host_run_dir)