Spaces:
Sleeping
Sleeping
Delete submission/submit.py
Browse files- submission/submit.py +0 -343
submission/submit.py
DELETED
@@ -1,343 +0,0 @@
|
|
1 |
-
# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
|
2 |
-
#
|
3 |
-
# This work is made available under the Nvidia Source Code License-NC.
|
4 |
-
# To view a copy of this license, visit
|
5 |
-
# https://nvlabs.github.io/stylegan2/license.html
|
6 |
-
|
7 |
-
"""Submit a function to be run either locally or in a computing cluster."""
|
8 |
-
|
9 |
-
import copy
|
10 |
-
import inspect
|
11 |
-
import os
|
12 |
-
import pathlib
|
13 |
-
import pickle
|
14 |
-
import platform
|
15 |
-
import pprint
|
16 |
-
import re
|
17 |
-
import shutil
|
18 |
-
import sys
|
19 |
-
import time
|
20 |
-
import traceback
|
21 |
-
|
22 |
-
from enum import Enum
|
23 |
-
|
24 |
-
from .. import util
|
25 |
-
from ..util import EasyDict
|
26 |
-
|
27 |
-
from . import internal
|
28 |
-
|
29 |
-
class SubmitTarget(Enum):
|
30 |
-
"""The target where the function should be run.
|
31 |
-
|
32 |
-
LOCAL: Run it locally.
|
33 |
-
"""
|
34 |
-
LOCAL = 1
|
35 |
-
|
36 |
-
|
37 |
-
class PathType(Enum):
|
38 |
-
"""Determines in which format should a path be formatted.
|
39 |
-
|
40 |
-
WINDOWS: Format with Windows style.
|
41 |
-
LINUX: Format with Linux/Posix style.
|
42 |
-
AUTO: Use current OS type to select either WINDOWS or LINUX.
|
43 |
-
"""
|
44 |
-
WINDOWS = 1
|
45 |
-
LINUX = 2
|
46 |
-
AUTO = 3
|
47 |
-
|
48 |
-
|
49 |
-
class PlatformExtras:
|
50 |
-
"""A mixed bag of values used by dnnlib heuristics.
|
51 |
-
|
52 |
-
Attributes:
|
53 |
-
|
54 |
-
data_reader_buffer_size: Used by DataReader to size internal shared memory buffers.
|
55 |
-
data_reader_process_count: Number of worker processes to spawn (zero for single thread operation)
|
56 |
-
"""
|
57 |
-
def __init__(self):
|
58 |
-
self.data_reader_buffer_size = 1<<30 # 1 GB
|
59 |
-
self.data_reader_process_count = 0 # single threaded default
|
60 |
-
|
61 |
-
|
62 |
-
_user_name_override = None
|
63 |
-
|
64 |
-
class SubmitConfig(util.EasyDict):
|
65 |
-
"""Strongly typed config dict needed to submit runs.
|
66 |
-
|
67 |
-
Attributes:
|
68 |
-
run_dir_root: Path to the run dir root. Can be optionally templated with tags. Needs to always be run through get_path_from_template.
|
69 |
-
run_desc: Description of the run. Will be used in the run dir and task name.
|
70 |
-
run_dir_ignore: List of file patterns used to ignore files when copying files to the run dir.
|
71 |
-
run_dir_extra_files: List of (abs_path, rel_path) tuples of file paths. rel_path root will be the src directory inside the run dir.
|
72 |
-
submit_target: Submit target enum value. Used to select where the run is actually launched.
|
73 |
-
num_gpus: Number of GPUs used/requested for the run.
|
74 |
-
print_info: Whether to print debug information when submitting.
|
75 |
-
local.do_not_copy_source_files: Do not copy source files from the working directory to the run dir.
|
76 |
-
run_id: Automatically populated value during submit.
|
77 |
-
run_name: Automatically populated value during submit.
|
78 |
-
run_dir: Automatically populated value during submit.
|
79 |
-
run_func_name: Automatically populated value during submit.
|
80 |
-
run_func_kwargs: Automatically populated value during submit.
|
81 |
-
user_name: Automatically populated value during submit. Can be set by the user which will then override the automatic value.
|
82 |
-
task_name: Automatically populated value during submit.
|
83 |
-
host_name: Automatically populated value during submit.
|
84 |
-
platform_extras: Automatically populated values during submit. Used by various dnnlib libraries such as the DataReader class.
|
85 |
-
"""
|
86 |
-
|
87 |
-
def __init__(self):
|
88 |
-
super().__init__()
|
89 |
-
|
90 |
-
# run (set these)
|
91 |
-
self.run_dir_root = "" # should always be passed through get_path_from_template
|
92 |
-
self.run_desc = ""
|
93 |
-
self.run_dir_ignore = ["__pycache__", "*.pyproj", "*.sln", "*.suo", ".cache", ".idea", ".vs", ".vscode", "_cudacache"]
|
94 |
-
self.run_dir_extra_files = []
|
95 |
-
|
96 |
-
# submit (set these)
|
97 |
-
self.submit_target = SubmitTarget.LOCAL
|
98 |
-
self.num_gpus = 1
|
99 |
-
self.print_info = False
|
100 |
-
self.nvprof = False
|
101 |
-
self.local = internal.local.TargetOptions()
|
102 |
-
self.datasets = []
|
103 |
-
|
104 |
-
# (automatically populated)
|
105 |
-
self.run_id = None
|
106 |
-
self.run_name = None
|
107 |
-
self.run_dir = None
|
108 |
-
self.run_func_name = None
|
109 |
-
self.run_func_kwargs = None
|
110 |
-
self.user_name = None
|
111 |
-
self.task_name = None
|
112 |
-
self.host_name = "localhost"
|
113 |
-
self.platform_extras = PlatformExtras()
|
114 |
-
|
115 |
-
|
116 |
-
def get_path_from_template(path_template: str, path_type: PathType = PathType.AUTO) -> str:
|
117 |
-
"""Replace tags in the given path template and return either Windows or Linux formatted path."""
|
118 |
-
# automatically select path type depending on running OS
|
119 |
-
if path_type == PathType.AUTO:
|
120 |
-
if platform.system() == "Windows":
|
121 |
-
path_type = PathType.WINDOWS
|
122 |
-
elif platform.system() == "Linux":
|
123 |
-
path_type = PathType.LINUX
|
124 |
-
else:
|
125 |
-
raise RuntimeError("Unknown platform")
|
126 |
-
|
127 |
-
path_template = path_template.replace("<USERNAME>", get_user_name())
|
128 |
-
|
129 |
-
# return correctly formatted path
|
130 |
-
if path_type == PathType.WINDOWS:
|
131 |
-
return str(pathlib.PureWindowsPath(path_template))
|
132 |
-
elif path_type == PathType.LINUX:
|
133 |
-
return str(pathlib.PurePosixPath(path_template))
|
134 |
-
else:
|
135 |
-
raise RuntimeError("Unknown platform")
|
136 |
-
|
137 |
-
|
138 |
-
def get_template_from_path(path: str) -> str:
|
139 |
-
"""Convert a normal path back to its template representation."""
|
140 |
-
path = path.replace("\\", "/")
|
141 |
-
return path
|
142 |
-
|
143 |
-
|
144 |
-
def convert_path(path: str, path_type: PathType = PathType.AUTO) -> str:
|
145 |
-
"""Convert a normal path to template and the convert it back to a normal path with given path type."""
|
146 |
-
path_template = get_template_from_path(path)
|
147 |
-
path = get_path_from_template(path_template, path_type)
|
148 |
-
return path
|
149 |
-
|
150 |
-
|
151 |
-
def set_user_name_override(name: str) -> None:
|
152 |
-
"""Set the global username override value."""
|
153 |
-
global _user_name_override
|
154 |
-
_user_name_override = name
|
155 |
-
|
156 |
-
|
157 |
-
def get_user_name():
|
158 |
-
"""Get the current user name."""
|
159 |
-
if _user_name_override is not None:
|
160 |
-
return _user_name_override
|
161 |
-
elif platform.system() == "Windows":
|
162 |
-
return os.getlogin()
|
163 |
-
elif platform.system() == "Linux":
|
164 |
-
try:
|
165 |
-
import pwd
|
166 |
-
return pwd.getpwuid(os.geteuid()).pw_name
|
167 |
-
except:
|
168 |
-
return "unknown"
|
169 |
-
else:
|
170 |
-
raise RuntimeError("Unknown platform")
|
171 |
-
|
172 |
-
|
173 |
-
def make_run_dir_path(*paths):
|
174 |
-
"""Make a path/filename that resides under the current submit run_dir.
|
175 |
-
|
176 |
-
Args:
|
177 |
-
*paths: Path components to be passed to os.path.join
|
178 |
-
|
179 |
-
Returns:
|
180 |
-
A file/dirname rooted at submit_config.run_dir. If there's no
|
181 |
-
submit_config or run_dir, the base directory is the current
|
182 |
-
working directory.
|
183 |
-
|
184 |
-
E.g., `os.path.join(dnnlib.submit_config.run_dir, "output.txt"))`
|
185 |
-
"""
|
186 |
-
import dnnlib
|
187 |
-
if (dnnlib.submit_config is None) or (dnnlib.submit_config.run_dir is None):
|
188 |
-
return os.path.join(os.getcwd(), *paths)
|
189 |
-
return os.path.join(dnnlib.submit_config.run_dir, *paths)
|
190 |
-
|
191 |
-
|
192 |
-
def _create_run_dir_local(submit_config: SubmitConfig) -> str:
|
193 |
-
"""Create a new run dir with increasing ID number at the start."""
|
194 |
-
run_dir_root = get_path_from_template(submit_config.run_dir_root, PathType.AUTO)
|
195 |
-
|
196 |
-
if not os.path.exists(run_dir_root):
|
197 |
-
os.makedirs(run_dir_root)
|
198 |
-
|
199 |
-
submit_config.run_id = _get_next_run_id_local(run_dir_root)
|
200 |
-
submit_config.run_name = "{0:05d}-{1}".format(submit_config.run_id, submit_config.run_desc)
|
201 |
-
run_dir = os.path.join(run_dir_root, submit_config.run_name)
|
202 |
-
|
203 |
-
if os.path.exists(run_dir):
|
204 |
-
raise RuntimeError("The run dir already exists! ({0})".format(run_dir))
|
205 |
-
|
206 |
-
os.makedirs(run_dir)
|
207 |
-
|
208 |
-
return run_dir
|
209 |
-
|
210 |
-
|
211 |
-
def _get_next_run_id_local(run_dir_root: str) -> int:
|
212 |
-
"""Reads all directory names in a given directory (non-recursive) and returns the next (increasing) run id. Assumes IDs are numbers at the start of the directory names."""
|
213 |
-
dir_names = [d for d in os.listdir(run_dir_root) if os.path.isdir(os.path.join(run_dir_root, d))]
|
214 |
-
r = re.compile("^\\d+") # match one or more digits at the start of the string
|
215 |
-
run_id = 0
|
216 |
-
|
217 |
-
for dir_name in dir_names:
|
218 |
-
m = r.match(dir_name)
|
219 |
-
|
220 |
-
if m is not None:
|
221 |
-
i = int(m.group())
|
222 |
-
run_id = max(run_id, i + 1)
|
223 |
-
|
224 |
-
return run_id
|
225 |
-
|
226 |
-
|
227 |
-
def _populate_run_dir(submit_config: SubmitConfig, run_dir: str) -> None:
|
228 |
-
"""Copy all necessary files into the run dir. Assumes that the dir exists, is local, and is writable."""
|
229 |
-
pickle.dump(submit_config, open(os.path.join(run_dir, "submit_config.pkl"), "wb"))
|
230 |
-
with open(os.path.join(run_dir, "submit_config.txt"), "w") as f:
|
231 |
-
pprint.pprint(submit_config, stream=f, indent=4, width=200, compact=False)
|
232 |
-
|
233 |
-
if (submit_config.submit_target == SubmitTarget.LOCAL) and submit_config.local.do_not_copy_source_files:
|
234 |
-
return
|
235 |
-
|
236 |
-
files = []
|
237 |
-
|
238 |
-
run_func_module_dir_path = util.get_module_dir_by_obj_name(submit_config.run_func_name)
|
239 |
-
assert '.' in submit_config.run_func_name
|
240 |
-
for _idx in range(submit_config.run_func_name.count('.') - 1):
|
241 |
-
run_func_module_dir_path = os.path.dirname(run_func_module_dir_path)
|
242 |
-
files += util.list_dir_recursively_with_ignore(run_func_module_dir_path, ignores=submit_config.run_dir_ignore, add_base_to_relative=False)
|
243 |
-
|
244 |
-
dnnlib_module_dir_path = util.get_module_dir_by_obj_name("dnnlib")
|
245 |
-
files += util.list_dir_recursively_with_ignore(dnnlib_module_dir_path, ignores=submit_config.run_dir_ignore, add_base_to_relative=True)
|
246 |
-
|
247 |
-
files += submit_config.run_dir_extra_files
|
248 |
-
|
249 |
-
files = [(f[0], os.path.join(run_dir, "src", f[1])) for f in files]
|
250 |
-
files += [(os.path.join(dnnlib_module_dir_path, "submission", "internal", "run.py"), os.path.join(run_dir, "run.py"))]
|
251 |
-
|
252 |
-
util.copy_files_and_create_dirs(files)
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
def run_wrapper(submit_config: SubmitConfig) -> None:
|
257 |
-
"""Wrap the actual run function call for handling logging, exceptions, typing, etc."""
|
258 |
-
is_local = submit_config.submit_target == SubmitTarget.LOCAL
|
259 |
-
|
260 |
-
# when running locally, redirect stderr to stdout, log stdout to a file, and force flushing
|
261 |
-
if is_local:
|
262 |
-
logger = util.Logger(file_name=os.path.join(submit_config.run_dir, "log.txt"), file_mode="w", should_flush=True)
|
263 |
-
else: # when running in a cluster, redirect stderr to stdout, and just force flushing (log writing is handled by run.sh)
|
264 |
-
logger = util.Logger(file_name=None, should_flush=True)
|
265 |
-
|
266 |
-
import dnnlib
|
267 |
-
dnnlib.submit_config = submit_config
|
268 |
-
|
269 |
-
exit_with_errcode = False
|
270 |
-
try:
|
271 |
-
print("dnnlib: Running {0}() on {1}...".format(submit_config.run_func_name, submit_config.host_name))
|
272 |
-
start_time = time.time()
|
273 |
-
|
274 |
-
run_func_obj = util.get_obj_by_name(submit_config.run_func_name)
|
275 |
-
assert callable(run_func_obj)
|
276 |
-
sig = inspect.signature(run_func_obj)
|
277 |
-
if 'submit_config' in sig.parameters:
|
278 |
-
run_func_obj(submit_config=submit_config, **submit_config.run_func_kwargs)
|
279 |
-
else:
|
280 |
-
run_func_obj(**submit_config.run_func_kwargs)
|
281 |
-
|
282 |
-
print("dnnlib: Finished {0}() in {1}.".format(submit_config.run_func_name, util.format_time(time.time() - start_time)))
|
283 |
-
except:
|
284 |
-
if is_local:
|
285 |
-
raise
|
286 |
-
else:
|
287 |
-
traceback.print_exc()
|
288 |
-
|
289 |
-
log_src = os.path.join(submit_config.run_dir, "log.txt")
|
290 |
-
log_dst = os.path.join(get_path_from_template(submit_config.run_dir_root), "{0}-error.txt".format(submit_config.run_name))
|
291 |
-
shutil.copyfile(log_src, log_dst)
|
292 |
-
|
293 |
-
# Defer sys.exit(1) to happen after we close the logs and create a _finished.txt
|
294 |
-
exit_with_errcode = True
|
295 |
-
finally:
|
296 |
-
open(os.path.join(submit_config.run_dir, "_finished.txt"), "w").close()
|
297 |
-
|
298 |
-
dnnlib.RunContext.get().close()
|
299 |
-
dnnlib.submit_config = None
|
300 |
-
logger.close()
|
301 |
-
|
302 |
-
# If we hit an error, get out of the script now and signal the error
|
303 |
-
# to whatever process that started this script.
|
304 |
-
if exit_with_errcode:
|
305 |
-
sys.exit(1)
|
306 |
-
|
307 |
-
return submit_config
|
308 |
-
|
309 |
-
|
310 |
-
def submit_run(submit_config: SubmitConfig, run_func_name: str, **run_func_kwargs) -> None:
|
311 |
-
"""Create a run dir, gather files related to the run, copy files to the run dir, and launch the run in appropriate place."""
|
312 |
-
submit_config = copy.deepcopy(submit_config)
|
313 |
-
|
314 |
-
submit_target = submit_config.submit_target
|
315 |
-
farm = None
|
316 |
-
if submit_target == SubmitTarget.LOCAL:
|
317 |
-
farm = internal.local.Target()
|
318 |
-
assert farm is not None # unknown target
|
319 |
-
|
320 |
-
# Disallow submitting jobs with zero num_gpus.
|
321 |
-
if (submit_config.num_gpus is None) or (submit_config.num_gpus == 0):
|
322 |
-
raise RuntimeError("submit_config.num_gpus must be set to a non-zero value")
|
323 |
-
|
324 |
-
if submit_config.user_name is None:
|
325 |
-
submit_config.user_name = get_user_name()
|
326 |
-
|
327 |
-
submit_config.run_func_name = run_func_name
|
328 |
-
submit_config.run_func_kwargs = run_func_kwargs
|
329 |
-
|
330 |
-
#--------------------------------------------------------------------
|
331 |
-
# Prepare submission by populating the run dir
|
332 |
-
#--------------------------------------------------------------------
|
333 |
-
host_run_dir = _create_run_dir_local(submit_config)
|
334 |
-
|
335 |
-
submit_config.task_name = "{0}-{1:05d}-{2}".format(submit_config.user_name, submit_config.run_id, submit_config.run_desc)
|
336 |
-
docker_valid_name_regex = "^[a-zA-Z0-9][a-zA-Z0-9_.-]+$"
|
337 |
-
if not re.match(docker_valid_name_regex, submit_config.task_name):
|
338 |
-
raise RuntimeError("Invalid task name. Probable reason: unacceptable characters in your submit_config.run_desc. Task name must be accepted by the following regex: " + docker_valid_name_regex + ", got " + submit_config.task_name)
|
339 |
-
|
340 |
-
# Farm specific preparations for a submit
|
341 |
-
farm.finalize_submit_config(submit_config, host_run_dir)
|
342 |
-
_populate_run_dir(submit_config, host_run_dir)
|
343 |
-
return farm.submit(submit_config, host_run_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|