Himanshu Goyal commited on
Commit
3dd0091
·
1 Parent(s): a84fcb7

feat: mlflow logging integration (#1773)

Browse files
docs/mlflow_integration.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## MLFlow Integration
2
+ YOLOX now supports MLFlow integration. MLFlow is an open-source platform for managing the end-to-end machine learning lifecycle. It is designed to work with any ML library, algorithm, deployment tool, or language. MLFlow can be used to track experiments, metrics, and parameters, and to log and visualize model artifacts. \
3
+ For more information, please refer to: [MLFlow Documentation](https://www.mlflow.org/docs/latest/index.html)
4
+
5
+ ## Follow these steps to start logging your experiments to MLFlow:
6
+ ### Step-1: Install MLFlow via pip
7
+ ```bash
8
+ pip install mlflow python-dotenv
9
+ ```
10
+
11
+ ### Step-2: Set up MLFlow Tracking Server
12
+ Start or connect to a MLFlow tracking server like databricks. You can start a local tracking server by running the following command:
13
+ ```bash
14
+ mlflow server --host 127.0.0.1 --port 8080
15
+ ```
16
+ Read more about setting up MLFlow tracking server [here](https://mlflow.org/docs/latest/tracking/server.html#mlflow-tracking-server)
17
+
18
+ ### Step-3: Set up MLFlow Environment Variables
19
+ Set the following environment variables in your `.env` file:
20
+ ```bash
21
+ MLFLOW_TRACKING_URI="127.0.0.1:5000" # set to your mlflow server URI
22
+ MLFLOW_EXPERIMENT_NAME="/path/to/experiment" # set to your experiment name
23
+ MLFLOW_TAGS={"release.candidate": "DEV1", "release.version": "0.0.0"}
24
+ # config related to logging model to mlflow as pyfunc
25
+ YOLOX_MLFLOW_LOG_MODEL_ARTIFACTS="True" # whether to log model (best or historical) or not
26
+ YOLOX_MLFLOW_LOG_MODEL_PER_n_EPOCHS=30 # try logging model only after every n epochs
27
+ YOLOX_MLFLOW_LOG_Nth_EPOCH_MODELS="False" # whether to log step model along with best_model or not
28
+ YOLOX_MLFLOW_RUN_NAME="" # give a custom name to your run, otherwise a random name is assign by mlflow
29
+ YOLOX_MLFLOW_FLATTEN_PARAMS="True" # flatten any sub sub params of dict to be logged as simple key value pair
30
+
31
+
32
+ MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING=True # log system gpu usage and other metrices
33
+ MLFLOW_NESTED_RUN="False" #whether to run as a nested run of given run_id
34
+ MLFLOW_RUN_ID="" # continue training from a given run_id
35
+ ```
36
+ ### Step-5: Provide --logger "mlflow" to the training script
37
+ ```bash
38
+ python tools/train.py -l mlflow -f exps/path/to/exp.py -d 1 -b 8 --fp16 -o -c
39
+ pre_trained_model/<model>.pth
40
+ # note the -l mlflow flag
41
+ # one working example is this
42
+ python tools/train.py -l mlflow -f exps/example/custom/yolox_s.py -d 1 -b 8 --fp16 -o -c pre_trained_model/yolox_s.pth
43
+ ```
44
+ ### Step-4: optional; start the mlflow ui and track your experiments
45
+ If you log runs to a local mlruns directory, run the following command in the directory above it, then access http://127.0.0.1:5000 in your browser.
46
+
47
+ ```bash
48
+ mlflow ui --port 5000
49
+ ```
50
+
51
+ ## Optional Databricks Integration
52
+
53
+ ### Step-1: Install Databricks sdk
54
+ ```bash
55
+ pip install databricks-sdk
56
+ ```
57
+
58
+ ### Step-2: Set up Databricks Environment Variables
59
+ Set the following environment variables in your `.env` file:
60
+ ```bash
61
+ MLFLOW_TRACKING_URI="databricks" # set to databricks
62
+ MLFLOW_EXPERIMENT_NAME="/Users/<user>/<experiment_name>/"
63
+ DATABRICKS_HOST = "https://dbc-1234567890123456.cloud.databricks.com" # set to your server URI
64
+ DATABRICKS_TOKEN = "dapixxxxxxxxxxxxx"
65
+ ```
tools/train.py CHANGED
@@ -85,7 +85,7 @@ def make_parser():
85
  "--logger",
86
  type=str,
87
  help="Logger to be used for metrics. \
88
- Implemented loggers include `tensorboard` and `wandb`.",
89
  default="tensorboard"
90
  )
91
  parser.add_argument(
 
85
  "--logger",
86
  type=str,
87
  help="Logger to be used for metrics. \
88
+ Implemented loggers include `tensorboard`, `mlflow` and `wandb`.",
89
  default="tensorboard"
90
  )
91
  parser.add_argument(
yolox/core/trainer.py CHANGED
@@ -14,6 +14,7 @@ from yolox.data import DataPrefetcher
14
  from yolox.exp import Exp
15
  from yolox.utils import (
16
  MeterBuffer,
 
17
  ModelEMA,
18
  WandbLogger,
19
  adjust_status,
@@ -74,7 +75,8 @@ class Trainer:
74
  self.before_train()
75
  try:
76
  self.train_in_epoch()
77
- except Exception:
 
78
  raise
79
  finally:
80
  self.after_train()
@@ -185,8 +187,11 @@ class Trainer:
185
  self.exp,
186
  self.evaluator.dataloader.dataset
187
  )
 
 
 
188
  else:
189
- raise ValueError("logger must be either 'tensorboard' or 'wandb'")
190
 
191
  logger.info("Training start...")
192
  logger.info("\n{}".format(model))
@@ -198,6 +203,16 @@ class Trainer:
198
  if self.rank == 0:
199
  if self.args.logger == "wandb":
200
  self.wandb_logger.finish()
 
 
 
 
 
 
 
 
 
 
201
 
202
  def before_epoch(self):
203
  logger.info("---> start train epoch{}".format(self.epoch + 1))
@@ -276,6 +291,10 @@ class Trainer:
276
  "train/lr": self.meter["lr"].latest
277
  })
278
  self.wandb_logger.log_metrics(metrics, step=self.progress_in_iter)
 
 
 
 
279
 
280
  self.meter.clear_meters()
281
 
@@ -351,6 +370,14 @@ class Trainer:
351
  "train/epoch": self.epoch + 1,
352
  })
353
  self.wandb_logger.log_images(predictions)
 
 
 
 
 
 
 
 
354
  logger.info("\n" + summary)
355
  synchronize()
356
 
@@ -358,6 +385,17 @@ class Trainer:
358
  if self.save_history_ckpt:
359
  self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95)
360
 
 
 
 
 
 
 
 
 
 
 
 
361
  def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None):
362
  if self.rank == 0:
363
  save_model = self.ema_model.ema if self.use_model_ema else self.model
 
14
  from yolox.exp import Exp
15
  from yolox.utils import (
16
  MeterBuffer,
17
+ MlflowLogger,
18
  ModelEMA,
19
  WandbLogger,
20
  adjust_status,
 
75
  self.before_train()
76
  try:
77
  self.train_in_epoch()
78
+ except Exception as e:
79
+ logger.error("Exception in training: ", e)
80
  raise
81
  finally:
82
  self.after_train()
 
187
  self.exp,
188
  self.evaluator.dataloader.dataset
189
  )
190
+ elif self.args.logger == "mlflow":
191
+ self.mlflow_logger = MlflowLogger()
192
+ self.mlflow_logger.setup(args=self.args, exp=self.exp)
193
  else:
194
+ raise ValueError("logger must be either 'tensorboard', 'mlflow' or 'wandb'")
195
 
196
  logger.info("Training start...")
197
  logger.info("\n{}".format(model))
 
203
  if self.rank == 0:
204
  if self.args.logger == "wandb":
205
  self.wandb_logger.finish()
206
+ elif self.args.logger == "mlflow":
207
+ metadata = {
208
+ "epoch": self.epoch + 1,
209
+ "input_size": self.input_size,
210
+ 'start_ckpt': self.args.ckpt,
211
+ 'exp_file': self.args.exp_file,
212
+ "best_ap": float(self.best_ap)
213
+ }
214
+ self.mlflow_logger.on_train_end(self.args, file_name=self.file_name,
215
+ metadata=metadata)
216
 
217
  def before_epoch(self):
218
  logger.info("---> start train epoch{}".format(self.epoch + 1))
 
291
  "train/lr": self.meter["lr"].latest
292
  })
293
  self.wandb_logger.log_metrics(metrics, step=self.progress_in_iter)
294
+ if self.args.logger == 'mlflow':
295
+ logs = {"train/" + k: v.latest for k, v in loss_meter.items()}
296
+ logs.update({"train/lr": self.meter["lr"].latest})
297
+ self.mlflow_logger.on_log(self.args, self.exp, self.epoch+1, logs)
298
 
299
  self.meter.clear_meters()
300
 
 
370
  "train/epoch": self.epoch + 1,
371
  })
372
  self.wandb_logger.log_images(predictions)
373
+ if self.args.logger == "mlflow":
374
+ logs = {
375
+ "val/COCOAP50": ap50,
376
+ "val/COCOAP50_95": ap50_95,
377
+ "val/best_ap": round(self.best_ap, 3),
378
+ "train/epoch": self.epoch + 1,
379
+ }
380
+ self.mlflow_logger.on_log(self.args, self.exp, self.epoch+1, logs)
381
  logger.info("\n" + summary)
382
  synchronize()
383
 
 
385
  if self.save_history_ckpt:
386
  self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95)
387
 
388
+ if self.args.logger == "mlflow":
389
+ metadata = {
390
+ "epoch": self.epoch + 1,
391
+ "input_size": self.input_size,
392
+ 'start_ckpt': self.args.ckpt,
393
+ 'exp_file': self.args.exp_file,
394
+ "best_ap": float(self.best_ap)
395
+ }
396
+ self.mlflow_logger.save_checkpoints(self.args, self.exp, self.file_name, self.epoch,
397
+ metadata, update_best_ckpt)
398
+
399
  def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None):
400
  if self.rank == 0:
401
  save_model = self.ema_model.ema if self.use_model_ema else self.model
yolox/utils/__init__.py CHANGED
@@ -11,6 +11,7 @@ from .ema import *
11
  from .logger import WandbLogger, setup_logger
12
  from .lr_scheduler import LRScheduler
13
  from .metric import *
 
14
  from .model_utils import *
15
  from .setup_env import *
16
  from .visualize import *
 
11
  from .logger import WandbLogger, setup_logger
12
  from .lr_scheduler import LRScheduler
13
  from .metric import *
14
+ from .mlflow_logger import MlflowLogger
15
  from .model_utils import *
16
  from .setup_env import *
17
  from .visualize import *
yolox/utils/mlflow_logger.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Megvii Inc. All rights reserved.
3
+ # Please read docs/mlflow_integration.md for more details.
4
+ """
5
+ Logging training runs with hyperparameter, datasets and trained models to MlFlow.
6
+ Mlflow support Model Tracking, Experiment Tracking, and Model Registry.
7
+ It can be hosted on-premises or in all the major cloud provider or with databricks also.
8
+ Please read docs/mlflow_integration.md for more details.
9
+
10
+ For changing default logging Behaviour you can change mlflow environment variables:
11
+ https://mlflow.org/docs/latest/python_api/mlflow.environment_variables.html
12
+
13
+ For more information, please refer to:
14
+ https://mlflow.org/docs/latest/introduction/index.html
15
+ """
16
+ import importlib.metadata
17
+ import importlib.util
18
+ import json
19
+ import os
20
+ from collections.abc import MutableMapping
21
+ import packaging.version
22
+ from loguru import logger
23
+
24
+ import torch
25
+
26
+ from yolox.utils import is_main_process
27
+
28
+
29
+ class MlflowLogger:
30
+ """
31
+ Main Mlflow logging class to log hyperparameters, metrics, and models to Mlflow.
32
+ """
33
+ def __init__(self):
34
+ if not self.is_required_library_available():
35
+ raise RuntimeError(
36
+ "MLflow Logging requires mlflow and python-dotenv to be installed. "
37
+ "Run `pip install mlflow python-dotenv`.")
38
+
39
+ import mlflow
40
+ from dotenv import find_dotenv, load_dotenv
41
+ load_dotenv(find_dotenv())
42
+ self.ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
43
+ self._MAX_PARAM_VAL_LENGTH = mlflow.utils.validation.MAX_PARAM_VAL_LENGTH
44
+ self._MAX_PARAMS_TAGS_PER_BATCH = mlflow.utils.validation.MAX_PARAMS_TAGS_PER_BATCH
45
+ self._initialized = False
46
+ self._auto_end_run = False
47
+ self.best_ckpt_upload_pending = False
48
+ self._tracking_uri = None
49
+ self._experiment_name = None
50
+ self._mlflow_log_artifacts = None
51
+ self._mlflow_log_model_per_n_epochs = None
52
+ self._mlflow_log_nth_epoch_models = None
53
+ self.run_name = None
54
+ self._flatten_params = None
55
+ self._nested_run = None
56
+ self._run_id = None
57
+ self._async_log = None
58
+ self._ml_flow = mlflow
59
+
60
+ def is_required_library_available(self):
61
+ """
62
+ check if required libraries are available.
63
+
64
+ Args: None
65
+
66
+ Returns:
67
+ bool: True if required libraries are available, False otherwise.
68
+ """
69
+ dotenv_availaible = importlib.util.find_spec("dotenv") is not None
70
+ mlflow_available = importlib.util.find_spec("mlflow") is not None
71
+ return dotenv_availaible and mlflow_available
72
+
73
+ def flatten_dict(self, d: MutableMapping, parent_key: str = "", delimiter: str = "."):
74
+ """
75
+ Flatten a nested dict into a single level dict.
76
+
77
+ Args:
78
+ d(MutableMapping): nested dictionary
79
+ parent_key(str): parent key
80
+ delimiter(str): delimiter to use
81
+
82
+ Returns:
83
+ flattened_dict(dict): flattened dictionary
84
+
85
+ """
86
+
87
+ def _flatten_dict(d, parent_key="", delimiter="."):
88
+ for k, v in d.items():
89
+ key = str(parent_key) + delimiter + str(k) if parent_key else k
90
+ if v and isinstance(v, MutableMapping):
91
+ yield from self.flatten_dict(v, key, delimiter=delimiter).items()
92
+ else:
93
+ yield key, v
94
+
95
+ return dict(_flatten_dict(d, parent_key, delimiter))
96
+
97
+ def setup(self, args, exp):
98
+ """
99
+ Set up the optional MLflow integration.
100
+
101
+ Args:
102
+ args(dict): training args dictionary
103
+ exp(dict): Experiment related hyperparameters
104
+
105
+ Returns:
106
+ None
107
+
108
+ Environment:
109
+ - **YOLOX_MLFLOW_LOG_MODEL_ARTIFACTS** (`str`, *optional*, defaults to `False`):
110
+ Whether to use MLflow `.log_artifact()` facility to log artifacts. This only makes
111
+ sense if logging to a remote server, e.g. s3 or GCS. If set to `True` or *1*,
112
+ will copy each check-points on each save in [`TrainingArguments`]'s `output_dir` to the
113
+ local or remote artifact storage. Using it without a remote storage will just copy the
114
+ files to your artifact location.
115
+ - **YOLOX_MLFLOW_LOG_MODEL_PER_n_EPOCHS** (`int`, *optional*, defaults to 30):
116
+ If ``YOLOX_MLFLOW_LOG_MODEL_ARTIFACTS`` is enabled then Log model checkpoints after
117
+ every n epochs. Default is 30. ``best_ckpt.pth`` will be updated after `n` epochs if
118
+ it has been updated during last `n` epochs.
119
+ - **YOLOX_MLFLOW_LOG_Nth_EPOCH_MODELS** (`str`, *optional*, defaults to `False`):
120
+ Whether to log the ``epoch_n_ckpt.pth`` models along with best_ckpt.pth model after
121
+ every `n` epoch as per YOLOX_MLFLOW_LOG_MODEL_PER_n_EPOCHS.
122
+ If set to `True` or *1*, will log ``epoch_n_ckpt.pth`` along with
123
+ ``best_ckpt.pth`` and as mlflow artifacts in different folders.
124
+ - **YOLOX_MLFLOW_RUN_NAME** (`str`, *optional*, defaults to random name):
125
+ Name of new run. Used only when ``run_id`` is unspecified. If a new run is
126
+ created and ``run_name`` is not specified, a random name will be generated for the run.
127
+ - **YOLOX_MLFLOW_FLATTEN_PARAMS** (`str`, *optional*, defaults to `False`):
128
+ Whether to flatten the parameters dictionary before logging.
129
+ - **MLFLOW_TRACKING_URI** (`str`, *optional*):
130
+ Whether to store runs at a specific path or remote server. Unset by default, which
131
+ skips setting the tracking URI entirely.
132
+ - **MLFLOW_EXPERIMENT_NAME** (`str`, *optional*, defaults to `None`):
133
+ Whether to use an MLflow experiment_name under which to launch the run. Default to
134
+ `None` which will point to the `Default` experiment in MLflow. Otherwise, it is a
135
+ case-sensitive name of the experiment to be activated. If an experiment with this
136
+ name does not exist, a new experiment with this name is created.
137
+ - **MLFLOW_TAGS** (`str`, *optional*):
138
+ A string dump of a dictionary of key/value pair to be added to the MLflow run as tags.
139
+ Example: `os.environ['MLFLOW_TAGS']=
140
+ '{"release.candidate": "RC1", "release.version": "2.2.0"}'`.
141
+ - **MLFLOW_NESTED_RUN** (`str`, *optional*):
142
+ Whether to use MLflow nested runs. If set to `True` or *1*, will create a nested run
143
+ inside the current run.
144
+ - **MLFLOW_RUN_ID** (`str`, *optional*):
145
+ Allow to reattach to an existing run which can be useful when resuming training from a
146
+ checkpoint. When `MLFLOW_RUN_ID` environment variable is set, `start_run` attempts
147
+ to resume a run with the specified run ID and other parameters are ignored.
148
+ - Other MLflow environment variables: For changing default logging Behaviour refer mlflow
149
+ environment variables:
150
+ https://mlflow.org/docs/latest/python_api/mlflow.environment_variables.html
151
+ - Setup ``Databricks`` integration with MLflow: Provide these two environment variables:
152
+ DATABRICKS_HOST="https://adb-4273978218682429.9.azuredatabricks.net"
153
+ DATABRICKS_TOKEN="dapixxxxxxxxxxxxx"
154
+ """
155
+ self._tracking_uri = os.getenv("MLFLOW_TRACKING_URI", None)
156
+ self._experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", None)
157
+ self._mlflow_log_artifacts = os.getenv("YOLOX_MLFLOW_LOG_MODEL_ARTIFACTS",
158
+ "False").upper() in self.ENV_VARS_TRUE_VALUES
159
+ self._mlflow_log_model_per_n_epochs = int(os.getenv(
160
+ "YOLOX_MLFLOW_LOG_MODEL_PER_n_EPOCHS", 30))
161
+
162
+ self._mlflow_log_nth_epoch_models = os.getenv("YOLOX_MLFLOW_LOG_Nth_EPOCH_MODELS",
163
+ "False").upper() in self.ENV_VARS_TRUE_VALUES
164
+ self.run_name = os.getenv("YOLOX_MLFLOW_RUN_NAME", None)
165
+ self.run_name = None if len(self.run_name.strip()) == 0 else self.run_name
166
+ self._flatten_params = os.getenv("YOLOX_MLFLOW_FLATTEN_PARAMS",
167
+ "FALSE").upper() in self.ENV_VARS_TRUE_VALUES
168
+ self._nested_run = os.getenv("MLFLOW_NESTED_RUN",
169
+ "FALSE").upper() in self.ENV_VARS_TRUE_VALUES
170
+ self._run_id = os.getenv("MLFLOW_RUN_ID", None)
171
+
172
+ # "synchronous" flag is only available with mlflow version >= 2.8.0
173
+ # https://github.com/mlflow/mlflow/pull/9705
174
+ # https://github.com/mlflow/mlflow/releases/tag/v2.8.0
175
+ self._async_log = packaging.version.parse(
176
+ self._ml_flow.__version__) >= packaging.version.parse("2.8.0")
177
+
178
+ logger.debug(
179
+ f"MLflow experiment_name={self._experiment_name}, run_name={self.run_name}, "
180
+ f"nested={self._nested_run}, tags={self._nested_run}, tracking_uri={self._tracking_uri}"
181
+ )
182
+ if is_main_process():
183
+ if not self._ml_flow.is_tracking_uri_set():
184
+ if self._tracking_uri:
185
+ self._ml_flow.set_tracking_uri(self._tracking_uri)
186
+ logger.debug(f"MLflow tracking URI is set to {self._tracking_uri}")
187
+ else:
188
+ logger.debug(
189
+ "Environment variable `MLFLOW_TRACKING_URI` is not provided and therefore"
190
+ " will not be explicitly set."
191
+ )
192
+ else:
193
+ logger.debug(f"MLflow tracking URI is set to {self._ml_flow.get_tracking_uri()}")
194
+
195
+ if self._ml_flow.active_run() is None or self._nested_run or self._run_id:
196
+ if self._experiment_name:
197
+ # Use of set_experiment() ensure that Experiment is created if not exists
198
+ self._ml_flow.set_experiment(self._experiment_name)
199
+ self._ml_flow.start_run(run_name=self.run_name, nested=self._nested_run)
200
+ logger.debug(
201
+ f"MLflow run started with run_id={self._ml_flow.active_run().info.run_id}")
202
+ self._auto_end_run = True
203
+ self._initialized = True
204
+ # filters these params from args
205
+ keys = ['experiment_name', 'batch_size', 'exp_file', 'resume', 'ckpt', 'start_epoch',
206
+ 'num_machines', 'fp16', 'logger']
207
+ combined_dict = {k: v for k, v in vars(args).items() if k in keys}
208
+ if exp is not None:
209
+ exp_dict = self.convert_exp_todict(exp)
210
+ combined_dict = {**exp_dict, **combined_dict}
211
+ self.log_params_mlflow(combined_dict)
212
+ mlflow_tags = os.getenv("MLFLOW_TAGS", None)
213
+ if mlflow_tags:
214
+ mlflow_tags = json.loads(mlflow_tags)
215
+ self._ml_flow.set_tags(mlflow_tags)
216
+
217
+ def log_params_mlflow(self, params_dict):
218
+ """
219
+ Log hyperparameters to MLflow.
220
+ MLflow's log_param() only accepts values no longer than 250 characters.
221
+ No overwriting of existing parameters is allowed by default from mlflow.
222
+
223
+ Args:
224
+ params_dict(dict): dict of hyperparameters
225
+
226
+ Returns:
227
+ None
228
+ """
229
+ if is_main_process():
230
+ params_dict = self.flatten_dict(params_dict) if self._flatten_params else params_dict
231
+ # remove params that are too long for MLflow
232
+ for name, value in list(params_dict.items()):
233
+ # internally, all values are converted to str in MLflow
234
+ if len(str(value)) > self._MAX_PARAM_VAL_LENGTH:
235
+ logger.warning(
236
+ f'Trainer is attempting to log a value of "{value}" for key "{name}" as a '
237
+ f'parameter. MLflow\'s log_param() only accepts values no longer than 250 '
238
+ f'characters so we dropped this attribute. You can use '
239
+ f'`MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters '
240
+ f'and avoid this message.'
241
+ )
242
+ del params_dict[name]
243
+ # MLflow cannot log more than 100 values in one go, so we have to split it
244
+ combined_dict_items = list(params_dict.items())
245
+ for i in range(0, len(combined_dict_items), self._MAX_PARAMS_TAGS_PER_BATCH):
246
+ if self._async_log:
247
+ self._ml_flow.log_params(
248
+ dict(combined_dict_items[i: i + self._MAX_PARAMS_TAGS_PER_BATCH]),
249
+ synchronous=False
250
+ )
251
+ else:
252
+ self._ml_flow.log_params(
253
+ dict(combined_dict_items[i: i + self._MAX_PARAMS_TAGS_PER_BATCH])
254
+ )
255
+
256
+ def convert_exp_todict(self, exp):
257
+ """
258
+ Convert the experiment object to dictionary for required parameter only
259
+
260
+ Args:
261
+ exp(dict): Experiment object
262
+
263
+ Returns:
264
+ exp_dict(dict): dict of experiment parameters
265
+
266
+ """
267
+ filter_keys = ['max_epoch', 'num_classes', 'input_size', 'output_dir',
268
+ 'data_dir', 'train_ann', 'val_ann', 'test_ann',
269
+ 'test_conf', 'nmsthre']
270
+ exp_dict = {k: v for k, v in exp.__dict__.items()
271
+ if not k.startswith("__") and k in filter_keys}
272
+ return exp_dict
273
+
274
+ def on_log(self, args, exp, step, logs):
275
+ """
276
+ Log metrics to MLflow.
277
+
278
+ Args:
279
+ args(dict): training args dictionary
280
+ exp(dict): Experiment related hyperparameters
281
+ step(int): current training step
282
+ logs(dict): dictionary of logs to be logged
283
+
284
+ Returns:
285
+ None
286
+ """
287
+ # step = trainer.progress_in_iter
288
+ if not self._initialized:
289
+ self.setup(args, exp)
290
+ if is_main_process(): # master thread only
291
+ metrics = {}
292
+ for k, v in logs.items():
293
+ if isinstance(v, (int, float)):
294
+ metrics[k] = v
295
+ elif isinstance(v, torch.Tensor) and v.numel() == 1:
296
+ metrics[k] = v.item()
297
+ else:
298
+ logger.warning(
299
+ f'Trainer is attempting to log a value of "{v}" of type {type(v)} for key '
300
+ f'"{k}" as a metric. MLflow log_metric() only accepts float and int types '
301
+ f'so we dropped this attribute.'
302
+ )
303
+
304
+ if self._async_log:
305
+ self._ml_flow.log_metrics(metrics=metrics, step=step, synchronous=False)
306
+ else:
307
+ self._ml_flow.log_metrics(metrics=metrics, step=step)
308
+
309
+ def on_train_end(self, args, file_name, metadata):
310
+ """
311
+ Mlflow logging action to take when training ends:
312
+ 1. log the training log file
313
+ 2. publish the latest best model to model_registry if it is allowed in config file
314
+ 3. close the mlfow run
315
+
316
+ Args:
317
+ args(dict): training args dictionary
318
+ file_name(str): output directory
319
+ metadata(dict): model related metadata
320
+
321
+ Returns:
322
+ None
323
+ """
324
+ if is_main_process() and self._initialized:
325
+ self.save_log_file(args, file_name)
326
+ if self.best_ckpt_upload_pending:
327
+ model_file_name = "best_ckpt"
328
+ mlflow_out_dir = f"{args.experiment_name}/{model_file_name}"
329
+ artifact_path = os.path.join(file_name, f"{model_file_name}.pth")
330
+ self.mlflow_save_pyfunc_model(metadata, artifact_path, mlflow_out_dir)
331
+ if self._auto_end_run and self._ml_flow.active_run():
332
+ self._ml_flow.end_run()
333
+
334
+ def save_log_file(self, args, file_name):
335
+ """
336
+ Save the training log file to mlflow artifact path
337
+ Args:
338
+ args(dict): training args dictionary
339
+ file_name(str): output directory
340
+
341
+ Returns:
342
+ None
343
+ """
344
+ log_file_path = os.path.join(file_name, "train_log.txt")
345
+ mlflow_out_dir = f"{args.experiment_name}"
346
+ logger.info(f"Logging logfile: {log_file_path} in mlflow artifact path: {mlflow_out_dir}.")
347
+ self._ml_flow.log_artifact(log_file_path, mlflow_out_dir)
348
+
349
+ def save_checkpoints(self, args, exp, file_name, epoch, metadata, update_best_ckpt):
350
+ """
351
+ Save the model checkpoints to mlflow artifact path
352
+ if save_history_ckpt is enabled then
353
+
354
+ Args:
355
+ args(dict): training args dictionary
356
+ exp(dict): Experiment related hyperparameters
357
+ file_name(str): output directory
358
+ epoch(int): current epoch
359
+ metadata(dict): model related metadata
360
+ update_best_ckpt(bool): bool to show if best_ckpt was updated
361
+
362
+ Returns:
363
+ None
364
+ """
365
+ if is_main_process() and self._mlflow_log_artifacts:
366
+ if update_best_ckpt:
367
+ self.best_ckpt_upload_pending = True
368
+ if ((epoch + 1) % self._mlflow_log_model_per_n_epochs) == 0:
369
+ self.save_log_file(args, file_name)
370
+ if self.best_ckpt_upload_pending:
371
+ model_file_name = "best_ckpt"
372
+ mlflow_out_dir = f"{args.experiment_name}/{model_file_name}"
373
+ artifact_path = os.path.join(file_name, f"{model_file_name}.pth")
374
+ self.mlflow_save_pyfunc_model(metadata, artifact_path, mlflow_out_dir)
375
+ self.best_ckpt_upload_pending = False
376
+ if self._mlflow_log_nth_epoch_models and exp.save_history_ckpt:
377
+ model_file_name = f"epoch_{epoch + 1}_ckpt"
378
+ mlflow_out_dir = f"{args.experiment_name}/hist_epochs/{model_file_name}"
379
+ artifact_path = os.path.join(file_name, f"{model_file_name}.pth")
380
+ self.mlflow_save_pyfunc_model(metadata, artifact_path, mlflow_out_dir)
381
+
382
+ def mlflow_save_pyfunc_model(self, metadata, artifact_path, mlflow_out_dir):
383
+ """
384
+ This will send the given model to mlflow server if HF_MLFLOW_LOG_ARTIFACTS is true
385
+ - optionally publish to model registry if allowed in config file
386
+
387
+ Args:
388
+ metadata(dict): model related metadata
389
+ artifact_path(str): model checkpoint path
390
+ mlflow_out_dir(str): mlflow artifact path
391
+
392
+ Returns:
393
+ None
394
+ """
395
+ if is_main_process() and self._initialized and self._mlflow_log_artifacts:
396
+ logger.info(
397
+ f"Logging checkpoint {artifact_path} artifacts in mlflow artifact path: "
398
+ f"{mlflow_out_dir}. This may take time.")
399
+ if os.path.exists(artifact_path):
400
+ self._ml_flow.pyfunc.log_model(
401
+ mlflow_out_dir,
402
+ artifacts={"model_path": artifact_path},
403
+ python_model=self._ml_flow.pyfunc.PythonModel(),
404
+ metadata=metadata
405
+ )
406
+
407
+ def __del__(self):
408
+ """
409
+ if the previous run is not terminated correctly, the fluent API will
410
+ not let you start a new run before the previous one is killed
411
+
412
+ Args: None
413
+ Return: None
414
+ """
415
+ if (
416
+ self._auto_end_run
417
+ and callable(getattr(self._ml_flow, "active_run", None))
418
+ and self._ml_flow.active_run() is not None
419
+ ):
420
+ self._ml_flow.end_run()