Himanshu Goyal
commited on
Commit
·
3dd0091
1
Parent(s):
a84fcb7
feat: mlflow logging integration (#1773)
Browse files- docs/mlflow_integration.md +65 -0
- tools/train.py +1 -1
- yolox/core/trainer.py +40 -2
- yolox/utils/__init__.py +1 -0
- yolox/utils/mlflow_logger.py +420 -0
docs/mlflow_integration.md
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## MLFlow Integration
|
2 |
+
YOLOX now supports MLFlow integration. MLFlow is an open-source platform for managing the end-to-end machine learning lifecycle. It is designed to work with any ML library, algorithm, deployment tool, or language. MLFlow can be used to track experiments, metrics, and parameters, and to log and visualize model artifacts. \
|
3 |
+
For more information, please refer to: [MLFlow Documentation](https://www.mlflow.org/docs/latest/index.html)
|
4 |
+
|
5 |
+
## Follow these steps to start logging your experiments to MLFlow:
|
6 |
+
### Step-1: Install MLFlow via pip
|
7 |
+
```bash
|
8 |
+
pip install mlflow python-dotenv
|
9 |
+
```
|
10 |
+
|
11 |
+
### Step-2: Set up MLFlow Tracking Server
|
12 |
+
Start or connect to a MLFlow tracking server like databricks. You can start a local tracking server by running the following command:
|
13 |
+
```bash
|
14 |
+
mlflow server --host 127.0.0.1 --port 8080
|
15 |
+
```
|
16 |
+
Read more about setting up MLFlow tracking server [here](https://mlflow.org/docs/latest/tracking/server.html#mlflow-tracking-server)
|
17 |
+
|
18 |
+
### Step-3: Set up MLFlow Environment Variables
|
19 |
+
Set the following environment variables in your `.env` file:
|
20 |
+
```bash
|
21 |
+
MLFLOW_TRACKING_URI="127.0.0.1:5000" # set to your mlflow server URI
|
22 |
+
MLFLOW_EXPERIMENT_NAME="/path/to/experiment" # set to your experiment name
|
23 |
+
MLFLOW_TAGS={"release.candidate": "DEV1", "release.version": "0.0.0"}
|
24 |
+
# config related to logging model to mlflow as pyfunc
|
25 |
+
YOLOX_MLFLOW_LOG_MODEL_ARTIFACTS="True" # whether to log model (best or historical) or not
|
26 |
+
YOLOX_MLFLOW_LOG_MODEL_PER_n_EPOCHS=30 # try logging model only after every n epochs
|
27 |
+
YOLOX_MLFLOW_LOG_Nth_EPOCH_MODELS="False" # whether to log step model along with best_model or not
|
28 |
+
YOLOX_MLFLOW_RUN_NAME="" # give a custom name to your run, otherwise a random name is assign by mlflow
|
29 |
+
YOLOX_MLFLOW_FLATTEN_PARAMS="True" # flatten any sub sub params of dict to be logged as simple key value pair
|
30 |
+
|
31 |
+
|
32 |
+
MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING=True # log system gpu usage and other metrices
|
33 |
+
MLFLOW_NESTED_RUN="False" #whether to run as a nested run of given run_id
|
34 |
+
MLFLOW_RUN_ID="" # continue training from a given run_id
|
35 |
+
```
|
36 |
+
### Step-5: Provide --logger "mlflow" to the training script
|
37 |
+
```bash
|
38 |
+
python tools/train.py -l mlflow -f exps/path/to/exp.py -d 1 -b 8 --fp16 -o -c
|
39 |
+
pre_trained_model/<model>.pth
|
40 |
+
# note the -l mlflow flag
|
41 |
+
# one working example is this
|
42 |
+
python tools/train.py -l mlflow -f exps/example/custom/yolox_s.py -d 1 -b 8 --fp16 -o -c pre_trained_model/yolox_s.pth
|
43 |
+
```
|
44 |
+
### Step-4: optional; start the mlflow ui and track your experiments
|
45 |
+
If you log runs to a local mlruns directory, run the following command in the directory above it, then access http://127.0.0.1:5000 in your browser.
|
46 |
+
|
47 |
+
```bash
|
48 |
+
mlflow ui --port 5000
|
49 |
+
```
|
50 |
+
|
51 |
+
## Optional Databricks Integration
|
52 |
+
|
53 |
+
### Step-1: Install Databricks sdk
|
54 |
+
```bash
|
55 |
+
pip install databricks-sdk
|
56 |
+
```
|
57 |
+
|
58 |
+
### Step-2: Set up Databricks Environment Variables
|
59 |
+
Set the following environment variables in your `.env` file:
|
60 |
+
```bash
|
61 |
+
MLFLOW_TRACKING_URI="databricks" # set to databricks
|
62 |
+
MLFLOW_EXPERIMENT_NAME="/Users/<user>/<experiment_name>/"
|
63 |
+
DATABRICKS_HOST = "https://dbc-1234567890123456.cloud.databricks.com" # set to your server URI
|
64 |
+
DATABRICKS_TOKEN = "dapixxxxxxxxxxxxx"
|
65 |
+
```
|
tools/train.py
CHANGED
@@ -85,7 +85,7 @@ def make_parser():
|
|
85 |
"--logger",
|
86 |
type=str,
|
87 |
help="Logger to be used for metrics. \
|
88 |
-
|
89 |
default="tensorboard"
|
90 |
)
|
91 |
parser.add_argument(
|
|
|
85 |
"--logger",
|
86 |
type=str,
|
87 |
help="Logger to be used for metrics. \
|
88 |
+
Implemented loggers include `tensorboard`, `mlflow` and `wandb`.",
|
89 |
default="tensorboard"
|
90 |
)
|
91 |
parser.add_argument(
|
yolox/core/trainer.py
CHANGED
@@ -14,6 +14,7 @@ from yolox.data import DataPrefetcher
|
|
14 |
from yolox.exp import Exp
|
15 |
from yolox.utils import (
|
16 |
MeterBuffer,
|
|
|
17 |
ModelEMA,
|
18 |
WandbLogger,
|
19 |
adjust_status,
|
@@ -74,7 +75,8 @@ class Trainer:
|
|
74 |
self.before_train()
|
75 |
try:
|
76 |
self.train_in_epoch()
|
77 |
-
except Exception:
|
|
|
78 |
raise
|
79 |
finally:
|
80 |
self.after_train()
|
@@ -185,8 +187,11 @@ class Trainer:
|
|
185 |
self.exp,
|
186 |
self.evaluator.dataloader.dataset
|
187 |
)
|
|
|
|
|
|
|
188 |
else:
|
189 |
-
raise ValueError("logger must be either 'tensorboard' or 'wandb'")
|
190 |
|
191 |
logger.info("Training start...")
|
192 |
logger.info("\n{}".format(model))
|
@@ -198,6 +203,16 @@ class Trainer:
|
|
198 |
if self.rank == 0:
|
199 |
if self.args.logger == "wandb":
|
200 |
self.wandb_logger.finish()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
def before_epoch(self):
|
203 |
logger.info("---> start train epoch{}".format(self.epoch + 1))
|
@@ -276,6 +291,10 @@ class Trainer:
|
|
276 |
"train/lr": self.meter["lr"].latest
|
277 |
})
|
278 |
self.wandb_logger.log_metrics(metrics, step=self.progress_in_iter)
|
|
|
|
|
|
|
|
|
279 |
|
280 |
self.meter.clear_meters()
|
281 |
|
@@ -351,6 +370,14 @@ class Trainer:
|
|
351 |
"train/epoch": self.epoch + 1,
|
352 |
})
|
353 |
self.wandb_logger.log_images(predictions)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
logger.info("\n" + summary)
|
355 |
synchronize()
|
356 |
|
@@ -358,6 +385,17 @@ class Trainer:
|
|
358 |
if self.save_history_ckpt:
|
359 |
self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95)
|
360 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None):
|
362 |
if self.rank == 0:
|
363 |
save_model = self.ema_model.ema if self.use_model_ema else self.model
|
|
|
14 |
from yolox.exp import Exp
|
15 |
from yolox.utils import (
|
16 |
MeterBuffer,
|
17 |
+
MlflowLogger,
|
18 |
ModelEMA,
|
19 |
WandbLogger,
|
20 |
adjust_status,
|
|
|
75 |
self.before_train()
|
76 |
try:
|
77 |
self.train_in_epoch()
|
78 |
+
except Exception as e:
|
79 |
+
logger.error("Exception in training: ", e)
|
80 |
raise
|
81 |
finally:
|
82 |
self.after_train()
|
|
|
187 |
self.exp,
|
188 |
self.evaluator.dataloader.dataset
|
189 |
)
|
190 |
+
elif self.args.logger == "mlflow":
|
191 |
+
self.mlflow_logger = MlflowLogger()
|
192 |
+
self.mlflow_logger.setup(args=self.args, exp=self.exp)
|
193 |
else:
|
194 |
+
raise ValueError("logger must be either 'tensorboard', 'mlflow' or 'wandb'")
|
195 |
|
196 |
logger.info("Training start...")
|
197 |
logger.info("\n{}".format(model))
|
|
|
203 |
if self.rank == 0:
|
204 |
if self.args.logger == "wandb":
|
205 |
self.wandb_logger.finish()
|
206 |
+
elif self.args.logger == "mlflow":
|
207 |
+
metadata = {
|
208 |
+
"epoch": self.epoch + 1,
|
209 |
+
"input_size": self.input_size,
|
210 |
+
'start_ckpt': self.args.ckpt,
|
211 |
+
'exp_file': self.args.exp_file,
|
212 |
+
"best_ap": float(self.best_ap)
|
213 |
+
}
|
214 |
+
self.mlflow_logger.on_train_end(self.args, file_name=self.file_name,
|
215 |
+
metadata=metadata)
|
216 |
|
217 |
def before_epoch(self):
|
218 |
logger.info("---> start train epoch{}".format(self.epoch + 1))
|
|
|
291 |
"train/lr": self.meter["lr"].latest
|
292 |
})
|
293 |
self.wandb_logger.log_metrics(metrics, step=self.progress_in_iter)
|
294 |
+
if self.args.logger == 'mlflow':
|
295 |
+
logs = {"train/" + k: v.latest for k, v in loss_meter.items()}
|
296 |
+
logs.update({"train/lr": self.meter["lr"].latest})
|
297 |
+
self.mlflow_logger.on_log(self.args, self.exp, self.epoch+1, logs)
|
298 |
|
299 |
self.meter.clear_meters()
|
300 |
|
|
|
370 |
"train/epoch": self.epoch + 1,
|
371 |
})
|
372 |
self.wandb_logger.log_images(predictions)
|
373 |
+
if self.args.logger == "mlflow":
|
374 |
+
logs = {
|
375 |
+
"val/COCOAP50": ap50,
|
376 |
+
"val/COCOAP50_95": ap50_95,
|
377 |
+
"val/best_ap": round(self.best_ap, 3),
|
378 |
+
"train/epoch": self.epoch + 1,
|
379 |
+
}
|
380 |
+
self.mlflow_logger.on_log(self.args, self.exp, self.epoch+1, logs)
|
381 |
logger.info("\n" + summary)
|
382 |
synchronize()
|
383 |
|
|
|
385 |
if self.save_history_ckpt:
|
386 |
self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95)
|
387 |
|
388 |
+
if self.args.logger == "mlflow":
|
389 |
+
metadata = {
|
390 |
+
"epoch": self.epoch + 1,
|
391 |
+
"input_size": self.input_size,
|
392 |
+
'start_ckpt': self.args.ckpt,
|
393 |
+
'exp_file': self.args.exp_file,
|
394 |
+
"best_ap": float(self.best_ap)
|
395 |
+
}
|
396 |
+
self.mlflow_logger.save_checkpoints(self.args, self.exp, self.file_name, self.epoch,
|
397 |
+
metadata, update_best_ckpt)
|
398 |
+
|
399 |
def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None):
|
400 |
if self.rank == 0:
|
401 |
save_model = self.ema_model.ema if self.use_model_ema else self.model
|
yolox/utils/__init__.py
CHANGED
@@ -11,6 +11,7 @@ from .ema import *
|
|
11 |
from .logger import WandbLogger, setup_logger
|
12 |
from .lr_scheduler import LRScheduler
|
13 |
from .metric import *
|
|
|
14 |
from .model_utils import *
|
15 |
from .setup_env import *
|
16 |
from .visualize import *
|
|
|
11 |
from .logger import WandbLogger, setup_logger
|
12 |
from .lr_scheduler import LRScheduler
|
13 |
from .metric import *
|
14 |
+
from .mlflow_logger import MlflowLogger
|
15 |
from .model_utils import *
|
16 |
from .setup_env import *
|
17 |
from .visualize import *
|
yolox/utils/mlflow_logger.py
ADDED
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Copyright (c) Megvii Inc. All rights reserved.
|
3 |
+
# Please read docs/mlflow_integration.md for more details.
|
4 |
+
"""
|
5 |
+
Logging training runs with hyperparameter, datasets and trained models to MlFlow.
|
6 |
+
Mlflow support Model Tracking, Experiment Tracking, and Model Registry.
|
7 |
+
It can be hosted on-premises or in all the major cloud provider or with databricks also.
|
8 |
+
Please read docs/mlflow_integration.md for more details.
|
9 |
+
|
10 |
+
For changing default logging Behaviour you can change mlflow environment variables:
|
11 |
+
https://mlflow.org/docs/latest/python_api/mlflow.environment_variables.html
|
12 |
+
|
13 |
+
For more information, please refer to:
|
14 |
+
https://mlflow.org/docs/latest/introduction/index.html
|
15 |
+
"""
|
16 |
+
import importlib.metadata
|
17 |
+
import importlib.util
|
18 |
+
import json
|
19 |
+
import os
|
20 |
+
from collections.abc import MutableMapping
|
21 |
+
import packaging.version
|
22 |
+
from loguru import logger
|
23 |
+
|
24 |
+
import torch
|
25 |
+
|
26 |
+
from yolox.utils import is_main_process
|
27 |
+
|
28 |
+
|
29 |
+
class MlflowLogger:
|
30 |
+
"""
|
31 |
+
Main Mlflow logging class to log hyperparameters, metrics, and models to Mlflow.
|
32 |
+
"""
|
33 |
+
def __init__(self):
|
34 |
+
if not self.is_required_library_available():
|
35 |
+
raise RuntimeError(
|
36 |
+
"MLflow Logging requires mlflow and python-dotenv to be installed. "
|
37 |
+
"Run `pip install mlflow python-dotenv`.")
|
38 |
+
|
39 |
+
import mlflow
|
40 |
+
from dotenv import find_dotenv, load_dotenv
|
41 |
+
load_dotenv(find_dotenv())
|
42 |
+
self.ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
|
43 |
+
self._MAX_PARAM_VAL_LENGTH = mlflow.utils.validation.MAX_PARAM_VAL_LENGTH
|
44 |
+
self._MAX_PARAMS_TAGS_PER_BATCH = mlflow.utils.validation.MAX_PARAMS_TAGS_PER_BATCH
|
45 |
+
self._initialized = False
|
46 |
+
self._auto_end_run = False
|
47 |
+
self.best_ckpt_upload_pending = False
|
48 |
+
self._tracking_uri = None
|
49 |
+
self._experiment_name = None
|
50 |
+
self._mlflow_log_artifacts = None
|
51 |
+
self._mlflow_log_model_per_n_epochs = None
|
52 |
+
self._mlflow_log_nth_epoch_models = None
|
53 |
+
self.run_name = None
|
54 |
+
self._flatten_params = None
|
55 |
+
self._nested_run = None
|
56 |
+
self._run_id = None
|
57 |
+
self._async_log = None
|
58 |
+
self._ml_flow = mlflow
|
59 |
+
|
60 |
+
def is_required_library_available(self):
|
61 |
+
"""
|
62 |
+
check if required libraries are available.
|
63 |
+
|
64 |
+
Args: None
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
bool: True if required libraries are available, False otherwise.
|
68 |
+
"""
|
69 |
+
dotenv_availaible = importlib.util.find_spec("dotenv") is not None
|
70 |
+
mlflow_available = importlib.util.find_spec("mlflow") is not None
|
71 |
+
return dotenv_availaible and mlflow_available
|
72 |
+
|
73 |
+
def flatten_dict(self, d: MutableMapping, parent_key: str = "", delimiter: str = "."):
|
74 |
+
"""
|
75 |
+
Flatten a nested dict into a single level dict.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
d(MutableMapping): nested dictionary
|
79 |
+
parent_key(str): parent key
|
80 |
+
delimiter(str): delimiter to use
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
flattened_dict(dict): flattened dictionary
|
84 |
+
|
85 |
+
"""
|
86 |
+
|
87 |
+
def _flatten_dict(d, parent_key="", delimiter="."):
|
88 |
+
for k, v in d.items():
|
89 |
+
key = str(parent_key) + delimiter + str(k) if parent_key else k
|
90 |
+
if v and isinstance(v, MutableMapping):
|
91 |
+
yield from self.flatten_dict(v, key, delimiter=delimiter).items()
|
92 |
+
else:
|
93 |
+
yield key, v
|
94 |
+
|
95 |
+
return dict(_flatten_dict(d, parent_key, delimiter))
|
96 |
+
|
97 |
+
def setup(self, args, exp):
|
98 |
+
"""
|
99 |
+
Set up the optional MLflow integration.
|
100 |
+
|
101 |
+
Args:
|
102 |
+
args(dict): training args dictionary
|
103 |
+
exp(dict): Experiment related hyperparameters
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
None
|
107 |
+
|
108 |
+
Environment:
|
109 |
+
- **YOLOX_MLFLOW_LOG_MODEL_ARTIFACTS** (`str`, *optional*, defaults to `False`):
|
110 |
+
Whether to use MLflow `.log_artifact()` facility to log artifacts. This only makes
|
111 |
+
sense if logging to a remote server, e.g. s3 or GCS. If set to `True` or *1*,
|
112 |
+
will copy each check-points on each save in [`TrainingArguments`]'s `output_dir` to the
|
113 |
+
local or remote artifact storage. Using it without a remote storage will just copy the
|
114 |
+
files to your artifact location.
|
115 |
+
- **YOLOX_MLFLOW_LOG_MODEL_PER_n_EPOCHS** (`int`, *optional*, defaults to 30):
|
116 |
+
If ``YOLOX_MLFLOW_LOG_MODEL_ARTIFACTS`` is enabled then Log model checkpoints after
|
117 |
+
every n epochs. Default is 30. ``best_ckpt.pth`` will be updated after `n` epochs if
|
118 |
+
it has been updated during last `n` epochs.
|
119 |
+
- **YOLOX_MLFLOW_LOG_Nth_EPOCH_MODELS** (`str`, *optional*, defaults to `False`):
|
120 |
+
Whether to log the ``epoch_n_ckpt.pth`` models along with best_ckpt.pth model after
|
121 |
+
every `n` epoch as per YOLOX_MLFLOW_LOG_MODEL_PER_n_EPOCHS.
|
122 |
+
If set to `True` or *1*, will log ``epoch_n_ckpt.pth`` along with
|
123 |
+
``best_ckpt.pth`` and as mlflow artifacts in different folders.
|
124 |
+
- **YOLOX_MLFLOW_RUN_NAME** (`str`, *optional*, defaults to random name):
|
125 |
+
Name of new run. Used only when ``run_id`` is unspecified. If a new run is
|
126 |
+
created and ``run_name`` is not specified, a random name will be generated for the run.
|
127 |
+
- **YOLOX_MLFLOW_FLATTEN_PARAMS** (`str`, *optional*, defaults to `False`):
|
128 |
+
Whether to flatten the parameters dictionary before logging.
|
129 |
+
- **MLFLOW_TRACKING_URI** (`str`, *optional*):
|
130 |
+
Whether to store runs at a specific path or remote server. Unset by default, which
|
131 |
+
skips setting the tracking URI entirely.
|
132 |
+
- **MLFLOW_EXPERIMENT_NAME** (`str`, *optional*, defaults to `None`):
|
133 |
+
Whether to use an MLflow experiment_name under which to launch the run. Default to
|
134 |
+
`None` which will point to the `Default` experiment in MLflow. Otherwise, it is a
|
135 |
+
case-sensitive name of the experiment to be activated. If an experiment with this
|
136 |
+
name does not exist, a new experiment with this name is created.
|
137 |
+
- **MLFLOW_TAGS** (`str`, *optional*):
|
138 |
+
A string dump of a dictionary of key/value pair to be added to the MLflow run as tags.
|
139 |
+
Example: `os.environ['MLFLOW_TAGS']=
|
140 |
+
'{"release.candidate": "RC1", "release.version": "2.2.0"}'`.
|
141 |
+
- **MLFLOW_NESTED_RUN** (`str`, *optional*):
|
142 |
+
Whether to use MLflow nested runs. If set to `True` or *1*, will create a nested run
|
143 |
+
inside the current run.
|
144 |
+
- **MLFLOW_RUN_ID** (`str`, *optional*):
|
145 |
+
Allow to reattach to an existing run which can be useful when resuming training from a
|
146 |
+
checkpoint. When `MLFLOW_RUN_ID` environment variable is set, `start_run` attempts
|
147 |
+
to resume a run with the specified run ID and other parameters are ignored.
|
148 |
+
- Other MLflow environment variables: For changing default logging Behaviour refer mlflow
|
149 |
+
environment variables:
|
150 |
+
https://mlflow.org/docs/latest/python_api/mlflow.environment_variables.html
|
151 |
+
- Setup ``Databricks`` integration with MLflow: Provide these two environment variables:
|
152 |
+
DATABRICKS_HOST="https://adb-4273978218682429.9.azuredatabricks.net"
|
153 |
+
DATABRICKS_TOKEN="dapixxxxxxxxxxxxx"
|
154 |
+
"""
|
155 |
+
self._tracking_uri = os.getenv("MLFLOW_TRACKING_URI", None)
|
156 |
+
self._experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", None)
|
157 |
+
self._mlflow_log_artifacts = os.getenv("YOLOX_MLFLOW_LOG_MODEL_ARTIFACTS",
|
158 |
+
"False").upper() in self.ENV_VARS_TRUE_VALUES
|
159 |
+
self._mlflow_log_model_per_n_epochs = int(os.getenv(
|
160 |
+
"YOLOX_MLFLOW_LOG_MODEL_PER_n_EPOCHS", 30))
|
161 |
+
|
162 |
+
self._mlflow_log_nth_epoch_models = os.getenv("YOLOX_MLFLOW_LOG_Nth_EPOCH_MODELS",
|
163 |
+
"False").upper() in self.ENV_VARS_TRUE_VALUES
|
164 |
+
self.run_name = os.getenv("YOLOX_MLFLOW_RUN_NAME", None)
|
165 |
+
self.run_name = None if len(self.run_name.strip()) == 0 else self.run_name
|
166 |
+
self._flatten_params = os.getenv("YOLOX_MLFLOW_FLATTEN_PARAMS",
|
167 |
+
"FALSE").upper() in self.ENV_VARS_TRUE_VALUES
|
168 |
+
self._nested_run = os.getenv("MLFLOW_NESTED_RUN",
|
169 |
+
"FALSE").upper() in self.ENV_VARS_TRUE_VALUES
|
170 |
+
self._run_id = os.getenv("MLFLOW_RUN_ID", None)
|
171 |
+
|
172 |
+
# "synchronous" flag is only available with mlflow version >= 2.8.0
|
173 |
+
# https://github.com/mlflow/mlflow/pull/9705
|
174 |
+
# https://github.com/mlflow/mlflow/releases/tag/v2.8.0
|
175 |
+
self._async_log = packaging.version.parse(
|
176 |
+
self._ml_flow.__version__) >= packaging.version.parse("2.8.0")
|
177 |
+
|
178 |
+
logger.debug(
|
179 |
+
f"MLflow experiment_name={self._experiment_name}, run_name={self.run_name}, "
|
180 |
+
f"nested={self._nested_run}, tags={self._nested_run}, tracking_uri={self._tracking_uri}"
|
181 |
+
)
|
182 |
+
if is_main_process():
|
183 |
+
if not self._ml_flow.is_tracking_uri_set():
|
184 |
+
if self._tracking_uri:
|
185 |
+
self._ml_flow.set_tracking_uri(self._tracking_uri)
|
186 |
+
logger.debug(f"MLflow tracking URI is set to {self._tracking_uri}")
|
187 |
+
else:
|
188 |
+
logger.debug(
|
189 |
+
"Environment variable `MLFLOW_TRACKING_URI` is not provided and therefore"
|
190 |
+
" will not be explicitly set."
|
191 |
+
)
|
192 |
+
else:
|
193 |
+
logger.debug(f"MLflow tracking URI is set to {self._ml_flow.get_tracking_uri()}")
|
194 |
+
|
195 |
+
if self._ml_flow.active_run() is None or self._nested_run or self._run_id:
|
196 |
+
if self._experiment_name:
|
197 |
+
# Use of set_experiment() ensure that Experiment is created if not exists
|
198 |
+
self._ml_flow.set_experiment(self._experiment_name)
|
199 |
+
self._ml_flow.start_run(run_name=self.run_name, nested=self._nested_run)
|
200 |
+
logger.debug(
|
201 |
+
f"MLflow run started with run_id={self._ml_flow.active_run().info.run_id}")
|
202 |
+
self._auto_end_run = True
|
203 |
+
self._initialized = True
|
204 |
+
# filters these params from args
|
205 |
+
keys = ['experiment_name', 'batch_size', 'exp_file', 'resume', 'ckpt', 'start_epoch',
|
206 |
+
'num_machines', 'fp16', 'logger']
|
207 |
+
combined_dict = {k: v for k, v in vars(args).items() if k in keys}
|
208 |
+
if exp is not None:
|
209 |
+
exp_dict = self.convert_exp_todict(exp)
|
210 |
+
combined_dict = {**exp_dict, **combined_dict}
|
211 |
+
self.log_params_mlflow(combined_dict)
|
212 |
+
mlflow_tags = os.getenv("MLFLOW_TAGS", None)
|
213 |
+
if mlflow_tags:
|
214 |
+
mlflow_tags = json.loads(mlflow_tags)
|
215 |
+
self._ml_flow.set_tags(mlflow_tags)
|
216 |
+
|
217 |
+
def log_params_mlflow(self, params_dict):
|
218 |
+
"""
|
219 |
+
Log hyperparameters to MLflow.
|
220 |
+
MLflow's log_param() only accepts values no longer than 250 characters.
|
221 |
+
No overwriting of existing parameters is allowed by default from mlflow.
|
222 |
+
|
223 |
+
Args:
|
224 |
+
params_dict(dict): dict of hyperparameters
|
225 |
+
|
226 |
+
Returns:
|
227 |
+
None
|
228 |
+
"""
|
229 |
+
if is_main_process():
|
230 |
+
params_dict = self.flatten_dict(params_dict) if self._flatten_params else params_dict
|
231 |
+
# remove params that are too long for MLflow
|
232 |
+
for name, value in list(params_dict.items()):
|
233 |
+
# internally, all values are converted to str in MLflow
|
234 |
+
if len(str(value)) > self._MAX_PARAM_VAL_LENGTH:
|
235 |
+
logger.warning(
|
236 |
+
f'Trainer is attempting to log a value of "{value}" for key "{name}" as a '
|
237 |
+
f'parameter. MLflow\'s log_param() only accepts values no longer than 250 '
|
238 |
+
f'characters so we dropped this attribute. You can use '
|
239 |
+
f'`MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters '
|
240 |
+
f'and avoid this message.'
|
241 |
+
)
|
242 |
+
del params_dict[name]
|
243 |
+
# MLflow cannot log more than 100 values in one go, so we have to split it
|
244 |
+
combined_dict_items = list(params_dict.items())
|
245 |
+
for i in range(0, len(combined_dict_items), self._MAX_PARAMS_TAGS_PER_BATCH):
|
246 |
+
if self._async_log:
|
247 |
+
self._ml_flow.log_params(
|
248 |
+
dict(combined_dict_items[i: i + self._MAX_PARAMS_TAGS_PER_BATCH]),
|
249 |
+
synchronous=False
|
250 |
+
)
|
251 |
+
else:
|
252 |
+
self._ml_flow.log_params(
|
253 |
+
dict(combined_dict_items[i: i + self._MAX_PARAMS_TAGS_PER_BATCH])
|
254 |
+
)
|
255 |
+
|
256 |
+
def convert_exp_todict(self, exp):
|
257 |
+
"""
|
258 |
+
Convert the experiment object to dictionary for required parameter only
|
259 |
+
|
260 |
+
Args:
|
261 |
+
exp(dict): Experiment object
|
262 |
+
|
263 |
+
Returns:
|
264 |
+
exp_dict(dict): dict of experiment parameters
|
265 |
+
|
266 |
+
"""
|
267 |
+
filter_keys = ['max_epoch', 'num_classes', 'input_size', 'output_dir',
|
268 |
+
'data_dir', 'train_ann', 'val_ann', 'test_ann',
|
269 |
+
'test_conf', 'nmsthre']
|
270 |
+
exp_dict = {k: v for k, v in exp.__dict__.items()
|
271 |
+
if not k.startswith("__") and k in filter_keys}
|
272 |
+
return exp_dict
|
273 |
+
|
274 |
+
def on_log(self, args, exp, step, logs):
|
275 |
+
"""
|
276 |
+
Log metrics to MLflow.
|
277 |
+
|
278 |
+
Args:
|
279 |
+
args(dict): training args dictionary
|
280 |
+
exp(dict): Experiment related hyperparameters
|
281 |
+
step(int): current training step
|
282 |
+
logs(dict): dictionary of logs to be logged
|
283 |
+
|
284 |
+
Returns:
|
285 |
+
None
|
286 |
+
"""
|
287 |
+
# step = trainer.progress_in_iter
|
288 |
+
if not self._initialized:
|
289 |
+
self.setup(args, exp)
|
290 |
+
if is_main_process(): # master thread only
|
291 |
+
metrics = {}
|
292 |
+
for k, v in logs.items():
|
293 |
+
if isinstance(v, (int, float)):
|
294 |
+
metrics[k] = v
|
295 |
+
elif isinstance(v, torch.Tensor) and v.numel() == 1:
|
296 |
+
metrics[k] = v.item()
|
297 |
+
else:
|
298 |
+
logger.warning(
|
299 |
+
f'Trainer is attempting to log a value of "{v}" of type {type(v)} for key '
|
300 |
+
f'"{k}" as a metric. MLflow log_metric() only accepts float and int types '
|
301 |
+
f'so we dropped this attribute.'
|
302 |
+
)
|
303 |
+
|
304 |
+
if self._async_log:
|
305 |
+
self._ml_flow.log_metrics(metrics=metrics, step=step, synchronous=False)
|
306 |
+
else:
|
307 |
+
self._ml_flow.log_metrics(metrics=metrics, step=step)
|
308 |
+
|
309 |
+
def on_train_end(self, args, file_name, metadata):
|
310 |
+
"""
|
311 |
+
Mlflow logging action to take when training ends:
|
312 |
+
1. log the training log file
|
313 |
+
2. publish the latest best model to model_registry if it is allowed in config file
|
314 |
+
3. close the mlfow run
|
315 |
+
|
316 |
+
Args:
|
317 |
+
args(dict): training args dictionary
|
318 |
+
file_name(str): output directory
|
319 |
+
metadata(dict): model related metadata
|
320 |
+
|
321 |
+
Returns:
|
322 |
+
None
|
323 |
+
"""
|
324 |
+
if is_main_process() and self._initialized:
|
325 |
+
self.save_log_file(args, file_name)
|
326 |
+
if self.best_ckpt_upload_pending:
|
327 |
+
model_file_name = "best_ckpt"
|
328 |
+
mlflow_out_dir = f"{args.experiment_name}/{model_file_name}"
|
329 |
+
artifact_path = os.path.join(file_name, f"{model_file_name}.pth")
|
330 |
+
self.mlflow_save_pyfunc_model(metadata, artifact_path, mlflow_out_dir)
|
331 |
+
if self._auto_end_run and self._ml_flow.active_run():
|
332 |
+
self._ml_flow.end_run()
|
333 |
+
|
334 |
+
def save_log_file(self, args, file_name):
|
335 |
+
"""
|
336 |
+
Save the training log file to mlflow artifact path
|
337 |
+
Args:
|
338 |
+
args(dict): training args dictionary
|
339 |
+
file_name(str): output directory
|
340 |
+
|
341 |
+
Returns:
|
342 |
+
None
|
343 |
+
"""
|
344 |
+
log_file_path = os.path.join(file_name, "train_log.txt")
|
345 |
+
mlflow_out_dir = f"{args.experiment_name}"
|
346 |
+
logger.info(f"Logging logfile: {log_file_path} in mlflow artifact path: {mlflow_out_dir}.")
|
347 |
+
self._ml_flow.log_artifact(log_file_path, mlflow_out_dir)
|
348 |
+
|
349 |
+
def save_checkpoints(self, args, exp, file_name, epoch, metadata, update_best_ckpt):
|
350 |
+
"""
|
351 |
+
Save the model checkpoints to mlflow artifact path
|
352 |
+
if save_history_ckpt is enabled then
|
353 |
+
|
354 |
+
Args:
|
355 |
+
args(dict): training args dictionary
|
356 |
+
exp(dict): Experiment related hyperparameters
|
357 |
+
file_name(str): output directory
|
358 |
+
epoch(int): current epoch
|
359 |
+
metadata(dict): model related metadata
|
360 |
+
update_best_ckpt(bool): bool to show if best_ckpt was updated
|
361 |
+
|
362 |
+
Returns:
|
363 |
+
None
|
364 |
+
"""
|
365 |
+
if is_main_process() and self._mlflow_log_artifacts:
|
366 |
+
if update_best_ckpt:
|
367 |
+
self.best_ckpt_upload_pending = True
|
368 |
+
if ((epoch + 1) % self._mlflow_log_model_per_n_epochs) == 0:
|
369 |
+
self.save_log_file(args, file_name)
|
370 |
+
if self.best_ckpt_upload_pending:
|
371 |
+
model_file_name = "best_ckpt"
|
372 |
+
mlflow_out_dir = f"{args.experiment_name}/{model_file_name}"
|
373 |
+
artifact_path = os.path.join(file_name, f"{model_file_name}.pth")
|
374 |
+
self.mlflow_save_pyfunc_model(metadata, artifact_path, mlflow_out_dir)
|
375 |
+
self.best_ckpt_upload_pending = False
|
376 |
+
if self._mlflow_log_nth_epoch_models and exp.save_history_ckpt:
|
377 |
+
model_file_name = f"epoch_{epoch + 1}_ckpt"
|
378 |
+
mlflow_out_dir = f"{args.experiment_name}/hist_epochs/{model_file_name}"
|
379 |
+
artifact_path = os.path.join(file_name, f"{model_file_name}.pth")
|
380 |
+
self.mlflow_save_pyfunc_model(metadata, artifact_path, mlflow_out_dir)
|
381 |
+
|
382 |
+
def mlflow_save_pyfunc_model(self, metadata, artifact_path, mlflow_out_dir):
|
383 |
+
"""
|
384 |
+
This will send the given model to mlflow server if HF_MLFLOW_LOG_ARTIFACTS is true
|
385 |
+
- optionally publish to model registry if allowed in config file
|
386 |
+
|
387 |
+
Args:
|
388 |
+
metadata(dict): model related metadata
|
389 |
+
artifact_path(str): model checkpoint path
|
390 |
+
mlflow_out_dir(str): mlflow artifact path
|
391 |
+
|
392 |
+
Returns:
|
393 |
+
None
|
394 |
+
"""
|
395 |
+
if is_main_process() and self._initialized and self._mlflow_log_artifacts:
|
396 |
+
logger.info(
|
397 |
+
f"Logging checkpoint {artifact_path} artifacts in mlflow artifact path: "
|
398 |
+
f"{mlflow_out_dir}. This may take time.")
|
399 |
+
if os.path.exists(artifact_path):
|
400 |
+
self._ml_flow.pyfunc.log_model(
|
401 |
+
mlflow_out_dir,
|
402 |
+
artifacts={"model_path": artifact_path},
|
403 |
+
python_model=self._ml_flow.pyfunc.PythonModel(),
|
404 |
+
metadata=metadata
|
405 |
+
)
|
406 |
+
|
407 |
+
def __del__(self):
|
408 |
+
"""
|
409 |
+
if the previous run is not terminated correctly, the fluent API will
|
410 |
+
not let you start a new run before the previous one is killed
|
411 |
+
|
412 |
+
Args: None
|
413 |
+
Return: None
|
414 |
+
"""
|
415 |
+
if (
|
416 |
+
self._auto_end_run
|
417 |
+
and callable(getattr(self._ml_flow, "active_run", None))
|
418 |
+
and self._ml_flow.active_run() is not None
|
419 |
+
):
|
420 |
+
self._ml_flow.end_run()
|