Spaces:

xiang-wuu
/

yolov5

Runtime error

Charles Frye commited on May 21, 2021

Commit

19100ba

unverified ·

1 Parent(s): dd7f0b7

Improves docs and handling of entities and resuming by WandbLogger (#3264)

* adds latest tag to match wandb defaults

* adds entity handling, 'last' tag

* fixes bug causing finished runs to resume

* removes redundant "last" tag for wandb artifact

Files changed (2) hide show

train.py +1 -1
utils/wandb_logging/wandb_utils.py +24 -9

train.py CHANGED Viewed

@@ -443,7 +443,7 @@ def train(hyp, opt, device, tb_writer=None):
         if wandb_logger.wandb and not opt.evolve:  # Log the stripped model
             wandb_logger.wandb.log_artifact(str(final), type='model',
                                             name='run_' + wandb_logger.wandb_run.id + '_model',
-                                            aliases=['last', 'best', 'stripped'])
         wandb_logger.finish_run()
     else:
         dist.destroy_process_group()

         if wandb_logger.wandb and not opt.evolve:  # Log the stripped model
             wandb_logger.wandb.log_artifact(str(final), type='model',
                                             name='run_' + wandb_logger.wandb_run.id + '_model',
+                                            aliases=['latest', 'best', 'stripped'])
         wandb_logger.finish_run()
     else:
         dist.destroy_process_group()

utils/wandb_logging/wandb_utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import json
 import sys
 from pathlib import Path
@@ -35,8 +36,9 @@ def get_run_info(run_path):
     run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX))
     run_id = run_path.stem
     project = run_path.parent.stem
     model_artifact_name = 'run_' + run_id + '_model'
-    return run_id, project, model_artifact_name
 def check_wandb_resume(opt):
@@ -44,9 +46,9 @@ def check_wandb_resume(opt):
     if isinstance(opt.resume, str):
         if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
             if opt.global_rank not in [-1, 0]:  # For resuming DDP runs
-                run_id, project, model_artifact_name = get_run_info(opt.resume)
                 api = wandb.Api()
-                artifact = api.artifact(project + '/' + model_artifact_name + ':latest')
                 modeldir = artifact.download()
                 opt.weights = str(Path(modeldir) / "last.pt")
             return True
@@ -78,6 +80,18 @@ def process_wandb_config_ddp_mode(opt):
 class WandbLogger():
     def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
         # Pre-training routine --
         self.job_type = job_type
@@ -85,16 +99,17 @@ class WandbLogger():
         # It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
         if isinstance(opt.resume, str):  # checks resume from artifact
             if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
-                run_id, project, model_artifact_name = get_run_info(opt.resume)
                 model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name
                 assert wandb, 'install wandb to resume wandb runs'
                 # Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
-                self.wandb_run = wandb.init(id=run_id, project=project, resume='allow')
                 opt.resume = model_artifact_name
         elif self.wandb:
             self.wandb_run = wandb.init(config=opt,
                                         resume="allow",
                                         project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
                                         name=name,
                                         job_type=job_type,
                                         id=run_id) if not wandb.run else wandb.run
@@ -172,8 +187,8 @@ class WandbLogger():
             modeldir = model_artifact.download()
             epochs_trained = model_artifact.metadata.get('epochs_trained')
             total_epochs = model_artifact.metadata.get('total_epochs')
-            assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % (
-                total_epochs)
             return modeldir, model_artifact
         return None, None
@@ -188,7 +203,7 @@ class WandbLogger():
         })
         model_artifact.add_file(str(path / 'last.pt'), name='last.pt')
         wandb.log_artifact(model_artifact,
-                           aliases=['latest', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
         print("Saving model artifact on epoch ", epoch + 1)
     def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False):
@@ -291,7 +306,7 @@ class WandbLogger():
             if self.result_artifact:
                 train_results = wandb.JoinedTable(self.val_table, self.result_table, "id")
                 self.result_artifact.add(train_results, 'result')
-                wandb.log_artifact(self.result_artifact, aliases=['latest', 'epoch ' + str(self.current_epoch),
                                                                   ('best' if best_result else '')])
                 self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"])
                 self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")

+"""Utilities and tools for tracking runs with Weights & Biases."""
 import json
 import sys
 from pathlib import Path
     run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX))
     run_id = run_path.stem
     project = run_path.parent.stem
+    entity = run_path.parent.parent.stem
     model_artifact_name = 'run_' + run_id + '_model'
+    return entity, project, run_id, model_artifact_name
 def check_wandb_resume(opt):
     if isinstance(opt.resume, str):
         if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
             if opt.global_rank not in [-1, 0]:  # For resuming DDP runs
+                entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
                 api = wandb.Api()
+                artifact = api.artifact(entity + '/' + project + '/' + model_artifact_name + ':latest')
                 modeldir = artifact.download()
                 opt.weights = str(Path(modeldir) / "last.pt")
             return True
 class WandbLogger():
+    """Log training runs, datasets, models, and predictions to Weights & Biases.
+    This logger sends information to W&B at wandb.ai. By default, this information
+    includes hyperparameters, system configuration and metrics, model metrics,
+    and basic data metrics and analyses.
+    By providing additional command line arguments to train.py, datasets,
+    models and predictions can also be logged.
+    For more on how this logger is used, see the Weights & Biases documentation:
+    https://docs.wandb.com/guides/integrations/yolov5
+    """
     def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
         # Pre-training routine --
         self.job_type = job_type
         # It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
         if isinstance(opt.resume, str):  # checks resume from artifact
             if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
+                entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
                 model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name
                 assert wandb, 'install wandb to resume wandb runs'
                 # Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
+                self.wandb_run = wandb.init(id=run_id, project=project, entity=entity, resume='allow')
                 opt.resume = model_artifact_name
         elif self.wandb:
             self.wandb_run = wandb.init(config=opt,
                                         resume="allow",
                                         project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
+                                        entity=opt.entity,
                                         name=name,
                                         job_type=job_type,
                                         id=run_id) if not wandb.run else wandb.run
             modeldir = model_artifact.download()
             epochs_trained = model_artifact.metadata.get('epochs_trained')
             total_epochs = model_artifact.metadata.get('total_epochs')
+            is_finished = total_epochs is None
+            assert not is_finished, 'training is finished, can only resume incomplete runs.'
             return modeldir, model_artifact
         return None, None
         })
         model_artifact.add_file(str(path / 'last.pt'), name='last.pt')
         wandb.log_artifact(model_artifact,
+                           aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
         print("Saving model artifact on epoch ", epoch + 1)
     def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False):
             if self.result_artifact:
                 train_results = wandb.JoinedTable(self.val_table, self.result_table, "id")
                 self.result_artifact.add(train_results, 'result')
+                wandb.log_artifact(self.result_artifact, aliases=['latest', 'last', 'epoch ' + str(self.current_epoch),
                                                                   ('best' if best_result else '')])
                 self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"])
                 self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")