henry000 commited on
Commit
c3b133e
·
1 Parent(s): cc70c05

💬 [Update] logging, add avg loss and ap table

Browse files
yolo/tools/data_loader.py CHANGED
@@ -199,7 +199,7 @@ class YoloDataLoader(DataLoader):
199
  batch_images = torch.stack(batch_images)
200
  batch_reverse = torch.stack(batch_reverse)
201
 
202
- return batch_images, batch_targets, batch_reverse, batch_path
203
 
204
 
205
  def create_dataloader(data_cfg: DataConfig, dataset_cfg: DatasetConfig, task: str = "train", use_ddp: bool = False):
 
199
  batch_images = torch.stack(batch_images)
200
  batch_reverse = torch.stack(batch_reverse)
201
 
202
+ return batch_size, batch_images, batch_targets, batch_reverse, batch_path
203
 
204
 
205
  def create_dataloader(data_cfg: DataConfig, dataset_cfg: DatasetConfig, task: str = "train", use_ddp: bool = False):
yolo/tools/solver.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import os
3
  import sys
4
  import time
 
5
 
6
  import torch
7
  from loguru import logger
@@ -72,22 +73,28 @@ class ModelTrainer:
72
  self.scaler.step(self.optimizer)
73
  self.scaler.update()
74
 
75
- return loss.item(), loss_item
76
 
77
  def train_one_epoch(self, dataloader):
78
  self.model.train()
79
- total_loss = 0
 
80
 
81
- for images, targets, *_ in dataloader:
82
- loss, loss_each = self.train_one_batch(images, targets)
83
 
84
- total_loss += loss
 
 
85
  self.progress.one_batch(loss_each)
86
 
 
 
 
87
  if self.scheduler:
88
  self.scheduler.step()
89
 
90
- return total_loss / len(dataloader)
91
 
92
  def save_checkpoint(self, epoch: int, filename="checkpoint.pt"):
93
  checkpoint = {
@@ -110,10 +117,9 @@ class ModelTrainer:
110
  if self.use_ddp:
111
  dataloader.sampler.set_epoch(epoch)
112
 
113
- self.progress.start_one_epoch(len(dataloader), self.optimizer, epoch)
114
- # TODO: calculate epoch loss
115
  epoch_loss = self.train_one_epoch(dataloader)
116
- self.progress.finish_one_epoch()
117
 
118
  self.validator.solve(self.validation_dataloader, epoch_idx=epoch)
119
 
@@ -199,21 +205,21 @@ class ModelValidator:
199
  # logger.info("🧪 Start Validation!")
200
  self.model.eval()
201
  mAPs, predict_json = [], []
202
- self.progress.start_one_epoch(len(dataloader))
203
- for images, targets, rev_tensor, img_paths in dataloader:
204
  images, targets, rev_tensor = images.to(self.device), targets.to(self.device), rev_tensor.to(self.device)
205
  with torch.no_grad():
206
  predicts = self.model(images)
207
  predicts = self.post_proccess(predicts)
208
  for idx, predict in enumerate(predicts):
209
  mAPs.append(calculate_map(predict, targets[idx]))
210
- self.progress.one_batch(mAP=Tensor(mAPs))
211
 
212
  predict_json.extend(predicts_to_json(img_paths, predicts, rev_tensor))
213
- self.progress.finish_one_epoch()
214
  with open(self.json_path, "w") as f:
215
  json.dump(predict_json, f)
216
 
217
- self.progress.run_coco()
218
  result = calculate_ap(self.coco_gt, predict_json)
219
- self.progress.finish_coco(result, epoch_idx)
 
2
  import os
3
  import sys
4
  import time
5
+ from collections import defaultdict
6
 
7
  import torch
8
  from loguru import logger
 
73
  self.scaler.step(self.optimizer)
74
  self.scaler.update()
75
 
76
+ return loss_item
77
 
78
  def train_one_epoch(self, dataloader):
79
  self.model.train()
80
+ total_loss = defaultdict(lambda: torch.tensor(0.0, device=self.device))
81
+ total_samples = 0
82
 
83
+ for batch_size, images, targets, *_ in dataloader:
84
+ loss_each = self.train_one_batch(images, targets)
85
 
86
+ for loss_name, loss_val in loss_each.items():
87
+ total_loss[loss_name] += loss_val * batch_size
88
+ total_samples += batch_size
89
  self.progress.one_batch(loss_each)
90
 
91
+ for loss_val in total_loss.values():
92
+ loss_val /= total_samples
93
+
94
  if self.scheduler:
95
  self.scheduler.step()
96
 
97
+ return total_loss
98
 
99
  def save_checkpoint(self, epoch: int, filename="checkpoint.pt"):
100
  checkpoint = {
 
117
  if self.use_ddp:
118
  dataloader.sampler.set_epoch(epoch)
119
 
120
+ self.progress.start_one_epoch(len(dataloader), "Train", self.optimizer, epoch)
 
121
  epoch_loss = self.train_one_epoch(dataloader)
122
+ self.progress.finish_one_epoch(epoch_loss, epoch)
123
 
124
  self.validator.solve(self.validation_dataloader, epoch_idx=epoch)
125
 
 
205
  # logger.info("🧪 Start Validation!")
206
  self.model.eval()
207
  mAPs, predict_json = [], []
208
+ self.progress.start_one_epoch(len(dataloader), task="Validate")
209
+ for batch_size, images, targets, rev_tensor, img_paths in dataloader:
210
  images, targets, rev_tensor = images.to(self.device), targets.to(self.device), rev_tensor.to(self.device)
211
  with torch.no_grad():
212
  predicts = self.model(images)
213
  predicts = self.post_proccess(predicts)
214
  for idx, predict in enumerate(predicts):
215
  mAPs.append(calculate_map(predict, targets[idx]))
216
+ self.progress.one_batch(Tensor(mAPs))
217
 
218
  predict_json.extend(predicts_to_json(img_paths, predicts, rev_tensor))
219
+ self.progress.finish_one_epoch(Tensor(mAPs), epoch_idx=epoch_idx)
220
  with open(self.json_path, "w") as f:
221
  json.dump(predict_json, f)
222
 
223
+ self.progress.start_pycocotools()
224
  result = calculate_ap(self.coco_gt, predict_json)
225
+ self.progress.finish_pycocotools(result, epoch_idx)
yolo/utils/logging_utils.py CHANGED
@@ -14,11 +14,12 @@ Example:
14
  import os
15
  import sys
16
  from collections import deque
17
- from typing import Dict, List
18
 
19
  import wandb
20
  import wandb.errors.term
21
  from loguru import logger
 
22
  from rich.console import Console, Group
23
  from rich.progress import (
24
  BarColumn,
@@ -72,58 +73,72 @@ class ProgressLogger(Progress):
72
  project="YOLO", resume="allow", mode="online", dir=self.save_path, id=None, name=exp_name
73
  )
74
 
75
- def update_ap_table(self, ap_list, epoch_idx=-1):
76
- ap_table, ap_main = make_ap_table(ap_list, self.ap_past_list, epoch_idx)
77
- self.ap_past_list.append((epoch_idx, ap_main))
78
- self.ap_table = ap_table
79
-
80
- if self.use_wandb:
81
- self.wandb.log({f"mAP/AP @ .5:.95": ap_main[1], f"mAP/AP @ .5": ap_main[3]})
82
-
83
  def get_renderable(self):
84
- return Group(*self.get_renderables(), self.ap_table)
 
85
 
86
  def start_train(self, num_epochs: int):
87
- self.task_epoch = self.add_task("[cyan]Epochs [white]| Loss | Box | DFL | BCE |", total=num_epochs)
88
 
89
- def start_one_epoch(self, num_batches: int, optimizer: Optimizer = None, epoch_idx: int = None):
 
 
90
  self.num_batches = num_batches
 
 
 
 
91
  if self.use_wandb and optimizer is not None:
92
  lr_values = [params["lr"] for params in optimizer.param_groups]
93
- lr_names = ["bias", "norm", "conv"]
94
  for lr_name, lr_value in zip(lr_names, lr_values):
95
- self.wandb.log({f"Learning Rate/{lr_name}": lr_value}, step=epoch_idx)
96
- self.batch_task = self.add_task("[green]Batches", total=num_batches)
97
-
98
- def one_batch(self, loss_dict: Dict[str, Tensor] = None, mAP: Tensor = None):
99
- if loss_dict is None:
100
- # refactor this block & class
101
- mAP_50, mAP_50_95 = mAP.mean(0)
102
- self.update(self.batch_task, advance=1, description=f"[green]Validating {mAP_50: .2f} {mAP_50_95: .2f}")
103
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  if self.use_wandb:
105
- for loss_name, loss_value in loss_dict.items():
106
- self.wandb.log({f"Loss/{loss_name}": loss_value})
107
-
108
- loss_str = "| -.-- |"
109
- for loss_name, loss_val in loss_dict.items():
110
- loss_str += f" {loss_val:2.2f} |"
111
 
112
- self.update(self.batch_task, advance=1, description=f"[green]Batches [white]{loss_str}")
113
- self.update(self.task_epoch, advance=1 / self.num_batches)
114
 
115
- def run_coco(self):
116
- self.batch_task = self.add_task("[green]Run COCO", total=1)
 
 
117
 
118
- def finish_coco(self, result, epoch_idx):
119
- self.update_ap_table(result, epoch_idx)
120
  self.update(self.batch_task, advance=1)
121
  self.refresh()
122
  self.remove_task(self.batch_task)
123
 
124
- def finish_one_epoch(self):
125
- self.remove_task(self.batch_task)
126
-
127
  def finish_train(self):
128
  self.wandb.finish()
129
 
@@ -149,7 +164,11 @@ def log_model_structure(model: List[YOLOLayer]):
149
  layer_param = sum(x.numel() for x in layer.parameters()) # number parameters
150
  in_channels, out_channels = getattr(layer, "in_c", None), getattr(layer, "out_c", None)
151
  if in_channels and out_channels:
152
- channels = f"{in_channels:4} -> {out_channels:4}"
 
 
 
 
153
  else:
154
  channels = "-"
155
  table.add_row(str(idx), layer.layer_type, layer.tags, f"{layer_param:,}", channels)
 
14
  import os
15
  import sys
16
  from collections import deque
17
+ from typing import Any, Dict, List
18
 
19
  import wandb
20
  import wandb.errors.term
21
  from loguru import logger
22
+ from omegaconf import ListConfig
23
  from rich.console import Console, Group
24
  from rich.progress import (
25
  BarColumn,
 
73
  project="YOLO", resume="allow", mode="online", dir=self.save_path, id=None, name=exp_name
74
  )
75
 
 
 
 
 
 
 
 
 
76
  def get_renderable(self):
77
+ renderable = Group(*self.get_renderables(), self.ap_table)
78
+ return renderable
79
 
80
  def start_train(self, num_epochs: int):
81
+ self.task_epoch = self.add_task(f"[cyan]Start Training {num_epochs} epochs", total=num_epochs)
82
 
83
+ def start_one_epoch(
84
+ self, num_batches: int, task: str = "Train", optimizer: Optimizer = None, epoch_idx: int = None
85
+ ):
86
  self.num_batches = num_batches
87
+ self.task = task
88
+ if hasattr(self, "task_epoch"):
89
+ self.update(self.task_epoch, description=f"[cyan] Preparing Data")
90
+
91
  if self.use_wandb and optimizer is not None:
92
  lr_values = [params["lr"] for params in optimizer.param_groups]
93
+ lr_names = ["Learning Rate/bias", "Learning Rate/norm", "Learning Rate/conv"]
94
  for lr_name, lr_value in zip(lr_names, lr_values):
95
+ self.wandb.log({lr_name: lr_value}, step=epoch_idx)
96
+ self.batch_task = self.add_task(f"[green] Phase: {task}", total=num_batches)
97
+
98
+ def one_batch(self, batch_info: Dict[str, Tensor] = None):
99
+ epoch_descript = "[cyan]" + self.task + "[white] |"
100
+ batch_descript = "|"
101
+ if self.task == "Train":
102
+ self.update(self.task_epoch, advance=1 / self.num_batches)
103
+ elif self.task == "Validate":
104
+ batch_info = {
105
+ "mAP.5": batch_info.mean(dim=0)[0],
106
+ "mAP.5:.95": batch_info.mean(dim=0)[1],
107
+ }
108
+ for info_name, info_val in batch_info.items():
109
+ epoch_descript += f"{info_name: ^9}|"
110
+ batch_descript += f" {info_val:2.2f} |"
111
+ self.update(self.batch_task, advance=1, description=f"[green]{self.task} [white]{batch_descript}")
112
+ if hasattr(self, "task_epoch"):
113
+ self.update(self.task_epoch, description=epoch_descript)
114
+
115
+ def finish_one_epoch(self, batch_info: Dict[str, Any] = None, epoch_idx: int = -1):
116
+ if self.task == "Train":
117
+ for loss_name in batch_info.keys():
118
+ batch_info["Loss/" + loss_name] = batch_info.pop(loss_name)
119
+ elif self.task == "Validate":
120
+ batch_info = {
121
+ "Metrics/mAP.5": batch_info.mean(dim=0)[0],
122
+ "Metrics/mAP.5:.95": batch_info.mean(dim=0)[1],
123
+ }
124
  if self.use_wandb:
125
+ self.wandb.log(batch_info, step=epoch_idx)
126
+ self.remove_task(self.batch_task)
 
 
 
 
127
 
128
+ def start_pycocotools(self):
129
+ self.batch_task = self.add_task("[green] run pycocotools", total=1)
130
 
131
+ def finish_pycocotools(self, result, epoch_idx=-1):
132
+ ap_table, ap_main = make_ap_table(result, self.ap_past_list, epoch_idx)
133
+ self.ap_past_list.append((epoch_idx, ap_main))
134
+ self.ap_table = ap_table
135
 
136
+ if self.use_wandb:
137
+ self.wandb.log({"PyCOCO/AP @ .5:.95": ap_main[1], "PyCOCO/AP @ .5": ap_main[3]})
138
  self.update(self.batch_task, advance=1)
139
  self.refresh()
140
  self.remove_task(self.batch_task)
141
 
 
 
 
142
  def finish_train(self):
143
  self.wandb.finish()
144
 
 
164
  layer_param = sum(x.numel() for x in layer.parameters()) # number parameters
165
  in_channels, out_channels = getattr(layer, "in_c", None), getattr(layer, "out_c", None)
166
  if in_channels and out_channels:
167
+ if isinstance(in_channels, (list, ListConfig)):
168
+ in_channels = "M"
169
+ if isinstance(out_channels, (list, ListConfig)):
170
+ out_channels = "M"
171
+ channels = f"{str(in_channels): >4} -> {str(out_channels): >4}"
172
  else:
173
  channels = "-"
174
  table.add_row(str(idx), layer.layer_type, layer.tags, f"{layer_param:,}", channels)