Spaces:

pawanmau01
/

testcap

Runtime error

testcap / torch /hub /baudm_parseq_main /tune.py

pawandev

first push

bfea304 11 months ago

7.91 kB

	#!/usr/bin/env python3
	# Scene Text Recognition Model Hub
	# Copyright 2022 Darwin Bautista
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import logging
	import math
	import os
	import shutil
	from pathlib import Path

	import hydra
	import numpy as np
	from hydra.core.hydra_config import HydraConfig
	from omegaconf import DictConfig, open_dict
	from ray import air, train, tune
	from ray.tune import CLIReporter
	from ray.tune.integration.pytorch_lightning import TuneReportCheckpointCallback
	from ray.tune.schedulers import MedianStoppingRule
	from ray.tune.search.ax import AxSearch

	from pytorch_lightning import LightningModule, Trainer
	from pytorch_lightning.loggers import TensorBoardLogger

	from strhub.data.module import SceneTextDataModule
	from strhub.models.base import BaseSystem

	log = logging.getLogger(__name__)


	class MetricTracker(tune.Stopper):
	"""Tracks the trend of the metric. Stops downward/stagnant trials. Assumes metric is being maximized."""

	def __init__(self, metric, max_t, patience: int = 3, window: int = 3) -> None:
	super().__init__()
	self.metric = metric
	self.trial_history = {}
	self.max_t = max_t
	self.training_iteration = 0
	self.eps = 0.01 # sensitivity
	self.patience = patience # number of consecutive downward/stagnant samples to trigger early stoppage.
	self.kernel = self.gaussian_pdf(np.arange(window) - window // 2, sigma=0.6)
	# Extra samples to keep in order to have better MAs + gradients for the middle p samples.
	self.buffer = 2 * (len(self.kernel) // 2) + 2

	@staticmethod
	def gaussian_pdf(x, sigma=1.0):
	return np.exp(-((x / sigma) ** 2) / 2) / (sigma * np.sqrt(2 * np.pi))

	@staticmethod
	def moving_average(x, k):
	return np.convolve(x, k, 'valid') / k.sum()

	def __call__(self, trial_id, result):
	self.training_iteration = result['training_iteration']
	if np.isnan(result['loss']) or self.training_iteration >= self.max_t:
	try:
	del self.trial_history[trial_id]
	except KeyError:
	pass
	return True
	history = self.trial_history.get(trial_id, [])
	# FIFO queue of metric values.
	history = history[-(self.patience + self.buffer - 1) :] + [result[self.metric]]
	# Only start checking once we have enough data. At least one non-zero sample is required.
	if len(history) == self.patience + self.buffer and sum(history) > 0:
	smooth_grad = np.gradient(self.moving_average(history, self.kernel))[1:-1] # discard edge values.
	# Check if trend is downward or stagnant
	if (smooth_grad < self.eps).all():
	log.info(f'Stopping trial = {trial_id}, hist = {history}, grad = {smooth_grad}')
	try:
	del self.trial_history[trial_id]
	except KeyError:
	pass
	return True
	self.trial_history[trial_id] = history
	return False

	def stop_all(self):
	return False


	class TuneReportCheckpointPruneCallback(TuneReportCheckpointCallback):

	def _handle(self, trainer: Trainer, pl_module: LightningModule):
	super()._handle(trainer, pl_module)
	# Prune older checkpoints
	trial_dir = train.get_context().get_trial_dir()
	for old in sorted(Path(trial_dir).glob('checkpoint_epoch=-step='), key=os.path.getmtime)[:-1]:
	log.info(f'Deleting old checkpoint: {old}')
	shutil.rmtree(old)


	def trainable(hparams, config):
	with open_dict(config):
	config.model.lr = hparams['lr']
	# config.model.weight_decay = hparams['wd']

	model: BaseSystem = hydra.utils.instantiate(config.model)
	datamodule: SceneTextDataModule = hydra.utils.instantiate(config.data)

	tune_callback = TuneReportCheckpointPruneCallback({
	'loss': 'val_loss',
	'NED': 'val_NED',
	'accuracy': 'val_accuracy',
	})
	if checkpoint := train.get_checkpoint():
	with checkpoint.as_directory() as checkpoint_dir:
	ckpt_path = os.path.join(checkpoint_dir, 'checkpoint')
	else:
	ckpt_path = None
	trainer: Trainer = hydra.utils.instantiate(
	config.trainer,
	enable_progress_bar=False,
	enable_checkpointing=False,
	logger=TensorBoardLogger(save_dir=train.get_context().get_trial_dir(), name='', version='.'),
	callbacks=[tune_callback],
	)
	trainer.fit(model, datamodule=datamodule, ckpt_path=ckpt_path)


	@hydra.main(config_path='configs', config_name='tune', version_base='1.2')
	def main(config: DictConfig):
	# Special handling for PARseq
	if config.model.get('perm_mirrored', False):
	assert config.model.perm_num % 2 == 0, 'perm_num should be even if perm_mirrored = True'
	# Modify config
	with open_dict(config):
	# Use mixed-precision training
	if config.trainer.get('gpus', 0):
	config.trainer.precision = 16
	# Resolve absolute path to data.root_dir
	config.data.root_dir = hydra.utils.to_absolute_path(config.data.root_dir)

	hparams = {
	'lr': tune.loguniform(config.tune.lr.min, config.tune.lr.max),
	# 'wd': tune.loguniform(config.tune.wd.min, config.tune.wd.max),
	}

	steps_per_epoch = len(hydra.utils.instantiate(config.data).train_dataloader())
	val_steps = steps_per_epoch * config.trainer.max_epochs / config.trainer.val_check_interval
	max_t = round(0.75 * val_steps)
	warmup_t = round(config.model.warmup_pct * val_steps)
	scheduler = MedianStoppingRule(time_attr='training_iteration', grace_period=warmup_t)

	# Always start by evenly diving the range in log scale.
	lr = hparams['lr']
	start = np.log10(lr.lower)
	stop = np.log10(lr.upper)
	num = math.ceil(stop - start) + 1
	initial_points = [{'lr': np.clip(x, lr.lower, lr.upper).item()} for x in reversed(np.logspace(start, stop, num))]
	search_alg = AxSearch(points_to_evaluate=initial_points)

	reporter = CLIReporter(parameter_columns=['lr'], metric_columns=['loss', 'accuracy', 'training_iteration'])

	out_dir = Path(HydraConfig.get().runtime.output_dir if config.tune.resume_dir is None else config.tune.resume_dir)

	resources_per_trial = {
	'cpu': 1,
	'gpu': config.tune.gpus_per_trial,
	}

	wrapped_trainable = tune.with_parameters(tune.with_resources(trainable, resources_per_trial), config=config)
	if config.tune.resume_dir is None:
	tuner = tune.Tuner(
	wrapped_trainable,
	param_space=hparams,
	tune_config=tune.TuneConfig(
	mode='max',
	metric='NED',
	search_alg=search_alg,
	scheduler=scheduler,
	num_samples=config.tune.num_samples,
	),
	run_config=air.RunConfig(
	name=out_dir.name,
	stop=MetricTracker('NED', max_t),
	progress_reporter=reporter,
	local_dir=str(out_dir.parent.absolute()),
	),
	)
	else:
	tuner = tune.Tuner.restore(config.tune.resume_dir, wrapped_trainable)
	results = tuner.fit()
	best_result = results.get_best_result()

	print('Best hyperparameters found were:', best_result.config)
	print('with result:\n', best_result)


	if __name__ == '__main__':
	main()