Cyril666's picture
First model version
6250360
from __future__ import print_function
try:
import tensorflow as tf
from tensorflow.python.ops import nn
relu = nn.relu
slim = tf.contrib.slim
sigmoid = nn.sigmoid
softmax = nn.softmax
except:
print("tensorflow is not installed, util.tf can not be used.")
def is_gpu_available(cuda_only=True):
"""
code from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/platform/test.py
Returns whether TensorFlow can access a GPU.
Args:
cuda_only: limit the search to CUDA gpus.
Returns:
True iff a gpu device of the requested kind is available.
"""
from tensorflow.python.client import device_lib as _device_lib
if cuda_only:
return any((x.device_type == 'GPU')
for x in _device_lib.list_local_devices())
else:
return any((x.device_type == 'GPU' or x.device_type == 'SYCL')
for x in _device_lib.list_local_devices())
def get_available_gpus(num_gpus = None):
"""
Modified on http://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow
However, the original code will occupy all available gpu memory.
The modified code need a parameter: num_gpus. It does nothing but return the device handler name
It will work well on single-maching-training, but I don't know whether it will work well on a cluster.
"""
if num_gpus == None:
from tensorflow.python.client import device_lib as _device_lib
local_device_protos = _device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
else:
return ['/gpu:%d'%(idx) for idx in xrange(num_gpus)]
def get_latest_ckpt(path):
# tf.train.latest_checkpoint
import util
path = util.io.get_absolute_path(path)
if util.io.is_dir(path):
ckpt = tf.train.get_checkpoint_state(path)
if ckpt is not None:
ckpt_path = ckpt.model_checkpoint_path
else:
ckpt_path = None
else:
ckpt_path = path;
return ckpt_path
def get_all_ckpts(path):
ckpt = tf.train.get_checkpoint_state(path)
all_ckpts = ckpt.all_model_checkpoint_paths
ckpts = [str(c) for c in all_ckpts]
return ckpts
def get_iter(ckpt):
import util
iter_ = int(util.str.find_all(ckpt, '.ckpt-\d+')[0].split('-')[-1])
return iter_
def get_init_fn(checkpoint_path, train_dir, ignore_missing_vars = False,
checkpoint_exclude_scopes = None, model_name = None, checkpoint_model_scope = None):
"""
code from github/SSD-tensorflow/tf_utils.py
Returns a function run by the chief worker to warm-start the training.
Note that the init_fn is only run when initializing the model during the very
first global step.
checkpoint_path: the checkpoint to be restored
train_dir: the directory where checkpoints are stored during training.
ignore_missing_vars: if False and there are variables in the model but not in the checkpoint, an error will be raised.
checkpoint_model_scope and model_name: if the root scope of checkpoints and the model in session is different,
(but the sub-scopes are all the same), specify them clearly
checkpoint_exclude_scopes: variables to be excluded when restoring from checkpoint_path.
Returns:
An init function run by the supervisor.
"""
import util
if util.str.is_none_or_empty(checkpoint_path):
return None
# Warn the user if a checkpoint exists in the train_dir. Then ignore.
if tf.train.latest_checkpoint(train_dir):
tf.logging.info(
'Ignoring --checkpoint_path because a checkpoint already exists in %s'
% train_dir)
return None
exclusions = []
if checkpoint_exclude_scopes:
exclusions = [scope.strip()
for scope in checkpoint_exclude_scopes.split(',')]
# TODO(sguada) variables.filter_variables()
variables_to_restore = []
for var in slim.get_model_variables():
excluded = False
for exclusion in exclusions:
if var.op.name.startswith(exclusion):
excluded = True
break
if not excluded:
variables_to_restore.append(var)
# Change model scope if necessary.
if checkpoint_model_scope is not None:
variables_to_restore = {checkpoint_model_scope + '/' + var.op.name : var for var in variables_to_restore}
tf.logging.info('variables_to_restore: %r'%(variables_to_restore))
checkpoint_path = get_latest_ckpt(checkpoint_path)
tf.logging.info('Fine-tuning from %s. Ignoring missing vars: %s' % (checkpoint_path, ignore_missing_vars))
print ('checkpoint_path', checkpoint_path)
return slim.assign_from_checkpoint_fn(
checkpoint_path,
variables_to_restore,
ignore_missing_vars=ignore_missing_vars)
def get_variables_to_train(flags = None):
"""code from github/SSD-tensorflow/tf_utils.py
Returns a list of variables to train.
Returns:
A list of variables to train by the optimizer.
"""
if flags is None or flags.trainable_scopes is None:
return tf.trainable_variables()
else:
scopes = [scope.strip() for scope in flags.trainable_scopes.split(',')]
variables_to_train = []
for scope in scopes:
variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
variables_to_train.extend(variables)
return variables_to_train
def Print(tensor, data, msg = '', file = None, mode = 'w'):
from tensorflow.python.ops import control_flow_ops
import util
def np_print(*args):
if util.str.contains(msg, '%'):
message = msg%tuple(args)
else:
message = msg + ' %'*len(args)%tuple(args)
if file is not None:
file_path = util.io.get_absolute_path(file)
print('writting message to file(%s):'%(file_path), message)
with open(file_path, mode) as f:
print(message, file = f)
else:
print(message)
return control_flow_ops.with_dependencies([tf.py_func(np_print, data, [])], tensor)
def get_variable_names_in_checkpoint(path, return_shapes = False, return_reader = False):
"""
Args:
path: the path to training directory containing checkpoints,
or path to checkpoint
Return:
a list of variable names in the checkpoint
"""
import util
ckpt = get_latest_ckpt(path)
ckpt_reader = tf.train.NewCheckpointReader(ckpt)
ckpt_vars = ckpt_reader.get_variable_to_shape_map()
names = [var for var in ckpt_vars]
if return_shapes:
return names, ckpt_vars
def get(name):
return ckpt_reader.get_tensor(name)
if return_reader:
return names, get
return names
def min_area_rect(xs, ys):
import util
rects = tf.py_func(util.img.min_area_rect, [xs, ys], xs.dtype)
rects.set_shape([None, 5])
return rects
def gpu_config(config = None, allow_growth = None, gpu_memory_fraction = None):
if config is None:
config = tf.ConfigProto()
if allow_growth is not None:
config.gpu_options.allow_growth = allow_growth
if gpu_memory_fraction is not None:
config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction
return config
def wait_for_checkpoint(path):
from tensorflow.contrib.training.python.training import evaluation
return evaluation.checkpoints_iterator(path)
def focal_loss(labels, logits, gamma = 2.0, alpha = 0.75, normalize = True):
labels = tf.where(labels > 0, tf.ones_like(labels), tf.zeros_like(labels))
labels = tf.cast(labels, tf.float32)
probs = tf.sigmoid(logits)
CE = tf.nn.sigmoid_cross_entropy_with_logits(labels = labels, logits = logits)
alpha_t = tf.ones_like(logits) * alpha
alpha_t = tf.where(labels > 0, alpha_t, 1.0 - alpha_t)
probs_t = tf.where(labels > 0, probs, 1.0 - probs)
focal_matrix = alpha_t * tf.pow((1.0 - probs_t), gamma)
fl = focal_matrix * CE
fl = tf.reduce_sum(fl)
if normalize:
#n_pos = tf.reduce_sum(labels)
#fl = fl / tf.cast(n_pos, tf.float32)
total_weights = tf.stop_gradient(tf.reduce_sum(focal_matrix))
fl = fl / total_weights
return fl
def focal_loss_layer_initializer(sigma = 0.01, pi = 0.01):
import numpy as np
b0 = - np.log((1 - pi) / pi)
return tf.random_normal_initializer(stddev = sigma), \
tf.constant_initializer(b0)
def sum_gradients(clone_grads, do_summary = False):
averaged_grads = []
for grad_and_vars in zip(*clone_grads):
grads = []
var = grad_and_vars[0][1]
try:
for g, v in grad_and_vars:
assert v == var
grads.append(g)
grad = tf.add_n(grads, name = v.op.name + '_summed_gradients')
except:
import pdb
pdb.set_trace()
averaged_grads.append((grad, v))
if do_summary:
tf.summary.histogram("variables_and_gradients_" + grad.op.name, grad)
tf.summary.histogram("variables_and_gradients_" + v.op.name, v)
tf.summary.scalar("variables_and_gradients_" + grad.op.name+\
'_mean/var_mean', tf.reduce_mean(grad)/tf.reduce_mean(var))
tf.summary.scalar("variables_and_gradients_" + v.op.name+'_mean',tf.reduce_mean(var))
return averaged_grads
def get_update_op():
"""
Extremely important for BatchNorm
"""
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
if update_ops:
return tf.group(*update_ops)
return None