Тензор не является ошибкой тензора после использования tf.IndexedSlices
Я пытаюсь реализовать сжатие градиента. Я планирую использовать tf.IndexedSlices, чтобы отправлять только ненулевые градиенты в horovod allreduce(). У меня есть код, где я вычисляю градиенты, используя compute_gradients, затем я вычисляю порог (медиану), и все значения в градиентах, меньших, чем порог, устанавливаются в ноль. После этого я создаю объект tf.IndexedSlices.
Я предполагаю, что после возврата из метода compute_gradients() CustomOptimizer, код переходит в horovod метод compute_gradients() DistributedOptimizer, который, в свою очередь, вызывает allreduce()? Это правильно?
Во-вторых, я получаю ошибку, что тензор не является тензором. Как все я allreduce (), используя IndexedSlices?
Я добавляю свой CustomOptimizer в модель BERT из репозитория Nvidia. https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT
Я выполняю шаги точной настройки на GPU
#Calling class
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, hvd=None, use_fp16=False, amp=False):
#code to find global_step, learning_rate etc...
if hvd is not None:
from horovod.tensorflow.compression import Compression
optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none)
#calls the code from class CustomOptimizer below
from CustomOptimizer import CustomOptimizer
optimizer = CustomOptimizer(optimizer)
tvars = tf.trainable_variables()
#this is calling CustomOptimizer.compute_gradients
# I assume after calling custom optimizers computer_gradients, it called horovod's compute gradients, where allreduce() is being called. Is this correct ??
grads_and_vars = optimizer.compute_gradients(loss, tvars)
grads_and_vars = [(g,v) for g,v in grads_and_vars if g is not None]
grads, tvars = list(zip(*grads_and_vars))
all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if use_fp16 or amp else tf.constant(True, dtype=tf.bool)
# This is how the model was pre-trained.
# ensure global norm is a finite number
# to prevent clip_by_global_norm from having a hizzy fit.
(clipped_grads, _) = tf.clip_by_global_norm(
grads, clip_norm=1.0,
use_norm=tf.cond(
all_are_finite,
lambda: tf.global_norm(grads),
lambda: tf.constant(1.0)))
train_op = optimizer.apply_gradients(
list(zip(clipped_grads, tvars)), global_step=global_step)
new_global_step = tf.cond(all_are_finite, lambda: global_step+1, lambda: global_step)
new_global_step = tf.identity(new_global_step, name='step_update')
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op
#end of create_optimizer
#new class Custom Optimizer
class CustomOptimizer(tf.train.Optimizer):
def compute_gradients(self, *args, **kwargs):
grads_and_vars = self._optimizer.compute_gradients(*args, **kwargs)
grads_and_vars = [(g,v) for g,v in grads_and_vars if g is not None]
newgrads = []
threshold = tf.contrib.distributions.percentile(grads_and_vars[0],80.0,interpolation='higher')
for grad, var in grads_and_vars:
bool_mask = tf.math.greater(abs(grad), threshold)
float_mask = tf.cast(bool_mask, grad.dtype)
grad = tf.multiply(grad, float_mask)
indexed_sclices =tf.IndexedSlices(grad,grad,dense_shape=grad.shape)
newgrads.append(indexed_sclices)
return [(grad, gradvar[1]) for grad, gradvar in zip(newgrads, grads_and_vars)]
Traceback (most recent call last):
File "run_squad.py", line 1411, in <module> tf.app.run()
File "/usr/local/lib/python3.5/dist packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "run_squad.py", line 1318, in main
estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=num_train_steps)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2457, in train rendezvous.raise_errors()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/error_handling.py", line 128, in raise_errors six.reraise(typ, value, traceback)
File "/usr/local/lib/python3.5/dist-packages/six.py", line 693, in reraise raise value
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2452, in train saving_listeners=saving_listeners)
File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 358, in train loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1124, in _train_model return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.5/dist packages/tensorflow_estimator/python/estimator/estimator.py", line 1154, in _train_model_default features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2251, in _call_model_fn
config)
File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1112, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2534, in _model_fn features, labels, is_export_mode=is_export_mode)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1323, in call_without_tpu
return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1593, in _call_model_fn
estimator_spec = self._model_fn(features=features, **kwargs)
File "run_squad.py", line 703, in model_fn
total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, hvd, amp=use_fp16)
File "/workspace/bert/optimization.py", line 102, in create_optimizer
list(zip(clipped_grads, tvars)), global_step=global_step)
File "/workspace/bert/CustomOptimizer.py", line 89, in apply_gradients
return self._optimizer.apply_gradients(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/horovod-0.16.0-py3.5-linux-x86_64.egg/horovod/tensorflow/__init__.py", line 237, in apply_gradients
return self._optimizer.apply_gradients(*args, **kwargs)
File "/workspace/bert/optimization.py", line 158, in apply_gradients
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/dispatch.py", line 180, in wrapper
return target(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_ops.py", line 248, in multiply
return gen_math_ops.mul(x, y, name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 5860, in mul
"Mul", x=x, y=y, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 520, in _apply_op_helper
repr(values), type(values).__name__))
TypeError: Expected float32 passed to parameter 'y' of op 'Mul', got <tensorflow.python.framework.ops.IndexedSlices object at 0x7fe07a3165f8> of type 'IndexedSlices' instead.
INFO:tensorflow:Error recorded from training_loop: Expected float32 passed to parameter 'y' of op 'Mul', got <tensorflow.python.framework.ops.IndexedSlices object at 0x7f96eaa0be48> of type 'IndexedSlices' instead.
INFO:tensorflow:training_loop marked as finished
WARNING:tensorflow:Reraising captured error
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 511, in _apply_op_helper
preferred_dtype=default_dtype)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1175, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gradients_impl.py", line 100, in _IndexedSlicesToTensor
dense_shape_value = tensor_util.constant_value(value.dense_shape)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py", line 848, in constant_value
ret = _ConstantValue(tensor, partial)
File "/usr/local/lib/python3.5/dist-`enter code here`packages/tensorflow/python/framework/tensor_util.py", line 711, in _ConstantValue
raise TypeError("tensor is not a Tensor")
TypeError: tensor is not a Tensor