Тензор не является ошибкой тензора после использования tf.IndexedSlices

Я пытаюсь реализовать сжатие градиента. Я планирую использовать tf.IndexedSlices, чтобы отправлять только ненулевые градиенты в horovod allreduce(). У меня есть код, где я вычисляю градиенты, используя compute_gradients, затем я вычисляю порог (медиану), и все значения в градиентах, меньших, чем порог, устанавливаются в ноль. После этого я создаю объект tf.IndexedSlices.

Я предполагаю, что после возврата из метода compute_gradients() CustomOptimizer, код переходит в horovod метод compute_gradients() DistributedOptimizer, который, в свою очередь, вызывает allreduce()? Это правильно?

Во-вторых, я получаю ошибку, что тензор не является тензором. Как все я allreduce (), используя IndexedSlices?

Я добавляю свой CustomOptimizer в модель BERT из репозитория Nvidia. https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT

Я выполняю шаги точной настройки на GPU



#Calling class
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, hvd=None, use_fp16=False, amp=False):
#code to find global_step, learning_rate etc...
if hvd is not None:
      from horovod.tensorflow.compression import Compression
      optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none)

  #calls the code from class CustomOptimizer below    
  from CustomOptimizer import CustomOptimizer
  optimizer = CustomOptimizer(optimizer)

  tvars = tf.trainable_variables()

  #this is calling CustomOptimizer.compute_gradients
  # I assume after calling custom optimizers computer_gradients, it called horovod's compute gradients, where allreduce() is being called. Is this correct ??
  grads_and_vars = optimizer.compute_gradients(loss, tvars) 
  grads_and_vars = [(g,v) for g,v in grads_and_vars if g is not None]
  grads, tvars = list(zip(*grads_and_vars))
  all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if use_fp16 or amp else tf.constant(True, dtype=tf.bool)

# This is how the model was pre-trained.
  # ensure global norm is a finite number 
  # to prevent clip_by_global_norm from having a hizzy fit.
  (clipped_grads, _) = tf.clip_by_global_norm(
        grads, clip_norm=1.0,
        use_norm=tf.cond(
            all_are_finite,
            lambda: tf.global_norm(grads),
            lambda: tf.constant(1.0)))

  train_op = optimizer.apply_gradients(
      list(zip(clipped_grads, tvars)), global_step=global_step)
  new_global_step = tf.cond(all_are_finite, lambda: global_step+1, lambda: global_step)
  new_global_step = tf.identity(new_global_step, name='step_update')
  train_op = tf.group(train_op, [global_step.assign(new_global_step)])
  return train_op

#end of create_optimizer

#new class Custom Optimizer

class CustomOptimizer(tf.train.Optimizer):

    def compute_gradients(self, *args, **kwargs):
        grads_and_vars = self._optimizer.compute_gradients(*args, **kwargs)
        grads_and_vars = [(g,v) for g,v in grads_and_vars if g is not None]
        newgrads = []
        threshold = tf.contrib.distributions.percentile(grads_and_vars[0],80.0,interpolation='higher')
        for grad, var in grads_and_vars:
            bool_mask = tf.math.greater(abs(grad), threshold)
            float_mask = tf.cast(bool_mask, grad.dtype)
            grad = tf.multiply(grad, float_mask)
            indexed_sclices =tf.IndexedSlices(grad,grad,dense_shape=grad.shape)
            newgrads.append(indexed_sclices)

        return [(grad, gradvar[1]) for grad, gradvar in zip(newgrads, grads_and_vars)]



Traceback (most recent call last):
    File "run_squad.py", line 1411, in <module> tf.app.run()
    File "/usr/local/lib/python3.5/dist packages/tensorflow/python/platform/app.py", line 125, in run
    _sys.exit(main(argv))
    File "run_squad.py", line 1318, in main
    estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=num_train_steps)
    File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2457, in train rendezvous.raise_errors()
    File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/error_handling.py", line 128, in raise_errors six.reraise(typ, value, traceback)
     File "/usr/local/lib/python3.5/dist-packages/six.py", line 693, in reraise raise value
     File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2452, in train saving_listeners=saving_listeners)
     File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 358, in train loss = self._train_model(input_fn, hooks, saving_listeners)
     File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1124, in _train_model return self._train_model_default(input_fn, hooks, saving_listeners)
     File "/usr/local/lib/python3.5/dist packages/tensorflow_estimator/python/estimator/estimator.py", line 1154, in _train_model_default features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2251, in _call_model_fn
    config)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1112, in _call_model_fn
    model_fn_results = self._model_fn(features=features, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2534, in _model_fn features, labels, is_export_mode=is_export_mode)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1323, in call_without_tpu
    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1593, in _call_model_fn
    estimator_spec = self._model_fn(features=features, **kwargs)
  File "run_squad.py", line 703, in model_fn
    total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, hvd, amp=use_fp16)
  File "/workspace/bert/optimization.py", line 102, in create_optimizer
    list(zip(clipped_grads, tvars)), global_step=global_step)
  File "/workspace/bert/CustomOptimizer.py", line 89, in apply_gradients
    return self._optimizer.apply_gradients(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/horovod-0.16.0-py3.5-linux-x86_64.egg/horovod/tensorflow/__init__.py", line 237, in apply_gradients
    return self._optimizer.apply_gradients(*args, **kwargs)
  File "/workspace/bert/optimization.py", line 158, in apply_gradients
    tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/dispatch.py", line 180, in wrapper
    return target(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_ops.py", line 248, in multiply
    return gen_math_ops.mul(x, y, name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 5860, in mul
    "Mul", x=x, y=y, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 520, in _apply_op_helper
    repr(values), type(values).__name__))
TypeError: Expected float32 passed to parameter 'y' of op 'Mul', got <tensorflow.python.framework.ops.IndexedSlices object at 0x7fe07a3165f8> of type 'IndexedSlices' instead.
INFO:tensorflow:Error recorded from training_loop: Expected float32 passed to parameter 'y' of op 'Mul', got <tensorflow.python.framework.ops.IndexedSlices object at 0x7f96eaa0be48> of type 'IndexedSlices' instead.
INFO:tensorflow:training_loop marked as finished
WARNING:tensorflow:Reraising captured error
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 511, in _apply_op_helper
    preferred_dtype=default_dtype)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1175, in internal_convert_to_tensor
    ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gradients_impl.py", line 100, in _IndexedSlicesToTensor
    dense_shape_value = tensor_util.constant_value(value.dense_shape)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py", line 848, in constant_value
    ret = _ConstantValue(tensor, partial)
    File "/usr/local/lib/python3.5/dist-`enter code here`packages/tensorflow/python/framework/tensor_util.py", line 711, in _ConstantValue
    raise TypeError("tensor is not a Tensor")
TypeError: tensor is not a Tensor

0 ответов

Другие вопросы по тегам