ResourceExhaustedError - недостаточно памяти в тензорном потоке
Ниже приведен мой фрагмент кода для сети, подобной alexnet (здесь ввод 224*224 оттенков серого). Я получаю сообщение об ошибке исчерпания ресурсов, опубликованное ниже, и я не уверен, почему
import tensorflow as tf
from IPython import embed
import pre_process_data as dataProc
import numpy as np
#Converting RGB to Gray
from skimage import color
#Parameters
learning_rate = 0.001
training_iters = 200000
batch_size = 4
display_step = 20
#Network Parameters
n_input = [224,224,3] # (img 224,224,3)
n_classes = 7 # (0-7)
dropout = 0.8 # Dropout
def ReturnOneHotVector(labels):
labels = labels.astype(np.int32)
sparse_labels = tf.reshape(labels, [-1, 1])
derived_size = tf.shape(sparse_labels)[0]
indices = tf.reshape(tf.range(0, derived_size, 1), [-1, 1])
concated = tf.concat( [indices, sparse_labels],axis=1)
outshape = tf.concat([tf.reshape(derived_size, [1]), tf.reshape(n_classes, [1])],axis=0)
one_hot_labels = tf.sparse_to_dense(concated, outshape, 1.0, 0.0)
return one_hot_labels
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1,shape=shape)
return initial
def conv2d(x,W):
return tf.nn.conv2d(x,W,strides=[1,1,1,1],padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
#Build Graph
def deepnn(x_image):
'''x - input [none, 224*224*3]
returns y([none, 10], scalar keep_prob'''
with tf.name_scope('reshape'):
x_image = tf.reshape(x_image,[-1,224,224,1])
with tf.name_scope('conv1'):
W_conv1 = weight_variable([5,5,1,32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image,W_conv1)+ b_conv1)
# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
h_pool1 = max_pool_2x2(h_conv1)
with tf.name_scope('conv2'):
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
# Second pooling layer.
with tf.name_scope('pool2'):
h_pool2 = max_pool_2x2(h_conv2)
#FC layer
with tf.name_scope('fc1'):
W_fc1 = weight_variable([56 * 56 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat=tf.reshape(h_pool2,[-1,56*56*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,W_fc1) + b_fc1)
#Dropout
with tf.name_scope('dropout'):
keep_prob=tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
#FC2
with tf.name_scope('fc2'):
W_fc2 = weight_variable([1024,7])
b_fc2 = bias_variable([7])
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
return y_conv, keep_prob
def main():
#Read input
joints,imgs,labels = dataProc.pre_proc_data()
labels = dataProc.ConvertLabels(labels)
labels = ReturnOneHotVector(labels) #return labels as one-hot-vector
imgs = color.rgb2gray(imgs)
imgs = imgs.reshape([-1, imgs.shape[1]*imgs.shape[2]])
embed()
#-----------------NETWORK 1 -------------------------#
x = tf.placeholder(tf.float32,[None,imgs.shape[1]])
y_ = tf. placeholder(tf.int8,[None, n_classes])
'''sess = tf.Session()
with sess.as_default():
embed()
'''
#Build convnet
y_conv, keep_prob = deepnn(x)
with tf.name_scope('loss'):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits = y_conv)
cross_entropy = tf.reduce_mean(cross_entropy)
with tf.name_scope('adam_optimizer'):
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
correct_prediction = tf.cast(correct_prediction, tf.float32)
accuracy = tf.reduce_mean(correct_prediction)
graph_location = '/home/dhiraj/Desktop/Sample Dataset/record' #tempfile.mkdtemp()
print('Saving graph to: %s' % graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(tf.get_default_graph())
'''config = tf.ConfigProto()
config.gpu_options.allocator_type='BFC'
config.gpu_options.per_process_gpu_memory_fraction = 0.90
sess = tf.Session(config = config)
'''
'''config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.4
session = tf.Session(config=config)
'''
with tf.Session() as sess:
#tf.Session() as sess:
print "STARTED TENSORLFOW SESSION"
sess.run(tf.initialize_all_variables())
step = 1
count = 0
labels = labels.eval()
while step*batch_size < training_iters:
batch_xs, batch_ys = imgs[count:count + batch_size],labels[count:count+ batch_size]
count = count + batch_size
sess.run(train_step, feed_dict={x:batch_xs, y_:batch_ys, keep_prob:dropout})
if step % display_step == 0:
acc = sess.run(accuracy, feed_dict={x:batch_xs, y_:batch_ys, keep_prob:1.})
loss = sess.run(cross_entropy, feed_dict={x: batch_xs, y_: batch_ys, keep_prob: 1.})
print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)
step = step+1
print "Optimization Finished!"
main()
Я получаю следующую ошибку:
name: TITAN X (Pascal)
major: 6 minor: 1 memoryClockRate (GHz) 1.531
pciBusID 0000:01:00.0
Total memory: 11.90GiB
Free memory: 11.23GiB
2017-09-16 16:16:14.018792: I tensorflow/core/common_runtime/gpu/gpu_device.cc:976] DMA: 0
2017-09-16 16:16:14.018797: I tensorflow/core/common_runtime/gpu/gpu_device.cc:986] 0: Y
2017-09-16 16:16:14.018809: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: TITAN X (Pascal), pci bus id: 0000:01:00.0)
STARTED TENSORLFOW SESSION
WARNING:tensorflow:From /usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py:175: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Iter 80, Minibatch Loss= 60.153133, Training Accuracy= 0.00000
Iter 160, Minibatch Loss= 40.757141, Training Accuracy= 0.00000
Iter 240, Minibatch Loss= 24.546730, Training Accuracy= 0.00000
Iter 320, Minibatch Loss= 7.033617, Training Accuracy= 0.25000
Iter 400, Minibatch Loss= 9.611395, Training Accuracy= 0.00000
Iter 480, Minibatch Loss= 17.850220, Training Accuracy= 0.50000
Iter 560, Minibatch Loss= 9.400875, Training Accuracy= 0.00000
Iter 640, Minibatch Loss= 7.338353, Training Accuracy= 0.25000
Iter 720, Minibatch Loss= 3.645672, Training Accuracy= 0.25000
Iter 800, Minibatch Loss= 1.157605, Training Accuracy= 0.25000
2017-09-16 16:16:31.545341: E tensorflow/core/common_runtime/bfc_allocator.cc:244] tried to allocate 0 bytes
2017-09-16 16:16:31.545369: W tensorflow/core/common_runtime/allocator_retry.cc:32] Request to allocate 0 bytes
2017-09-16 16:16:31.545375: E tensorflow/core/common_runtime/bfc_allocator.cc:244] tried to allocate 0 bytes
2017-09-16 16:16:31.545379: W tensorflow/core/common_runtime/allocator_retry.cc:32] Request to allocate 0 bytes
2017-09-16 16:16:31.545433: E tensorflow/core/common_runtime/bfc_allocator.cc:378] tried to deallocate nullptr
2017-09-16 16:16:31.545482: E tensorflow/core/common_runtime/bfc_allocator.cc:378] tried to deallocate nullptr
-----------------------------------------------------------
[SOME TEXT HERE ]
ResourceExhaustedError: Ran out of GPU memory when allocating 0 bytes for
[[Node: loss/SoftmaxCrossEntropyWithLogits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](loss/Reshape, loss/Reshape_1)]]
Caused by op u'loss/SoftmaxCrossEntropyWithLogits', defined at:
File "tensorflow_version_rgb.py", line 162, in <module>
main()
File "tensorflow_version_rgb.py", line 117, in main
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits = y_conv)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 1597, in softmax_cross_entropy_with_logits
precise_logits, labels, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 2385, in _softmax_cross_entropy_with_logits
features=features, labels=labels, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
ResourceExhaustedError (see above for traceback): Ran out of GPU memory when allocating 0 bytes for
[[Node: loss/SoftmaxCrossEntropyWithLogits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](loss/Reshape, loss/Reshape_1)]]
Итерация моего кода зависит от размера пакета: если я уменьшу его до 1, он выполнит еще пару итераций, а затем выдаст мне ту же ошибку. Я проверил размер моего imgs.nbytes
что составляет 334774272 байта. Я запустил другой учебный код по тензорному потоку, и он, кажется, работает нормально.
Мои входные данные 'imgs' имеют размер (834, 224,224) серой шкалы, а метки - (834,7) матрицы.
Во время выполнения кода выше, он говорит:
Total memory: 11.90GiB
Free memory: 11.23GiB