TensorFlow 2.0 - новичок в реализации простой CNN
Я только что закончил специализацию DL на Coursera и пытаюсь реализовать CNN с TensorFlow 2.0 и моими собственными данными. Я следовал руководству и документации с tenorflow.org и смог настроить конвейер для загрузки моего изображения. Однако, когда я запускал модель, я продолжал сталкиваться с проблемами, связанными с памятью / ресурсами.
Моя модель должна выполнять классификацию с несколькими ярлыками с 30 категориями. ниже мой код:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
#Import helper modules
import os
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import os
import pandas as pd
import IPython.display as display
from PIL import Image
AUTOTUNE = tf.data.experimental.AUTOTUNE
#Define path to directories
train_dir = pathlib.Path.cwd() / 'train'
validation_dir = pathlib.Path.cwd() / 'validation'
test_dir = pathlib.Path.cwd() / 'test'
#Read csv file containing filename and label
train_csv = pd.read_csv(pathlib.Path.cwd() / 'train.csv')
validation_csv = pd.read_csv(pathlib.Path.cwd() / 'validation.csv')
#Define total number of training and validation set
total_train = train_csv.shape[0]
total_val = validation_csv.shape[0]
print(f'Total training images: {total_train}')
print(f'Total validation images: {total_val}')
приведенный ниже код взят с https://www.tensorflow.org/tutorials/load_data/images
CLASS_NAMES = np.array([item.name for item in train_dir.glob('*')])
#set up variables
BATCH_SIZE = 128
TRAIN_STEPS_PER_EPOCH = np.ceil(total_train/BATCH_SIZE)
VAL_STEPS_PER_EPOCH = np.ceil(total_val/BATCH_SIZE)
IMG_HEIGHT = 150
IMG_WIDTH = 150
#using td.data.Dataset
train_list_ds = tf.data.Dataset.list_files(str(train_dir/'*/*'))
valid_list_ds = tf.data.Dataset.list_files(str(validation_dir/'*/*'))
def get_label(file_path):
# convert the path to a list of path components
parts = tf.strings.split(file_path, os.path.sep)
# The second to last is the class-directory
return parts[-2] == CLASS_NAMES
def decode_img(img):
# convert the compressed string to a 3D uint8 tensor
img = tf.image.decode_jpeg(img, channels=3)
# Use `convert_image_dtype` to convert to floats in the [0,1] range.
img = tf.image.convert_image_dtype(img, tf.float32)
# resize the image to the desired size.
return tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH])
def process_path(file_path):
label = get_label(file_path)
# load the raw data from the file as a string
img = tf.io.read_file(file_path)
img = decode_img(img)
return img, label
def show_batch(image_batch, label_batch):
plt.figure(figsize=(10,10))
for n in range(25):
ax = plt.subplot(5,5,n+1)
plt.imshow(image_batch[n])
plt.title(CLASS_NAMES[label_batch[n]==1][0].title())
plt.axis('off')
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_labeled_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
valid_labeled_ds = valid_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
# use `.cache(filename)` to cache preprocessing work for datasets that don't
# fit in memory.
if cache:
if isinstance(cache, str):
ds = ds.cache(cache)
else:
ds = ds.cache()
ds = ds.shuffle(buffer_size=shuffle_buffer_size)
# Repeat forever
ds = ds.repeat()
ds = ds.batch(BATCH_SIZE)
# `prefetch` lets the dataset fetch batches in the background while the model
# is training.
ds = ds.prefetch(buffer_size=AUTOTUNE)
return ds
train_ds = prepare_for_training(train_labeled_ds)
x_train, y_train = next(iter(train_ds))
valid_ds = prepare_for_training(valid_labeled_ds)
модель взята из учебника: https://www.tensorflow.org/tutorials/images/classification
model = Sequential([
Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
MaxPooling2D(),
Conv2D(32, 3, padding='same', activation='relu'),
MaxPooling2D(),
Conv2D(64, 3, padding='same', activation='relu'),
MaxPooling2D(),
Flatten(),
Dense(512, activation='relu'),
Dense(30, activation='softmax')
])
model.compile(optimizer='adam',
loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
model.summary()
model.fit(train_ds.repeat(),
epochs=15, batch_size= BATCH_SIZE,
validation_data=valid_ds.repeat(), steps_per_epoch= TRAIN_STEPS_PER_EPOCH,
validation_steps=VAL_STEPS_PER_EPOCH)
сначала я столкнулся с Input, у меня закончились данные, поэтому я изменил свой набор входных данных на train_ds.repeat() вместо x=x_train, y=y_train.
следующая проблема, с которой я столкнулся, это
100/741 [===>..........................] - ETA: 18:07 - loss: 3.7188 - accuracy: 0.04942020-06-23 14:52:29.232604: E tensorflow/core/lib/jpeg/jpeg_mem.cc:323] Premature end of JPEG data. Stopped at line 910/1000
Traceback (most recent call last):
File "product_detection.py", line 123, in <module>
model.fit(train_ds.repeat(), epochs=15, batch_size= BATCH_SIZE, validation_data=valid_ds.repeat(), steps_per_epoch= TRAIN_STEPS_PER_EPOCH, validation_steps=VAL_STEPS_PER_EPOCH)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
tmp_logs = train_function(iterator)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
result = self._call(*args, **kwds)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 2420, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 1665, in _filtered_call
self.captured_inputs)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 1746, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 598, in call
ctx=ctx)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\execute.py", line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Invalid JPEG data or crop window, data size 101360
[[{{node DecodeJpeg}}]]
[[IteratorGetNext]]
(1) Invalid argument: Invalid JPEG data or crop window, data size 101360
[[{{node DecodeJpeg}}]]
[[IteratorGetNext]]
[[IteratorGetNext/_2]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1113]
Function call stack:
train_function -> train_function
2020-06-23 14:52:29.411288: W tensorflow/core/kernels/data/cache_dataset_ops.cc:794] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
на данный момент я полностью потерялся, я подозреваю, что скопированная мною функция prepare_for_training не подходит для моего приложения, но я недостаточно понимаю, чтобы вносить изменения. В нем явно сказано, что это для набора данных из 1000+ изображений, в то время как я работаю с наборами данных для обучения 90k и проверкой 10k. Я пытаюсь изменить batch_size, но проблема не устранена.
Я использую TensorFlow-gpu с GTX 1050 Ti. Могу я попросить указать, как это сделать? Заранее спасибо.
Изменить 1: изменил мой batch_size на 10, и появилась эта ошибка
3537/9484 [==========>...................] - ETA: 12:31 - loss: 3.7519 - accuracy: 0.02602020-06-23 16:10:13.148245: W tensorflow/core/framework/op_kernel.cc:1753] OP_REQUIRES failed at cast_op.cc:109 : Resource exhausted: OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
2020-06-23 16:10:13.159005: W tensorflow/core/framework/op_kernel.cc:1753] OP_REQUIRES failed at cast_op.cc:109 : Resource exhausted: OOM when allocating tensor with shape[678,678,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
Resou3538/9484 [==========>...................] - ETA: 12:31 - loss: 3.7519 - accuracy: 0.0260rce exhausted: OOM when allocating tensor with shape[956,956,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
tensorflow.python.framework.errors_impl.ResourceExhaustedError: 2 root error(s) found.
(0) Resource exhausted: OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
[[{{node convert_image/Cast}}]]
[[IteratorGetNext]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[IteratorGetNext/_2]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
(1) Resource exhausted: OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
[[{{node convert_image/Cast}}]]
[[IteratorGetNext]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
1 ответ
Согласно этому разделу вывода:
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Invalid JPEG data or crop window, data size 101360
[[{{node DecodeJpeg}}]]
[[IteratorGetNext]]
(1) Invalid argument: Invalid JPEG data or crop window, data size 101360
[[{{node DecodeJpeg}}]]
[[IteratorGetNext]]
[[IteratorGetNext/_2]]
Похоже, что если некоторые данные повреждены, проверьте свои данные и удалить поврежденные записи, то есть общая ошибка, которую вы можете проверить ЗДЕСЬ и ТАМ