Модель CTC: результат декодера ctc пуст

Я новичок в глубоком обучении и работаю над распознаванием текста. Когда я выполняю свою модель, я получаю пустой результат, может ли кто-нибудь мне помочь.

Это мой полный код: char_list = string.ascii_letters + string.digits print("Список символов: ", char_list)

    #function to decode the text into indice of char list
    def encode_to_labels(text):
        # We encode each output word into digits
        digit_list = []
        for index, character in enumerate(text):
            try:
                digit_list.append(char_list.index(character))
            except:
                print("Error in finding index for character ", character)
        #End For
        return digit_list
    a=  encode_to_labels('hola')
    print(a)

    #preprocess the data
    #read the image from IAM Dataset
    n_samples = len(os.listdir('/home/yosra/Desktop/imagetest'))
    # #Number of samples in xml file
    # xml_samples = len(dic)

    #list of trining_set

    training_img = []
    training_txt=[]
    train_input_length = []
    train_label_length = []
    orig_txt = []


    #lists for validation dataset
    valid_img = []
    valid_txt = []
    valid_input_length = []
    valid_label_length = []
    valid_orig_txt = []

    max_label_len = 0


    # Training Variables
    k=1

    for i, pic in enumerate(os.listdir('/home/yosra/Desktop/imagetest')):
            # Read image as grayscale
            img = cv2.imread(os.path.join('/home/yosra/Desktop/imagetest', pic), cv2.IMREAD_GRAYSCALE)

            pic_target = pic[:-4]
            # convert each image of shape (32, 128, 1)
            w, h = img.shape

            if h > 128 or w > 32:
                continue
            # endif

            # Process the images to bring them to scale
            if w < 32:
                add_zeros = np.ones((32-w, h))*255
                img = np.concatenate((img, add_zeros))
            # endif
            if h < 128:
                add_zeros = np.ones((32, 128-h))*255
                img = np.concatenate((img, add_zeros), axis=1)
            # endif    

            img = np.expand_dims(img , axis = 2)

            # Normalise the image
            img = img/255.

            # Get the text for the image
            txt = pic_target.split('_')[1]

            # compute maximum length of the text
            if len(txt) > max_label_len:
                max_label_len = len(txt)

            if k%10 == 0:     
                valid_orig_txt.append(txt)   
                valid_label_length.append(len(txt))
                valid_input_length.append(31)
                valid_img.append(img)
                valid_txt.append(encode_to_labels(txt))
            else:
                orig_txt.append(txt)   
                train_label_length.append(len(txt))
                train_input_length.append(31)
                training_img.append(img)
                training_txt.append(encode_to_labels(txt))
            k+=1


    # pad each output label to maximum text length
    train_padded_txt = pad_sequences(training_txt, maxlen=max_label_len, padding='post', value = len(char_list))
    valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value = len(char_list))



    # input with shape of height=32 and width=128 

    # input with shape of height=32 and width=128 
    inputs = Input(shape=(32,128,1))

    # convolution layer with kernel size (3,3)
    conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
    # poolig layer with kernel size (2,2)
    pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)

    conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
    pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)

    conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)

    conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
    # poolig layer with kernel size (2,1)
    pool_4 = MaxPool2D(pool_size=(2, 1))(conv_4)

    conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
    # Batch normalization layer
    batch_norm_5 = BatchNormalization()(conv_5)

    conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
    batch_norm_6 = BatchNormalization()(conv_6)
    pool_6 = MaxPool2D(pool_size=(2, 1))(batch_norm_6)

    conv_7 = Conv2D(512, (2,2), activation = 'relu')(pool_6)

    squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_7)

    # bidirectional LSTM layers with units=128
    blstm_1 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(squeezed)
    blstm_2 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(blstm_1)

    outputs = Dense(len(char_list)+1, activation = 'softmax')(blstm_2)

    # model to be used at test time
    act_model = Model(inputs, outputs)

    act_model.summary()


    #the CTC loss fnction is to predict the output text, it is very helpfull for the 
    #text recognition topic.
    labels = Input(name='the_labels', shape=[max_label_len], dtype='float32')
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')


    def ctc_lambda_func(args):
        y_pred, labels, input_length, label_length = args

        return K.ctc_batch_cost(labels, y_pred, input_length, label_length)


    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])

    #model to be used at training time
    model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)

    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')

    filepath="/home/yosra/Downloads/best_model.hdf5"
    checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
    callbacks_list = [checkpoint]

    #train the model


    callbacks_list = [checkpoint]
    training_img = np.array(training_img)
    train_input_length = np.array(train_input_length)
    train_label_length = np.array(train_label_length)

    valid_img = np.array(valid_img)
    valid_input_length = np.array(valid_input_length)
    valid_label_length = np.array(valid_label_length)

    batch_size = 256
    epochs = 10

    model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length], y=np.zeros(len(training_img)), 
               batch_size=batch_size, 
               epochs = epochs, 
               validation_data = ([valid_img, valid_padded_txt, valid_input_length, valid_label_length], [np.zeros(len(valid_img))]), 
               verbose = 1, callbacks = callbacks_list)

    act_model.save(filepath)

    #test the model
    # load the saved best model weights
    # load the saved best model weights
    act_model.load_weights(filepath)

    valid_img=valid_img[:10].shape

    # predict outputs on validation images
    # predict outputs on validation images
    prediction = act_model.predict(valid_img[:10])

    # use CTC decoder
    out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1],
                             greedy=True)[0][0])


    # see the results
    i = 0
    for x in out:
        print("original_text =  ", valid_orig_txt[i])
        print("predicted text = ", end = '')
        for p in x:  
            if int(p) != -1:
                print(char_list[int(p)], end = '')       
        print('\n')
        i+=1

это мой результат: original_text = Текст, предсказанный энурезом =

0 ответов

Другие вопросы по тегам