Paddle model下面mnist优化函数使用fluid.optimizer.DecayedAdagradOptimizer时,如果加上内存优化,训练的loss值异常

5kgi1eie  于 2021-11-30  发布在  Java
关注(0)|答案(2)|浏览(317)

不加内存优化时,前6个batch的loss值为:
[2.3050585, 4.64196, 2.0815804, 2.0276387, 2.0014627, 1.6691642]
加上fluid.memory_optimize(fluid.default_main_program())后,为:
[2.3050585, 2.3096275, 5.387212, 35.37569, 33.65272, 29.050358]

hs1rzwqc

hs1rzwqc1#

使用Paddle release/0.15.0 和如下代码未复现问题,请 @kolinwei 给出最小复现代码。

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import paddle
import paddle.fluid as fluid

def cnn_model(data, is_simple=False):
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=data,
        filter_size=5,
        num_filters=20,
        pool_size=2,
        pool_stride=2,
        act="relu")
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
        num_filters=50,
        pool_size=2,
        pool_stride=2,
        act="relu")
    SIZE = 10
    input_shape = conv_pool_2.shape
    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5

    predict = fluid.layers.fc(
        input=data if is_simple else conv_pool_2,
        size=SIZE,
        act="softmax",
        param_attr=fluid.param_attr.ParamAttr(
            initializer=fluid.initializer.NormalInitializer(
                loc=0.0, scale=scale, seed=1)))
    return predict

def run_benchmark():
    # Input data
    fluid.default_startup_program().random_seed = 2
    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

    # Train program
    predict = cnn_model(images)
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)

    # Optimization
    opt = fluid.optimizer.DecayedAdagradOptimizer(
        learning_rate=0.001, decay=0.95, epsilon=1.0e-6)
    opt.minimize(avg_cost)

    # fluid.memory_optimize(fluid.default_main_program())

    # Initialize executor
    place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)

    # Parameter initialization
    exe.run(fluid.default_startup_program())

    # Reader
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=8192)
    pe = fluid.ParallelExecutor(use_cuda=True, main_program=fluid.default_main_program())

    for batch_id, data in enumerate(train_reader()):
        img_data = np.array(
                [x[0].reshape([1, 28, 28]) for x in data]).astype('float32')
        y_data = np.array([x[1] for x in data]).astype("int64")
        y_data = y_data.reshape([len(y_data), 1])

        loss = pe.run(
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[avg_cost.name]
            )  # The accuracy is the accumulation of batches, but not the current batch.
        print(loss)

if __name__ == '__main__':
    run_benchmark()
txu3uszq

txu3uszq2#

我试了下,单机下复现不了,多机时会出现

相关问题