Paddle reshape op在反向过程中报错

xpcnnkqh  于 2021-11-30  发布在  Java
关注(0)|答案(1)|浏览(478)
  • 标题:reshape op 反向过程中报错: The Tensor in the reshape2_grad Op's Input Variable Out@GRAD(slice_34.tmp_0@GRAD) is not initialized
  • 版本、环境信息:

   1)cpu版本 1.7.1 Mac OS 10.14,Python3.7.0
2) gpu版本1.6.2.post107, Python2.7.13

  • 训练信息

   1)gpu 单机单卡、单机多卡都报错
   2)cpu单线程、多线程都报错
   3)Operator信息
Traceback (most recent call last):
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/tasks/bml_renshou/run_with_json.py", line 114, in
run_trainer(_params)
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/tasks/bml_renshou/run_with_json.py", line 97, in run_trainer
trainer.train_and_eval()
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/textone/training/custom_trainer.py", line 105, in train_and_eval
raise e
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/textone/training/custom_trainer.py", line 53, in train_and_eval
self.run(InstanceName.TRAINING, need_fetch=False)
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/textone/common/controler.py", line 434, in run
self.train_exe.run(fetch_list=[])
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/paddle/fluid/parallel_executor.py", line 311, in run
return_numpy=return_numpy)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/paddle/fluid/executor.py", line 783, in run
six.reraise(*sys.exc_info())
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/six.py", line 693, in reraise
raise value
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/paddle/fluid/executor.py", line 778, in run
use_program_cache=use_program_cache)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/paddle/fluid/executor.py", line 843, in _run_impl
return_numpy=return_numpy)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/paddle/fluid/executor.py", line 677, in _run_parallel
tensors = exe.run(fetch_var_names)._move_to_list()
paddle.fluid.core_avx.EnforceNotMet:

C++ Call Stacks (More useful to developers):

0 std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator > paddle::platform::GetTraceBackString<std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator > const&>(std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator > const&&&, char const*, int)
1 paddle::framework::OperatorWithKernel::ParseInputDataType(paddle::framework::ExecutionContext const&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator > const&, paddle::framework::proto::VarType_Type*) const
2 paddle::framework::OperatorWithKernel::IndicateVarDataType(paddle::framework::ExecutionContext const&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator > const&) const
3 paddle::operators::Reshape2GradOp::GetExpectedKernelType(paddle::framework::ExecutionContext const&) const
4 paddle::framework::OperatorWithKernel::ChooseKernel(paddle::framework::RuntimeContext const&, paddle::framework::Scope const&, paddle::platform::Place const&) const
5 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&, paddle::framework::RuntimeContext*) const
6 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
7 paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
8 paddle::framework::details::ComputationOpHandle::RunImpl()
9 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOpSync(paddle::framework::details::OpHandleBase*)
10 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOp(paddle::framework::details::OpHandleBase*, std::__1::shared_ptr<paddle::framework::BlockingQueue > const&, unsigned long*)
11 std::__1::__packaged_task_func<std::__1::__bind<paddle::framework::details::FastThreadedSSAGraphExecutor::RunOpAsync(std::__1::unordered_map<paddle::framework::details::OpHandleBase*, std::__1::atomic, std::__1::hashpaddle::framework::details::OpHandleBase*, std::__1::equal_topaddle::framework::details::OpHandleBase*, std::__1::allocator<std::__1::pair<paddle::framework::details::OpHandleBase* const, std::__1::atomic > > >, paddle::framework::details::OpHandleBase, std::__1::shared_ptr<paddle::framework::BlockingQueue > const&)::$_0>, std::__1::allocator<std::__1::__bind<paddle::framework::details::FastThreadedSSAGraphExecutor::RunOpAsync(std::__1::unordered_map<paddle::framework::details::OpHandleBase*, std::__1::atomic, std::__1::hashpaddle::framework::details::OpHandleBase*, std::__1::equal_topaddle::framework::details::OpHandleBase*, std::__1::allocator<std::__1::pair<paddle::framework::details::OpHandleBase* const, std::__1::atomic > > >, paddle::framework::details::OpHandleBase, std::__1::shared_ptr<paddle::framework::BlockingQueue > const&)::$_0> >, void ()>::operator()()
12 std::__1::packaged_task<void ()>::operator()()
13 ThreadPool::ThreadPool(unsigned long)::'lambda'()::operator()() const
14 std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_deletestd::__1::__thread_struct >, ThreadPool::ThreadPool(unsigned long)::'lambda'()> >(void*, void*)

Python Call Stacks (More useful to users):

File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/paddle/fluid/framework.py", line 2525, in append_op
attrs=kwargs.get("attrs", None))
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/paddle/fluid/layer_helper.py", line 43, in append_op
return self.main_program.current_block().append_op(args,kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/paddle/fluid/layers/nn.py", line 5676, in reshape
"XShape": x_shape})
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/textone/models/bml_renshou_multi_label.py", line 88, in forward
real_label = fluid.layers.reshape(index_label, shape=[-1, 1], inplace=True)
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/textone/common/controler.py", line 355, in init_train_net
self.forward_train_output = self.model_class.forward(fields_dict, phase=InstanceName.TRAINING)
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/textone/common/controler.py", line 338, in init_net
self.init_train_net()
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/textone/common/controler.py", line 127, in
init
*
self.init_net()
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/textone/training/custom_trainer.py", line 30, ininit
BaseTrainer.init(self, params, data_set_reader, model_class)
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/tasks/bml_renshou/run_with_json.py", line 74, in build_trainer
trainer = trainer_class(params=params_dict, data_set_reader=dataset_reader, model_class=model)
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/tasks/bml_renshou/run_with_json.py", line 95, in run_trainer
trainer = build_trainer(trainer_params_dict, dataset_reader, model, num_train_examples)
File "/Users/baidu/Desktop/icode_test/baidu/nlp/textone/tasks/bml_renshou/run_with_json.py", line 114, in
run_trainer(_params)

Error Message Summary:

InvalidArgumentError: The Tensor in the reshape2_grad Op's Input Variable Out@GRAD(slice_34.tmp_0@GRAD) is not initialized.
[Hint: Expected t->IsInitialized() == true, but received t->IsInitialized():0 != true:1.] at (/home/teamcity/work/ef54dc8a5b211854/paddle/fluid/framework/operator.cc:1264)
[operator < reshape2_grad > error]

Process finished with exit code 1

omhiaaxx

omhiaaxx1#

`class BmlRenShouMultiLabelClassification(Model):
"""BmlRenShouMultiLabelClassification
"""

def __init__(self, model_params):
    Model.__init__(self, model_params)
    self.label_info = self.model_params.get("label_info", [])
    self.label_nums = len(self.label_info)

def forward(self, fields_dict, phase):
    """前向计算组网部分包括loss值的计算,必须由子类实现
    :param: fields_dict: 序列化好的id
    :param: phase: 当前调用的阶段,如训练、预测,不同的阶段组网可以不一样
    :return: 一个dict数据,存放TARGET_FEED_NAMES, TARGET_PREDICTS, PREDICT_RESULT,LABEL,LOSS等所有你希望获取的数据
    """
    fields_dict = self.fields_process(fields_dict, phase)
    instance_text_a = fields_dict["text_a"]
    record_id_text_a = instance_text_a[InstanceName.RECORD_ID]
    text_a = record_id_text_a[InstanceName.SRC_IDS]
    text_a_lens = record_id_text_a[InstanceName.SEQ_LENS]

    instance_label = fields_dict["label"]
    record_id_label = instance_label[InstanceName.RECORD_ID]
    label = record_id_label[InstanceName.SRC_IDS]

    dict_dim = self.model_params.get('vocab_size', 17964)
    emb_dim = self.model_params.get('emb_dim', 128)
    hid_dim = self.model_params.get('hid_dim', 128)
    hid_dim2 = self.model_params.get('hid_dim2', 96)
    win_sizes = [1, 2, 3]

    unpad_data = fluid.layers.sequence_unpad(text_a, length=text_a_lens)
    # embedding layer
    emb = fluid.layers.embedding(input=unpad_data, size=[dict_dim, emb_dim])

    '''
    emb = fluid.layers.dropout(
        x=emb,
        dropout_prob=0.1,
        dropout_implementation="upscale_in_train")
    '''
    # convolution layer
    convs = []
    for win_size in win_sizes:
        conv_h = fluid.nets.sequence_conv_pool(
            input=emb,
            num_filters=hid_dim,
            filter_size=win_size,
            act="tanh",
            pool_type="max")
        convs.append(conv_h)
    convs_out = fluid.layers.concat(input=convs, axis=1)

    fc_1 = fluid.layers.fc(input=[convs_out], size=hid_dim2, act="tanh")

    loss_array = fluid.layers.create_array(dtype='float64')
    predictions_array = fluid.layers.create_array(dtype='float64')

    i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)

    for index in range(self.label_nums):
        predictions = fluid.layers.fc(input=[fc_1], size=self.label_info[index], act='softmax')

        axes = [1]
        starts = [index]
        ends = [1 + index]

        index_label = fluid.layers.slice(label, axes=axes, starts=starts, ends=ends)

        real_label = fluid.layers.reshape(index_label, shape=[-1, 1], inplace=True)
        # stop_gradient 必须打开,因为reshape的op在反向的时候会找梯度,但是没有,所以会报错
        # real_label.stop_gradient = True

        # cross_entropy不会对label计算梯度
        cost = fluid.layers.cross_entropy(input=predictions, label=real_label)
        avg_cost = fluid.layers.mean(x=cost)

        idx = fluid.layers.cast(i, 'int64')
        idx.stop_gradient = True

        fluid.layers.array_write(avg_cost, idx, loss_array)
        fluid.layers.array_write(predictions, idx, predictions_array)

        i = fluid.layers.increment(x=i, value=1, in_place=True)

    if phase == InstanceName.SAVE_INFERENCE:
        """保存模型时需要的入参:表示模型预测时需要输入的变量名称和顺序"""
        target_feed_name_list = [text_a.name, text_a_lens.name]
        """保存模型时需要的入参:表示预测时最终输出的结果"""
        probs, probs_index = fluid.layers.tensor_array_to_tensor(input=predictions_array)
        target_predict_list = [probs, probs_index]
        forward_return_dict = {
            InstanceName.TARGET_FEED_NAMES: target_feed_name_list,
            InstanceName.TARGET_PREDICTS: target_predict_list
        }
        return forward_return_dict

    total_loss, loss_index = fluid.layers.tensor_array_to_tensor(input=loss_array, axis=-1)

    loss = fluid.layers.reduce_mean(total_loss)

    probs, probs_index = fluid.layers.tensor_array_to_tensor(input=predictions_array)

    """PREDICT_RESULT,LABEL,LOSS 是关键字,必须要赋值并返回"""
    forward_return_dict = {
        InstanceName.PREDICT_RESULT: probs,
        "predict_index": probs_index,
        InstanceName.LABEL: label,
        InstanceName.LOSS: loss,
    }
    return forward_return_dict

def fields_process(self, fields_dict, phase):
    """对fields中序列化好的id按需做二次处理
    :return: 处理好的fields
    """
    return fields_dict

def make_embedding(self, fields, phase):
    """构造embedding,按需调用
    :param fields:
    :param phase:
    :return: embedding_dict
    """
    return None

def optimizer(self, loss, is_fleet=False):
    """
    :param loss:
    :param is_fleet:
    :return:
    """
    opt_param = self.model_params.get('optimization', None)
    if opt_param:
        lr = opt_param.get('learning_rate', 2e-5)
    else:
        lr = 2e-5
    optimizer = fluid.optimizer.Adam(learning_rate=lr,
                                     regularization=fluid.regularizer.L2Decay(regularization_coeff=1e-3))
    optimizer.minimize(loss)
    optimizer_output_dict = collections.OrderedDict()
    return optimizer_output_dict

`

相关问题