paddle 1.5.0, cuda 8.0, cudnn v7, k40 单卡训练
报错:Cannot find fetch variable in scope, fetch_var_name is tmp_12 at xxx fetch_op.cc:37
部分代码:
def net_config(image, label, model, args):
model_list = [m for m in dir(models) if "__" not in m]
assert args.model in model_list, "{} is not lists: {}".format(args.model, model_list)
model_name = args.model
if "Ft_Net" in model_name:
x3_g_pool_fc, x4_g_pool_fc, x4_p_pool_fc, x3_g_avg_fc, x4_g_avg_fc, x4_p_avg_fc, x3_g_max_fc, x4_g_max_fc, x4_p_max_fc = model.net(input=image)
cost_1, pred_1 = calc_loss(x3_g_pool_fc, label)
avg_cost_1 = fluid.layers.mean(x=cost_1)
。。。。。。
total_cost = (cost_1 + cost_2 + cost_3 + cost_4 + cost_5 + cost_6 + cost_7 + cost_8 + cost_9) / 9.0
acc_1 = fluid.layers.accuracy(input=pred_1, label=label, k=1)
acc_2 = fluid.layers.accuracy(input=pred_2, label=label, k=1)
acc_3 = fluid.layers.accuracy(input=pred_3, label=label, k=1)
acc_4 = fluid.layers.accuracy(input=pred_4, label=label, k=1)
acc_5 = fluid.layers.accuracy(input=pred_5, label=label, k=1)
acc_6 = fluid.layers.accuracy(input=pred_6, label=label, k=1)
acc_7 = fluid.layers.accuracy(input=pred_7, label=label, k=1)
acc_8 = fluid.layers.accuracy(input=pred_8, label=label, k=1)
acc_9 = fluid.layers.accuracy(input=pred_9, label=label, k=1)
return total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9
def train(args):
model_name = args.model
checkpoint = args.checkpoint
pretrained_model = args.pretrained_model
with_memory_optimization = args.with_mem_opt
model_save_dir = args.model_save_dir
num_instances = args.num_instances
startup_prog = fluid.Program()
train_prog = fluid.Program()
train_py_reader, total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr = \
build_program(is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args)
train_fetch_vars = [total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr]
train_fetch_list = []
for var in train_fetch_vars:
var.persistable=True
train_fetch_list.append(var.name)
if with_memory_optimization:
fluid.memory_optimize(train_prog)
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
。。。。。。
build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = args.with_inplace
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = device_num
if num_trainers > 1 and args.use_gpu:
dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
exec_strategy.num_threads = 1
train_exe = fluid.ParallelExecutor(
main_program=train_prog,
use_cuda=bool(args.use_gpu),
loss_name=total_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
for pass_id in range(args.num_epochs):
train_py_reader.start()
train_info = [[], [], [], [], [], [], [], [], [], []]
train_time = []
batch_id = 0
try:
while True:
t1 = time.time()
total_loss, tmp_acc_1, tmp_acc_2, tmp_acc_3, tmp_acc_4, tmp_acc_5, tmp_acc_6, tmp_acc_7, tmp_acc_8, tmp_acc_9, lr = exe.run(
fetch_list=train_fetch_list)
4条答案
按热度按时间iugsix8n1#
看起来是部分fetch的值不在网络中,可以检查一下total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr这些项
jtoj6r0c2#
我的代码基本仿照PaddleCv/ImageClassification中的写法
def calc_loss(logit, label, class_dim=751, use_label_smoothing=True, epsilon=0.1):
softmax_out = fluid.layers.softmax(logit)
if use_label_smoothing:
label_one_hot = fluid.layers.one_hot(input=label, depth=class_dim)
smooth_label = fluid.layers.label_smooth(label=label_one_hot, epsilon=epsilon, dtype="float32")
loss = fluid.layers.cross_entropy(input=softmax_out, label=smooth_label, soft_label=True)
else:
loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
会报如下错误
Traceback (most recent call last):
File "train.py", line 366, in
main()
File "train.py", line 362, in main
train(args)
File "train.py", line 227, in train
build_program(is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args)
File "train.py", line 199, in build_program
optimizer.minimize(total_cost)
File "</home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/decorator.pyc:decorator-gen-20>", line 2, in minimize
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/wrapped_decorator.py", line 25, inimpl
return wrapped_func(args,kwargs)
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/dygraph/base.py", line 87, inimpl*
return func(*args,**kwargs)
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/optimizer.py", line 594, in minimize
no_grad_set=no_grad_set)
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/optimizer.py", line 493, in backward
no_grad_set, callbacks)
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/backward.py", line 578, in append_backward
File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/backward.py", line 392, in append_backward_vars
op_desc.infer_shape(block.desc)
paddle.fluid.core_avx.EnforceNotMet: Enforce failed. Expected dy_dims.size() == rank, but received dy_dims.size():1 != rank:2.
Input(Y@Grad) and Input(X) should have the same rank. at [/home/vis/wangjian33/code/Paddle/paddle/fluid/operators/cross_entropy_op.cc:96]
如果在返回loss之前先做reduce_mean:
loss = fluid.layers.reduce_mean(loss)
则上述错误能避免,但是会报Cannot find fetch variable in scope错误
zd287kbt3#
看起来是shape没对上,用一下label=smooth_label.reshape(label.shape) 看看
dly7yett4#
之前的shape问题解决了,是计算总loss的时候分支loss的变量名用错了。但是Cannot find fetch variable in scope的问题依然存在,而且应该就是loss处的问题,fetch_list中只写loss,也还是有错。
现在关键代码如下:
def net_config(image, label, model, args):
model_list = [m for m in dir(models) if "__" not in m]
assert args.model in model_list, "{} is not lists: {}".format(args.model, model_list)
model_name = args.model
def build_program(is_train, main_prog, startup_prog, args):
image_shape = [int(m) for m in args.image_shape.split(",")]
model_name = args.model
model_list = [m for m in dir(models) if "__" not in m]
assert model_name in model_list, "{} is not in lists: {}".format(args.model, model_list)
model = models.dict[model_name](layers=args.layers, class_num=args.class_dim, num_bottleneck=args.num_features, is_train=True)
with fluid.program_guard(main_prog, startup_prog):
py_reader = fluid.layers.py_reader(
capacity=64,
shapes=[[-1] + image_shape, [-1, 1]],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
use_double_buffer=True)
with fluid.unique_name.guard():
image, label = fluid.layers.read_file(py_reader)
if "Ft_Net" in model_name:
print('This is Ft_Net')
total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9 = net_config(image, label, model, args)
total_cost.persistable = True
acc_1.persistable = True
acc_2.persistable = True
acc_3.persistable = True
acc_4.persistable = True
acc_5.persistable = True
acc_6.persistable = True
acc_7.persistable = True
acc_8.persistable = True
acc_9.persistable = True
else:
print('model error!')
if is_train:
params = {}
params["total_images"] = args.total_images
params["lr"] = args.lr
params["num_epochs"] = args.num_epochs
params["learning_strategy"] = {}
params["learning_strategy"]["batch_size"] = args.batch_size
params["learning_strategy"]["name"] = args.lr_strategy
params["l2_decay"] = args.l2_decay
params["momentum_rate"] = args.momentum_rate
optimizer = optimizer_setting(params)
optimizer.minimize(total_cost)
global_lr = optimizer._global_learning_rate()
if is_train:
return py_reader, total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr
def train(args):
model_name = args.model
checkpoint = args.checkpoint
pretrained_model = args.pretrained_model
with_memory_optimization = args.with_mem_opt
model_save_dir = args.model_save_dir
num_instances = args.num_instances