模型代码如下
`# !/usr/bin/python
coding=utf-8
import paddle
import paddle.fluid as fluid
from functools import reduce
from itertools import starmap
import numpy as np
import math
def DNN(args, feat_list):
dense_feature_dim = reduce(lambda x,y:x+y,[ 1 for _,feat in feat_list.items() if feat.value_type == 0])
dense_feature = fluid.layers.data(name="dense_input", shape=[dense_feature_dim], dtype='float32')
sparse_feature = [fluid.layers.data(name=feat.prefix, shape=[1], lod_level=1, dtype='int64') for _, feat in feat_list.items() if feat.value_type == 1]
sparse_weight_feature = [fluid.layers.data(name=feat.prefix + '@index', shape=[1], lod_level=1, dtype='int64') for _, feat in feat_list.items() if feat.value_type == 2]
sparse_weight_value = [fluid.layers.data(name=feat.prefix + '@value', shape=[1], lod_level=1, dtype='float32') for _, feat in feat_list.items() if feat.value_type == 2]
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
duration = fluid.layers.data(name='duration', shape=[1], dtype='float32')
data_list = [dense_feature] + sparse_feature + sparse_weight_feature + sparse_weight_value + [label] + [duration]
if args.is_infer:
data_list = [dense_feature] + sparse_feature + sparse_weight_feature + sparse_weight_value
sparse_feature_names = [feat.prefix for _, feat in feat_list.items() if feat.value_type == 1]
sparse_feature_size = [ feat.dimension for _, feat in feat_list.items() if feat.value_type == 1]
sparse_feature_embsize = [feat.emb_size for _,feat in feat_list.items() if feat.value_type == 1]
sparse_weight_feature_names = [feat.prefix for _, feat in feat_list.items() if feat.value_type == 2]
sparse_weight_feature_size = [ feat.dimension for _, feat in feat_list.items() if feat.value_type == 2]
sparse_weight_feature_embsize = [feat.emb_size for _,feat in feat_list.items() if feat.value_type == 2]
def embedding_layer(input, attr_name, input_size, emb_size):
"""embedding_layer"""
param_name = attr_name + '_table'
sparse_field_param_attr = fluid.param_attr.ParamAttr(name=param_name, initializer=fluid.initializer.Normal(scale=1 / math.sqrt(input_size)))
emb = fluid.layers.embedding(input=input, dtype='float32', size=[input_size, emb_size], param_attr=sparse_field_param_attr, is_sparse=True)
return fluid.layers.sequence_pool(input=emb, pool_type='Max')
def mmoe_block(input,expert_num,task_num):
input_dim = input.shape[1]
experts_input_list = [input] * expert_num
experts_input = fluid.layers.stack(experts_input_list,axis=1)
temp_experts_input = fluid.layers.unsqueeze(input=experts_input,axes=[2])
experts_input_matrix = fluid.layers.transpose(temp_experts_input, perm=[2, 1, 0, 3]) #[1,expert_num,batch_size,input_dim]
w = fluid.layers.create_parameter(shape=[expert_num, input_dim, input_dim//2], dtype='float32',
attr=fluid.param_attr.ParamAttr(name='mmoe_first_layer',gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
default_initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (input_dim**0.5)))
w1 = fluid.layers.unsqueeze(input=w,axes=[0])
#w1 = fluid.layers.stack([w]*batch_size,axis=0)
hidden1 = fluid.layers.matmul(experts_input_matrix, w1)
#hidden1_activation = fluid.layers.relu(hidden1) #[1,expert_num,batch_size,input_dim//2]
hidden1_flatten = fluid.layers.reshape(fluid.layers.transpose(fluid.layers.squeeze(hidden1,axes=[0]), perm=[1,0,2]), shape=[-1, expert_num * (input_dim//2)])
hidden1_batchnorm = fluid.layers.batch_norm(input=hidden1_flatten, act='relu', is_test=not args.is_train, name='mmoe_hidden1_batchnorm') #[batch_size, expert_num*input_dim//2]
hidden1_output = fluid.layers.unsqueeze(fluid.layers.transpose(fluid.layers.reshape(hidden1_batchnorm, shape=[-1, expert_num, input_dim//2]), perm=[1,0,2]), axes=[0]) #[1, expert_num, batch_size, input_dim//2]
v = fluid.layers.create_parameter(shape=[expert_num, input_dim//2, input_dim//4], dtype='float32',
attr=fluid.param_attr.ParamAttr(name='mmoe_second_layer',gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
default_initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / ((input_dim//2)**0.5)))
v1 = fluid.layers.unsqueeze(input=v,axes=[0])
#v1 = fluid.layers.stack([v]*batch_size,axis=0)
hidden2 = fluid.layers.matmul(hidden1_output, v1)
#hidden2_activation = fluid.layers.relu(hidden2) #[1,expert_num,batch_size,output_dim]
hidden2_flatten = fluid.layers.reshape(fluid.layers.transpose(fluid.layers.squeeze(hidden2,axes=[0]), perm=[1,0,2]), shape=[-1, expert_num * (input_dim//4)])
hidden2_batchnorm = fluid.layers.batch_norm(input=hidden2_flatten, act='relu', is_test=not args.is_train, name='mmoe_hidden2_batchnorm') #[batch_size, expert_num*input_dim//4]
hidden2_output = fluid.layers.reshape(hidden2_batchnorm, shape=[-1, expert_num, input_dim//4])
#output = fluid.layers.squeeze(hidden2_transpose,axes=[2]) #[batch_size,expert_num,output_dim]
#gated_kernels = []
final_output = []
for i in range(task_num):
fc = fluid.layers.fc(name='gate_kernel_task_fc_{}'.format(i),input=input, size=expert_num)
gated_softmax_output = fluid.layers.softmax(input=fc, name='gate_kernel_task_softmax_{}'.format(i), axis=1)#[batch_size,expert_num]
#fluid.layers.Print(gated_softmax_output)
weighted_output = fluid.layers.elementwise_mul(hidden2_output, gated_softmax_output, axis=0)#[batch_size,expert_num,output_dim]
agregate_output = fluid.layers.reduce_sum(weighted_output, dim=1, name='gate_kernel_task_agregate_{}'.format(i))
final_output.append(agregate_output)
return final_output
sparse_weight_embedding_list = []
for raw_sparse_weight_feat, raw_sparse_weight_val, feature_name, feature_dimension, feature_embsize in zip(sparse_weight_feature, sparse_weight_value, sparse_weight_feature_names, sparse_weight_feature_size, sparse_weight_feature_embsize):
param_name = feature_name + '_table'
sparse_weight_param_attr = fluid.param_attr.ParamAttr(name=param_name, initializer=fluid.initializer.Normal(scale=1 / math.sqrt(feature_dimension)),gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0))
#sparse_weight_val = fluid.layers.unsqueeze(input=raw_sparse_weight_val,axes=[2])
emb = fluid.layers.embedding(input=raw_sparse_weight_feat, dtype='float32', size=[feature_dimension, feature_embsize],param_attr=sparse_weight_param_attr,is_sparse=True)
weighted_sparse_weight_embedding = emb * raw_sparse_weight_val
sparse_weight_embedding = fluid.layers.sequence_pool(input=weighted_sparse_weight_embedding, pool_type='Max')
sparse_weight_embedding_list.append(sparse_weight_embedding)
sparse_embedding_list = list(starmap(embedding_layer, zip(sparse_feature, sparse_feature_names, sparse_feature_size, sparse_feature_embsize)))
concated = fluid.layers.concat(sparse_embedding_list + sparse_weight_embedding_list + [dense_feature], axis=1)
united_fc = fluid.layers.fc(input = concated, size = 512 ,act = None,param_attr = \
fluid.ParamAttr(learning_rate=1.0, \
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (512**0.5)),
gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
bias_attr = \
fluid.ParamAttr(learning_rate=1.0, \
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (512**0.5)),
gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
name='united_fc')
united_bn = fluid.layers.batch_norm(input=united_fc, act='relu', is_test=not args.is_train, name='united_bn')
fluid.layers.Print(united_fc, summarize=512)
fluid.layers.Print(united_bn, summarize=512)
ctr_fc, duration_fc = mmoe_block(united_bn,args.expert_num,2)
fluid.layers.Print(ctr_fc, summarize=128)
ctr_fc1 = fluid.layers.fc(input = ctr_fc,size = 64,act = None,
param_attr = \
fluid.ParamAttr(learning_rate=1.0, \
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (64**0.5)),
gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
bias_attr = \
fluid.ParamAttr(learning_rate=1.0, \
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (64**0.5)),
gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
name='ctr_fc1')
ctr_bn1 = fluid.layers.batch_norm(input=ctr_fc1, act='relu', is_test=not args.is_train,name='ctr_bn1')
duration_fc1 = fluid.layers.fc(input = duration_fc,size = 64,act = None,
param_attr = \
fluid.ParamAttr(learning_rate=1.0, \
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (64**0.5)),
gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
bias_attr = \
fluid.ParamAttr(learning_rate=1.0, \
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (64**0.5)),
gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
name='duration_fc1')
duration_bn1 = fluid.layers.batch_norm(input=duration_fc1, act='relu', is_test=not args.is_train,name='duration_bn1')
fluid.layers.Print(ctr_fc1, summarize=64)
fluid.layers.Print(ctr_bn1, summarize=64)
ctr_predict = fluid.layers.fc(input=ctr_bn1, size=2, act="softmax",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1 / math.sqrt(ctr_bn1.shape[1])),
gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)), name='ctr_predict')
fluid.layers.Print(ctr_predict)
dura_predict = fluid.layers.fc(input=duration_bn1, size=1, act=None,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1 / math.sqrt(duration_bn1.shape[1])),
gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)), name='dura_predict')
# fluid.layers.Print(dura_predict)
if args.is_train:
ctr_cost = fluid.layers.cross_entropy(input=ctr_predict, label=data_list[-2])
avg_ctr_cost = fluid.layers.reduce_sum(ctr_cost)
dura_cost = fluid.layers.square_error_cost(input=dura_predict, label=data_list[-1])
avg_dura_cost = fluid.layers.reduce_sum(dura_cost)
total_cost = avg_ctr_cost + 0.05 * avg_dura_cost
accuracy = fluid.layers.accuracy(input=ctr_predict, label=data_list[-2])
auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=ctr_predict, label=data_list[-2], num_thresholds=2**12, slide_steps=20)
return total_cost, avg_ctr_cost, avg_dura_cost, auc_var, batch_auc_var, accuracy, ctr_predict, dura_predict, data_list
elif args.is_infer:
return ctr_predict, dura_predict, data_list
else:
ctr_cost = fluid.layers.cross_entropy(input=ctr_predict, label=data_list[-2])
avg_ctr_cost = fluid.layers.reduce_sum(ctr_cost)
dura_cost = fluid.layers.square_error_cost(input=dura_predict, label=data_list[-1])
avg_dura_cost = fluid.layers.reduce_sum(dura_cost)
total_cost = avg_ctr_cost + 0.05 * avg_dura_cost
accuracy = fluid.layers.accuracy(input=ctr_predict, label=data_list[-2])
auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=ctr_predict, label=data_list[-2], num_thresholds=2**12, slide_steps=20)
return total_cost, avg_ctr_cost, avg_dura_cost, auc_var, batch_auc_var, accuracy, ctr_predict, dura_predict, data_list`
训练分别测试使用了SGD,ADAM优化器,学习率设置为1e-6,这种情况下一直报错
PaddleCheckError: Operator clip_by_norm output Tensor clip_by_norm_63.tmp_0 contains NAN at [/paddle/paddle/fluid/framework/operator.cc:848]
[operator < clip_by_norm > error]
8条答案
按热度按时间b1uwtaje1#
去掉clip试一下?
jw5wzhpr2#
加上clip的原因就是之前embedding层出现INF。之前经常因此导致预测值变为nan,报错。
zkure5ic3#
单机训练是否出inf/nan
gojuced74#
单机正常训练,mpi只用部分数据(30%)偶尔出现nan,全量必出现。
hrysbysz5#
单机加上clip后还出nan么
vof42yt16#
单机加clip之前也不出现nan,加上之后也正常,但也有可能是数据量少。
eanckbw97#
补充说明:
mpi训练 embedding层不加gradient_clip 报错:Operator adam output Tensor match&seccate#bayes_ctr_table_moment2_0 contains Inf at [/paddle/paddle/fluid/framework/operator.cc:846]
添加gradient_clip后 报错:Operator clip_by_norm output Tensor clip_by_norm_52.tmp_0 contains NAN at [/paddle/paddle/fluid/framework/operator.cc:848]
r55awzrz8#
补充:mpi单节点训练正常,多节点训练必现以上报错。