paddle version:1.5
train.py
# !/usr/bin/python
# -*- coding: UTF-8 -*-
# Configure the neural network.
import paddle
import paddle.fluid as fluid
import numpy
import os
from paddle.fluid.incubate.fleet.parameter_server.distributed_transpiler import fleet
from paddle.fluid.incubate.fleet.base import role_maker
from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
def net(x, y):
fc_0 = fluid.layers.fc(input=x, size=1, act='tanh')
fc_1 = fluid.layers.fc(input=x, size=1, act='tanh')
fc_2 = fluid.layers.fc(input=[fc_0, fc_1], size=1, act='tanh')
y_predict = fluid.layers.fc(input=fc_0, size=2, act="softmax")
cost = fluid.layers.cross_entropy(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
auc_var, auc_batch_var, auc_states = fluid.layers.auc(input=y_predict, label=y, slide_steps=20)
return y_predict, avg_cost, auc_var, auc_batch_var
def fake_reader():
def reader():
for i in range(1000):
x = numpy.random.random((1, 13)).astype('float32')
y = numpy.random.randint(0, 2, (1, 1)).astype('int64')
yield x,y
return reader
def train():
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='int64')
y_predict, avg_cost, auc, auc_batch = net(x, y)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
trainers = int(os.environ["PADDLE_TRAINERS"])
training_role = os.environ["PADDLE_TRAINING_ROLE"]
training_role = role_maker.Role.WORKER if training_role == "TRAINER" else role_maker.Role.SERVER
print(type(training_role))
ports = os.getenv("PADDLE_PSERVER_PORTS")
pserver_ip = os.getenv("PADDLE_PSERVER_IP", "")
pserver_endpoints = []
for port in ports.split(","):
pserver_endpoints.append(':'.join([pserver_ip, port]))
role = role_maker.UserDefinedRoleMaker(current_id=trainer_id, role=training_role, worker_num=trainers, server_endpoints=pserver_endpoints)
config = DistributeTranspilerConfig()
config.sync_mode = True
# 加入 fleet init 初始化环境
fleet.init(role)
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
# 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
optimizer = fleet.distributed_optimizer(optimizer, config)
optimizer.minimize(avg_cost)
# 启动server
if fleet.is_server():
fleet.init_server()
fleet.run_server()
# 启动worker
if fleet.is_worker():
# 初始化worker配置
fleet.init_worker()
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
train_reader = paddle.batch(fake_reader(), batch_size=24)
exe.run(fleet.startup_program)
print fleet.main_program
PASS_NUM = 10
for pass_id in range(PASS_NUM):
for batch_id, data in enumerate(train_reader()):
avg_loss_value, auc_value, auc_batch_value = exe.run(fleet.main_program, feed=feeder.feed(data), fetch_list=[avg_cost, auc, auc_batch])
print("Pass %d, total avg cost = %f, auc = %f, batch_auc = %f" % (pass_id, avg_loss_value, auc_value, auc_batch_value))
#fleet.save_inference_model(exe, "./inference_model", feeded_var_names=[x.name], target_vars=[y], main_program=fleet.main_program, export_for_deployment=True)
fleet.save_inference_model(exe, "./inference_model", feeded_var_names=[x.name, y.name], target_vars=[y_predict], export_for_deployment=True)
fleet.save_persistables(exe, "./persistables", main_program=None)
# 通知server,当前节点训练结束
fleet.stop_worker()
if __name__ == "__main__":
train()
train.sh
# !/bin/bash
export PADDLE_TRAINERS=1
export PADDLE_TRAINER_ID=0
export PADDLE_PSERVER_PORTS=36001
export PADDLE_PSERVER_IP=127.0.0.1
if [ "$1" = "ps" ]
then
export PADDLE_TRAINING_ROLE=PSERVER
export GLOG_v=0
export GLOG_logtostderr=1
echo "PADDLE WILL START PSERVER ..."
stdbuf -oL python train.py &> pserver.0.log &
fi
if [ "$1" = "tr" ]
then
export PADDLE_TRAINING_ROLE=TRAINER
export GLOG_v=0
export GLOG_logtostderr=1
echo "PADDLE WILL START TRAINER ..."
stdbuf -oL python train.py &> trainer.0.log &
fi
执行命令:
bash train.sh ps
bash train.sh tr
报错如下:
Pass 9, total avg cost = 0.667034, auc = 0.494549, batch_auc = 0.485349
Pass 9, total avg cost = 0.723714, auc = 0.494146, batch_auc = 0.471183
Pass 9, total avg cost = 0.715571, auc = 0.493919, batch_auc = 0.464442
Pass 9, total avg cost = 0.697153, auc = 0.493904, batch_auc = 0.460331
Pass 9, total avg cost = 0.675562, auc = 0.494208, batch_auc = 0.462572
Pass 9, total avg cost = 0.703915, auc = 0.494083, batch_auc = 0.459323
Pass 9, total avg cost = 0.701939, auc = 0.494001, batch_auc = 0.459589
Pass 9, total avg cost = 0.683494, auc = 0.494195, batch_auc = 0.476225
Pass 9, total avg cost = 0.689812, auc = 0.494278, batch_auc = 0.479758
Pass 9, total avg cost = 0.698757, auc = 0.494270, batch_auc = 0.483297
Pass 9, total avg cost = 0.699067, auc = 0.494255, batch_auc = 0.485961
Pass 9, total avg cost = 0.689187, auc = 0.494375, batch_auc = 0.484674
Pass 9, total avg cost = 0.687636, auc = 0.494524, batch_auc = 0.491892
Pass 9, total avg cost = 0.700927, auc = 0.494461, batch_auc = 0.490608
Pass 9, total avg cost = 0.681764, auc = 0.494693, batch_auc = 0.496995
Pass 9, total avg cost = 0.695704, auc = 0.494697, batch_auc = 0.503726
Pass 9, total avg cost = 0.692039, auc = 0.494755, batch_auc = 0.508039
Pass 9, total avg cost = 0.683566, auc = 0.494957, batch_auc = 0.513368
Pass 9, total avg cost = 0.706128, auc = 0.494837, batch_auc = 0.509732
Pass 9, total avg cost = 0.697720, auc = 0.494861, batch_auc = 0.513518
Pass 9, total avg cost = 0.685556, auc = 0.495013, batch_auc = 0.505540
Pass 9, total avg cost = 0.690064, auc = 0.495078, batch_auc = 0.517566
Traceback (most recent call last):
File "train.py", line 91, in <module>
train()
File "train.py", line 85, in train
fleet.save_inference_model(exe, "./inference_model", feeded_var_names=[x.name, y.name], target_vars=[y_predict], export_for_deployment=True)
File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/incubate/fleet/parameter_server/distributed_transpiler/__init__.py", line 157, in save_inference_model
model_only=True)
File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/io.py", line 1064, in save_inference_model
prepend_feed_ops(main_program, feeded_var_names)
File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/io.py", line 878, in prepend_feed_ops
out = global_block.var(name)
File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/framework.py", line 1497, in var
raise ValueError("var %s not in this block" % name)
ValueError: var y not in this block
1条答案
按热度按时间tnkciper1#
看起来在save的时候,y被裁剪掉了,是否给用户个提示,而不是直接报错,这样更好点