Paddle 【1.5】save_inference_model 接口,在feeded_var_names时候传入被裁剪的var,会报错

kxeu7u2r  于 2021-11-30  发布在  Java
关注(0)|答案(1)|浏览(376)

paddle version:1.5
train.py


# !/usr/bin/python

# -*- coding: UTF-8 -*-

# Configure the neural network.

import paddle
import paddle.fluid as fluid
import numpy
import os
from paddle.fluid.incubate.fleet.parameter_server.distributed_transpiler import fleet
from paddle.fluid.incubate.fleet.base import role_maker
from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig

def net(x, y):
    fc_0 = fluid.layers.fc(input=x, size=1, act='tanh')
    fc_1 = fluid.layers.fc(input=x, size=1, act='tanh')
    fc_2 = fluid.layers.fc(input=[fc_0, fc_1], size=1, act='tanh')
    y_predict = fluid.layers.fc(input=fc_0, size=2, act="softmax")
    cost = fluid.layers.cross_entropy(input=y_predict, label=y)
    avg_cost = fluid.layers.mean(cost)
    auc_var, auc_batch_var, auc_states = fluid.layers.auc(input=y_predict, label=y, slide_steps=20)
    return y_predict, avg_cost, auc_var, auc_batch_var

def fake_reader():
    def reader():
        for i in range(1000):
            x = numpy.random.random((1, 13)).astype('float32')
            y = numpy.random.randint(0, 2, (1, 1)).astype('int64')
            yield x,y
    return reader

def train():
    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
    y = fluid.layers.data(name='y', shape=[1], dtype='int64')
    y_predict, avg_cost, auc, auc_batch = net(x, y)

    place = fluid.CPUPlace()
    exe = fluid.Executor(place)

    trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
    trainers = int(os.environ["PADDLE_TRAINERS"])
    training_role = os.environ["PADDLE_TRAINING_ROLE"]
    training_role = role_maker.Role.WORKER if training_role == "TRAINER" else role_maker.Role.SERVER
    print(type(training_role)) 
    ports = os.getenv("PADDLE_PSERVER_PORTS")
    pserver_ip = os.getenv("PADDLE_PSERVER_IP", "")
    pserver_endpoints = []
    for port in ports.split(","):
        pserver_endpoints.append(':'.join([pserver_ip, port]))

    role = role_maker.UserDefinedRoleMaker(current_id=trainer_id, role=training_role, worker_num=trainers, server_endpoints=pserver_endpoints)
    config = DistributeTranspilerConfig()
    config.sync_mode = True

    # 加入 fleet init 初始化环境
    fleet.init(role)
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
    optimizer = fleet.distributed_optimizer(optimizer, config)
    optimizer.minimize(avg_cost)

    # 启动server
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    # 启动worker
    if fleet.is_worker():
        # 初始化worker配置
        fleet.init_worker()

        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
        train_reader = paddle.batch(fake_reader(), batch_size=24)

        exe.run(fleet.startup_program)

        print fleet.main_program
        PASS_NUM = 10
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                avg_loss_value, auc_value, auc_batch_value = exe.run(fleet.main_program, feed=feeder.feed(data), fetch_list=[avg_cost, auc, auc_batch])
                print("Pass %d, total avg cost = %f, auc = %f, batch_auc = %f" % (pass_id, avg_loss_value, auc_value, auc_batch_value))

        #fleet.save_inference_model(exe, "./inference_model", feeded_var_names=[x.name], target_vars=[y], main_program=fleet.main_program, export_for_deployment=True)
        fleet.save_inference_model(exe, "./inference_model", feeded_var_names=[x.name, y.name], target_vars=[y_predict], export_for_deployment=True)
        fleet.save_persistables(exe, "./persistables", main_program=None)
        # 通知server,当前节点训练结束
        fleet.stop_worker()

if __name__ == "__main__":
    train()

train.sh


# !/bin/bash

export PADDLE_TRAINERS=1
export PADDLE_TRAINER_ID=0
export PADDLE_PSERVER_PORTS=36001
export PADDLE_PSERVER_IP=127.0.0.1

if [ "$1" = "ps" ]
then
    export PADDLE_TRAINING_ROLE=PSERVER

    export GLOG_v=0
    export GLOG_logtostderr=1

    echo "PADDLE WILL START PSERVER ..."
    stdbuf -oL python train.py &> pserver.0.log &
fi

if [ "$1" = "tr" ]
then
    export PADDLE_TRAINING_ROLE=TRAINER

    export GLOG_v=0
    export GLOG_logtostderr=1

    echo "PADDLE WILL START TRAINER ..."
    stdbuf -oL python train.py &> trainer.0.log &
fi

执行命令:

bash train.sh ps
bash train.sh tr

报错如下:

Pass 9, total avg cost = 0.667034, auc = 0.494549, batch_auc = 0.485349
Pass 9, total avg cost = 0.723714, auc = 0.494146, batch_auc = 0.471183
Pass 9, total avg cost = 0.715571, auc = 0.493919, batch_auc = 0.464442
Pass 9, total avg cost = 0.697153, auc = 0.493904, batch_auc = 0.460331
Pass 9, total avg cost = 0.675562, auc = 0.494208, batch_auc = 0.462572
Pass 9, total avg cost = 0.703915, auc = 0.494083, batch_auc = 0.459323
Pass 9, total avg cost = 0.701939, auc = 0.494001, batch_auc = 0.459589
Pass 9, total avg cost = 0.683494, auc = 0.494195, batch_auc = 0.476225
Pass 9, total avg cost = 0.689812, auc = 0.494278, batch_auc = 0.479758
Pass 9, total avg cost = 0.698757, auc = 0.494270, batch_auc = 0.483297
Pass 9, total avg cost = 0.699067, auc = 0.494255, batch_auc = 0.485961
Pass 9, total avg cost = 0.689187, auc = 0.494375, batch_auc = 0.484674
Pass 9, total avg cost = 0.687636, auc = 0.494524, batch_auc = 0.491892
Pass 9, total avg cost = 0.700927, auc = 0.494461, batch_auc = 0.490608
Pass 9, total avg cost = 0.681764, auc = 0.494693, batch_auc = 0.496995
Pass 9, total avg cost = 0.695704, auc = 0.494697, batch_auc = 0.503726
Pass 9, total avg cost = 0.692039, auc = 0.494755, batch_auc = 0.508039
Pass 9, total avg cost = 0.683566, auc = 0.494957, batch_auc = 0.513368
Pass 9, total avg cost = 0.706128, auc = 0.494837, batch_auc = 0.509732
Pass 9, total avg cost = 0.697720, auc = 0.494861, batch_auc = 0.513518
Pass 9, total avg cost = 0.685556, auc = 0.495013, batch_auc = 0.505540
Pass 9, total avg cost = 0.690064, auc = 0.495078, batch_auc = 0.517566
Traceback (most recent call last):
  File "train.py", line 91, in <module>
    train()
  File "train.py", line 85, in train
    fleet.save_inference_model(exe, "./inference_model", feeded_var_names=[x.name, y.name], target_vars=[y_predict], export_for_deployment=True)
  File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/incubate/fleet/parameter_server/distributed_transpiler/__init__.py", line 157, in save_inference_model
    model_only=True)
  File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/io.py", line 1064, in save_inference_model
    prepend_feed_ops(main_program, feeded_var_names)
  File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/io.py", line 878, in prepend_feed_ops
    out = global_block.var(name)
  File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/framework.py", line 1497, in var
    raise ValueError("var %s not in this block" % name)
ValueError: var y not in this block
tnkciper

tnkciper1#

看起来在save的时候,y被裁剪掉了,是否给用户个提示,而不是直接报错,这样更好点

相关问题