Paddle 【飞桨论文复现赛-第六期】自定义外部算子动态图预测正常,导出静态图后推理异常

lg40wkob  于 2022-04-21  发布在  Java

bug描述 Describe the Bug


环境:aistudio gpu 至尊 paddlepaddle==2.2.2







其他补充信息 Additional Supplementary Information


环境:aistudio gpu 至尊 paddlepaddle==2.2.2







外部算子C++代码(,, psamask.cc代码如下:

# include "paddle/extension.h"

# include <vector>

# define CHECK_INPUT(x) PD_CHECK( == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")

# ifndef min

# define min(a,b) (((a) < (b)) ? (a) : (b))

# endif

# ifndef max

# define max(a,b) (((a) > (b)) ? (a) : (b))

# endif

template <typename data_t>
void psamask_collect_forward_kernel(const data_t* x_data,
                             data_t* out_data,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  for(int i{0}; i<num_*feature_H_*feature_H_*feature_W_*feature_W_; ++i) {
      out_data[i] = 0;
  for(int n = 0; n < num_; n++) {
    for(int h = 0; h < feature_H_; h++) {
        for(int w = 0; w < feature_W_; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
            const int hstart = max(0, half_mask_H_ - h);
            const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
            const int wstart = max(0, half_mask_W_ - w);
            const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
            // (hidx,                    widx                   ) with mask-indexed
            // (hidx + h - half_mask_H_, widx + w - half_mask_W_) with feature-indexed
            for (int hidx = hstart; hidx < hend; hidx++) {
                for (int widx = wstart; widx < wend; widx++) {
                    out_data[(n * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)) * feature_H_ * feature_W_ + h * feature_W_ + w] = x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w];

template <typename data_t>
void psamask_distribute_forward_kernel(const data_t* x_data,
                             data_t* out_data,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  for(int i{0}; i<num_*feature_H_*feature_H_*feature_W_*feature_W_; ++i) {
      out_data[i] = 0;
  for(int n = 0; n < num_; n++) {
    for(int h = 0; h < feature_H_; h++) {
        for(int w = 0; w < feature_W_; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
            const int hstart = max(0, half_mask_H_ - h);
            const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
            const int wstart = max(0, half_mask_W_ - w);
            const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
            // (hidx,                    widx                   ) with mask-indexed
            // (hidx + h - half_mask_H_, widx + w - half_mask_W_) with feature-indexed
            for (int hidx = hstart; hidx < hend; hidx++) {
                for (int widx = wstart; widx < wend; widx++) {
                    out_data[(n * feature_H_ * feature_W_ + h * feature_W_ + w) * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)] = x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w];

template <typename data_t>
void psamask_collect_backward_kernel(const data_t* grad_out_data,
                            data_t* grad_x_data,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  for (int i{0}; i < num_ * mask_H_ * mask_W_ * feature_H_ * feature_W_; ++i){
      grad_x_data[i] = 0;
  for(int n = 0; n < num_; n++) {
    for(int h = 0; h < feature_H_; h++) {
        for(int w = 0; w < feature_W_; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
            const int hstart = max(0, half_mask_H_ - h);
            const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
            const int wstart = max(0, half_mask_W_ - w);
            const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
            // (hidx,                    widx                   ) with mask-indexed
            // (hidx + h - half_mask_H_, widx + w - half_mask_W_) with feature-indexed
            for (int hidx = hstart; hidx < hend; hidx++) {
                for (int widx = wstart; widx < wend; widx++) {
                    grad_x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w] = grad_out_data[(n * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)) * feature_H_ * feature_W_ + h * feature_W_ + w];

template <typename data_t>
void psamask_distribute_backward_kernel(const data_t* grad_out_data,
                            data_t* grad_x_data,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  for (int i{0}; i < num_ * mask_H_ * mask_W_ * feature_H_ * feature_W_; ++i){
      grad_x_data[i] = 0;
  for(int n = 0; n < num_; n++) {
    for(int h = 0; h < feature_H_; h++) {
        for(int w = 0; w < feature_W_; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
            const int hstart = max(0, half_mask_H_ - h);
            const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
            const int wstart = max(0, half_mask_W_ - w);
            const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
            // (hidx,                    widx                   ) with mask-indexed
            // (hidx + h - half_mask_H_, widx + w - half_mask_W_) with feature-indexed
            for (int hidx = hstart; hidx < hend; hidx++) {
                for (int widx = wstart; widx < wend; widx++) {
                    grad_x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w] = grad_out_data[(n * feature_H_ * feature_W_ + h * feature_W_ + w) * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)];

std::vector<paddle::Tensor> PSAMaskCPUForward(const paddle::Tensor& x,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {

  auto out = paddle::Tensor(paddle::PlaceType::kCPU, std::vector<int64_t>{num_, feature_H_ * feature_W_, feature_H_, feature_W_});

  if (psa_type == 0) {
        x.type(), "psamask_collect_forward_kernel", ([&] {
      <data_t>(), out.mutable_data<data_t>(, num_, feature_H_, feature_W_,
                            mask_H_, mask_W_, half_mask_H_, half_mask_W_);
        x.type(), "psamask_distribute_forward_kernel", ([&] {
      <data_t>(),  out.mutable_data<data_t>(, num_, feature_H_, feature_W_,
                            mask_H_, mask_W_, half_mask_H_, half_mask_W_);

  return {out};

std::vector<paddle::Tensor> PSAMaskCPUBackward(const paddle::Tensor& x,
                                            const paddle::Tensor& out,
                                            const paddle::Tensor& grad_out,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {

  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());

  if (psa_type == 0) {
    PD_DISPATCH_FLOATING_TYPES(out.type(), "psamask_collect_backward_kernel", ([&] {
                                    num_, feature_H_, feature_W_,
                             mask_H_, mask_W_, half_mask_H_, half_mask_W_);
  } else {
      PD_DISPATCH_FLOATING_TYPES(out.type(), "psamask_distribute_backward_kernel", ([&] {
                                    num_, feature_H_, feature_W_,
                             mask_H_, mask_W_, half_mask_H_, half_mask_W_);

  return {grad_x};

// NOTE: If your custom operator may be compiled in an environment with CUDA,
// or it may be compiled in an environment without CUDA, in order to adapt the
// compilation environment, you can use the PADDLE_WITH_CUDA macro control
// the CUDA related code.


std::vector<paddle::Tensor> PSAMaskCUDAForward(const paddle::Tensor& x,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_);
std::vector<paddle::Tensor> PSAMaskCUDABackward(const paddle::Tensor& x,
                                            const paddle::Tensor& out,
                                            const paddle::Tensor& grad_out,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_);

# endif

std::vector<paddle::Tensor> PSAMaskForward(const paddle::Tensor& x,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  if ( == paddle::PlaceType::kCPU) {
    return PSAMaskCPUForward(x, psa_type, num_, feature_H_, feature_W_, mask_H_, mask_W_, half_mask_H_, half_mask_W_);


  } else if ( == paddle::PlaceType::kGPU) {
    return PSAMaskCUDAForward(x, psa_type, num_, feature_H_, feature_W_, mask_H_, mask_W_, half_mask_H_, half_mask_W_);

# endif

  } else {
    PD_THROW("Unsupported device type for forward function of custom relu operator.");

std::vector<paddle::Tensor> PSAMaskBackward(const paddle::Tensor& x,
                                         const paddle::Tensor& out,
                                         const paddle::Tensor& grad_out,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  if ( == paddle::PlaceType::kCPU) {
    return PSAMaskCPUBackward(x, out, grad_out, psa_type, num_, feature_H_, feature_W_, mask_H_, mask_W_, half_mask_H_, half_mask_W_);


  } else if ( == paddle::PlaceType::kGPU) {
    return PSAMaskCUDABackward(x, out, grad_out, psa_type, num_, feature_H_, feature_W_, mask_H_, mask_W_, half_mask_H_, half_mask_W_);

# endif

  } else {
    PD_THROW("Unsupported device type for backward function of custom relu operator.");

// 维度推导
std::vector<std::vector<int64_t>> PSAMaskInferShape(const std::vector<int64_t> x_shape) {
    return {std::vector<int64_t>{x_shape[0], x_shape[2] * x_shape[3], x_shape[2], x_shape[3]}};

// 类型推导
std::vector<paddle::DataType> PSAMaskInferDtype(paddle::DataType x_dtype) {
  return {x_dtype};

        "psa_type: int",
        "num_: int",
        "feature_H_: int",
        "feature_W_: int",
        "mask_H_: int",
        "mask_W_: int",
        "half_mask_H_: int",
        "half_mask_W_: int"})

    .Inputs({"X", "Out", paddle::Grad("Out")})
        "psa_type: int",
        "num_: int",
        "feature_H_: int",
        "feature_W_: int",
        "mask_H_: int",
        "mask_W_: int",
        "half_mask_H_: int",
        "half_mask_W_: int"})


# include "paddle/extension.h"

# include <vector>

# define CHECK_GPU_INPUT(x) PD_CHECK( == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")

# ifndef min

# define min(a,b) (((a) < (b)) ? (a) : (b))

# endif

# ifndef max

# define max(a,b) (((a) > (b)) ? (a) : (b))

# endif

template <typename data_t>
__global__ void psamask_collect_cuda_forward_kernel(const data_t* x_data,
                             data_t* out_data,
                             const int nthreads,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = gid; i < num_ * feature_H_ * feature_W_ * feature_H_ * feature_W_; i += blockDim.x * gridDim.x) {
        out_data[i] = 0;
    for (int index{blockIdx.x * blockDim.x + threadIdx.x}; index< nthreads; index+=blockDim.x * gridDim.x) {
        const int w{index % feature_W_};
        const int h{index / feature_W_ % feature_H_};
        const int n{index / feature_W_ / feature_H_};
        const int hstart = max(0, half_mask_H_ - h);
        const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
        const int wstart = max(0, half_mask_W_ - w);
        const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
        for (int hidx{hstart}; hidx < hend; ++hidx){
            for (int widx{wstart}; widx < wend; ++widx) {
                out_data[(n * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)) * feature_H_ * feature_W_ + h * feature_W_ + w] = x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w];

template <typename data_t>
__global__ void psamask_distribute_cuda_forward_kernel(const data_t* x_data,
                             data_t* out_data,
                             const int nthreads,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = gid; i < num_ * feature_H_ * feature_W_ * feature_H_ * feature_W_; i += blockDim.x * gridDim.x) {
        out_data[i] = 0;
    for (int index{blockIdx.x * blockDim.x + threadIdx.x}; index< nthreads; index+=blockDim.x * gridDim.x) {
        const int w{index % feature_W_};
        const int h{index / feature_W_ % feature_H_};
        const int n{index / feature_W_ / feature_H_};
        const int hstart = max(0, half_mask_H_ - h);
        const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
        const int wstart = max(0, half_mask_W_ - w);
        const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
        for (int hidx{hstart}; hidx < hend; ++hidx){
            for (int widx{wstart}; widx < wend; ++widx) {
                out_data[(n * feature_H_ * feature_W_ + h * feature_W_ + w) * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)] = x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w];

template <typename data_t>
__global__ void psamask_collect_cuda_backward_kernel(const data_t* grad_out_data, const data_t* out,
                            data_t* grad_x_data,
                            const int nthreads,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = gid; i < num_ * mask_H_ * mask_W_ * feature_H_ * feature_W_; i += blockDim.x * gridDim.x) {
        grad_x_data[i] = 0;
     for (int index{blockIdx.x * blockDim.x + threadIdx.x}; index < nthreads; index+=blockDim.x * gridDim.x) {
        const int w{index % feature_W_};
        const int h{index / feature_W_ % feature_H_};
        const int n{index / feature_W_ / feature_H_};
        const int hstart = max(0, half_mask_H_ - h);
        const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
        const int wstart = max(0, half_mask_W_ - w);
        const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
        for (int hidx{hstart}; hidx < hend; ++hidx){
            for (int widx{wstart}; widx < wend; ++widx) {
                grad_x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w] = grad_out_data[(n * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)) * feature_H_ * feature_W_ + h * feature_W_ + w];

template <typename data_t>
__global__ void psamask_distribute_cuda_backward_kernel(const data_t* grad_out_data, const data_t* out,
                            data_t* grad_x_data,
                            const int nthreads,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
     int gid = blockIdx.x * blockDim.x + threadIdx.x;
     for (int i = gid; i < num_ * mask_H_ * mask_W_ * feature_H_ * feature_W_; i += blockDim.x * gridDim.x) {
         grad_x_data[i] = 0;
     for (int index{blockIdx.x * blockDim.x + threadIdx.x}; index< nthreads; index+=blockDim.x * gridDim.x) {
        const int w{index % feature_W_};
        const int h{index / feature_W_ % feature_H_};
        const int n{index / feature_W_ / feature_H_};
        const int hstart = max(0, half_mask_H_ - h);
        const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
        const int wstart = max(0, half_mask_W_ - w);
        const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
        for (int hidx{hstart}; hidx < hend; ++hidx){
            for (int widx{wstart}; widx < wend; ++widx) {
                grad_x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w] = grad_out_data[(n * feature_H_ * feature_W_ + h * feature_W_ + w) * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)];

std::vector<paddle::Tensor> PSAMaskCUDAForward(const paddle::Tensor& x,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {

  auto out = paddle::Tensor(paddle::PlaceType::kGPU, std::vector<int64_t>{num_, feature_H_ * feature_W_, feature_H_, feature_W_});
  int numel = out.size();
  int nthreads = num_ * feature_H_ * feature_W_;
  int block = 512;
  if (psa_type == 0) {
        x.type(), "psamask_collect_cuda_forward_kernel", ([&] {psamask_collect_cuda_forward_kernel<data_t><<<nthreads, block, 0,>>>(<data_t>(), out.mutable_data<data_t>(, nthreads, num_, feature_H_, feature_W_,mask_H_, mask_W_, half_mask_H_, half_mask_W_);}));
        x.type(), "psamask_distribute_cuda_forward_kernel", ([&] {
            psamask_distribute_cuda_forward_kernel<data_t><<<nthreads, block, 0,>>>(
      <data_t>(),  out.mutable_data<data_t>(, nthreads, num_, feature_H_, feature_W_,
                            mask_H_, mask_W_, half_mask_H_, half_mask_W_);

  return {out};

std::vector<paddle::Tensor> PSAMaskCUDABackward(const paddle::Tensor& x,
                                            const paddle::Tensor& out,
                                            const paddle::Tensor& grad_out,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {

  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
  int numel = x.size();
  int nthreads = num_ * feature_H_ * feature_W_;
  int block = 512;
  if (psa_type == 0) {
    PD_DISPATCH_FLOATING_TYPES(out.type(), "psamask_collect_cuda_backward_kernel", ([&] {
                                psamask_collect_cuda_backward_kernel<data_t><<<nthreads,block, 0,>>>(
                                    num_, feature_H_, feature_W_,
                             mask_H_, mask_W_, half_mask_H_, half_mask_W_);
  } else {
    PD_DISPATCH_FLOATING_TYPES(out.type(), "psamask_distribute_cuda_backward_kernel", ([&] {
                                psamask_distribute_cuda_backward_kernel<data_t><<<nthreads, block, 0,>>>(
                                    num_, feature_H_, feature_W_,
                             mask_H_, mask_W_, half_mask_H_, half_mask_W_);

  return {grad_x};



Hi! We've received your issue and please be patient to get responded. We will arrange technicians to answer your questions as soon as possible. Please make sure that you have posted enough message to demo your request. You may also check out the APIFAQGithub Issue and AI community to get the answer.Have a nice day!
