bug描述 Describe the Bug

自定义外部算子静态图推理异常：自定义外部算子动态图预测正常，导出静态图后推理异常

环境：aistudio gpu 至尊 paddlepaddle==2.2.2

自定义算子组网：https://github.com/justld/PSANet_paddle/blob/2f709cc39aa5519b37c4e914708dbf37cea79d8e/paddleseg/models/psanet.py#L189

测试结果：自定义算子导出静态图后预测异常。

测试方法：
1、使用nn.Conv2D替代自定义算子，动态图预测结果和静态图推理结果一致；
使用Conv2D替换自定义算子（其他无任何改变，T添加的Conv2D参数默认初始化）,动态图预测结果：

动态图模型导出为静态图，推理结果和动态图一致：

2、使用自定义算子，动态图预测结果是对的，静态图推理错误（与动态图不一致）。
动态图预测结果：

导出静态图后推理结果（与动态图不一致）：

其他补充信息 Additional Supplementary Information

问题描述：自定义外部算子psamask动态图预测正常，导出静态图后推理异常(与动态图预测结果不一致)

环境：aistudio gpu 至尊 paddlepaddle==2.2.2

自定义算子组网：https://github.com/justld/PSANet_paddle/blob/2f709cc39aa5519b37c4e914708dbf37cea79d8e/paddleseg/models/psanet.py#L189

测试结果：自定义算子导出静态图后预测异常。

动态图模型导出为静态图，推理结果和动态图一致：

2、使用自定义算子，动态图预测结果是对的，静态图推理错误（与动态图不一致）。
动态图预测结果：

导出静态图后推理结果（与动态图不一致）：

外部算子C++代码（psamask.cc, psamask.cu）， psamask.cc代码如下：


# include "paddle/extension.h"

# include <vector>

# define CHECK_INPUT(x) PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")

# ifndef min

# define min(a,b) (((a) < (b)) ? (a) : (b))

# endif

# ifndef max

# define max(a,b) (((a) > (b)) ? (a) : (b))

# endif

template <typename data_t>
void psamask_collect_forward_kernel(const data_t* x_data,
                             data_t* out_data,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  for(int i{0}; i<num_*feature_H_*feature_H_*feature_W_*feature_W_; ++i) {
      out_data[i] = 0;
  }
  for(int n = 0; n < num_; n++) {
    for(int h = 0; h < feature_H_; h++) {
        for(int w = 0; w < feature_W_; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
            const int hstart = max(0, half_mask_H_ - h);
            const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
            const int wstart = max(0, half_mask_W_ - w);
            const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
            // (hidx,                    widx                   ) with mask-indexed
            // (hidx + h - half_mask_H_, widx + w - half_mask_W_) with feature-indexed
            for (int hidx = hstart; hidx < hend; hidx++) {
                for (int widx = wstart; widx < wend; widx++) {
                    out_data[(n * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)) * feature_H_ * feature_W_ + h * feature_W_ + w] = x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w];
                }
            }
        }
    }
  }
}

template <typename data_t>
void psamask_distribute_forward_kernel(const data_t* x_data,
                             data_t* out_data,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  for(int i{0}; i<num_*feature_H_*feature_H_*feature_W_*feature_W_; ++i) {
      out_data[i] = 0;
  }
  for(int n = 0; n < num_; n++) {
    for(int h = 0; h < feature_H_; h++) {
        for(int w = 0; w < feature_W_; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
            const int hstart = max(0, half_mask_H_ - h);
            const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
            const int wstart = max(0, half_mask_W_ - w);
            const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
            // (hidx,                    widx                   ) with mask-indexed
            // (hidx + h - half_mask_H_, widx + w - half_mask_W_) with feature-indexed
            for (int hidx = hstart; hidx < hend; hidx++) {
                for (int widx = wstart; widx < wend; widx++) {
                    out_data[(n * feature_H_ * feature_W_ + h * feature_W_ + w) * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)] = x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w];
                }
            }
        }
    }
  }
}

template <typename data_t>
void psamask_collect_backward_kernel(const data_t* grad_out_data,
                            data_t* grad_x_data,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  for (int i{0}; i < num_ * mask_H_ * mask_W_ * feature_H_ * feature_W_; ++i){
      grad_x_data[i] = 0;
  }
  for(int n = 0; n < num_; n++) {
    for(int h = 0; h < feature_H_; h++) {
        for(int w = 0; w < feature_W_; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
            const int hstart = max(0, half_mask_H_ - h);
            const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
            const int wstart = max(0, half_mask_W_ - w);
            const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
            // (hidx,                    widx                   ) with mask-indexed
            // (hidx + h - half_mask_H_, widx + w - half_mask_W_) with feature-indexed
            for (int hidx = hstart; hidx < hend; hidx++) {
                for (int widx = wstart; widx < wend; widx++) {
                    grad_x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w] = grad_out_data[(n * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)) * feature_H_ * feature_W_ + h * feature_W_ + w];
                }
            }
        }
    }
  }
}

template <typename data_t>
void psamask_distribute_backward_kernel(const data_t* grad_out_data,
                            data_t* grad_x_data,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  for (int i{0}; i < num_ * mask_H_ * mask_W_ * feature_H_ * feature_W_; ++i){
      grad_x_data[i] = 0;
  }
  for(int n = 0; n < num_; n++) {
    for(int h = 0; h < feature_H_; h++) {
        for(int w = 0; w < feature_W_; w++) {
        // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
            const int hstart = max(0, half_mask_H_ - h);
            const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
            const int wstart = max(0, half_mask_W_ - w);
            const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
            // (hidx,                    widx                   ) with mask-indexed
            // (hidx + h - half_mask_H_, widx + w - half_mask_W_) with feature-indexed
            for (int hidx = hstart; hidx < hend; hidx++) {
                for (int widx = wstart; widx < wend; widx++) {
                    grad_x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w] = grad_out_data[(n * feature_H_ * feature_W_ + h * feature_W_ + w) * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)];
                }
            }
        }
    }
  }
}

std::vector<paddle::Tensor> PSAMaskCPUForward(const paddle::Tensor& x,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  CHECK_INPUT(x);

  auto out = paddle::Tensor(paddle::PlaceType::kCPU, std::vector<int64_t>{num_, feature_H_ * feature_W_, feature_H_, feature_W_});

  if (psa_type == 0) {
    PD_DISPATCH_FLOATING_TYPES(
        x.type(), "psamask_collect_forward_kernel", ([&] {
            psamask_collect_forward_kernel<data_t>(
                x.data<data_t>(), out.mutable_data<data_t>(x.place()), num_, feature_H_, feature_W_,
                            mask_H_, mask_W_, half_mask_H_, half_mask_W_);
        }));
  }
  else{
      PD_DISPATCH_FLOATING_TYPES(
        x.type(), "psamask_distribute_forward_kernel", ([&] {
            psamask_distribute_forward_kernel<data_t>(
                x.data<data_t>(),  out.mutable_data<data_t>(x.place()), num_, feature_H_, feature_W_,
                            mask_H_, mask_W_, half_mask_H_, half_mask_W_);
        }));
  }

  return {out};
}

std::vector<paddle::Tensor> PSAMaskCPUBackward(const paddle::Tensor& x,
                                            const paddle::Tensor& out,
                                            const paddle::Tensor& grad_out,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  CHECK_INPUT(x);
  CHECK_INPUT(out);
  CHECK_INPUT(grad_out);

  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());

  if (psa_type == 0) {
    PD_DISPATCH_FLOATING_TYPES(out.type(), "psamask_collect_backward_kernel", ([&] {
                                psamask_collect_backward_kernel<data_t>(
                                    grad_out.data<data_t>(),
                                    grad_x.mutable_data<data_t>(x.place()),
                                    num_, feature_H_, feature_W_,
                             mask_H_, mask_W_, half_mask_H_, half_mask_W_);
                                }));
  } else {
      PD_DISPATCH_FLOATING_TYPES(out.type(), "psamask_distribute_backward_kernel", ([&] {
                                psamask_distribute_backward_kernel<data_t>(
                                    grad_out.data<data_t>(),
                                    grad_x.mutable_data<data_t>(x.place()),
                                    num_, feature_H_, feature_W_,
                             mask_H_, mask_W_, half_mask_H_, half_mask_W_);
                                }));
  }

  return {grad_x};
}

// NOTE: If your custom operator may be compiled in an environment with CUDA,
// or it may be compiled in an environment without CUDA, in order to adapt the
// compilation environment, you can use the PADDLE_WITH_CUDA macro control
// the CUDA related code.

# ifdef PADDLE_WITH_CUDA

std::vector<paddle::Tensor> PSAMaskCUDAForward(const paddle::Tensor& x,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_);
std::vector<paddle::Tensor> PSAMaskCUDABackward(const paddle::Tensor& x,
                                            const paddle::Tensor& out,
                                            const paddle::Tensor& grad_out,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_);

# endif

std::vector<paddle::Tensor> PSAMaskForward(const paddle::Tensor& x,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  if (x.place() == paddle::PlaceType::kCPU) {
    return PSAMaskCPUForward(x, psa_type, num_, feature_H_, feature_W_, mask_H_, mask_W_, half_mask_H_, half_mask_W_);

# ifdef PADDLE_WITH_CUDA

  } else if (x.place() == paddle::PlaceType::kGPU) {
    return PSAMaskCUDAForward(x, psa_type, num_, feature_H_, feature_W_, mask_H_, mask_W_, half_mask_H_, half_mask_W_);

# endif

  } else {
    PD_THROW("Unsupported device type for forward function of custom relu operator.");
  }
}

std::vector<paddle::Tensor> PSAMaskBackward(const paddle::Tensor& x,
                                         const paddle::Tensor& out,
                                         const paddle::Tensor& grad_out,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  if (x.place() == paddle::PlaceType::kCPU) {
    return PSAMaskCPUBackward(x, out, grad_out, psa_type, num_, feature_H_, feature_W_, mask_H_, mask_W_, half_mask_H_, half_mask_W_);

# ifdef PADDLE_WITH_CUDA

  } else if (x.place() == paddle::PlaceType::kGPU) {
    return PSAMaskCUDABackward(x, out, grad_out, psa_type, num_, feature_H_, feature_W_, mask_H_, mask_W_, half_mask_H_, half_mask_W_);

# endif

  } else {
    PD_THROW("Unsupported device type for backward function of custom relu operator.");
  }
}

// 维度推导
std::vector<std::vector<int64_t>> PSAMaskInferShape(const std::vector<int64_t> x_shape) {
    return {std::vector<int64_t>{x_shape[0], x_shape[2] * x_shape[3], x_shape[2], x_shape[3]}};
}

// 类型推导
std::vector<paddle::DataType> PSAMaskInferDtype(paddle::DataType x_dtype) {
  return {x_dtype};
}

PD_BUILD_OP(psamask)
    .Inputs({"X"})
    .Outputs({"Out"})
    .Attrs({
        "psa_type: int",
        "num_: int",
        "feature_H_: int",
        "feature_W_: int",
        "mask_H_: int",
        "mask_W_: int",
        "half_mask_H_: int",
        "half_mask_W_: int"})
    .SetKernelFn(PD_KERNEL(PSAMaskForward))
    .SetInferShapeFn(PD_INFER_SHAPE(PSAMaskInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(PSAMaskInferDtype));

PD_BUILD_GRAD_OP(psamask)
    .Inputs({"X", "Out", paddle::Grad("Out")})
    .Outputs({paddle::Grad("X")})
    .Attrs({
        "psa_type: int",
        "num_: int",
        "feature_H_: int",
        "feature_W_: int",
        "mask_H_: int",
        "mask_W_: int",
        "half_mask_H_: int",
        "half_mask_W_: int"})
    .SetKernelFn(PD_KERNEL(PSAMaskBackward));

psamask.cu代码如下：


# include "paddle/extension.h"

# include <vector>

# define CHECK_GPU_INPUT(x) PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")

# ifndef min

# define min(a,b) (((a) < (b)) ? (a) : (b))

# endif

# ifndef max

# define max(a,b) (((a) > (b)) ? (a) : (b))

# endif

template <typename data_t>
__global__ void psamask_collect_cuda_forward_kernel(const data_t* x_data,
                             data_t* out_data,
                             const int nthreads,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = gid; i < num_ * feature_H_ * feature_W_ * feature_H_ * feature_W_; i += blockDim.x * gridDim.x) {
        out_data[i] = 0;
    }
    for (int index{blockIdx.x * blockDim.x + threadIdx.x}; index< nthreads; index+=blockDim.x * gridDim.x) {
        const int w{index % feature_W_};
        const int h{index / feature_W_ % feature_H_};
        const int n{index / feature_W_ / feature_H_};
        const int hstart = max(0, half_mask_H_ - h);
        const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
        const int wstart = max(0, half_mask_W_ - w);
        const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
        for (int hidx{hstart}; hidx < hend; ++hidx){
            for (int widx{wstart}; widx < wend; ++widx) {
                out_data[(n * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)) * feature_H_ * feature_W_ + h * feature_W_ + w] = x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w];
            }
        }
    }
}

template <typename data_t>
__global__ void psamask_distribute_cuda_forward_kernel(const data_t* x_data,
                             data_t* out_data,
                             const int nthreads,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = gid; i < num_ * feature_H_ * feature_W_ * feature_H_ * feature_W_; i += blockDim.x * gridDim.x) {
        out_data[i] = 0;
    } 
    for (int index{blockIdx.x * blockDim.x + threadIdx.x}; index< nthreads; index+=blockDim.x * gridDim.x) {
        const int w{index % feature_W_};
        const int h{index / feature_W_ % feature_H_};
        const int n{index / feature_W_ / feature_H_};
        const int hstart = max(0, half_mask_H_ - h);
        const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
        const int wstart = max(0, half_mask_W_ - w);
        const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
        for (int hidx{hstart}; hidx < hend; ++hidx){
            for (int widx{wstart}; widx < wend; ++widx) {
                out_data[(n * feature_H_ * feature_W_ + h * feature_W_ + w) * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)] = x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w];
            }
        }
    }
}

template <typename data_t>
__global__ void psamask_collect_cuda_backward_kernel(const data_t* grad_out_data, const data_t* out,
                            data_t* grad_x_data,
                            const int nthreads,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = gid; i < num_ * mask_H_ * mask_W_ * feature_H_ * feature_W_; i += blockDim.x * gridDim.x) {
        grad_x_data[i] = 0;
    }                      
     for (int index{blockIdx.x * blockDim.x + threadIdx.x}; index < nthreads; index+=blockDim.x * gridDim.x) {
        const int w{index % feature_W_};
        const int h{index / feature_W_ % feature_H_};
        const int n{index / feature_W_ / feature_H_};
        const int hstart = max(0, half_mask_H_ - h);
        const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
        const int wstart = max(0, half_mask_W_ - w);
        const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
        for (int hidx{hstart}; hidx < hend; ++hidx){
            for (int widx{wstart}; widx < wend; ++widx) {
                grad_x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w] = grad_out_data[(n * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)) * feature_H_ * feature_W_ + h * feature_W_ + w];
            }
        }
    }
}

template <typename data_t>
__global__ void psamask_distribute_cuda_backward_kernel(const data_t* grad_out_data, const data_t* out,
                            data_t* grad_x_data,
                            const int nthreads,
                             const int num_, const int feature_H_, const int feature_W_,
                             const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
     int gid = blockIdx.x * blockDim.x + threadIdx.x;
     for (int i = gid; i < num_ * mask_H_ * mask_W_ * feature_H_ * feature_W_; i += blockDim.x * gridDim.x) {
         grad_x_data[i] = 0;
     }        
     for (int index{blockIdx.x * blockDim.x + threadIdx.x}; index< nthreads; index+=blockDim.x * gridDim.x) {
        const int w{index % feature_W_};
        const int h{index / feature_W_ % feature_H_};
        const int n{index / feature_W_ / feature_H_};
        const int hstart = max(0, half_mask_H_ - h);
        const int hend = min(mask_H_, feature_H_ + half_mask_H_ - h);
        const int wstart = max(0, half_mask_W_ - w);
        const int wend = min(mask_W_, feature_W_ + half_mask_W_ - w);
        for (int hidx{hstart}; hidx < hend; ++hidx){
            for (int widx{wstart}; widx < wend; ++widx) {
                grad_x_data[((n * mask_H_ * mask_W_ + hidx * mask_W_ + widx) * feature_H_ + h) * feature_W_ + w] = grad_out_data[(n * feature_H_ * feature_W_ + h * feature_W_ + w) * feature_H_ * feature_W_ + (hidx + h - half_mask_H_) * feature_W_ + (widx + w - half_mask_W_)];
            }
        }
    }
}

std::vector<paddle::Tensor> PSAMaskCUDAForward(const paddle::Tensor& x,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  CHECK_GPU_INPUT(x);

  auto out = paddle::Tensor(paddle::PlaceType::kGPU, std::vector<int64_t>{num_, feature_H_ * feature_W_, feature_H_, feature_W_});
  int numel = out.size();
  int nthreads = num_ * feature_H_ * feature_W_;
  int block = 512;
  if (psa_type == 0) {
    PD_DISPATCH_FLOATING_TYPES(
        x.type(), "psamask_collect_cuda_forward_kernel", ([&] {psamask_collect_cuda_forward_kernel<data_t><<<nthreads, block, 0, x.stream()>>>(x.data<data_t>(), out.mutable_data<data_t>(x.place()), nthreads, num_, feature_H_, feature_W_,mask_H_, mask_W_, half_mask_H_, half_mask_W_);}));
  }
  else{
      PD_DISPATCH_FLOATING_TYPES(
        x.type(), "psamask_distribute_cuda_forward_kernel", ([&] {
            psamask_distribute_cuda_forward_kernel<data_t><<<nthreads, block, 0, x.stream()>>>(
                x.data<data_t>(),  out.mutable_data<data_t>(x.place()), nthreads, num_, feature_H_, feature_W_,
                            mask_H_, mask_W_, half_mask_H_, half_mask_W_);
        }));
  }

  return {out};
}

std::vector<paddle::Tensor> PSAMaskCUDABackward(const paddle::Tensor& x,
                                            const paddle::Tensor& out,
                                            const paddle::Tensor& grad_out,
    const int psa_type, const int num_, const int feature_H_, const int feature_W_, 
    const int mask_H_, const int mask_W_, const int half_mask_H_, const int half_mask_W_) {
  CHECK_GPU_INPUT(x);
  CHECK_GPU_INPUT(out);
  CHECK_GPU_INPUT(grad_out);

  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
  int numel = x.size();
  int nthreads = num_ * feature_H_ * feature_W_;
  int block = 512;
  if (psa_type == 0) {
    PD_DISPATCH_FLOATING_TYPES(out.type(), "psamask_collect_cuda_backward_kernel", ([&] {
                                psamask_collect_cuda_backward_kernel<data_t><<<nthreads,block, 0, x.stream()>>>(
                                    grad_out.data<data_t>(),
                                    out.data<data_t>(),
                                    grad_x.mutable_data<data_t>(x.place()),
                                    nthreads,
                                    num_, feature_H_, feature_W_,
                             mask_H_, mask_W_, half_mask_H_, half_mask_W_);
                                }));
  } else {
    PD_DISPATCH_FLOATING_TYPES(out.type(), "psamask_distribute_cuda_backward_kernel", ([&] {
                                psamask_distribute_cuda_backward_kernel<data_t><<<nthreads, block, 0, x.stream()>>>(
                                    grad_out.data<data_t>(),
                                    out.data<data_t>(),
                                    grad_x.mutable_data<data_t>(x.place()),
                                    nthreads,
                                    num_, feature_H_, feature_W_,
                             mask_H_, mask_W_, half_mask_H_, half_mask_W_);
                                }));
  }

  return {grad_x};
}

1条答案

按热度按时间

7ivaypg91#

您好，我们已经收到了您的问题，会安排技术人员尽快解答您的问题，请耐心等待。请您再次检查是否提供了清晰的问题描述、复现代码、环境&版本、报错信息等。同时，您也可以通过查看官网API文档、常见问题、历史Issue、AI社区来寻求解答。祝您生活愉快～

Hi! We've received your issue and please be patient to get responded. We will arrange technicians to answer your questions as soon as possible. Please make sure that you have posted enough message to demo your request. You may also check out the API，FAQ，Github Issue and AI community to get the answer.Have a nice day!

赞(0）回复(0）举报 2022-04-21

Paddle 【飞桨论文复现赛-第六期】自定义外部算子动态图预测正常，导出静态图后推理异常

bug描述 Describe the Bug

其他补充信息 Additional Supplementary Information

1条答案

相关问题

热门标签

最新问答