c++ 将共享内存curand状态返回给global会导致奇怪的随机数

wz8daaqr  于 2023-04-01  发布在  其他
关注(0)|答案(1)|浏览(63)

我尝试在CUDA c++中使用curand编写代码。我几乎得到了我想要的,除了如果我从共享内存更新全局curand状态,我会得到奇怪的输出。如果我删除该更新,一切都按预期工作(例如,我得到的数字在0.0和1.0之间)。如果我包含更新(第22行)我看到负数,一堆0,甚至一些极端的数字,如2 e +31。我也无法发现与www.example.com上找到的CUDA手册的区别https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-example。非常肯定这是一个愚蠢的疏忽-任何帮助都是感激的!谢谢。
下面是我的代码:

#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <iostream>

#define ITER 32

__global__ void setup_kernel(curandState* state) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    curand_init(1234, idx, 0, &state[idx]);
}

__global__ void generate_kernel(curandState* curand_state, const unsigned int n, float* result_float) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    curandState localState = curand_state[idx];

    if (idx < n) {
        float myrandf = curand_uniform(&localState);
        result_float[idx] = myrandf;
        curand_state[idx] = localState;
    }

}

int main() {

    curandState* d_state;
    cudaMalloc(&d_state, sizeof(curandState));
    float* d_result_float, * h_result_float;
    cudaMalloc(&d_result_float, ITER * sizeof(float));
    h_result_float = (float*)malloc(ITER * sizeof(float));

    int BLOCK_SIZE = 1024;
    int GRID_SIZE = (ITER + BLOCK_SIZE - 1) / BLOCK_SIZE;
    std::cout << "BLOCK_SIZE: " << BLOCK_SIZE << "; GRID_SIZE: " << GRID_SIZE << "\n";
    setup_kernel << <GRID_SIZE, BLOCK_SIZE >> > (d_state);
    generate_kernel << <GRID_SIZE, BLOCK_SIZE >> > (d_state, ITER, d_result_float);

    cudaDeviceSynchronize();

    cudaMemcpy(h_result_float, d_result_float, ITER * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < ITER; i++)
        std::cout << h_result_float[i] << "\n";

    return 0;
}

输出:

BLOCK_SIZE: 1024; GRID_SIZE: 1
0
0.820181
0
0
4.6068e-09
-1.56062e+09
-0.758724
[...]
0
0
4.6068e-09
-3.77124e-23
2.8262e+33
-3.31968e+19
njthzxwz

njthzxwz1#

当ITER为32时,这:

int BLOCK_SIZE = 1024;
int GRID_SIZE = (ITER + BLOCK_SIZE - 1) / BLOCK_SIZE;

将导致您启动一个包含1024个线程的块。(这没有什么错。)
在你的内核中,你似乎理解了什么是线程检查:

if (idx < n) {

但是在两个内核中,你都允许读写活动的索引(基于1024个线程)远远超过你所分配的。这些将索引到1024,因为它们不受任何线程检查的控制:

__global__ void setup_kernel(curandState* state) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    curand_init(1234, idx, 0, &state[idx]);
                                     ^^^

以及:

__global__ void generate_kernel(curandState* curand_state, const unsigned int n, float* result_float) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    curandState localState = curand_state[idx];
                                          ^^^

这一点:

cudaMalloc(&d_state, sizeof(curandState));

只为单个curand状态分配空间。在正常使用中,每个线程需要一个状态。即使你不知道这一点,你也可以像为每个线程分配一个状态一样索引:

curand_state[idx] = localState;
                 ^^^

如果使用compute-sanitizer运行代码,您将能够看到这两个编码错误。
当我进行更改以解决这些项目时,我似乎得到了合理的结果:

$ cat t2232.cu
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <iostream>

#define ITER 32

__global__ void setup_kernel(curandState* state, const unsigned int n) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx < n)
      curand_init(1234, idx, 0, &state[idx]);
}

__global__ void generate_kernel(curandState* curand_state, const unsigned int n, float* result_float) {

    int idx = threadIdx.x + blockDim.x * blockIdx.x;

    if (idx < n) {
        curandState localState = curand_state[idx];
        float myrandf = curand_uniform(&localState);
        result_float[idx] = myrandf;
        curand_state[idx] = localState;
    }

}

int main() {

    curandState* d_state;
    cudaMalloc(&d_state, ITER*sizeof(curandState));
    float* d_result_float, * h_result_float;
    cudaMalloc(&d_result_float, ITER * sizeof(float));
    h_result_float = (float*)malloc(ITER * sizeof(float));

    int BLOCK_SIZE = 1024;
    int GRID_SIZE = (ITER + BLOCK_SIZE - 1) / BLOCK_SIZE;
    std::cout << "BLOCK_SIZE: " << BLOCK_SIZE << "; GRID_SIZE: " << GRID_SIZE << "\n";
    setup_kernel << <GRID_SIZE, BLOCK_SIZE >> > (d_state, ITER);
    generate_kernel << <GRID_SIZE, BLOCK_SIZE >> > (d_state, ITER, d_result_float);

    cudaDeviceSynchronize();

    cudaMemcpy(h_result_float, d_result_float, ITER * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < ITER; i++)
        std::cout << h_result_float[i] << "\n";

    return 0;
}
$ nvcc -o t2232 t2232.cu -lcurand
$ compute-sanitizer ./t2232
========= COMPUTE-SANITIZER
BLOCK_SIZE: 1024; GRID_SIZE: 1
0.145468
0.820181
0.550399
0.29483
0.914733
0.868979
0.321921
0.782857
0.0113023
0.28545
0.781606
0.23384
0.679064
0.282442
0.629903
0.121223
0.433255
0.383079
0.513567
0.298722
0.416607
0.0344908
0.0493946
0.0466557
0.616587
0.648044
0.868518
0.401159
0.063146
0.49717
0.680894
0.935035
========= ERROR SUMMARY: 0 errors
$

顺便说一句,你所说的共享内存并不是共享内存,而是本地内存,这就是为什么本地变量被称为localState

相关问题