CUDA中访问垃圾值的共享内存

ybzsozfc  于 2023-05-06  发布在  其他
关注(0)|答案(1)|浏览(100)

我正在尝试使用CUDA在2D中实现Navier-Stokes求解器。我在用雅可比法解差分方程组。我将代码划分为4x 4块,由16 x16个线程组成。由于我的矩阵(尺寸为64 x64)中的每个内部点都需要其顶部,底部,左侧和右侧元素来计算其新值,因此我为每个块创建了一个新的18x 18维共享矩阵。我以这种方式将所有值读入矩阵-索引为(0, 0)的线程将其值写入矩阵中的(1, 1)元素,并且还将尝试读取它上面的元素和它左边的元素,如果这种访问没有超出边界。一旦读取完成,我更新所有内部点的值,然后将它们写回内存。
我最终得到了矩阵pn中的垃圾值,即使所有的值都被正确初始化了。我真的看不出我哪里错了。有人能帮我吗?
我的内核:

__global__ void red_psi (float *psi_o, float *psi_n, float *e, float *omega, float l1)
{
    // m = n = 64
    int i1 = blockIdx.x;
    int j1 = blockIdx.y;
    int i2 = threadIdx.x;
    int j2 = threadIdx.y;
    int i = (i1 * blockDim.x) + i2; // Actual row of the element
    int j = (j1 * blockDim.y) + j2; // Actual column of the element
    int l = i * n + j;

    // e_XX --> variables refers to expanded shared memory location in order to accomodate halo elements
    //Current Local ID with radius offset.
    int e_li = i2 + 1;
    int e_lj = j2 + 1;

    // Variable pointing at top and bottom neighbouring location
    int e_li_prev = e_li - 1;
    int e_li_next = e_li + 1;

    // Variable pointing at left and right neighbouring location
    int e_lj_prev = e_lj - 1;
    int e_lj_next = e_lj + 1;

    __shared__ float po[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
    __shared__ float pn[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
    __shared__ float oo[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
    //__shared__ float ee[BLOCK_SIZE + 2][BLOCK_SIZE + 2];

    if (i2 < 1) // copy top and bottom halo
    {
        //Copy Top Halo Element
        if (blockIdx.y > 0) // Boundary check
        {
            po[i2][e_lj] = psi_o[l - n];
            //pn[i2][e_lj] = psi_n[l - n];
            oo[i2][e_lj] = omega[l - n];
            //printf ("i_pn[%d][%d] = %f\n", i2, e_lj, oo[i2][e_lj]);
        }

        //Copy Bottom Halo Element
        if (blockIdx.y < (gridDim.y - 1)) // Boundary check
        {
            po[1 + BLOCK_SIZE][e_lj] = psi_o[l + n];
            //pn[1 + BLOCK_SIZE][e_lj] = psi_n[l + n];
            oo[1 + BLOCK_SIZE][e_lj] = omega[l + n];
            //printf ("j_pn[%d][%d] = %f\n", 1 + BLOCK_SIZE, e_lj, oo[1 + BLOCK_SIZE][e_lj]);
        }

    }

    if (j2 < 1) // copy left and right halo
    {
        if (blockIdx.x > 0) // Boundary check
        {
            po[e_li][j2] = psi_o[l - 1];
            //pn[e_li][j2] = psi_n[l - 1];
            oo[e_li][j2] = omega[l - 1];
            //printf ("k_pn[%d][%d] = %f\n", e_li, j2, oo[e_li][j2]);
        }

        if (blockIdx.x < (gridDim.x - 1)) // Boundary check
        {
            po[e_li][1 + BLOCK_SIZE] = psi_o[l + 1];
            //pn[e_li][1 + BLOCK_SIZE] = psi_n[l + 1];
            oo[e_li][1 + BLOCK_SIZE] = omega[l + 1];
            //printf ("l_pn[%d][%d] = %f\n", e_li, 1 + BLOCK_SIZE, oo[e_li][BLOCK_SIZE + 1]);
        }
    }

    // copy current location
    po[e_li][e_lj] = psi_o[l];
    //pn[e_li][e_lj] = psi_n[l];
    oo[e_li][e_lj] = omega[l];
    //printf ("o_pn[%d][%d] = %f\n", e_li, e_lj, oo[e_li][e_lj]);

    __syncthreads ();

    // Checking whether we have an internal point.
    if ((i >= 1 && i < (m - 1)) && (j >= 1 && j < (n - 1)))
    {
        //printf ("Calculating for - (%d, %d)\n", i, j);
        pn[e_li][e_lj] = 0.25 * (po[e_li_next][e_lj] + po[e_li_prev][e_lj] + po[e_li][e_lj_next] + po[e_li][e_lj_prev] + h*h*oo[e_li][e_lj]);
        //printf ("n_pn[%d][%d] (%d, %d), a(%d, %d) = %f\n", e_li_prev, e_lj, i1, j1, i, j, po[e_li_prev][e_lj]);
        pn[e_li][e_lj] = po[e_li][e_lj] + 1.0 * (pn[e_li][e_lj] - po[e_li][e_lj]);

        __syncthreads ();

        psi_n[l] = pn[e_li][e_lj];
        e[l] = po[e_li][e_lj] - pn[e_li][e_lj];
    }
}

这是我调用内核的方式:

dim3 threadsPerBlock (4, 4);
dim3 numBlocks (4, 4);
red_psi<<<numBlocks, threadsPerBlock>>> (d_xn, d_xx, d_e, d_w, l1);

d_xxd_xnd_ed_w都是大小为4096的float数组。

yyhrrdl8

yyhrrdl81#

在复制top / bottom和left / right halo元素时,我切换了blockDim.x和blockDim.y。

相关问题