c++ 对于最简单的CUDA内核,获取“遇到非法内存访问”

yws3nbqq  于 2023-08-09  发布在  其他
关注(0)|答案(1)|浏览(220)

我正在尝试使用CUDA V12.2.128在Ubuntu 22.04上的GeForce RTX 2060 GPU上学习CUDA编程。
当我编译下面的代码并尝试运行它时,

#include "cuda_runtime.h"

#include <iostream>
#include <stdexcept>
#include <cstdlib>
#include <cmath>

#define CUDACHECK(E) if (E != cudaSuccess) \
    { \
        std::cerr << "Fatal error [line " << __LINE__ << "]: " << cudaGetErrorString(E) << ".\n"; \
        std::abort(); \
    }

#define ASSERT(B) if (!(B)) \
    { \
        std::cerr << "Fatal error: test failed in line " << __LINE__ << ".\n"; \
        std::abort(); \
    }

__global__ void fill(float* x, const std::size_t& size, const float& value)
{
    const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < size)
    {
        x[tid] = value;
    }
}

int main()
{
    // allocate device data
    const auto size = std::size_t{10000};
    auto* xD = static_cast<float*>(nullptr);
    CUDACHECK(cudaMalloc(&xD, size * sizeof(float)));
    ASSERT(xD != nullptr);
    
    // run fill kernel
    const auto fill_value = static_cast<float>(-1.0);
    fill<<<1,1>>>(xD, size, fill_value);
    CUDACHECK(cudaDeviceSynchronize());
    
    // <--- control doesn't get past here!
    
    // copy to host
    //auto* xH = reinterpret_cast<float*>(std::malloc(size * sizeof(float)));
    //ASSERT(xH != nullptr);
    //CUDACHECK(cudaMemcpy(xH, xD, size * sizeof(float), cudaMemcpyDeviceToHost));
    
    //constexpr auto tol = std::is_same<float, float>::value ? static_cast<float>(1E-5) : static_cast<float>(1E-10);
    //for (auto ii = std::size_t{}; ii < size; ++ii)
    //{
    //    ASSERT(std::fabs(fill_value - xH[ii]) < tol);
    //}
    
    // free
    CUDACHECK(cudaFree(reinterpret_cast<void*>(xD)));
    //std::free(xH);
    
    // tests have been passed
    std::cout << "Tests passed! Hallo, CUDA world!" << std::endl;
    
    return 0;
}

字符串
我得到以下输出:

Fatal error [line 42]: an illegal memory access was encountered.
Aborted (core dumped)


然后,我使用compute-sanitizer和--tool memcheck,得到以下输出:

========= COMPUTE-SANITIZER
========= Invalid __global__ read of size 8 bytes
=========     at 0x20 in fill(float *, const unsigned long &, const float &)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x7ffc902b1f30 is out of bounds
=========     and is 639.439.150.385 bytes after the nearest allocation at 0x7f67aea00000 of size 512 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x32e950]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:libcudart_static_4d8b33a106dceb3c07a56e26de61f2d53bb62a68 [0x1093e]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
=========     Host Frame:cudaLaunchKernel [0x70b4e]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
=========     Host Frame:cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) [0xb235]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
=========     Host Frame:__device_stub__Z4fillPfRKmRKf(float*, unsigned long const*, float const*) [0xb094]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
=========     Host Frame:fill(float*, unsigned long const&, float const&) [0xb0f7]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
=========     Host Frame:main [0xadaa]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
=========     Host Frame:../sysdeps/nptl/libc_start_call_main.h:58:__libc_start_call_main [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:../csu/libc-start.c:379:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xab15]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
========= 
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x47e786]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:cudaDeviceSynchronize [0x48a64]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
=========     Host Frame:main [0xadaf]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
=========     Host Frame:../sysdeps/nptl/libc_start_call_main.h:58:__libc_start_call_main [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:../csu/libc-start.c:379:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xab15]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
========= 
Fatal error [line 42]: ========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x47e786]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:cudaDeviceSynchronize [0x48a64]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
=========     Host Frame:main [0xadfb]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
=========     Host Frame:../sysdeps/nptl/libc_start_call_main.h:58:__libc_start_call_main [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:../csu/libc-start.c:379:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xab15]
=========                in /home/nitin/Documents/code/gpu/device/./hw_cuda
========= 
unspecified launch failure.
========= Error: process didn't terminate successfully
========= Target application returned an error
========= ERROR SUMMARY: 3 errors

问题

1.此错误的原因可能是什么?
1.如何继续调试它?
我会很乐意提供更多的信息,当要求它。先谢了。

yhxst69z

yhxst69z1#

在@AbatorAbetor在评论中提到它之后,我能够回到我的代码来验证传递对自动变量的引用会导致这种行为。所需的校正为:

__global__ void fill(float* x, const std::size_t size, const float value)

字符串
非常感谢你的帮助。

相关问题