c++ cuda拷贝内存

x33g5p2x  于2021-12-11 转载在 C/C++  
字(0.8k)|赞(0)|评价(0)|浏览(562)

https://developer.nvidia.com/zh-cn/blog/how-overlap-data-transfers-cuda-cc/

分批拷贝:

https://github.com/NVIDIA-developer-blog/code-samples/blob/master/series/cuda-cpp/overlap-data-transfers/async.cu

  1. float *a, *d_a;
  2. checkCuda( cudaMallocHost((void**)&a, bytes) ); // host pinned
  3. checkCuda( cudaMalloc((void**)&d_a, bytes) ); // device
  4. for (int i = 0; i < nStreams; ++i) {
  5. int offset = i * streamSize;
  6. cudaMemcpyAsync(&d_a[offset], &a[offset],
  7. streamBytes, cudaMemcpyHostToDevice, cudaMemcpyHostToDevice, stream[i]);
  8. }
  9. for (int i = 0; i < nStreams; ++i) {
  10. int offset = i * streamSize;
  11. kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
  12. }
  13. for (int i = 0; i < nStreams; ++i) {
  14. int offset = i * streamSize;
  15. cudaMemcpyAsync(&a[offset], &d_a[offset],
  16. streamBytes, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToHost, stream[i]);
  17. }

相关文章

最新文章

更多