c++ CUDA:如何创建2D纹理对象?

nukf8bse  于 2024-01-09  发布在  其他
关注(0)|答案(1)|浏览(149)

我正在尝试创建2D纹理对象,4x4 uint8_t。下面是代码:

  1. __global__ void kernel(cudaTextureObject_t tex)
  2. {
  3. int x = threadIdx.x;
  4. int y = threadIdx.y;
  5. uint8_t val = tex2D<uint8_t>(tex, x, y);
  6. printf("%d, ", val);
  7. return;
  8. }
  9. int main(int argc, char **argv)
  10. {
  11. cudaTextureObject_t tex;
  12. uint8_t dataIn[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
  13. uint8_t* dataDev = 0;
  14. cudaMalloc((void**)&dataDev, 16);
  15. struct cudaResourceDesc resDesc;
  16. memset(&resDesc, 0, sizeof(resDesc));
  17. resDesc.resType = cudaResourceTypePitch2D;
  18. resDesc.res.pitch2D.devPtr = dataDev;
  19. resDesc.res.pitch2D.desc.x = 8;
  20. resDesc.res.pitch2D.desc.y = 8;
  21. resDesc.res.pitch2D.desc.f = cudaChannelFormatKindUnsigned;
  22. resDesc.res.pitch2D.width = 4;
  23. resDesc.res.pitch2D.height = 4;
  24. resDesc.res.pitch2D.pitchInBytes = 4;
  25. struct cudaTextureDesc texDesc;
  26. memset(&texDesc, 0, sizeof(texDesc));
  27. cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
  28. cudaMemcpy(dataDev, &dataIn[0], 16, cudaMemcpyHostToDevice);
  29. dim3 threads(4, 4);
  30. kernel<<<1, threads>>>(tex);
  31. cudaDeviceSynchronize();
  32. return 0;
  33. }

字符串
我预计结果会是这样的:

  1. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,


即纹理对象的所有值(顺序无关紧要)。
但实际结果是:

  1. 0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6,


我做错了什么?

iih3973s

iih3973s1#

当你使用pitch2D变量进行纹理操作时,底层的分配应该是一个适当的 pitched 分配。我认为通常人们会使用cudaMallocPitch来创建它。然而,要求是:
cudaResourceDesc::res::pitch 2D::pitchInline以字节为单位指定两行之间的间距,并且必须与cudaDeviceProp::texturePitchAlignment对齐。
在我的GPU上,最后一个属性是32。我不知道你的GPU,但我打赌你的GPU的属性不是4。但是你在这里指定了4:

  1. resDesc.res.pitch2D.pitchInBytes = 4;

字符串
同样,我认为人们通常会使用使用cudaMallocPitch创建的间距分配来实现这一点。然而,如果行到行的维度(以字节为单位)可以被texturePitchAlignment整除(在我的例子中是32),那么我确实可以传递普通的线性分配。
我做的另一个改变是使用cudaCreateChannelDesc<>()而不是像你一样手动设置参数。这会创建一组不同的desc参数,似乎也会影响结果。研究差异应该不难。
当我调整你的代码来解决这些问题时,我得到的结果对我来说似乎是合理的:

  1. $ cat t30.cu
  2. #include <stdio.h>
  3. #include <stdint.h>
  4. typedef uint8_t mt; // use an integer type
  5. __global__ void kernel(cudaTextureObject_t tex)
  6. {
  7. int x = threadIdx.x;
  8. int y = threadIdx.y;
  9. mt val = tex2D<mt>(tex, x, y);
  10. printf("%d, ", val);
  11. }
  12. int main(int argc, char **argv)
  13. {
  14. cudaDeviceProp prop;
  15. cudaGetDeviceProperties(&prop, 0);
  16. printf("texturePitchAlignment: %lu\n", prop.texturePitchAlignment);
  17. cudaTextureObject_t tex;
  18. const int num_rows = 4;
  19. const int num_cols = prop.texturePitchAlignment*1; // should be able to use a different multiplier here
  20. const int ts = num_cols*num_rows;
  21. const int ds = ts*sizeof(mt);
  22. mt dataIn[ts];
  23. for (int i = 0; i < ts; i++) dataIn[i] = i;
  24. mt* dataDev = 0;
  25. cudaMalloc((void**)&dataDev, ds);
  26. cudaMemcpy(dataDev, dataIn, ds, cudaMemcpyHostToDevice);
  27. struct cudaResourceDesc resDesc;
  28. memset(&resDesc, 0, sizeof(resDesc));
  29. resDesc.resType = cudaResourceTypePitch2D;
  30. resDesc.res.pitch2D.devPtr = dataDev;
  31. resDesc.res.pitch2D.width = num_cols;
  32. resDesc.res.pitch2D.height = num_rows;
  33. resDesc.res.pitch2D.desc = cudaCreateChannelDesc<mt>();
  34. resDesc.res.pitch2D.pitchInBytes = num_cols*sizeof(mt);
  35. struct cudaTextureDesc texDesc;
  36. memset(&texDesc, 0, sizeof(texDesc));
  37. cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
  38. dim3 threads(4, 4);
  39. kernel<<<1, threads>>>(tex);
  40. cudaDeviceSynchronize();
  41. printf("\n");
  42. return 0;
  43. }
  44. $ nvcc -o t30 t30.cu
  45. $ cuda-memcheck ./t30
  46. ========= CUDA-MEMCHECK
  47. texturePitchAlignment: 32
  48. 0, 1, 2, 3, 32, 33, 34, 35, 64, 65, 66, 67, 96, 97, 98, 99,
  49. ========= ERROR SUMMARY: 0 errors
  50. $


正如评论中所问的,如果我要做类似的事情,但使用cudaMallocPitchcudaMemcpy2D,它可能看起来像这样:

  1. $ cat t1421.cu
  2. #include <stdio.h>
  3. #include <stdint.h>
  4. typedef uint8_t mt; // use an integer type
  5. __global__ void kernel(cudaTextureObject_t tex)
  6. {
  7. int x = threadIdx.x;
  8. int y = threadIdx.y;
  9. mt val = tex2D<mt>(tex, x, y);
  10. printf("%d, ", val);
  11. }
  12. int main(int argc, char **argv)
  13. {
  14. cudaDeviceProp prop;
  15. cudaGetDeviceProperties(&prop, 0);
  16. printf("texturePitchAlignment: %lu\n", prop.texturePitchAlignment);
  17. cudaTextureObject_t tex;
  18. const int num_rows = 4;
  19. const int num_cols = prop.texturePitchAlignment*1; // should be able to use a different multiplier here
  20. const int ts = num_cols*num_rows;
  21. const int ds = ts*sizeof(mt);
  22. mt dataIn[ts];
  23. for (int i = 0; i < ts; i++) dataIn[i] = i;
  24. mt* dataDev = 0;
  25. size_t pitch;
  26. cudaMallocPitch((void**)&dataDev, &pitch, num_cols*sizeof(mt), num_rows);
  27. cudaMemcpy2D(dataDev, pitch, dataIn, num_cols*sizeof(mt), num_cols*sizeof(mt), num_rows, cudaMemcpyHostToDevice);
  28. struct cudaResourceDesc resDesc;
  29. memset(&resDesc, 0, sizeof(resDesc));
  30. resDesc.resType = cudaResourceTypePitch2D;
  31. resDesc.res.pitch2D.devPtr = dataDev;
  32. resDesc.res.pitch2D.width = num_cols;
  33. resDesc.res.pitch2D.height = num_rows;
  34. resDesc.res.pitch2D.desc = cudaCreateChannelDesc<mt>();
  35. resDesc.res.pitch2D.pitchInBytes = pitch;
  36. struct cudaTextureDesc texDesc;
  37. memset(&texDesc, 0, sizeof(texDesc));
  38. cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
  39. dim3 threads(4, 4);
  40. kernel<<<1, threads>>>(tex);
  41. cudaDeviceSynchronize();
  42. printf("\n");
  43. return 0;
  44. }
  45. $ nvcc -o t1421 t1421.cu
  46. $ cuda-memcheck ./t1421
  47. ========= CUDA-MEMCHECK
  48. texturePitchAlignment: 32
  49. 0, 1, 2, 3, 32, 33, 34, 35, 64, 65, 66, 67, 96, 97, 98, 99,
  50. ========= ERROR SUMMARY: 0 errors
  51. $


虽然我们这里有纹理对象,但很容易证明纹理引用也会发生类似的问题。你不能创建任意小的2D纹理引用,就像你不能创建任意小的2D纹理对象一样。我也不打算提供演示,因为它会在很大程度上重复上面的内容,人们不应该在新的开发工作中使用纹理引用-纹理对象是更好的方法。

展开查看全部

相关问题