c++ cudaMemcpy()的性能在使用cudaHostAlloc()后没有提高[重复]

cbjzeqam  于 2023-08-09  发布在  其他
关注(0)|答案(1)|浏览(133)

此问题在此处已有答案

Strange Cudamemcpy execution time(1个答案)
9天前关闭。
我有一个在cuBLAS中执行BLAS的gemm()子程序的程序,如下所示:

size_t m = 10000;
size_t k = 4000;
size_t n = 6000;

// h_A is an m-by-k matrix, h_B is a k-by-n matrix and h_C is a m-by-n matrix.

double *h_C;
cudaHostAlloc((void **)(&h_C), m * n * sizeof(double), cudaHostAllocDefault);

uint64_t t0 = get_timestamp_in_microsec();

CUBLAS_CHECK(cublasCreate(&cublasH));

uint64_t t1 = get_timestamp_in_microsec();

CUDA_CHECK(cudaMalloc((void **)(&d_A), sizeof(double) * m * k));
CUDA_CHECK(cudaMalloc((void **)(&d_B), sizeof(double) * k * n));
CUDA_CHECK(cudaMalloc((void **)(&d_C), sizeof(double) * m * n));

uint64_t t2 = get_timestamp_in_microsec();

CUDA_CHECK(cudaMemcpy(d_A, h_A.data(), sizeof(double) * m * k, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_B, h_B.data(), sizeof(double) * k * n, cudaMemcpyHostToDevice));

uint64_t t3 = get_timestamp_in_microsec();

CUBLAS_CHECK(cublasDgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));

uint64_t t4 = get_timestamp_in_microsec();

CUDA_CHECK(cudaMemcpy(h_C, d_C, sizeof(double) * m * n, cudaMemcpyDeviceToHost));

uint64_t t5 = get_timestamp_in_microsec();

字符串
性能还可以,除了最后的cudaMemcpy()d_C移回h_C需要大约3秒:

t1-t0: 12.929ms
t2-t1: 0.949ms
t3-t2: 53.256ms
t4-t3: 0.315ms
t5-t4: 2653.57ms


在搜索了SO和其他网站之后,我从malloc()迁移到cudaHostAlloc(),但上面的结果似乎没有任何变化。
我的代码中有任何问题吗?或者这已经是最佳性能了吗?

qnakjoqk

qnakjoqk1#

根据@njuffa的建议,我在CUBLAS_CHECK(cublasDgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));之后插入了cudaDeviceSynchronize();。现在的数字更合理:

t1-t0: 12.826ms
t2-t1: 0.955ms
t3-t2: 52.843ms
t4-t3: 2620.88ms
t5-t4: 36.406ms

字符串

相关问题