此问题在此处已有答案:
Strange Cudamemcpy execution time(1个答案)
9天前关闭。
我有一个在cuBLAS中执行BLAS的gemm()子程序的程序,如下所示:
size_t m = 10000;
size_t k = 4000;
size_t n = 6000;
// h_A is an m-by-k matrix, h_B is a k-by-n matrix and h_C is a m-by-n matrix.
double *h_C;
cudaHostAlloc((void **)(&h_C), m * n * sizeof(double), cudaHostAllocDefault);
uint64_t t0 = get_timestamp_in_microsec();
CUBLAS_CHECK(cublasCreate(&cublasH));
uint64_t t1 = get_timestamp_in_microsec();
CUDA_CHECK(cudaMalloc((void **)(&d_A), sizeof(double) * m * k));
CUDA_CHECK(cudaMalloc((void **)(&d_B), sizeof(double) * k * n));
CUDA_CHECK(cudaMalloc((void **)(&d_C), sizeof(double) * m * n));
uint64_t t2 = get_timestamp_in_microsec();
CUDA_CHECK(cudaMemcpy(d_A, h_A.data(), sizeof(double) * m * k, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_B, h_B.data(), sizeof(double) * k * n, cudaMemcpyHostToDevice));
uint64_t t3 = get_timestamp_in_microsec();
CUBLAS_CHECK(cublasDgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));
uint64_t t4 = get_timestamp_in_microsec();
CUDA_CHECK(cudaMemcpy(h_C, d_C, sizeof(double) * m * n, cudaMemcpyDeviceToHost));
uint64_t t5 = get_timestamp_in_microsec();
字符串
性能还可以,除了最后的cudaMemcpy()
将d_C
移回h_C
需要大约3秒:
t1-t0: 12.929ms
t2-t1: 0.949ms
t3-t2: 53.256ms
t4-t3: 0.315ms
t5-t4: 2653.57ms
型
在搜索了SO和其他网站之后,我从malloc()
迁移到cudaHostAlloc()
,但上面的结果似乎没有任何变化。
我的代码中有任何问题吗?或者这已经是最佳性能了吗?
1条答案
按热度按时间qnakjoqk1#
根据@njuffa的建议,我在
CUBLAS_CHECK(cublasDgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));
之后插入了cudaDeviceSynchronize();
。现在的数字更合理:字符串