#include <cuda_runtime.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;
class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;
public:
void startCounter() {
start = std::chrono::system_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
/ 1000000.0;
}
};
int N = 10000000;
void GenData(int N, float* a)
{
for (int i = 0; i < N; i ++) a[i] = float(rand() % 1000000) / (rand() % 100 + 1);
}
__global__
void HelloWorld()
{
printf("Hello world\n");
}
constexpr int npoints = 6;
const string costnames[] = {"allocate", "H2D", "sort", "D2H", "hostsum", "free"};
double cost[3][npoints];
volatile double dummy = 0;
void Test1()
{
MyTimer timer;
timer.startCounter();
float *h_a = new float[N];
float *d_a;
cudaMalloc(&d_a, N * sizeof(float));
cudaDeviceSynchronize();
cost[0][0] += timer.getCounterMsPrecise();
GenData(N, h_a);
dummy = h_a[rand() % N];
timer.startCounter();
cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cost[0][1] += timer.getCounterMsPrecise();
timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(d_a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[0][2] += timer.getCounterMsPrecise();
timer.startCounter();
cudaMemcpy(h_a, d_a, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
dummy = h_a[rand() % N];
cost[0][3] += timer.getCounterMsPrecise();
timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += h_a[i];
dummy = sum;
cost[0][4] += timer.getCounterMsPrecise();
timer.startCounter();
delete[] h_a;
cudaFree(d_a);
cudaDeviceSynchronize();
cost[0][5] += timer.getCounterMsPrecise();
for (int i = 0; i < npoints; i++) dummy += cost[0][i];
}
void Test2()
{
MyTimer timer;
timer.startCounter();
float *a;
cudaMallocManaged(&a, N * sizeof(float));
cost[1][0] += timer.getCounterMsPrecise();
GenData(N, a);
dummy = a[rand() % N];
timer.startCounter();
cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
cudaDeviceSynchronize();
cost[1][1] += timer.getCounterMsPrecise();
timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[1][2] += timer.getCounterMsPrecise();
timer.startCounter();
cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
cudaDeviceSynchronize();
dummy = a[rand() % N];
cost[1][3] += timer.getCounterMsPrecise();
timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += a[i];
dummy = sum;
cost[1][4] += timer.getCounterMsPrecise();
timer.startCounter();
cudaFree(a);
cudaDeviceSynchronize();
cost[1][5] += timer.getCounterMsPrecise();
for (int i = 0; i < npoints; i++) dummy += cost[1][i];
}
void Test3()
{
MyTimer timer;
timer.startCounter();
float *a;
cudaMallocManaged(&a, N * sizeof(float));
cost[2][0] += timer.getCounterMsPrecise();
GenData(N, a);
dummy = a[rand() % N];
timer.startCounter();
//cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
//cudaDeviceSynchronize();
cost[2][1] += timer.getCounterMsPrecise();
timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[2][2] += timer.getCounterMsPrecise();
timer.startCounter();
// cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
// cudaDeviceSynchronize();
dummy = a[rand() % N];
cost[2][3] += timer.getCounterMsPrecise();
timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += a[i];
dummy = sum;
cost[2][4] += timer.getCounterMsPrecise();
timer.startCounter();
cudaFree(a);
cudaDeviceSynchronize();
cost[2][5] += timer.getCounterMsPrecise();
for (int i = 0; i < npoints; i++) dummy += cost[2][i];
}
int main()
{
srand(time(NULL));
HelloWorld<<<1,1>>>();
// warmup
Test1();
Test2();
for (int i = 0; i < 3; i++)
for (int j = 0; j < npoints; j++) cost[i][j] = 0;
int ntest = 10;
for (int t = 1; t <= ntest; t++) {
Test1();
Test2();
Test3();
}
for (int i = 0; i < npoints; i++) {
cout << "cost " << costnames[i] << " = " << (cost[0][i] / ntest) << " , " << (cost[1][i] / ntest) << " , " << (cost[2][i] / ntest) << "\n";
}
return 0;
}
// 2080ti
// Hello world
// cost allocate = 0.245438 , 0.0470603 , 0.029834
// cost H2D = 6.25315 , 6.36215 , 3.71e-05
// cost sort = 2.61625 , 2.6077 , 14.5418
// cost D2H = 8.74573 , 0.0520719 , 0.0759482
// cost hostsum = 6.98815 , 17.9619 , 18.3188
// cost free = 2.82205 , 3.8711 , 4.12887
我试着比较cudaMalloc
和cudaMallocManaged
的性能。用例是一个矩阵库,其中GPU的使用对用户是隐藏的(即他们可以像普通库一样使用它,但某些操作会自动使用GPU)。
如果一个算法只使用GPU,那么我们可以cudaMemPrefetch
内存到GPU。你可以看到cost[0][2] == cost[1][2]
,和cost[0][3]
慢得多。然而,预取不工作在相反的方向,所以cost[1][4] > cost[0][3] + cost[0][4]
,~10-15%慢。
那么,有没有办法将统一内存从设备预取到主机呢?
1条答案
按热度按时间7kqas0il1#
您使用cudaMemPrefetchAsync将数据预取到主机不正确。根据API文档:
传入dstDevice的cudaCpuDeviceId将数据预取到主机内存
按原样运行代码,我在计算机上观察到以下输出。
当我使用cudaCpuDeviceId将排序后的数据预取到主机时,hostsum时间会减少。