c++ CUDA统一内存如何从设备预取到主机？

gab6jxml 于 2023-03-25 发布在其他

关注(0)|答案(1)|浏览(126)

#include <cuda_runtime.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;

class MyTimer {
    std::chrono::time_point<std::chrono::system_clock> start;

public:
    void startCounter() {
        start = std::chrono::system_clock::now();
    }

    int64_t getCounterNs() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
    }

    int64_t getCounterMs() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
    }

    double getCounterMsPrecise() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
                / 1000000.0;
    }
};

int N = 10000000;

void GenData(int N, float* a)
{
  for (int i = 0; i < N; i ++) a[i] = float(rand() % 1000000) / (rand() % 100 + 1);
}

__global__
void HelloWorld()
{
  printf("Hello world\n");
}

constexpr int npoints = 6;
const string costnames[] = {"allocate", "H2D", "sort", "D2H", "hostsum", "free"};
double cost[3][npoints];
volatile double dummy = 0;

void Test1()
{
  MyTimer timer;

  timer.startCounter();
  float *h_a = new float[N];
  float *d_a;
  cudaMalloc(&d_a, N * sizeof(float));
  cudaDeviceSynchronize();
  cost[0][0] += timer.getCounterMsPrecise();

  GenData(N, h_a);
  dummy = h_a[rand() % N];

  timer.startCounter();
  cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaDeviceSynchronize();
  cost[0][1] += timer.getCounterMsPrecise();

  timer.startCounter();
  thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(d_a);
  thrust::sort(dev_ptr, dev_ptr + N);
  cudaDeviceSynchronize();
  cost[0][2] += timer.getCounterMsPrecise();

  timer.startCounter();
  cudaMemcpy(h_a, d_a, N * sizeof(float), cudaMemcpyDeviceToHost);  
  cudaDeviceSynchronize();
  dummy = h_a[rand() % N];
  cost[0][3] += timer.getCounterMsPrecise();
  
  timer.startCounter();
  float sum = 0;
  for (int i = 0; i < N; i++) sum += h_a[i];
  dummy = sum;
  cost[0][4] += timer.getCounterMsPrecise();

  timer.startCounter();
  delete[] h_a;
  cudaFree(d_a);
  cudaDeviceSynchronize();
  cost[0][5] += timer.getCounterMsPrecise();

  for (int i = 0; i < npoints; i++) dummy += cost[0][i];
}

void Test2()
{
  MyTimer timer;

  timer.startCounter();
  float *a;
  cudaMallocManaged(&a, N * sizeof(float));  
  cost[1][0] += timer.getCounterMsPrecise();

  GenData(N, a);
  dummy = a[rand() % N];

  timer.startCounter();
  cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
  cudaDeviceSynchronize();
  cost[1][1] += timer.getCounterMsPrecise();

  timer.startCounter();
  thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
  thrust::sort(dev_ptr, dev_ptr + N);
  cudaDeviceSynchronize();
  cost[1][2] += timer.getCounterMsPrecise();

  timer.startCounter();
  cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
  cudaDeviceSynchronize();
  dummy = a[rand() % N];
  cost[1][3] += timer.getCounterMsPrecise();

  timer.startCounter();
  float sum = 0;
  for (int i = 0; i < N; i++) sum += a[i];
  dummy = sum;
  cost[1][4] += timer.getCounterMsPrecise();

  timer.startCounter();  
  cudaFree(a);
  cudaDeviceSynchronize();
  cost[1][5] += timer.getCounterMsPrecise();

  for (int i = 0; i < npoints; i++) dummy += cost[1][i];
}

void Test3()
{
  MyTimer timer;

  timer.startCounter();
  float *a;
  cudaMallocManaged(&a, N * sizeof(float));  
  cost[2][0] += timer.getCounterMsPrecise();

  GenData(N, a);
  dummy = a[rand() % N];

  timer.startCounter();
  //cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
  //cudaDeviceSynchronize();
  cost[2][1] += timer.getCounterMsPrecise();

  timer.startCounter();
  thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
  thrust::sort(dev_ptr, dev_ptr + N);
  cudaDeviceSynchronize();
  cost[2][2] += timer.getCounterMsPrecise();

  timer.startCounter();
  // cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
  // cudaDeviceSynchronize();
  dummy = a[rand() % N];
  cost[2][3] += timer.getCounterMsPrecise();

  timer.startCounter();
  float sum = 0;
  for (int i = 0; i < N; i++) sum += a[i];
  dummy = sum;
  cost[2][4] += timer.getCounterMsPrecise();

  timer.startCounter();  
  cudaFree(a);
  cudaDeviceSynchronize();
  cost[2][5] += timer.getCounterMsPrecise();

  for (int i = 0; i < npoints; i++) dummy += cost[2][i];
}

int main()
{
  srand(time(NULL));
  HelloWorld<<<1,1>>>();

  // warmup
  Test1();
  Test2();
  for (int i = 0; i < 3; i++)
  for (int j = 0; j < npoints; j++) cost[i][j] = 0;  

  int ntest = 10;
  for (int t = 1; t <= ntest; t++) {
    Test1();
    Test2();
    Test3();
  }

  for (int i = 0; i < npoints; i++) {
    cout << "cost " << costnames[i] << " = " << (cost[0][i] / ntest) << " , " << (cost[1][i] / ntest) << " , " << (cost[2][i] / ntest) << "\n";
  }

  return 0;
}

// 2080ti
// Hello world
// cost allocate = 0.245438 , 0.0470603 , 0.029834
// cost H2D = 6.25315 , 6.36215 , 3.71e-05
// cost sort = 2.61625 , 2.6077 , 14.5418
// cost D2H = 8.74573 , 0.0520719 , 0.0759482
// cost hostsum = 6.98815 , 17.9619 , 18.3188
// cost free = 2.82205 , 3.8711 , 4.12887

我试着比较cudaMalloc和cudaMallocManaged的性能。用例是一个矩阵库，其中GPU的使用对用户是隐藏的（即他们可以像普通库一样使用它，但某些操作会自动使用GPU）。
如果一个算法只使用GPU，那么我们可以cudaMemPrefetch内存到GPU。你可以看到cost[0][2] == cost[1][2]，和cost[0][3]慢得多。然而，预取不工作在相反的方向，所以cost[1][4] > cost[0][3] + cost[0][4]，~10-15%慢。
那么，有没有办法将统一内存从设备预取到主机呢？

c++

来源：https://stackoverflow.com/questions/75819026/cuda-unified-memory-how-to-prefetch-from-device-to-host

1条答案

按热度按时间

7kqas0il1#

您使用cudaMemPrefetchAsync将数据预取到主机不正确。根据API文档：
传入dstDevice的cudaCpuDeviceId将数据预取到主机内存
按原样运行代码，我在计算机上观察到以下输出。

Hello world
cost allocate = 0.190719 , 0.0421818 , 0.0278854
cost H2D = 3.29175 , 5.30171 , 4.3e-05
cost sort = 0.619405 , 0.59198 , 11.6026
cost D2H = 3.42561 , 0.730888 , 0.729142
cost hostsum = 7.34508 , 12.7422 , 12.9242
cost free = 2.20156 , 5.1042 , 5.99327

当我使用cudaCpuDeviceId将排序后的数据预取到主机时，hostsum时间会减少。

Hello world
cost allocate = 0.192218 , 0.0414427 , 0.0268805
cost H2D = 3.21791 , 5.31319 , 5e-05
cost sort = 0.617812 , 0.594804 , 12.6862
cost D2H = 3.3481 , 2.9555 , 0.730368
cost hostsum = 7.23154 , 7.20661 , 12.737
cost free = 2.101 , 5.22388 , 5.8554

赞(0）回复(0）举报 2023-03-25

我来回答

c++ CUDA统一内存如何从设备预取到主机？

1条答案

相关问题

热门标签

最新问答