平铺优化gcc vs clang

nafvub8i  于 2023-01-02  发布在  其他
关注(0)|答案(1)|浏览(208)

我正在寻找使用平铺方法的L1缓存优化。
这是我正在使用的优化技术:

// Original loop
void original_loop() {
  for (int i = 0; i < N; i++) {
    sum += array[i];
  }
}

// Loop tiling
void tiled_loop() {
  for (int i = 0; i < N; i += 16) {
    for (int j = 0; j < 16; j++) {
      sum += array[i + j];
    }
  }
}

然后有趣的是,我发现clang中的平铺循环实际上比原始循环慢(或者说速度差不多)。
这是我的基准测试代码:

#include <chrono>
#include <iostream>

const int N = 10000;
const int blockSize = 16;
int array[N];
int sum;

// Original loop
void original_loop() {
  for (int i = 0; i < N; i++) {
    sum += array[i];
  }
}

// Loop tiling
void tiled_loop() {
  for (int i = 0; i < N; i += blockSize) {
    for (int j = 0; j < blockSize; j++) {
      sum += array[i + j];
    }
  }
}

int main() {
  // Initialize array
  for (int i = 0; i < N; i++) {
    array[i] = i;
  }

  // Benchmark original loop
  auto start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < 100000; i++) {
    sum = 0;
    original_loop();
  }
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> elapsed = end - start;
  std::cout << "Original loop: " << elapsed.count() << " seconds" << std::endl;

  // Benchmark tiled loop
  start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < 100000; i++) {
    sum = 0;
    tiled_loop();
  }
  end = std::chrono::high_resolution_clock::now();
  elapsed = end - start;
  std::cout << "Tiled loop: " << elapsed.count() << " seconds" << std::endl;

  return 0;
}

使用gcc 12.2 and -O3clang 15.0.0 and -O3的一些结果:

For N = 1000000 and blockSize = 16
**GCC**
Original loop: 11.1892 seconds
Tiled loop: 9.67448 seconds
**Clang**  
Original loop: 8.52184 seconds
Tiled loop: 8.67858 seconds

较小数字:

For N = 10000 and blockSize = 16
**GCC**
Original loop: 0.094786 seconds
Tiled loop: 0.0436597 seconds
**Clang**
Original loop: 0.0416874 seconds
Tiled loop: 0.0610718 seconds

我已经试过很多次了。Clang的效果和原来的一样或者更差。有时候它比原来的好,但是即使这样,差别也没有gcc那么大。有什么想法吗?
下面是汇编代码(我找不到汇编代码的链接,也许有人可以用godbolt链接替换此代码):
海湾合作委员会:

original_loop():
        mov     ecx, DWORD PTR sum[rip]
        mov     eax, OFFSET FLAT:array
        mov     edx, OFFSET FLAT:array+40000
        pxor    xmm0, xmm0
.L2:
        paddd   xmm0, XMMWORD PTR [rax]
        add     rax, 16
        cmp     rdx, rax
        jne     .L2
        movdqa  xmm1, xmm0
        psrldq  xmm1, 8
        paddd   xmm0, xmm1
        movdqa  xmm1, xmm0
        psrldq  xmm1, 4
        paddd   xmm0, xmm1
        movd    eax, xmm0
        add     eax, ecx
        mov     DWORD PTR sum[rip], eax
        ret
tiled_loop():
        pxor    xmm1, xmm1
        mov     eax, OFFSET FLAT:array
        mov     edx, OFFSET FLAT:array+40000
        movd    xmm3, DWORD PTR sum[rip]
        movdqa  xmm2, xmm1
        movdqa  xmm0, xmm1
.L6:
        paddd   xmm3, XMMWORD PTR [rax]
        paddd   xmm0, XMMWORD PTR [rax+16]
        add     rax, 64
        paddd   xmm2, XMMWORD PTR [rax-32]
        paddd   xmm1, XMMWORD PTR [rax-16]
        cmp     rdx, rax
        jne     .L6
        paddd   xmm0, xmm3
        paddd   xmm0, xmm2
        paddd   xmm0, xmm1
        movdqa  xmm1, xmm0
        psrldq  xmm1, 8
        paddd   xmm0, xmm1
        movdqa  xmm1, xmm0
        psrldq  xmm1, 4
        paddd   xmm0, xmm1
        movd    DWORD PTR sum[rip], xmm0
        ret
sum:
        .zero   4
array:
        .zero   40000

铿锵:

original_loop():                     # @original_loop()
        pxor    xmm0, xmm0
        mov     eax, 12
        movd    xmm1, dword ptr [rip + sum]     # xmm1 = mem[0],zero,zero,zero
        lea     rcx, [rip + array]
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        paddd   xmm1, xmmword ptr [rcx + 4*rax - 48]
        paddd   xmm0, xmmword ptr [rcx + 4*rax - 32]
        paddd   xmm1, xmmword ptr [rcx + 4*rax - 16]
        paddd   xmm0, xmmword ptr [rcx + 4*rax]
        add     rax, 16
        cmp     rax, 10012
        jne     .LBB0_1
        paddd   xmm0, xmm1
        pshufd  xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
        paddd   xmm1, xmm0
        pshufd  xmm0, xmm1, 85                  # xmm0 = xmm1[1,1,1,1]
        paddd   xmm0, xmm1
        movd    dword ptr [rip + sum], xmm0
        ret
tiled_loop():                        # @tiled_loop()
        mov     edx, dword ptr [rip + sum]
        xor     eax, eax
        lea     rcx, [rip + array]
.LBB1_1:                                # =>This Inner Loop Header: Depth=1
        movdqa  xmm0, xmmword ptr [rcx + 4*rax]
        movdqa  xmm1, xmmword ptr [rcx + 4*rax + 16]
        paddd   xmm1, xmmword ptr [rcx + 4*rax + 48]
        paddd   xmm0, xmmword ptr [rcx + 4*rax + 32]
        paddd   xmm0, xmm1
        pshufd  xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
        paddd   xmm1, xmm0
        pshufd  xmm0, xmm1, 85                  # xmm0 = xmm1[1,1,1,1]
        paddd   xmm0, xmm1
        movd    esi, xmm0
        add     esi, edx
        cmp     rax, 9983
        ja      .LBB1_3
        movdqa  xmm0, xmmword ptr [rcx + 4*rax + 64]
        movdqa  xmm1, xmmword ptr [rcx + 4*rax + 80]
        paddd   xmm1, xmmword ptr [rcx + 4*rax + 112]
        paddd   xmm0, xmmword ptr [rcx + 4*rax + 96]
        paddd   xmm0, xmm1
        pshufd  xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
        paddd   xmm1, xmm0
        pshufd  xmm0, xmm1, 85                  # xmm0 = xmm1[1,1,1,1]
        paddd   xmm0, xmm1
        movd    edx, xmm0
        add     edx, esi
        add     rax, 32
        jmp     .LBB1_1
.LBB1_3:
        mov     dword ptr [rip + sum], esi
        ret
array:
        .zero   40000

sum:
        .long   0
    • 编辑**

带有-march=native标志

Original loop: 0.0292406 seconds
Tiled loop: 0.173324 seconds

Clang性能差10倍

ibrsph3r

ibrsph3r1#

GCC和Clang的优化方式往往不同。如果你想要最佳结果,你可能需要使用-march。使用-O3 -march=skylake,Clang会自动展开循环,如果你在original_loop()中添加#pragma GCC unroll 16,GCC也会生成类似的代码:https://godbolt.org/z/WWr1nToaG
对于平铺,GCC的代码还可以,但是Clang的代码非常臃肿:https://godbolt.org/z/GYqYTc7cc-我对Clang代码运行较慢并不感到惊讶。
如果你想使用这样的技巧来让编译器生成更好的代码,你必须接受它是编译器特定的。对一个设置有帮助的调整可能会对另一个设置有伤害,而且这不容易预测。通常如果你写像original_loop()这样的简单代码,至少一个给定的编译器在新版本发布时会有所改进(或保持不变)。

相关问题