我正在寻找使用平铺方法的L1缓存优化。
这是我正在使用的优化技术:
// Original loop
void original_loop() {
for (int i = 0; i < N; i++) {
sum += array[i];
}
}
// Loop tiling
void tiled_loop() {
for (int i = 0; i < N; i += 16) {
for (int j = 0; j < 16; j++) {
sum += array[i + j];
}
}
}
然后有趣的是,我发现clang中的平铺循环实际上比原始循环慢(或者说速度差不多)。
这是我的基准测试代码:
#include <chrono>
#include <iostream>
const int N = 10000;
const int blockSize = 16;
int array[N];
int sum;
// Original loop
void original_loop() {
for (int i = 0; i < N; i++) {
sum += array[i];
}
}
// Loop tiling
void tiled_loop() {
for (int i = 0; i < N; i += blockSize) {
for (int j = 0; j < blockSize; j++) {
sum += array[i + j];
}
}
}
int main() {
// Initialize array
for (int i = 0; i < N; i++) {
array[i] = i;
}
// Benchmark original loop
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100000; i++) {
sum = 0;
original_loop();
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "Original loop: " << elapsed.count() << " seconds" << std::endl;
// Benchmark tiled loop
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100000; i++) {
sum = 0;
tiled_loop();
}
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << "Tiled loop: " << elapsed.count() << " seconds" << std::endl;
return 0;
}
使用gcc 12.2 and -O3
和clang 15.0.0 and -O3
的一些结果:
For N = 1000000 and blockSize = 16
**GCC**
Original loop: 11.1892 seconds
Tiled loop: 9.67448 seconds
**Clang**
Original loop: 8.52184 seconds
Tiled loop: 8.67858 seconds
较小数字:
For N = 10000 and blockSize = 16
**GCC**
Original loop: 0.094786 seconds
Tiled loop: 0.0436597 seconds
**Clang**
Original loop: 0.0416874 seconds
Tiled loop: 0.0610718 seconds
我已经试过很多次了。Clang的效果和原来的一样或者更差。有时候它比原来的好,但是即使这样,差别也没有gcc那么大。有什么想法吗?
下面是汇编代码(我找不到汇编代码的链接,也许有人可以用godbolt链接替换此代码):
海湾合作委员会:
original_loop():
mov ecx, DWORD PTR sum[rip]
mov eax, OFFSET FLAT:array
mov edx, OFFSET FLAT:array+40000
pxor xmm0, xmm0
.L2:
paddd xmm0, XMMWORD PTR [rax]
add rax, 16
cmp rdx, rax
jne .L2
movdqa xmm1, xmm0
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd eax, xmm0
add eax, ecx
mov DWORD PTR sum[rip], eax
ret
tiled_loop():
pxor xmm1, xmm1
mov eax, OFFSET FLAT:array
mov edx, OFFSET FLAT:array+40000
movd xmm3, DWORD PTR sum[rip]
movdqa xmm2, xmm1
movdqa xmm0, xmm1
.L6:
paddd xmm3, XMMWORD PTR [rax]
paddd xmm0, XMMWORD PTR [rax+16]
add rax, 64
paddd xmm2, XMMWORD PTR [rax-32]
paddd xmm1, XMMWORD PTR [rax-16]
cmp rdx, rax
jne .L6
paddd xmm0, xmm3
paddd xmm0, xmm2
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd DWORD PTR sum[rip], xmm0
ret
sum:
.zero 4
array:
.zero 40000
铿锵:
original_loop(): # @original_loop()
pxor xmm0, xmm0
mov eax, 12
movd xmm1, dword ptr [rip + sum] # xmm1 = mem[0],zero,zero,zero
lea rcx, [rip + array]
.LBB0_1: # =>This Inner Loop Header: Depth=1
paddd xmm1, xmmword ptr [rcx + 4*rax - 48]
paddd xmm0, xmmword ptr [rcx + 4*rax - 32]
paddd xmm1, xmmword ptr [rcx + 4*rax - 16]
paddd xmm0, xmmword ptr [rcx + 4*rax]
add rax, 16
cmp rax, 10012
jne .LBB0_1
paddd xmm0, xmm1
pshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
paddd xmm1, xmm0
pshufd xmm0, xmm1, 85 # xmm0 = xmm1[1,1,1,1]
paddd xmm0, xmm1
movd dword ptr [rip + sum], xmm0
ret
tiled_loop(): # @tiled_loop()
mov edx, dword ptr [rip + sum]
xor eax, eax
lea rcx, [rip + array]
.LBB1_1: # =>This Inner Loop Header: Depth=1
movdqa xmm0, xmmword ptr [rcx + 4*rax]
movdqa xmm1, xmmword ptr [rcx + 4*rax + 16]
paddd xmm1, xmmword ptr [rcx + 4*rax + 48]
paddd xmm0, xmmword ptr [rcx + 4*rax + 32]
paddd xmm0, xmm1
pshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
paddd xmm1, xmm0
pshufd xmm0, xmm1, 85 # xmm0 = xmm1[1,1,1,1]
paddd xmm0, xmm1
movd esi, xmm0
add esi, edx
cmp rax, 9983
ja .LBB1_3
movdqa xmm0, xmmword ptr [rcx + 4*rax + 64]
movdqa xmm1, xmmword ptr [rcx + 4*rax + 80]
paddd xmm1, xmmword ptr [rcx + 4*rax + 112]
paddd xmm0, xmmword ptr [rcx + 4*rax + 96]
paddd xmm0, xmm1
pshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
paddd xmm1, xmm0
pshufd xmm0, xmm1, 85 # xmm0 = xmm1[1,1,1,1]
paddd xmm0, xmm1
movd edx, xmm0
add edx, esi
add rax, 32
jmp .LBB1_1
.LBB1_3:
mov dword ptr [rip + sum], esi
ret
array:
.zero 40000
sum:
.long 0
- 编辑**
带有-march=native
标志
Original loop: 0.0292406 seconds
Tiled loop: 0.173324 seconds
Clang性能差10倍
1条答案
按热度按时间ibrsph3r1#
GCC和Clang的优化方式往往不同。如果你想要最佳结果,你可能需要使用
-march
。使用-O3 -march=skylake
,Clang会自动展开循环,如果你在original_loop()
中添加#pragma GCC unroll 16
,GCC也会生成类似的代码:https://godbolt.org/z/WWr1nToaG对于平铺,GCC的代码还可以,但是Clang的代码非常臃肿:https://godbolt.org/z/GYqYTc7cc-我对Clang代码运行较慢并不感到惊讶。
如果你想使用这样的技巧来让编译器生成更好的代码,你必须接受它是编译器特定的。对一个设置有帮助的调整可能会对另一个设置有伤害,而且这不容易预测。通常如果你写像
original_loop()
这样的简单代码,至少一个给定的编译器在新版本发布时会有所改进(或保持不变)。