assembly 与寄存器分配相关的ARM性能问题

nkhmeac6  于 2023-04-12  发布在  其他
关注(0)|答案(1)|浏览(116)

我目前正在使用Cortex A77 CPU的嵌入式板上测量以下代码的性能。

void kernel_func_x16(unsigned char* __restrict input_data, unsigned char* __restrict output_data)
{
    int stride_size=16;

    for(int i=0; i<100000000; i+=stride_size)
    {
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
    }
    return;
}

每个输入输出缓冲区的大小约为96 MB,内核运行1000次后,性能值取去除前200次的平均值,上面代码(kernel_func_x16)的测量结果约为8.3ms(带O3选项)。
运行下面的代码(kernel_func_x32),性能测量值约为11.1 ms。

void kernel_func_x32(unsigned char* __restrict input_data, unsigned char* __restrict output_data)
{
    int stride_size=32;

    for(int i=0; i<100000000; i+=stride_size)
    {
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
        *output_data++ = *input_data++;
    }
    return;
}

要想知道为什么会有这样的性能,请将每段代码生成为汇编代码。下面的代码是将kernel_func_x16kernel_func_x32的O3选项依次添加后生成的汇编代码。

///// KERNEL_FUNC_X16 ///////
    .arch armv8.2-a+crc
    .file   "kernel.cpp"
    .text
    .align  2
    .p2align 4,,11
    .global _Z15kernel_func_x16PhS_
    .type   _Z15kernel_func_x16PhS_, %function
_Z15kernel_func_x16PhS_:
.LFB4340:
    .cfi_startproc
    mov x3, 57600
    mov x2, 0
    movk    x3, 0x5f5, lsl 16
    .p2align 3,,7
.L2:
    ldr q0, [x0, x2]
    str q0, [x1, x2]
    add x2, x2, 16
    cmp x2, x3
    bne .L2
    ret
    .cfi_endproc
.LFE4340:
    .size   _Z15kernel_func_x16PhS_, .-_Z15kernel_func_x16PhS_
    .ident  "GCC: (Ubuntu 11.1.0-1ubuntu1~18.04.1) 11.1.0"
    .section    .note.GNU-stack,"",@progbits
///// KERNEL_FUNC_X32 ///////
    .arch armv8.2-a+crc
    .file   "kernel.cpp"
    .text
    .align  2
    .p2align 4,,11
    .global _Z15kernel_func_x32PhS_
    .type   _Z15kernel_func_x32PhS_, %function
_Z15kernel_func_x32PhS_:
.LFB4340:
    .cfi_startproc
    mov x3, 57600
    add x5, x0, 16
    add x4, x1, 16
    mov x2, 0
    movk    x3, 0x5f5, lsl 16
    .p2align 3,,7
.L2:
    ldr q1, [x0, x2]
    ldr q0, [x5, x2]
    str q1, [x1, x2]
    str q0, [x4, x2]
    add x2, x2, 32
    cmp x2, x3
    bne .L2
    ret
    .cfi_endproc
.LFE4340:
    .size   _Z15kernel_func_x32PhS_, .-_Z15kernel_func_x32PhS_
    .ident  "GCC: (Ubuntu 11.1.0-1ubuntu1~18.04.1) 11.1.0"
    .section    .note.GNU-stack,"",@progbits

两个装配代号的钥匙圈代号位于.L2标签中。
最初很难找到kernel_func_x32性能下降的原因,所以我用各种方法修改汇编代码并进行分析。
首先,我重新组织了kernel_func_x32的指令,以模仿kernel_func_x16的指令序列。

.arch armv8.2-a+crc
    .file   "kernel.cpp"
    .text
    .align  2
    .p2align 4,,11
    .global _Z15kernel_func_x32PhS_
    .type   _Z15kernel_func_x32PhS_, %function
_Z15kernel_func_x32PhS_:
.LFB4340:
    .cfi_startproc
    mov x3, 57600
    add x5, x0, 0
    add x4, x1, 0
    mov x2, 0
    movk    x3, 0x5f5, lsl 16
    .p2align 3,,7
.L2:
    ldr q1, [x0, x2]
    str q1, [x1, x2]
    add x2, x2, 16

    ldr q0, [x5, x2]
    str q0, [x4, x2]
    add x2, x2, 16
    cmp x2, x3
    bne .L2
    ret
    .cfi_endproc
.LFE4340:
    .size   _Z15kernel_func_x32PhS_, .-_Z15kernel_func_x32PhS_
    .ident  "GCC: (Ubuntu 11.1.0-1ubuntu1~18.04.1) 11.1.0"
    .section    .note.GNU-stack,"",@progbits

由于上面的代码有一个类似于kernel_func_x16的加载/存储指令结构,并且执行更少的比较指令。我猜上面的代码性能会类似或更快于kernel_func_x16
但是当运行该代码时,性能测量为约11 ms。
在寻找性能没有提高的原因时,我不小心修改了代码,将x5x4寄存器替换为x 0x1,如下所示。

.arch armv8.2-a+crc
    .file   "kernel.cpp"
    .text
    .align  2
    .p2align 4,,11
    .global _Z15kernel_func_x32PhS_
    .type   _Z15kernel_func_x32PhS_, %function
_Z15kernel_func_x32PhS_:
.LFB4340:
    .cfi_startproc
    mov x3, 57600
    add x5, x0, 0
    add x4, x1, 0
    mov x2, 0
    movk    x3, 0x5f5, lsl 16
    .p2align 3,,7
.L2:
    ldr q1, [x0, x2]
    str q1, [x1, x2]
    add x2, x2, 16

    ldr q0, [x0, x2]
    str q0, [x1, x2]
    add x2, x2, 16
    cmp x2, x3
    bne .L2
    ret
    .cfi_endproc
.LFE4340:
    .size   _Z15kernel_func_x32PhS_, .-_Z15kernel_func_x32PhS_
    .ident  "GCC: (Ubuntu 11.1.0-1ubuntu1~18.04.1) 11.1.0"
    .section    .note.GNU-stack,"",@progbits

上面的代码,性能大约是8.2毫秒。虽然操作是一样的,但是性能已经提高了,因为寄存器名称已经改变了。我搜索了几个文档来查找原因,但是找不到。如果有人能给予我一些建议,我将不胜感激。

yzuktlbb

yzuktlbb1#

我认为你的实验表明,在Cortex-A77上,加载指令的编码,特别是基址寄存器,会影响硬件预取。这是已知的Falkor的情况,Falkor是高通公司设计的稍旧的AArch 64 CPU内核。
在Falkor的情况下,Linaro工程师对Glibc和GCC的预取进行了改进,并提到不仅基址和偏移寄存器,甚至目标寄存器都会影响“标签”,加载地址将根据该标签路由到预取单元之一:
https://inbox.sourceware.org/gcc-patches/20180724070741.25065-1-siddhesh@sourceware.org/

The falkor hardware prefetching system uses a combination of the
source, destination and offset to decide which prefetcher unit to
train with the load.  This is great when loads in a loop are
sequential but sub-optimal if there are unrelated loads in a loop that
tag to the same prefetcher unit.

https://inbox.sourceware.org/libc-alpha/1502134812-31816-1-git-send-email-siddhesh@sourceware.org/

+   FALKOR-SPECIFIC DESIGN:
+
+   The smallest copies (32 bytes or less) focus on optimal pipeline usage,
+   which is why the redundant copies of 0-3 bytes have been replaced with
+   conditionals, since the former would unnecessarily break across multiple
+   issue groups.  The medium copy group has been enlarged to 128 bytes since
+   bumping up the small copies up to 32 bytes allows us to do that without
+   cost and also allows us the reduce the size of the prep code before loop64.
+
+   All copies are done only via two registers r6 and r7.  This is to ensure
+   that all loads hit a single hardware prefetcher which can get correctly
+   trained to prefetch a single stream.

https://inbox.sourceware.org/libc-alpha/20180503175209.2943-1-siddhesh@sourceware.org/T/

The tail of the copy loops are unable to train the falkor hardware
prefetcher because they load from a different base compared to the hot
loop.

对于Cortex-A77,我没有同样的证据,但似乎有可能使用类似的技巧。

相关问题