C语言 32位平台上的无符号64 x64->128位整数乘法

7fyelxc5  于 2022-12-11  发布在  其他
关注(0)|答案(1)|浏览(162)

在探索性活动的背景下,我已经开始关注32位平台的整数和定点运算构建模块。(具体地说是armv7),我希望RISC-V32在嵌入式领域的重要性会不断提高。128位整数乘法。本网站上关于此构建块的其他问题不提供32位平台的详细覆盖范围。
在过去的30年里,我已经多次实现了这个和其他的算法构建块,但是总是用汇编语言,用于各种体系结构。然而,在这个时候,我的希望和愿望是这些可以用直接的ISO-C编程,理想情况下,单一版本的C代码将生成跨体系结构的良好机器代码。我知道通过操纵HLL代码来控制机器代码的方法通常是脆弱的,但我希望处理器架构和工具链已经足够成熟,使之成为可行的。
在下面的示例代码中,我选择了六个看起来适合HLL实现的变体。除了生成部分积(这是所有变体所共有的)之外,还有两个基本方法:(1)使用64位算法对部分积求和,让编译器处理32位二等分之间的进位传递。在这种情况下,有多种选择来对部分积求和。(2)使用32位算术进行求和,直接模拟进位标志。在这种情况下,我们可以选择在加法后生成进位(a = a + b; carry = a < b;)或在添加之前(carry = ~a < b; a = a + b;)。以下变体1至3属于前一类,变体5和6属于后一类。
Compiler Explorer上,我主要关注感兴趣的平台的工具链gcc12.2和clang15.0,我使用-O3进行编译,一般的发现是,clang生成的代码平均比gcc更高效,并且变体之间的差异虽然这在作为较新架构的RISC-V的情况下是可以理解的,在X1 M4 N1 X的情况下,这让我很惊讶,它已经存在了十几年。
有三个案例特别值得注意。虽然我以前和编译器工程师一起工作过,对基本的代码转换、阶段排序等问题有一定的了解,但我所知道的可能适用于此代码的唯一技术是习语识别,我看不出这本身如何解释观察结果。第一个案例是变体3。clang15.0产生了非常紧凑的代码,仅包含10条指令,我认为无法改进:

umul64wide:
        push    {r4, lr}
        umull   r12, r4, r2, r0
        umull   lr, r0, r3, r0
        umaal   lr, r4, r2, r1
        umaal   r0, r4, r3, r1
        ldr     r1, [sp, #8]
        strd    r0, r4, [r1]
        mov     r0, r12
        mov     r1, lr
        pop     {r4, pc}

相比之下,gcc生成的指令数量是gcc的两倍,需要的寄存器数量也是gcc的两倍。我假设gcc不知道如何使用乘法-累加指令umaal,但这就是全部情况吗?相反的情况,但没有那么戏剧性,发生在变体6中,其中gcc 12.2生成了这样一个18条指令的序列,寄存器使用率很低:

umul64wide:
        mov     ip, r0
        push    {r4, r5, lr}
        mov     lr, r1
        umull   r0, r1, r0, r2
        ldr     r4, [sp, #12]
        umull   r5, ip, r3, ip
        adds    r1, r1, r5
        umull   r2, r5, lr, r2
        adc     ip, ip, #0
        umull   lr, r3, lr, r3
        adds    r1, r1, r2
        adc     r2, ip, #0
        adds    r2, r2, r5
        adc     r3, r3, #0
        adds    r2, r2, lr
        adc     r3, r3, #0
        strd    r2, r3, [r4]
        pop     {r4, r5, pc}

生成的代码很好地将模拟的进位传播转换为真实的的进位传播。clang 15.0使用了9条指令和5个寄存器,如果不花更多的时间进行分析,我无法真正理解它要做什么。第三个观察结果是关于变体5和变体6生成的机器代码的差异。特别是使用clang。它们使用相同的基本算法,其中一个变体在加法之前计算模拟进位,另一个变体在加法之后计算模拟进位。最后我确实发现,一个变体(即变体4)似乎在两个工具链和两个架构中都有效。然而,在我着手讨论其他组成部分和面临类似斗争之前,我想问:
(1)在下面的代码中,是否有我没有考虑过的编码习惯或算法,这些习惯或算法可能会导致上级的结果?(2)是否有特定的优化开关,例如假设的-ffrobnicate(参见here),-O3中不包含的代码,这些代码将帮助编译器为这些类型的位操作场景更一致地生成高效代码?解释哪些编译器机制可能会导致代码生成中观察到的显著差异,以及如何影响或解决这些差异,也会有所帮助。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

#define VARIANT         (3)
#define USE_X64_ASM_REF (0)

/* Multiply two unsigned 64-bit operands a and b. Returns least significant 64 
   bits of product as return value, most significant 64 bits of product via h.
*/
uint64_t umul64wide (uint64_t a, uint64_t b, uint64_t *h)
{
    uint32_t a_lo = (uint32_t)a;
    uint32_t a_hi = a >> 32;
    uint32_t b_lo = (uint32_t)b;
    uint32_t b_hi = b >> 32;
    uint64_t p0 = (uint64_t)a_lo * b_lo;
    uint64_t p1 = (uint64_t)a_lo * b_hi;
    uint64_t p2 = (uint64_t)a_hi * b_lo;
    uint64_t p3 = (uint64_t)a_hi * b_hi;
#if VARIANT == 1
    uint32_t c = (uint32_t)(((p0 >> 32) + (uint32_t)p1 + (uint32_t)p2) >> 32);
    *h = p3 + (p1 >> 32) + (p2 >> 32) + c;
    return p0 + ((p1 + p2) << 32);
#elif VARIANT == 2
    uint64_t s = (p0 >> 32) + p1;
    uint64_t t = (uint32_t)s + p2;
    *h = (s >> 32) + (t >> 32) + p3;
    return (uint32_t)p0 + (t << 32);
#elif VARIANT == 3
    *h = (p1 >> 32) + (((p0 >> 32) + (uint32_t)p1 + p2) >> 32) + p3;
    return p0 + ((p1 + p2) << 32);
#elif VARIANT == 4
    uint64_t t = (p0 >> 32) + p1 + (uint32_t)p2;
    *h = (p2 >> 32) + (t >> 32) + p3;
    return (uint32_t)p0 + (t << 32);
#elif VARIANT == 5
    uint32_t r0, r1, r2, r3, r4, r5, r6;
    r0 = (uint32_t)p0;
    r1 = p0 >> 32;    
    r5 = (uint32_t)p1;
    r2 = p1 >> 32;
    r1 = r1 + r5;
    r6 = r1 < r5;
    r2 = r2 + r6;
    r6 = (uint32_t)p2;
    r5 = p2 >> 32;
    r1 = r1 + r6;
    r6 = r1 < r6;
    r2 = r2 + r6;
    r4 = (uint32_t)p3;
    r3 = p3 >> 32;
    r2 = r2 + r5;
    r6 = r2 < r5;
    r3 = r3 + r6;
    r2 = r2 + r4;
    r6 = r2 < r4;
    r3 = r3 + r6;
    *h =   ((uint64_t)r3 << 32) | r2;
    return ((uint64_t)r1 << 32) | r0;
#elif VARIANT == 6
    uint32_t r0, r1, r2, r3, r4, r5, r6;
    r0 = (uint32_t)p0;
    r1 = p0 >> 32;    
    r5 = (uint32_t)p1;
    r2 = p1 >> 32;
    r4 = ~r1;
    r4 = r4 < r5;
    r1 = r1 + r5;
    r2 = r2 + r4;
    r6 = (uint32_t)p2;
    r5 = p2 >> 32;
    r4 = ~r1;
    r4 = r4 < r6;
    r1 = r1 + r6;
    r2 = r2 + r4;
    r4 = (uint32_t)p3;
    r3 = p3 >> 32;
    r6 = ~r2;
    r6 = r6 < r5;
    r2 = r2 + r5;
    r3 = r3 + r6;
    r6 = ~r2;
    r6 = r6 < r4;
    r2 = r2 + r4;
    r3 = r3 + r6;
    *h =   ((uint64_t)r3 << 32) | r2;
    return ((uint64_t)r1 << 32) | r0;
#else
#error unsupported VARIANT
#endif
}

#if defined(__SIZEOF_INT128__)
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
    unsigned __int128 prod = ((unsigned __int128)a) * b;
    *h = (uint64_t)(prod >> 32);
    return (uint64_t)prod;
}
#elif defined(_MSC_VER) && defined(_WIN64)
#include <intrin.h>
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
    *h = __umulh (a, b);
    return a * b;
}
#elif USE_X64_ASM_REF
uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
    uint64_t res_l, res_h;
    __asm__ (
        "movq  %2, %%rax;\n\t"          // rax = a
        "mulq  %3;\n\t"                 // rdx:rax = a * b
        "movq  %%rdx, %0;\n\t"          // res_h = rdx
        "movq  %%rax, %1;\n\t"          // res_l = rax
        : "=rm" (res_h), "=rm"(res_l)
        : "rm"(a), "rm"(b)
        : "%rax", "%rdx");
    *h = res_h;
    return res_l;
}
#else // generic (and slow) reference implementation
#define ADDCcc(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)

#define ADDcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)

#define ADDC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t0+t1)

uint64_t umul64wide_ref (uint64_t a, uint64_t b, uint64_t *h)
{
    uint32_t cy, t0, t1;
    uint32_t a_lo = (uint32_t)a;
    uint32_t a_hi = a >> 32;
    uint32_t b_lo = (uint32_t)b;
    uint32_t b_hi = b >> 32;
    uint64_t p0 = (uint64_t)a_lo * b_lo;
    uint64_t p1 = (uint64_t)a_lo * b_hi;
    uint64_t p2 = (uint64_t)a_hi * b_lo;
    uint64_t p3 = (uint64_t)a_hi * b_hi;
    uint32_t p0_lo = (uint32_t)p0;
    uint32_t p0_hi = p0 >> 32;
    uint32_t p1_lo = (uint32_t)p1;
    uint32_t p1_hi = p1 >> 32;
    uint32_t p2_lo = (uint32_t)p2;
    uint32_t p2_hi = p2 >> 32;
    uint32_t p3_lo = (uint32_t)p3;
    uint32_t p3_hi = p3 >> 32;
    uint32_t r0 = p0_lo;
    uint32_t r1 = ADDcc  (p0_hi, p1_lo, cy, t0, t1);
    uint32_t r2 = ADDCcc (p1_hi, p2_hi, cy, t0, t1);
    uint32_t r3 = ADDC   (p3_hi,     0, cy, t0, t1);
    r1 = ADDcc  (r1, p2_lo, cy, t0, t1);
    r2 = ADDCcc (r2, p3_lo, cy, t0, t1);
    r3 = ADDC   (r3,     0, cy, t0, t1);
    *h =   ((uint64_t)r3 << 32) + r2;
    return ((uint64_t)r1 << 32) + r0;
}
#endif 

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <gmars...@gmail.com>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void)
{
    uint64_t a, b, res_hi, res_lo, ref_hi, ref_lo, count = 0;

    printf ("Smoke test of umul64wide variant %d\n", VARIANT);
    do {
        a = KISS64;
        b = KISS64;
        ref_lo = umul64wide_ref (a, b, &ref_hi);
        res_lo = umul64wide (a, b, &res_hi);
        if ((res_lo ^ ref_lo) | (res_hi ^ ref_hi)) {
            printf ("!!!! error: a=%016llx b=%016llx res=%016llx_%016llx ref=%016llx_%016llx\n",
                    a, b, res_hi, res_lo, ref_hi, ref_lo);
            return EXIT_FAILURE;
        }
        if (!(count & 0xfffffff)) printf ("\r%llu", count);
        count++;
    } while (count);
    return EXIT_SUCCESS;
}
kmbjn2e3

kmbjn2e31#

我避免使用((x += y) < y)溢出测试,因为不是每个伊萨都能有效地处理条件标志,并且在使用标志寄存器的结果时可能会禁止重新排序; x86[-64]是一个明显的例子,尽管后来的BMI(2)指令可能会帮助缓解这个问题。我还添加了一个32 x 32 -〉64位C实现进行比较-但我希望任何现代伊萨至少能提供一个像ARM的umulh那样的“高位字”乘法。

/******************************************************************************/

/* stackoverflow.com/questions/74713642 */

#include <inttypes.h>
#include <stdio.h>

/* umul_32_32 : 32 x 32 => 64 */

/* force inline (non-portable), or implement it as macro, e.g.,
 * #define umul_32_32(rh, rl, x, y) do { ... } while (0) */

#if (1)

static inline __attribute__((always_inline))
uint64_t umul_32_32 (uint32_t x, uint32_t y)
{
    return (((uint64_t) x) * y);
}

#else

/* if no widening multiply is available, the compiler probably
 * generates something at least as efficient as the following -
 * or (worst case) it calls a builtin function. */

static inline __attribute__((always_inline))
uint64_t umul_32_32 (uint32_t x, uint32_t y)
{
    uint32_t m0, m1, m2, m3; /* (partial products) */
    uint32_t x0, x1, y0, y1;

    x0 = x & UINT16_MAX, x1 = x >> (16);
    y0 = y & UINT16_MAX, y1 = y >> (16);

    m0 = x0 * y0, m1 = x1 * y0;
    m2 = x0 * y1, m3 = x1 * y1;
    m1 += m0 >> (16);
    m3 += m2 >> (16);
    m1 += m2 & UINT16_MAX;

    uint32_t rh = m3 + (m1 >> (16));
    uint32_t rl = m1 << (16) | (m0 & UINT16_MAX);

    return (((uint64_t) rh) << 32 | rl);

    /* 32 x 32 => 64 : no branching or carry overflow tests. */
}

#endif

/* ensure the function is called to inspect code gen / assembly,
 * otherwise gcc and clang evaluate this at compile time. */

__attribute__((noinline)) void umul_64_64 (

    uint64_t *rh, uint64_t *rl, uint64_t x, uint64_t y)
{
    uint64_t m0, m1, m2, m3; /* (partial products) */
    uint32_t x0, x1, y0, y1;

    x0 = (uint32_t) (x), x1 = (uint32_t) (x >> (32));
    y0 = (uint32_t) (y), y1 = (uint32_t) (y >> (32));

    m0 = umul_32_32(x0, y0), m1 = umul_32_32(x1, y0);
    m2 = umul_32_32(x0, y1), m3 = umul_32_32(x1, y1);
    m1 += m0 >> (32);
    m3 += m2 >> (32);
    m1 += m2 & UINT32_MAX;

    *rh = m3 + (m1 >> (32));
    *rl = m1 << (32) | (m0 & UINT32_MAX);

    /* 64 x 64 => 128 : no branching or carry overflow tests. */
}

#if (0)

int main (void)
{
    uint64_t x = UINT64_MAX, y = UINT64_MAX, rh, rl;

    umul_64_64(& rh, & rl, x, y);
    fprintf(stdout, "0x%016" PRIX64 ":0x%016" PRIX64 "\n", rh, rl);

    return (0);
}

#endif

/******************************************************************************/

对于ARM-7,我得到的或多或少是same results作为您的“variant 3”代码,这并不奇怪,因为这是相同的基本思想。我在gcc-12和gcc-trunk上尝试了不同的标志,但无法改进它。
我敢猜测,苹果对AArch 64芯片的投资,只不过是对32位ARM-7芯片进行了更积极的优化和投资。但这纯粹是猜测。不过,对于这样一个主要平台来说,这是一个相当明显的差距。

相关问题