c++ 作为练习，尝试编写Gerd Isenberg的位向前扫描的矢量化实现

2nbm6dog 于 2022-12-20 发布在其他

关注(0)|答案(3)|浏览(104)

作为练习，我尝试编写BSF的矢量化实现，但我卡住了，它不起作用。
算法：

short bitScanForward(int16_t bb)
 {
     constexpr uint16_t two = static_cast<uint16_t>(2);
     constexpr uint16_t zero = static_cast<uint16_t>(0);
     uint16_t lsb;
     bb &= -bb;
     lsb = (unsigned short)bb
               | (unsigned short)(bb >> short(8));
     return static_cast<short>(((((((unsigned short)(bb >> 
       short(8)) != zero) * two)
    + ((lsb & unsigned short(0xf0f0)) != zero)) * two)
    + ((lsb & unsigned short(0xcccc)) != zero)) * two)
    + ((lsb & unsigned short(0xaaaa)) != zero);
}

参见：Gerd Isenberg BSF
我的矢量代码：

[[nodiscard]] inline __m128i _mm_cmpneq_epi16(const __m128i& a, const __m128i& b) noexcept
{
    const __m128i _NEG_ONE = _mm_set1_epi16(static_cast<int16_t>(-1));
    __m128i _mask = _mm_setzero_si128();

    _mask = _mm_cmpeq_epi16(a, b);
    _mask = _mm_xor_si128(_mask, _NEG_ONE);//Not Equal

    return _mask;
}//End of _mm_neq_epi16

 [[nodiscard]] inline __m128i _mm_bsf_epi16(__m128i x) noexcept
{
    __m128i _lsb = _mm_setzero_si128();
    __m128i _temp1 = _mm_setzero_si128();
    __m128i _temp2 = _mm_setzero_si128();
    __m128i _result = _mm_setzero_si128();

    const __m128i _zero = _mm_setzero_si128();
    const __m128i _one = _mm_set1_epi16(static_cast<uint16_t>(1));
    const __m128i _two = _mm_set1_epi16(static_cast<uint16_t>(2));
    const __m128i _hex2 = _mm_set1_epi16(static_cast<uint16_t>(0xf0f0));
    const __m128i _hex3 = _mm_set1_epi16(static_cast<uint16_t>(0xcccc));
    const __m128i _hex4 = _mm_set1_epi16(static_cast<uint16_t>(0xaaaa));

    x = _mm_and_si128(x, _mm_sub_epi16(_zero, x));

    _lsb = _mm_or_si128(x, _mm_srli_epi16(x, 8));

    _temp1 = _mm_mullo_epi16(_mm_abs_epi16(_mm_cmpneq_epi16(_mm_srli_epi16(x, 8), _zero)), _two);
    _temp2 = _mm_abs_epi16(_mm_cmpneq_epi16(_mm_and_si128(_lsb, _hex2), _zero));

    _result = _mm_add_epi16(_temp1, _temp2);
    _result = _mm_mullo_epi16(_result, _two);

    _temp1 = _mm_abs_epi16(_mm_cmpneq_epi16(_mm_and_si128(_lsb, _hex3), _zero));
    _temp2 = _mm_abs_epi16(_mm_cmpneq_epi16(_mm_and_si128(_lsb, _hex4), _zero));

    _result = _mm_add_epi16(_result, _temp1);
    _result = _mm_add_epi16(_result, _temp2);
            
    return _result;
}//End of _mm_bsf_epi16

这是我得到的一个常数向量的结果：

-32,768 1000000000000000 bsf: 15
  8,192 0010000000000000 bsf: 13
  2,048 0000100000000000 bsf: 11
  8,704 0010001000000000 bsf: 9
  8,832 0010001010000000 bsf: 7
-24,544 1010000000100000 bsf: 5
-24,568 1010000000001000 bsf: 3
 -8,190 1110000000000010 bsf: 1

正如你所看到的，大多数都是错误的。有可能我只是搞砸了一个嵌套函数调用，但我也可能只是偏离了基础。我很好奇，看看它是否比定标器BSF指令更快。任何帮助都将不胜感激。
好的，我让它工作了，原来我严重误读了上面混乱的括号，并且把算法做得乱了序。
至于性能，这个版本确实优于常见算法，如：

x = x & -x;

if ((x & 0xff00ff00) != 0) index += 8;
if ((x & 0xf0f0f0f0) != 0) index += 4;
if ((x & 0xcccccccc) != 0) index += 2;
if ((x & 0xaaaaaaaa) != 0) index += 1;

return index;

在x86上没有用于16位整数的BSF指令。
我的SIMD版本需要138ms来交换10亿int16_t s上的ffs（使用多线程），而上面的其他版本需要374ms（使用多线程）。

c++

来源：https://stackoverflow.com/questions/73930875/trying-to-write-a-vectorized-implementation-of-gerd-isenbergs-bit-scan-forward

3条答案

按热度按时间

wooyq4lh1#

您选择的SIMD BSF策略效率不高。利用CPU可以作为单个指令执行的其他基元操作会更好。即使是该策略的最佳实现也需要许多不同的掩码常数，并且每个向量需要许多指令。
您选择用_mm_mullo_epi16而不是_mm_slli_epi16 by 1来实现*2是非常不幸的。幸运的是，有些编译器会通过一个常量为您将mullo优化为add，但是整个策略仍然需要比必要的多得多的指令。但是其他像MSVC和ICC这样的程序相当严格地使用了内部函数，实际上使用了硬件乘法，它的延迟相对较高。
有多种好的策略，最佳选择取决于SIMD元素宽度和可用的伊萨扩展级别（许多策略要求SSSE 3用于pshufb）。实施细节中的一些微优化可能取决于英特尔与AMD或同一供应商各代产品之间的微架构差异。

*提供AVX-512 vpopcntb/w/d/q时：vpopcnt(~v & (v-1))

（vpadd -1/vpandn/vpopcnt），即，使掩码达到并且 * 不 * 包括最低设置位，并且对它进行popcount。~v & (v-1)对于零输入给出全1，因此对于16位输入，它可以产生17个不同的输出值，而不需要任何修正来完全工作于0。
3个指令，其中两个非常便宜。（vpopcnt在支持它的CPU上很便宜，Ice Lake和以后的版本，除了桤木Lake，还有Zen 4。AVX-512 VPOPCNTDQ和BITALG（对于b/w版本）。）如果你在循环中使用__tzcnt_u16，Clang会以这种方式将其矢量化。
注意，v ^ (v-1)要得到一个像标量blsmsk一样的直到 * 并包括 * 的掩码，将计数太多，并且不能区分0和0x8000;两者都产生X1 M19 N1 X。

使用AVX-512的32位或64位元素：vplzcntd/q始终可用（所有AVX-512 CPU都有AVX-512 CD）。tzcntd = 31-lzcntd(v&-v)用于非零输入。这将为全零元素给予-1。（因此，如果需要，最后一个vpminud(tz, set1(32))将把UINT_MAX箝位到32。）
*SSSE 3的16位元素：DeBruijn序列乘法以生成pshufb LUT的4位值：非常好，特别是如果您不关心input=0的情况。此策略不适用于32位或64位元素，如果没有AVX-512 VBMI vpermb，则无法实现更宽的LUT，在这种情况下，您通常也会使用vpopcnt。

每个向量5条单微指令（使用AVX），2个向量常量。（如果您想要完整的tzcnt行为，则为input=0生成16。如果-1适用于这种情况，则成本略低。）pmullw（_mm_mullo_epi16）在现代CPU上是单微指令，与pmulld不同
我认为这个策略比aqrit的聪明策略要好，aqrit把pshufb的结果和pminub结合起来（9条指令和gcc或clang）。

*32位元素：@Soonts的FP策略非常好，特别是如果你只想假设SSE 2。转换到FP以利用硬件来计算指数字段。32位是压缩SIMD int-〉float转换的自然宽度。如果输入设置了MSB，你必须处理设置的符号位，即在指数下移之后的额外and指令。

@aqrit使用2x pshufb作为原始整数的每个半字节的4位LUT的策略也很有趣，但我认为它需要一个额外的合并步骤，而@Soonts需要 * 更少 * 的步骤，不需要分割低/高和合并。
@aqrit的仅SSE 2策略（_mm_avg_epu16(r, _mm_cmpeq_epi16(_mm_and_si128(x3333, v), x0000));等）看起来比FP策略慢，特别是对于32位，它需要更多的工作，但FP策略每个向量需要 * 更少 * 的工作。

64位元素：压缩64位整数-〉FP转换在AVX-512之前不可用。Skylake-X具有AVX-512，但没有AVX-512 VPOPCNTDQ。

即使没有对SIMD popcount的直接支持，popcnt(~v & (v-1))的想法也可能是好的。SIMD popcnt是一种已知技术，例如，将2x vpshufb分成低/高半字节作为4位LUT。然后将这些高/低半字节一起_mm_add_epi8，并将psadbw与0对0相加，以在qword块内对字节求和。
(This基本上就是clang如何自动矢量化sum += __tzcnt_u16(arr[i])，即使没有-march=icelake-client '，但有一些浪费的混洗和低效的求和。）

带SSSE 3的16位元素的BSF

设置的最低有效位的位置上的答案可以调整为16位，然后可以使用SSSE 3 pshufb对8位值的16条目查找表进行矢量化。
一个De Bruijn sequence4位的位模式都在某个地方重叠，乘以2的幂（一个位集），这些序列中的一个就会移位到n的最高位，右移type_width - n就会把它们移到最低位，这样我们就在一个字节的底部得到一个4位的值，可以用作LUT索引。

SSE2 pmullw在所有现代CPU上都很快，甚至在Alder Lake E核上也是如此。单uop，尽管在P核Haswell/Skylake/Ice Lake上的延迟是5个周期。但自SKL以来，它具有2/时钟吞吐量，运行在端口0或1上。在Zen 2上也很快，例如，1/时钟吞吐量，3个周期延迟。https://uops.info/。
SIMD整数移位（psrlw）与pmullw竞争相同的端口，但幸运的是，2/时钟吞吐量应该足以避免瓶颈。pshufb在Intel上运行在端口5上，不与shift/pmul竞争。

__m128i bsf_epi16_debruijn(__m128i v)
{
    const __m128i debruijn_magic = _mm_set1_epi16( 0x09AF );
    const __m128i bit_table = _mm_setr_epi8(
         0,  1,  2,  5,  3,  9,  6, 11, 
        15,  4,  8, 10, 14,  7, 13, 12  );

    __m128i blsi = _mm_sub_epi16(_mm_setzero_si128(), v);
    blsi = _mm_and_si128(blsi, v);       // v &= -v;  a power of 2; multiplying by it is like a shift

    __m128i idx = _mm_mullo_epi16(blsi, debruijn_magic);
    idx = _mm_srli_epi16(idx, 12);       // leaving a 4-bit index from the selected position in the DeBruijn sequence
// maybe TODO: avoid the shift with PMULHW with a debruijn sequence and table crafted to use the bits "shifted" into the high half?
// But then would need to mask before pshufb without AVX-512VBMI vpermb xmm
// And if we have that (Ice Lake) we normally have AVX-512 BITALG for vpopcntw(~v & (v-1)) or vpopcntw(pandn(v, v-1))  (vpaddw / vpandn)

    __m128i bsf = _mm_shuffle_epi8(bit_table, idx);  // high half of each word looks up to 0 so no fixup needed
    // input = 0 produces output = 0, same as input=1, unless we fixup the result

#if 1
    // optional: produce -1 or 16 for input==0, instead of 0
    __m128i was_zero = _mm_cmpeq_epi16(v, _mm_setzero_si128());
    //bsf = _mm_blendv_epi8(bsf, _mm_set1_epi16(16), was_zero);  // single-uop on AMD, 2 uops on Intel; 3 on Alder Lake P and 4 on E cores.  Single uop for the legacy SSE version, though.

    // was_zero = _mm_and_si128(was_zero, _mm_set1_epi16(16));
    bsf = _mm_or_si128(bsf, was_zero);  // return -1 for v==0 .  (Or with the &16, return 16
      // alternative: bsf = _mm_sub_epi16(bsf, _mm_slli_epi16(was_zero,4));  // subtract (-1<<4) or (0).  Avoids a constant.
#endif
    return bsf;
}

我使用https://sites.google.com/site/sydfhd/articles-tutorials/de-bruijn-sequence-generator中的程序生成了16位De Bruijn序列和查找表，通过注解掉包含is_mulshift的if的2行修复了编译错误，因为程序中没有定义is_mulshift。
用这个，原始的，和（我的调整）Soonts的答案，加上aqrit的答案。也是一个标量循环，叮当声自动矢量化。

bsf_epi16_debruijn(long long __vector(2)):            # @bsf_epi16_debruijn(long long __vector(2))
        vpxor   xmm1, xmm1, xmm1              # constant can be hoisted out of loops
        vpsubw  xmm2, xmm1, xmm0
        vpand   xmm2, xmm2, xmm0
        vpmullw xmm2, xmm2, xmmword ptr [rip + .LCPI5_0]
        vpsrlw  xmm2, xmm2, 12
        vmovdqa xmm3, xmmword ptr [rip + .LCPI5_1] # xmm3 = [0,1,2,5,3,9,6,11,15,4,8,10,14,7,13,12]
        vpshufb xmm2, xmm3, xmm2
        vpcmpeqw        xmm0, xmm0, xmm1      # fixup for v==0
        vpor    xmm0, xmm2, xmm0              # fixup for v==0
        ret

因此，不算将寄存器设置为常量的指令（因为这些指令可以通过AVX从循环中提升出来，以便非破坏性地使用它们），主要工作有5条指令：两条用于乘法/移位端口，两条可以在任何端口上运行的简单整数，还有一条英特尔CPU只能在端口5上运行的shuffle。
还有2条指令用于此修正策略，该策略为0的元素提供-1，而不是在没有修正的情况下提供output = 0。（这就是为什么我们可以直接用OR代替vpblendvb，即使我们想将其设置为16，而不仅仅是-1. -1 | anything == -1，因此即使输入为0时LUT不产生0，也可以使用。）
这很容易扩展到256位矢量（AVX2）或512位矢量（AVX-512BW）。我还没有尝试将其写成标量，看看GCC或clang是否会自动矢量化移位和LUT查找;我并不乐观，但也不排除这种可能性。
在x86上没有用于16位整数的BSF指令。
不正确：bsf允许16、32或64位的操作数大小。BMI1 tzcnt也是如此。* BSF的内部函数 * 和内置函数没有在编译器之间实现良好的标准化（AFAIK没有用于16位bsf的内部函数），但Intel确实提供了_tzcnt_u16文档。（两个前导下划线），而不是Intel的名称，但clang支持两个名称（一个和两个下划线）。
那很好输入为零的bsf会产生垃圾值（它的内部函数不会暴露目标寄存器保持不变的asm行为; AMD记录的行为，但Intel和AMD都实现了）。对于非零16位输入，低16位以上的位不会影响值。因此，使用16位bsf不会有帮助，但16位tzcnt确实可以在输入为零时获得16，而不必执行_tzcnt_u32(0x10000 | x)来让32位TZCNT在你想要的位置找到设置位。

赞(0）回复(0）举报 2022-12-20

ehxuflar2#

我不喜欢那个算法，指令太多。请尝试下面的版本。

// Count number of trailing zero bits in 16-bit integers
__m128i tzcnt_epi16( __m128i vec )
{
    // Isolate the rightmost set bit by computing tmp = vec & (-vec)
    const __m128i zero = _mm_setzero_si128();
    __m128i tmp = _mm_sub_epi16( zero, vec );
    tmp = _mm_and_si128( tmp, vec );

    // Expand int16 lanes to int32, even/odd lanes in different vectors
    __m128i low = _mm_blend_epi16( zero, tmp, 0b01010101 );
    __m128i high = _mm_srli_epi32( tmp, 16 );
    // Convert int32 to fp32
    low = _mm_castps_si128( _mm_cvtepi32_ps( low ) );
    high = _mm_castps_si128( _mm_cvtepi32_ps( high ) );
    // The mantissa is 0, the input is either 0 or 2^n where n is a small integer
    // The sign bit is unset, the only part of these fp32 numbers is exponent
    // Merge two vectors into a single one
    low = _mm_srli_epi32( low, 23 );
    high = _mm_srli_epi32( high, 23 - 16 );
    tmp = _mm_or_si128( low, high );

    // Now we have a vector of 16 bit lanes containing the exponents
    // When 0, we should return 16
    // Otherwise, we should return ( val - 127 )
    const __m128i bias = _mm_set1_epi16( 127 );
    tmp = _mm_sub_epi16( tmp, bias );
    return _mm_min_epu16( tmp, _mm_set1_epi16( 16 ) );
}

由于_mm_blend_epi16和_mm_min_epu16指令，上述代码需要SSE 4.1。

赞(0）回复(0）举报 2022-12-20

c3frrgcw3#

参见Peter Cordes的answer，这个答案只对8位通道有意义。

__m128i sse2_tzcnt_epi16(__m128i v) {
    const __m128i x0000 = _mm_setzero_si128();
    const __m128i x5555 = _mm_set1_epi16(0x5555);
    const __m128i x3333 = _mm_set1_epi16(0x3333);
    const __m128i x0F0F = _mm_set1_epi16(0x0F0F);
    const __m128i x00FF = _mm_set1_epi16(0x00FF);

    __m128i r;
    v = _mm_and_si128(v, _mm_sub_epi16(x0000, v));
    r = _mm_slli_epi16(_mm_cmpeq_epi16(_mm_and_si128(x5555, v), x0000), 15);
    r = _mm_avg_epu16(r, _mm_cmpeq_epi16(_mm_and_si128(x3333, v), x0000));
    r = _mm_avg_epu16(r, _mm_cmpeq_epi16(_mm_and_si128(x0F0F, v), x0000));
    r = _mm_avg_epu16(r, _mm_cmpeq_epi16(_mm_and_si128(x00FF, v), x0000));
    r = _mm_sub_epi16(_mm_srli_epi16(r, 12), _mm_cmpeq_epi16(v, x0000));
    return r;
}

__m128i ssse3_tzcnt_epi16(__m128i v) {
    const __m128i lut_lo = _mm_set_epi8(8, 9, 8, 10, 8, 9, 8, 11, 8, 9, 8, 10, 8, 9, 8, 16);
    const __m128i lut_hi = _mm_set_epi8(12, 13, 12, 14, 12, 13, 12, 15, 12, 13, 12, 14, 12, 13, 12, 16);
    const __m128i nibble_mask = _mm_set1_epi8(0x0F);
    __m128i t;

    t = _mm_and_si128(nibble_mask, v);
    v = _mm_and_si128(_mm_srli_epi16(v, 4), nibble_mask);
    t = _mm_shuffle_epi8(lut_lo, t);
    v = _mm_shuffle_epi8(lut_hi, v);
    v = _mm_min_epu8(v, t);
    t = _mm_xor_si128(_mm_set1_epi8(8), v);
    v = _mm_min_epu8(_mm_srli_epi16(v, 8), t);
    return v;
}

赞(0）回复(0）举报 2022-12-20

我来回答

c++ 作为练习，尝试编写Gerd Isenberg的位向前扫描的矢量化实现

3条答案

带SSSE 3的16位元素的BSF

相关问题

热门标签

最新问答