我正在使用System.Buffers.Binary.BinaryPrimitives以精确的方式将值写入字节数组。没有来自MS的示例，我可以看到几种方法来完成它，但我不确定其中一种是否比另一种更好。原则上，创建大量Span<byte>对象的需要似乎不太理想？
考虑这个简单的例子：

//writes these values in this order to a new 16-byte buffer
byte[] PopulateBuffer(int i1,int i2,Int16 s1,Int16 s2)
{
 var buffer = new byte[16]; //padded based on external protocol
 var span = new Span<byte>(buffer);
 BinaryPrimitives.WriteInt32LittleEndian(span.Slice(0,4),i1);
 BinaryPrimitives.WriteInt32LittleEndian(span.Slice(4,4),i2);
 BinaryPrimitives.WriteInt16LittleEndian(span.Slice(8,2),s1);
 BinaryPrimitives.WriteInt16LittleEndian(span.Slice(10,2),s2);
 return buffer;
}

我在这里示例化了5个Span对象。与老式的通过位移位手动获取字节的方法相比，这看起来确实很混乱，但实际上开销很大吗？有没有更好的方法来使用这个类？

TL;DR：从下面的结果来看，基于Span的方法似乎比替代方法要快得多。
注意，Span<T>是一个值类型，JIT在看穿它方面做得相当不错。
我创建了一个简化测试：

using System;
using System.Buffers.Binary;
public class C
{
    byte[] PopulateBufferSpan(int i1, short s2)
    {
         var buffer = new byte[6];
         var span = new Span<byte>(buffer);
         BinaryPrimitives.WriteInt32LittleEndian(span.Slice(0,4), i1);
         BinaryPrimitives.WriteInt16LittleEndian(span.Slice(4,2), s2);
         return buffer;
    }
    byte[] PopulateBufferExplicit(int i1, short s2)
    {
        var buffer = new byte[6];
        buffer[0] = (byte)(i1 & 0xFF);
        buffer[1] = (byte)((i1 >> 8) & 0xFF);
        buffer[2] = (byte)((i1 >> 16) & 0xFF);
        buffer[3] = (byte)((i1 >> 24) & 0xFF);
        buffer[4] = (byte)(s2 & 0xFF);
        buffer[5] = (byte)((s2 >> 8) & 0xFF);
        return buffer;
    }
}

哪个JIT用于：

C.PopulateBufferSpan(Int32, Int16)
    L0000: push rdi
    L0001: push rsi
    L0002: sub rsp, 0x28
    L0006: mov esi, edx
    L0008: mov edi, r8d
    L000b: mov rcx, 0x7ffec35e2360
    L0015: mov edx, 0x6
    L001a: call 0x7fff230847e0
    L001f: lea rdx, [rax+0x10]
    L0023: mov ecx, 0x6
    L0028: mov r8d, ecx
    L002b: cmp r8, 0x4
    L002f: jb L0051
    L0031: mov r8, rdx
    L0034: mov [r8], esi
    L0037: mov ecx, ecx
    L0039: cmp rcx, 0x6
    L003d: jb L0057
    L003f: add rdx, 0x4
    L0043: movsx rcx, di
    L0047: mov [rdx], cx
    L004a: add rsp, 0x28
    L004e: pop rsi
    L004f: pop rdi
    L0050: ret
    L0051: call System.ThrowHelper.ThrowArgumentOutOfRangeException()
    L0056: int3
    L0057: call System.ThrowHelper.ThrowArgumentOutOfRangeException()
    L005c: int3
C.PopulateBufferExplicit(Int32, Int16)
    L0000: push rdi
    L0001: push rsi
    L0002: sub rsp, 0x28
    L0006: mov esi, edx
    L0008: mov edi, r8d
    L000b: mov rcx, 0x7ffec35e2360
    L0015: mov edx, 0x6
    L001a: call 0x7fff230847e0
    L001f: mov [rax+0x10], sil
    L0023: mov edx, esi
    L0025: sar edx, 0x8
    L0028: mov [rax+0x11], dl
    L002b: mov edx, esi
    L002d: sar edx, 0x10
    L0030: mov [rax+0x12], dl
    L0033: sar esi, 0x18
    L0036: mov [rax+0x13], sil
    L003a: movsx rdx, di
    L003e: mov [rax+0x14], dl
    L0041: sar edx, 0x8
    L0044: mov [rax+0x15], dl
    L0047: add rsp, 0x28
    L004b: pop rsi
    L004c: pop rdi
    L004d: ret

正如您所看到的，这两个版本的复杂性差别很小，只是使用BinaryPrimitives的版本有一些范围检查（这不是坏事）。
请注意，JIT现在是多层的，我认为SharpLab只显示第一层的结果，所以如果它在热路径上，这可能会得到很好的改善。
SharpLab链接
我还使用BenchmarkDotNet运行了一个基准测试：

public class MyBenchmark
{
    private byte[] buffer = new byte[32];
    [Benchmark]
    public void PopulateBufferLESpan()
    {
        PopulateBufferLESpanImpl(1, 2, 3, 4);
    }
    [Benchmark]
    public void PopulateBufferLEExplicit()
    {
        PopulateBufferLEExplicitImpl(1, 2, 3, 4);
    }
    [Benchmark]
    public void PopulateBufferBESpan()
    {
        PopulateBufferBESpanImpl(1, 2, 3, 4);
    }
    [Benchmark]
    public void PopulateBufferBEExplicit()
    {
        PopulateBufferBEExplicitImpl(1, 2, 3, 4);
    }
    private void PopulateBufferLESpanImpl(int i1, int i2, short s1, short s2)
    {
        var span = new Span<byte>(buffer);
        BinaryPrimitives.WriteInt32LittleEndian(span.Slice(0, 4), i1);
        BinaryPrimitives.WriteInt32LittleEndian(span.Slice(4, 4), i2);
        BinaryPrimitives.WriteInt16LittleEndian(span.Slice(8, 2), s1);
        BinaryPrimitives.WriteInt16LittleEndian(span.Slice(10, 2), s2);
    }
    private void PopulateBufferLEExplicitImpl(int i1, int i2, short i3, short i4)
    {
        buffer[0] = (byte)(i1 & 0xFF);
        buffer[1] = (byte)((i1 >> 8) & 0xFF);
        buffer[2] = (byte)((i1 >> 16) & 0xFF);
        buffer[3] = (byte)((i1 >> 24) & 0xFF);
        buffer[4] = (byte)(i2 & 0xFF);
        buffer[5] = (byte)((i2 >> 8) & 0xFF);
        buffer[6] = (byte)((i2 >> 16) & 0xFF);
        buffer[7] = (byte)((i2 >> 24) & 0xFF);
        buffer[8] = (byte)(i3 & 0xFF);
        buffer[9] = (byte)((i3 >> 8) & 0xFF);
        buffer[10] = (byte)(i4 & 0xFF);
        buffer[11] = (byte)((i4 >> 8) & 0xFF);
    }
    private void PopulateBufferBESpanImpl(int i1, int i2, short s1, short s2)
    {
        var span = new Span<byte>(buffer);
        BinaryPrimitives.WriteInt32BigEndian(span.Slice(0, 4), i1);
        BinaryPrimitives.WriteInt32BigEndian(span.Slice(4, 4), i2);
        BinaryPrimitives.WriteInt16BigEndian(span.Slice(8, 2), s1);
        BinaryPrimitives.WriteInt16BigEndian(span.Slice(10, 2), s2);
    }
    private void PopulateBufferBEExplicitImpl(int i1, int i2, short i3, short i4)
    {
        buffer[0] = (byte)((i1 >> 24) & 0xFF);
        buffer[1] = (byte)((i1 >> 16) & 0xFF);
        buffer[2] = (byte)((i1 >> 8) & 0xFF);
        buffer[3] = (byte)(i1 & 0xFF);
        buffer[4] = (byte)((i2 >> 24) & 0xFF);
        buffer[5] = (byte)((i2 >> 16) & 0xFF);
        buffer[6] = (byte)((i2 >> 24) & 0xFF);
        buffer[7] = (byte)(i2 & 0xFF);
        buffer[8] = (byte)((i3 >> 8) & 0xFF);
        buffer[9] = (byte)(i3 & 0xFF);
        buffer[10] = (byte)((i4 >> 8) & 0xFF);
        buffer[11] = (byte)(i4 & 0xFF);
    }

结果如下：

BenchmarkDotNet=v0.11.5, OS=Windows 10.0.16299.1565 (1709/FallCreatorsUpdate/Redstone3)
Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
Frequency=2062501 Hz, Resolution=484.8482 ns, Timer=TSC
.NET Core SDK=3.0.100
  [Host]     : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), 64bit RyuJIT
  DefaultJob : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), 64bit RyuJIT
|                   Method |     Mean |     Error |    StdDev |   Median |
|------------------------- |---------:|----------:|----------:|---------:|
|     PopulateBufferLESpan | 1.772 ns | 0.0629 ns | 0.0558 ns | 1.745 ns |
| PopulateBufferLEExplicit | 3.698 ns | 0.0689 ns | 0.0576 ns | 3.688 ns |
|     PopulateBufferBESpan | 2.532 ns | 0.0791 ns | 0.0740 ns | 2.531 ns |
| PopulateBufferBEExplicit | 4.003 ns | 0.1106 ns | 0.2951 ns | 3.872 ns |

也许令人惊讶的是，基于Span的方法比位操作快得多，这可能是因为x86是little-endian，BinaryPrimitives意识到它可以直接将值位块传输到数组中，而无需单独提取和分配每个字节，但BE变体也显示出相当大的差异。

展开查看全部

2条答案

按热度按时间

xqkwcwgp1#

赞(0）回复(0）举报 2023-01-22

byqmnocz2#

Span s是一个ref struct类型。创建span将创建非常简洁的对象，引用原始数组和您声明的范围的开始到结束位置。
Span、ReadOnlySpan和Memory是专门为更好地处理序列（尤其是内存/字节序列）而引入的。
你可以把Spans看作数组段指针。创建、复制和访问这种指针结构的成本相对较低。当操作它时，你仍然在原始的底层数组上操作。（不使用额外的数组示例。）
你提到了位移位，我想你的意思是作为BinaryPrimitives.WriteInt32LittleEndian的替代品。
如果不使用 * WriteInt 32 * 方法，而是手动提取四个字节，并按索引将它们设置到数组中，则会投入多个位操作，并可能破坏线性向量操作，而这些操作可以通过SIMD指令、分支预测和缓存进行CPU优化。
很难预测哪种方法性能更好，为了确定性能差异，您必须专门测试您的用例。
通常，使用Span不是一个昂贵的操作，使用标准库提供的方法更可取[而不是手动复制它们的行为]。

.net 使用BinaryPrimitives填充字节缓冲区的首选方法？

2条答案

相关问题

热门标签

最新问答