如何在C中将float转换为half类型以及相反的方式

qacovj5a  于 2023-08-03  发布在  其他
关注(0)|答案(3)|浏览(283)

如何在C中将浮点数(float 32)转换为半浮点数(float 16),同时考虑NaN,Infinity等边缘情况。
我不需要算术,因为我只需要类型来满足支持它们的要求。所以半类型可以是uint16_t或对应的typedef。我只在C中找到了一些方法,或者一些没有考虑到NaN等边缘情况的方法。
我需要将一个float转换成一个half类型,它可以表示为一个简单的uint16_t,这个uint16_t应该只包含half的二进制表示,因为我不会对它进行算术运算。我需要这个,这样我就可以满足图书馆的要求。我不能使用现有的实现,因为它们是作为共享库构建的(也主要是C
),在这种情况下我不能使用。此外,GCC/Clang __fp16_Float16将无法工作,因为我将代码编译为将在隔离环境中运行的Web程序集,因此无法使用本机依赖代码(无WASI)(并且EMCC在使用_FloatXX类型时抛出错误)。

g9icjywg

g9icjywg1#

您有多种选择:
1.使用现有的实施例,例如来自Industrial Light & Magic的那个还有一些其他的impl。
1.使用一个固有的,例如对于Intel CPU,您有_mm_cvtps_ph_mm_cvtph_ps,它们一次最多可以转换4个值。
1.使用自己的IEEE 754浮点格式知识编写,并使用了一半。
编辑:由于您主要是想来回转换,因此在ILM代码中要查看的两个函数是:Float to half: Line 85half to float: line 62

vjrehmav

vjrehmav2#

16位到32位转换的代码是here
下面是快速编写的32位到16位转换的测试代码,主要基于算法here。我现在没有时间把它适当地记录下来,它可以改进。测试不检查NaN的有效载荷位(在有效数字段中)的处理。

#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>

//  Provide bit N (equal to 2**N), for 0 <= N < 32.
#define Bit(N)  ((uint32_t) 1 << (N))

//  Provide a mask of N bits, for 0 <= N < 32.
#define Mask(N) (((uint32_t) 1 << (N)) - 1)

/*  Convert a 32-bit floating-point number (with 1 sign bit, 8 exponent field
    bits, and 23 significand field bits) to a 16-bit floating-point number
    (with 1 sign bit, 5 exponent field bits, and 10 significand field bits).
*/
uint16_t ConvertF32ToF16(uint32_t x)
{
    uint32_t SignBit          = x >> 31;
    uint32_t ExponentField    = x >> 23 & Mask( 8);
    uint32_t SignificandField = x       & Mask(23);

    //  All-bits-set in the exponent field means infinity or NaN.
    if (ExponentField == Mask(8))
    {
        //  Zero significand field means infinity.
        if (SignificandField == 0)
            //  Return infinity with copied sign bit.
            return SignBit << 15 | Mask(5) << 10;

        //  Otherwise, we have a NaN.
        else
        {
            //  Truncate significand field.
            SignificandField >>= 23-10;

            /*  If truncated field is zero, turn on a bit so it indicates a
                NaN, not an infinity.
            */
            if (SignificandField == 0)
                SignificandField = 1;

            return SignBit << 15 | Mask(5) << 10 | SignificandField;
        }
    }

    //  All-bits-zero in the exponent field indicates a subnormal number.
    else if (ExponentField == 0)
    {
        /*  All subnormal 32-bit floating-point numbers are too small to
            convert to anything but zero in the 16-bit format.  Return zero
            with the sign bit copied.
        */
        return SignBit << 15;
    }

    //  Everything else is a normal number.
    else
    {
        //  Convert the exponent from the 32-bit format to the 16-bit format.
        int Exponent = (int) ExponentField - 127 + 15;

        //  Small numbers convert to zero.
        if (Exponent < -11)
            return SignBit << 15;

        /*  Decode the significand field.  Note the radix point is between
            bits 23 and 22.
        */
        uint32_t Significand = Bit(23) | SignificandField;

        //  Process values that are subnormal in the 16-bit format.
        if (Exponent < 1)
        {
            /*  Below, we shift the significand to where it will be when the
                exponent is increased to the minimum exponent of the 16-bit
                format.  Here, we move the bits that will shift out to the
                left, to isolate them.
            */
            uint32_t T = Significand << 32 - (1 - Exponent) - 13;
            Significand >>= 1 - Exponent + 13;
            Exponent = 1;

            /*  If the removed bits are greater than half the low bit, we round
                up.
            */
            if (Bit(31) < T)
                ++Significand;

            /*  Otherwise, if the removed bits equal half the low bit, we round
                to make the low bit of the significand even.
            */
            if (Bit(31) == T)
                Significand += Significand & 1;

            //  That could carry to significand to 2.
            if (Bit(10) <= Significand)
                return SignBit << 15 | 1 << 10 | 0;

            return SignBit << 15 | 0 << 10 | (Significand & Mask(10));
        }

        uint32_t T = Significand & Mask(13);
        if (Bit(12) < T || Bit(12) == T && (Significand & Bit(13)))
            Significand += Bit(13);
        Significand >>= 13;
        if (Bit(11) <= Significand)
        {
            ++Exponent;
            Significand >>= 1;
        }

        if (31 <= Exponent)
            return SignBit << 15 | Mask(5) << 10;

        return SignBit << 15 | Exponent << 10 | (Significand & Mask(10));
    }
}

#include <stdlib.h>

static void Test(float x)
{
    uint16_t y0 = (union { _Float16 f; uint16_t u; }) { x } .u;

    uint32_t xu = (union { float f; uint32_t u; }) { x } .u;
    uint16_t y1 = ConvertF32ToF16(xu);

    if (y0 == y1) return;

    printf("%a -> 0x%04hx but expected 0x%04hx.\n", x, y1, y0);
    exit(EXIT_FAILURE);
}

#include <math.h>

int main(void)
{
    Test(-NAN);
    Test(+NAN);
    for (float x = -INFINITY; x < INFINITY; x = nexttowardf(x, INFINITY))
        Test(x);
    Test(INFINITY);
}

字符串

qvtsj1bj

qvtsj1bj3#

下面我展示了一个floathalf转换的ISO-C99实现,它已经过详尽的测试。以下假设适用:floatMap到IEEE-754 binary32,而halfMap到IEEE-754 binary16;浮点和整数数据类型在存储时使用相同的字节序;转换到更窄的浮点类型应利用舍入模式 * 到最近或偶数 *。
作为黄金参考,测试框架使用2011年引入的x86-64指令集扩展F16C,以支持半精度(FP 16)作为存储类型。IEEE-754 NaN处理包含一些架构特定元素,下面的float2half_rn()函数旨在模拟x86-64行为。调整,例如切换到使用单个规范NaN编码,是微不足道的。
下面的代码来自我以前在BSD许可证here下发布的代码。我使用英特尔编译器版本13.1.3.198 Build 20130607来构建此代码,并在IvyBridge CPU上运行了详尽的测试。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "immintrin.h"

uint32_t float_as_uint32 (float a)
{
    uint32_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

uint16_t float2half_rn (float a)
{
    uint32_t ia = float_as_uint32 (a);
    uint16_t ir;

    ir = (ia >> 16) & 0x8000;
    if ((ia & 0x7f800000) == 0x7f800000) {
        if ((ia & 0x7fffffff) == 0x7f800000) {
            ir |= 0x7c00; /* infinity */
        } else {
            ir |= 0x7e00 | ((ia >> (24 - 11)) & 0x1ff); /* NaN, quietened */
        }
    } else if ((ia & 0x7f800000) >= 0x33000000) {
        int shift = (int)((ia >> 23) & 0xff) - 127;
        if (shift > 15) {
            ir |= 0x7c00; /* infinity */
        } else {
            ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
            if (shift < -14) { /* denormal */  
                ir |= ia >> (-1 - shift);
                ia = ia << (32 - (-1 - shift));
            } else { /* normal */
                ir |= ia >> (24 - 11);
                ia = ia << (32 - (24 - 11));
                ir = ir + ((14 + shift) << 10);
            }
            /* IEEE-754 round to nearest of even */
            if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
                ir++;
            }
        }
    }
    return ir;
}

uint16_t float2half_rn_ref (float a)
{
    __m128 pa = _mm_set_ps1 (a);
    __m128i r16 = _mm_cvtps_ph (pa, _MM_FROUND_TO_NEAREST_INT);
    uint16_t res;
    memcpy (&res, &r16, sizeof res);
    return res;
}

float uint32_as_float (uint32_t a)
{
    float r;
    memcpy (&r, &a, sizeof r);
    return r;
}

int main (void)
{
    float arg;
    uint16_t resi, refi;
    uint32_t argi = 0;
    do {
        arg = uint32_as_float (argi);
        refi = float_to_half (arg);
        resi = float2half_rn (arg);
        if (resi != refi) {
            printf ("error @ %15.8e (%08x): resi=%04x  refi=%04x\n", 
                    arg, argi, resi, refi);
            return EXIT_FAILURE;
        }
        argi++;
        if ((argi & 0xffffff) == 0) printf ("\r%08x", argi);
    } while (argi);
    return EXIT_SUCCESS;
}

字符串

相关问题