C语言 x86内部:如何优化FIR的外/内环

bvuwiixz  于 2023-04-19  发布在  其他
关注(0)|答案(2)|浏览(131)

以下代码用于计算FIR:

void Fir(float* pIn, float* pOut, float* pCoeff, float* pStage, uint32_t N, uint32_t FilterLength)
{
    int n, k;
    float* pSrc;
    float* pCoeffSrc = pCoeff;
    float* pDst = pOut;
    float s0, s1, s2, s3;
    __m128 Vec, Mul;
    __m128 Sum0,Sum1,Sum2,Sum3;

    __m128 Zero = _mm_set_ps1(0);
    memcpy(&pStage[FilterLength - 1], pIn, N * sizeof(float));

    for (n = 0; n < N; n+=4)
    {
        //Sum0
        pSrc = &pStage[n];
        Sum0 = _mm_set_ps1(0);
        pCoeffSrc = pCoeff;

        for (k = 0; k < FilterLength >> 2; k++)
        {
            __m128 Coeff = _mm_load_ps(pCoeffSrc);
            Vec = _mm_load_ps(pSrc); 
            Sum0  = _mm_fmadd_ps(Coeff, Vec, Sum0);
            pCoeffSrc += 4;
            pSrc += 4;
        }

        Sum0 = _mm_hadd_ps(Sum0, Zero);
        Sum0 = _mm_hadd_ps(Sum0, Zero);

        //Sum1
        pSrc = &pStage[n+1];
        Sum1 = _mm_set_ps1(0);
        pCoeffSrc = pCoeff;

        for (k = 0; k < FilterLength >> 2; k++)
        {
            __m128 Coeff = _mm_load_ps(pCoeffSrc);
            Vec = _mm_load_ps(pSrc);
            Sum1 = _mm_fmadd_ps(Coeff, Vec, Sum1);
            pCoeffSrc += 4;
            pSrc += 4;
        }

        Sum1 = _mm_hadd_ps(Sum1, Zero);
        Sum1 = _mm_hadd_ps(Sum1, Zero);

        //Sum2
        pSrc = &pStage[n+2];
        Sum2 = _mm_set_ps1(0);
        pCoeffSrc = pCoeff;

        for (k = 0; k < FilterLength >> 2; k++)
        {
            __m128 Coeff = _mm_load_ps(pCoeffSrc);
            Vec = _mm_load_ps(pSrc);
            Sum2 = _mm_fmadd_ps(Coeff, Vec, Sum2);
            pCoeffSrc += 4;
            pSrc += 4;
        }

        Sum2 = _mm_hadd_ps(Sum2, Zero);
        Sum2 = _mm_hadd_ps(Sum2, Zero);

        //Sum3
        pSrc = &pStage[n+3];
        Sum3 = _mm_set_ps1(0);
        pCoeffSrc = pCoeff;

        for (k = 0; k < FilterLength >> 2; k++)
        {
            __m128 Coeff = _mm_load_ps(pCoeffSrc);
            Vec = _mm_load_ps(pSrc);
            Sum3 = _mm_fmadd_ps(Coeff, Vec, Sum3);
            pCoeffSrc += 4;
            pSrc += 4;
        }

        Sum3 = _mm_hadd_ps(Sum3, Zero);
        Sum3 = _mm_hadd_ps(Sum3, Zero);

        Vec = _mm_set_ps(Sum3.m128_f32[0], Sum2.m128_f32[0], Sum1.m128_f32[0], Sum0.m128_f32[0]);
        _mm_store_ps(pDst, Vec);
        pDst+=4;
    }
}

每个内部循环(4)的结果是一个向量的标量和。然后我通过以下方式从4个标量创建一个向量:
Vec = _mm_set_ps(Sum3.m128_f32[0],Sum2.m128_f32[0],Sum1.m128_f32[0],Sum0.m128_f32[0]);
Vec通过以下方式存储在RAM中:_mm_store_ps(pDst,Vec);
我可以优化这个代码吗?
谢谢你,兹维卡

thtygnil

thtygnil1#

以下版本速度更快,但未完全优化。没有memcpy,内部循环较小,并且取决于外部循环的索引。

void Fir(float* pIn, float* pOut, float* pCoeff, uint32_t N, uint32_t FilterLength)
{
    int n, k;
    float* pSrc;
    float* pCoeffSrc = pCoeff;
    float* pDst = pOut;
    __m128 Vec, Mul;
    __m128 Sum;;
    __m128 Zero = _mm_set_ps1(0);
    uint32_t Offset;

    for (n = 0; n < N; n++)
    {
        pSrc = pIn;
        Sum = _mm_set_ps1(0);
        Offset = FilterLength - 1 - n;
        pCoeffSrc = pCoeff + Offset;

        for (k= Offset; k<FilterLength; k+=4)
        {
            __m128 Coeff = _mm_load_ps(pCoeffSrc);
            Vec = _mm_load_ps(pSrc); 
            Sum  = _mm_fmadd_ps(Coeff, Vec, Sum);
            pCoeffSrc += 4;
            pSrc += 4;
        }

        Sum = _mm_hadd_ps(Sum, Zero);
        Sum = _mm_hadd_ps(Sum, Zero);
        *pDst = Sum.m128_f32[0];
        pDst++;
    }
}

我知道_mm_load_ps在对齐地址上不起作用。

w6lpcovy

w6lpcovy2#

下面的代码运行速度比前一个快了x3。主要变化:__m256而不是__m128

void Fir(float* pIn, float* pOut, float* pCoeff, uint32_t N, uint32_t FilterLength)
{
    int n, k;
    float* pSrc;
    float* pCoeffSrc = pCoeff;
    float* pDst = pOut;

    __m256 Vec, Mul;
    __m256 Sum;;
    __m256 Zero = _mm256_setr_ps(0, 0, 0, 0, 0, 0, 0, 0);
    uint32_t Offset;

    for (n = 0; n < N; n++)
    {
        pSrc = pIn;
        Sum = _mm256_setr_ps(0,0,0,0,0,0,0,0);
        Offset = FilterLength - 1 - n;
        pCoeffSrc = pCoeff + Offset;

        for (k = Offset; k < FilterLength; k += 8)
        {
            __m256 Coeff = _mm256_load_ps(pCoeffSrc);
            Vec = _mm256_load_ps(pSrc);
            Sum = _mm256_fmadd_ps(Coeff, Vec, Sum);
            pCoeffSrc += 8;
            pSrc += 8;
        }

        *pDst = Sum.m256_f32[0] + Sum.m256_f32[1] + Sum.m256_f32[2] + Sum.m256_f32[3] + Sum.m256_f32[4] + Sum.m256_f32[5] + Sum.m256_f32[6] + Sum.m256_f32[7];
        pDst++;
    }
}

我敢肯定这段代码没有完全优化。例如,8个元素的总和。

相关问题