以下代码用于计算FIR:
void Fir(float* pIn, float* pOut, float* pCoeff, float* pStage, uint32_t N, uint32_t FilterLength)
{
int n, k;
float* pSrc;
float* pCoeffSrc = pCoeff;
float* pDst = pOut;
float s0, s1, s2, s3;
__m128 Vec, Mul;
__m128 Sum0,Sum1,Sum2,Sum3;
__m128 Zero = _mm_set_ps1(0);
memcpy(&pStage[FilterLength - 1], pIn, N * sizeof(float));
for (n = 0; n < N; n+=4)
{
//Sum0
pSrc = &pStage[n];
Sum0 = _mm_set_ps1(0);
pCoeffSrc = pCoeff;
for (k = 0; k < FilterLength >> 2; k++)
{
__m128 Coeff = _mm_load_ps(pCoeffSrc);
Vec = _mm_load_ps(pSrc);
Sum0 = _mm_fmadd_ps(Coeff, Vec, Sum0);
pCoeffSrc += 4;
pSrc += 4;
}
Sum0 = _mm_hadd_ps(Sum0, Zero);
Sum0 = _mm_hadd_ps(Sum0, Zero);
//Sum1
pSrc = &pStage[n+1];
Sum1 = _mm_set_ps1(0);
pCoeffSrc = pCoeff;
for (k = 0; k < FilterLength >> 2; k++)
{
__m128 Coeff = _mm_load_ps(pCoeffSrc);
Vec = _mm_load_ps(pSrc);
Sum1 = _mm_fmadd_ps(Coeff, Vec, Sum1);
pCoeffSrc += 4;
pSrc += 4;
}
Sum1 = _mm_hadd_ps(Sum1, Zero);
Sum1 = _mm_hadd_ps(Sum1, Zero);
//Sum2
pSrc = &pStage[n+2];
Sum2 = _mm_set_ps1(0);
pCoeffSrc = pCoeff;
for (k = 0; k < FilterLength >> 2; k++)
{
__m128 Coeff = _mm_load_ps(pCoeffSrc);
Vec = _mm_load_ps(pSrc);
Sum2 = _mm_fmadd_ps(Coeff, Vec, Sum2);
pCoeffSrc += 4;
pSrc += 4;
}
Sum2 = _mm_hadd_ps(Sum2, Zero);
Sum2 = _mm_hadd_ps(Sum2, Zero);
//Sum3
pSrc = &pStage[n+3];
Sum3 = _mm_set_ps1(0);
pCoeffSrc = pCoeff;
for (k = 0; k < FilterLength >> 2; k++)
{
__m128 Coeff = _mm_load_ps(pCoeffSrc);
Vec = _mm_load_ps(pSrc);
Sum3 = _mm_fmadd_ps(Coeff, Vec, Sum3);
pCoeffSrc += 4;
pSrc += 4;
}
Sum3 = _mm_hadd_ps(Sum3, Zero);
Sum3 = _mm_hadd_ps(Sum3, Zero);
Vec = _mm_set_ps(Sum3.m128_f32[0], Sum2.m128_f32[0], Sum1.m128_f32[0], Sum0.m128_f32[0]);
_mm_store_ps(pDst, Vec);
pDst+=4;
}
}
每个内部循环(4)的结果是一个向量的标量和。然后我通过以下方式从4个标量创建一个向量:
Vec = _mm_set_ps(Sum3.m128_f32[0],Sum2.m128_f32[0],Sum1.m128_f32[0],Sum0.m128_f32[0]);
Vec通过以下方式存储在RAM中:_mm_store_ps(pDst,Vec);
我可以优化这个代码吗?
谢谢你,兹维卡
2条答案
按热度按时间thtygnil1#
以下版本速度更快,但未完全优化。没有memcpy,内部循环较小,并且取决于外部循环的索引。
我知道_mm_load_ps在对齐地址上不起作用。
w6lpcovy2#
下面的代码运行速度比前一个快了x3。主要变化:__m256而不是__m128
我敢肯定这段代码没有完全优化。例如,8个元素的总和。