My input is 2 float vectors. One is the real part of complex input. The second is the imaginary part of the same complex input.
I developed the following code for calculating FIR on a float input. In case of FIR, I suspect the input should be complex float (Re,Im, Re, Im,...) and not float (Re, Re, ...)
void CVector::Fir(float* pInRe, float *pInIm, float* pOutRe,
float *pOutIm, float* pCoeff, uint32_t N, uint32_t FilterLength)
{
int n, k;
float* pSrcRe;
float* pSrcIm;
float* pCoeffSrc = pCoeff;
float* pDstRe = pOutRe;
float* pDstIm = pOutIm;
__m128 Zero128 = _mm_set_ps1(0);
__m256 VecRe, VecIm;
__m256 SumRe, SumIm;
uint32_t Offset;
__m128 Low, High, Sum128;
for (n = 0; n < N; n++)
{
pSrcRe = pInRe;
pSrcIm = pInIm;
SumRe = _mm256_set1_ps(0);
SumIm = SumRe;
Offset = FilterLength - 1 - n;
pCoeffSrc = pCoeff + Offset;
for (k = Offset; k < FilterLength; k += 8)
{
__m256 Coeff = _mm256_load_ps(pCoeffSrc);
VecRe = _mm256_load_ps(pSrcRe);
VecIm = _mm256_load_ps(pSrcIm);
SumRe = _mm256_fmadd_ps(Coeff, VecRe, SumRe);
SumIm = _mm256_fmadd_ps(Coeff, VecIm, SumIm);
pCoeffSrc += 8;
pSrcRe += 8;
pSrcIm += 8;
}
//Accumlate 8 elements in Sum128
//extract 4 low float from Sum
Low = _mm256_castps256_ps128(SumRe);
//extract 4 high float from Sum
High = _mm256_extractf32x4_ps(SumRe, 1);
//Add Low, High into __mm128
Sum128 = _mm_add_ps(Low, High);
//Pack to __mm128 with one element > 0
Sum128 = _mm_hadd_ps(Sum128, Zero128);
Sum128 = _mm_hadd_ps(Sum128, Zero128);
//extract float[0] from Sum128
*pDstRe = _mm_cvtss_f32(Sum128);
//Accumlate 8 elements in Sum128
//extract 4 low float from Sum
Low = _mm256_castps256_ps128(SumIm);
//extract 4 high float from Sum
High = _mm256_extractf32x4_ps(SumIm, 1);
//Add Low, High into __mm128
Sum128 = _mm_add_ps(Low, High);
//Pack to __mm128 with one element > 0
Sum128 = _mm_hadd_ps(Sum128, Zero128);
Sum128 = _mm_hadd_ps(Sum128, Zero128);
*pDstIm = _mm_cvtss_f32(Sum128);
pDstRe++;
pDstIm++;
}
}
Can you please advise if the input (and output) should be interleaved or separate real, imag ?