x86 Intrinsic : FIR for complex float input

63 Views Asked by At

My input is 2 float vectors. One is the real part of complex input. The second is the imaginary part of the same complex input.

I developed the following code for calculating FIR on a float input. In case of FIR, I suspect the input should be complex float (Re,Im, Re, Im,...) and not float (Re, Re, ...)

    void CVector::Fir(float* pInRe, float *pInIm, float* pOutRe, 
float *pOutIm, float* pCoeff, uint32_t N, uint32_t FilterLength)
{
    int n, k;
    float* pSrcRe;
    float* pSrcIm;
    float* pCoeffSrc = pCoeff;
    float* pDstRe = pOutRe;
    float* pDstIm = pOutIm;
    __m128 Zero128 = _mm_set_ps1(0);

    __m256 VecRe, VecIm;
    __m256 SumRe, SumIm;
    uint32_t Offset;
    __m128 Low, High, Sum128;

    for (n = 0; n < N; n++)
    {
        pSrcRe = pInRe;
        pSrcIm = pInIm;
        SumRe = _mm256_set1_ps(0);
        SumIm = SumRe;
        Offset = FilterLength - 1 - n;
        pCoeffSrc = pCoeff + Offset;

        for (k = Offset; k < FilterLength; k += 8)
        {
            __m256 Coeff = _mm256_load_ps(pCoeffSrc);
            VecRe = _mm256_load_ps(pSrcRe);
            VecIm = _mm256_load_ps(pSrcIm);
            SumRe = _mm256_fmadd_ps(Coeff, VecRe, SumRe);
            SumIm = _mm256_fmadd_ps(Coeff, VecIm, SumIm);
            pCoeffSrc += 8;
            pSrcRe += 8;
            pSrcIm += 8;
        }

        //Accumlate 8 elements in Sum128
        //extract 4 low float from Sum
        Low = _mm256_castps256_ps128(SumRe);
        //extract 4 high float from Sum
        High = _mm256_extractf32x4_ps(SumRe, 1);
        //Add Low, High into __mm128
        Sum128 = _mm_add_ps(Low, High);
        //Pack to __mm128 with one element > 0 
        Sum128 = _mm_hadd_ps(Sum128, Zero128);
        Sum128 = _mm_hadd_ps(Sum128, Zero128);
        //extract float[0] from Sum128
        *pDstRe = _mm_cvtss_f32(Sum128);


        //Accumlate 8 elements in Sum128
        //extract 4 low float from Sum
        Low = _mm256_castps256_ps128(SumIm);
        //extract 4 high float from Sum
        High = _mm256_extractf32x4_ps(SumIm, 1);
        //Add Low, High into __mm128
        Sum128 = _mm_add_ps(Low, High);
        //Pack to __mm128 with one element > 0 
        Sum128 = _mm_hadd_ps(Sum128, Zero128);
        Sum128 = _mm_hadd_ps(Sum128, Zero128);
        *pDstIm = _mm_cvtss_f32(Sum128);

        pDstRe++;
        pDstIm++;
    }
}

Can you please advise if the input (and output) should be interleaved or separate real, imag ?

0

There are 0 best solutions below