I am learning how to use SIMD for image processing. However, I wonder why I have not seen much improvement in the performance after using SIMD.

  • Image size: 3840*2160
  • Image format: PixelFormat.Format24bppRgb
  • Function: 24bit to 8bit
  1. Normal way with unsafe & Parallel.For: 48ms in average
static void GrayViaParallel(BitmapData org, BitmapData des)
{
    int width = org.Width;
    int height = org.Height;

    var orgp = (byte*)org.Scan0.ToPointer();
    var desp = (byte*)des.Scan0.ToPointer();

    Parallel.For(0, height, i =>
    {
        int orgSd = i * org.Stride;
        int desSd = i * des.Stride;
        for (int j = 0; j < width; j++)
        {
            //                              Red                     Green                  Blue
            desp[desSd] = (byte)((orgp[orgSd + 2] * 19595 + orgp[orgSd + 1] * 38469 + orgp[orgSd] * 7472) >> 16);
            desSd++;
            orgSd += 3;
        }
    });
}
  1. Implemented SIMD: 32ms in average
static void GrayViaParallelAndSIMD(byte* src, byte* dst, int count)
{
    var Coeleft = Vector128.Create(mulBlue, mulGreen, mulRed, mulBlue, mulGreen, mulRed, mulBlue, mulGreen);
    var CoeRight = Vector128.Create(mulRed, mulBlue, mulGreen, mulRed, mulBlue, mulGreen, mulRed, 0);

    int allPixels = count * 3;
    byte* srcEnd = src + allPixels; //Is it wrong?
    int stride = 15; //Proceed 15 bytes per step
    int loopCount = (int)((srcEnd - src) / stride);

    Parallel.For(0, loopCount, i =>
    {
        int curPos = (i + 1) * stride;
        if (curPos < allPixels) //If not added,  it will exceed the image data
        {
            // Load the first 16 bytes of the pixels
            var _1st16bytes = Sse2.LoadVector128(src + i * stride);

            // Get the first 8 bytes
            var low = Sse2.UnpackLow(_1st16bytes, Vector128<byte>.Zero).AsUInt16();
            //Get the next 8 bytes
            var high = Sse2.UnpackHigh(_1st16bytes, Vector128<byte>.Zero).AsUInt16();

            // Calculate the first 8 bytes
            var lowMul = Sse2.MultiplyHigh(Coeleft, low);
            // Calculate the next 8 bytes
            var highMul = Sse2.MultiplyHigh(CoeRight, high);

            //               Blue                     Green                   Red
            var px1 = lowMul.GetElement(0)  + lowMul.GetElement(1)  + lowMul.GetElement(2);
            var px2 = lowMul.GetElement(3)  + lowMul.GetElement(4)  + lowMul.GetElement(5);
            var px3 = lowMul.GetElement(6)  + lowMul.GetElement(7)  + highMul.GetElement(0);
            var px4 = highMul.GetElement(1) + highMul.GetElement(2) + highMul.GetElement(3);
            var px5 = highMul.GetElement(4) + highMul.GetElement(5) + highMul.GetElement(6);

            //15 bytes for 5 pixels 
            var i5 = i * 5;

            dst[i5    ] = (byte)px1;
            dst[i5 + 1] = (byte)px2;
            dst[i5 + 2] = (byte)px3;
            dst[i5 + 3] = (byte)px4;
            dst[i5 + 4] = (byte)px5;
        }
    });
}

Is there a better way to do this, or how can I enhance it, please? I would be very grateful for any advice.

I have experimented with different SIMD approaches, but none of them seem to work well. I am looking for a substantial boost in efficiency.

0

There are 0 best solutions below