I am learning how to use SIMD for image processing. However, I wonder why I have not seen much improvement in the performance after using SIMD.
- Image size: 3840*2160
- Image format: PixelFormat.Format24bppRgb
- Function: 24bit to 8bit
- Normal way with unsafe & Parallel.For: 48ms in average
static void GrayViaParallel(BitmapData org, BitmapData des)
{
int width = org.Width;
int height = org.Height;
var orgp = (byte*)org.Scan0.ToPointer();
var desp = (byte*)des.Scan0.ToPointer();
Parallel.For(0, height, i =>
{
int orgSd = i * org.Stride;
int desSd = i * des.Stride;
for (int j = 0; j < width; j++)
{
// Red Green Blue
desp[desSd] = (byte)((orgp[orgSd + 2] * 19595 + orgp[orgSd + 1] * 38469 + orgp[orgSd] * 7472) >> 16);
desSd++;
orgSd += 3;
}
});
}
- Implemented SIMD: 32ms in average
static void GrayViaParallelAndSIMD(byte* src, byte* dst, int count)
{
var Coeleft = Vector128.Create(mulBlue, mulGreen, mulRed, mulBlue, mulGreen, mulRed, mulBlue, mulGreen);
var CoeRight = Vector128.Create(mulRed, mulBlue, mulGreen, mulRed, mulBlue, mulGreen, mulRed, 0);
int allPixels = count * 3;
byte* srcEnd = src + allPixels; //Is it wrong?
int stride = 15; //Proceed 15 bytes per step
int loopCount = (int)((srcEnd - src) / stride);
Parallel.For(0, loopCount, i =>
{
int curPos = (i + 1) * stride;
if (curPos < allPixels) //If not added, it will exceed the image data
{
// Load the first 16 bytes of the pixels
var _1st16bytes = Sse2.LoadVector128(src + i * stride);
// Get the first 8 bytes
var low = Sse2.UnpackLow(_1st16bytes, Vector128<byte>.Zero).AsUInt16();
//Get the next 8 bytes
var high = Sse2.UnpackHigh(_1st16bytes, Vector128<byte>.Zero).AsUInt16();
// Calculate the first 8 bytes
var lowMul = Sse2.MultiplyHigh(Coeleft, low);
// Calculate the next 8 bytes
var highMul = Sse2.MultiplyHigh(CoeRight, high);
// Blue Green Red
var px1 = lowMul.GetElement(0) + lowMul.GetElement(1) + lowMul.GetElement(2);
var px2 = lowMul.GetElement(3) + lowMul.GetElement(4) + lowMul.GetElement(5);
var px3 = lowMul.GetElement(6) + lowMul.GetElement(7) + highMul.GetElement(0);
var px4 = highMul.GetElement(1) + highMul.GetElement(2) + highMul.GetElement(3);
var px5 = highMul.GetElement(4) + highMul.GetElement(5) + highMul.GetElement(6);
//15 bytes for 5 pixels
var i5 = i * 5;
dst[i5 ] = (byte)px1;
dst[i5 + 1] = (byte)px2;
dst[i5 + 2] = (byte)px3;
dst[i5 + 3] = (byte)px4;
dst[i5 + 4] = (byte)px5;
}
});
}
Is there a better way to do this, or how can I enhance it, please? I would be very grateful for any advice.
I have experimented with different SIMD approaches, but none of them seem to work well. I am looking for a substantial boost in efficiency.