Hi I have the following code:
public unsafe class MultiplyAndAdd : IDisposable
{
float[] rawFirstData = new float[1024];
float[] rawSecondData = new float[1024];
static int alignment = 32;
float[] alignedFirstData = new float[1024 + alignment / sizeof(float)];
int alignedFirstDataOffset;
GCHandle alignedFirstDataHandle;
float* alignedFirstDataPointer;
float[] alignedSecondData = new float[1024 + alignment / sizeof(float)];
int alignedSecondDataOffset;
GCHandle alignedSecondDataHandle;
float* alignedSecondDataPointer;
public IEnumerable<object[]> Data { get; set; }
public void Dispose()
{
this.alignedFirstDataHandle.Free();
this.alignedSecondDataHandle.Free();
}
//Calculate the offset that needs to be applied to ensure that the array is aligned with 32.
private int CalculateAlignmentOffset(GCHandle handle)
{
var handlePointer = handle.AddrOfPinnedObject().ToInt64();
long lPtr2 = (handlePointer + alignment - 1) & ~(alignment - 1);
return (int)(lPtr2 - handlePointer);
}
public MultiplyAndAdd()
{
Random random = new Random(1055);
for (var i = 0; i < 1024; i++)
{
rawFirstData[i] = (float)random.NextDouble() * 4f - 2f;
rawSecondData[i] = (float)random.NextDouble() * 4f - 2f;
}
alignedFirstDataHandle = GCHandle.Alloc(alignedFirstData, GCHandleType.Pinned);
alignedFirstDataOffset = CalculateAlignmentOffset(alignedFirstDataHandle);
alignedFirstDataPointer = (float*)(alignedFirstDataHandle.AddrOfPinnedObject() + alignedFirstDataOffset);
alignedSecondDataHandle = GCHandle.Alloc(alignedSecondData, GCHandleType.Pinned);
alignedSecondDataOffset = CalculateAlignmentOffset(alignedSecondDataHandle);
alignedSecondDataPointer = (float*)(alignedSecondDataHandle.AddrOfPinnedObject() + alignedSecondDataOffset);
for (var i = 0; i < 1024; i++)
{
alignedFirstData[i + alignedFirstDataOffset / sizeof(float)] = rawFirstData[i];
alignedSecondData[i + alignedSecondDataOffset / sizeof(float)] = rawSecondData[i];
}
Data = new[] {
//7,
8,
//11,
//16,
20,
//30,
32,
//40,
50 }.Select(x => new object[] { x }).ToList();
}
public void Validate()
{
for(var i = 0; i < 1024; i++)
{
if (rawFirstData[i] != alignedFirstData[i + alignedFirstDataOffset / sizeof(float)])
{
throw new InvalidOperationException("Diff found!");
}
if (rawFirstData[i] != *(alignedFirstDataPointer + i))
{
throw new InvalidOperationException("Diff found!");
}
if (rawSecondData[i] != alignedSecondData[i + alignedSecondDataOffset / sizeof(float)])
{
throw new InvalidOperationException("Diff found!");
}
if (rawSecondData[i] != *(alignedSecondDataPointer + i))
{
throw new InvalidOperationException("Diff found!");
}
}
Action<string, float, float> ensureAlmostSame = delegate (string name, float normal, float other)
{
var diff = MathF.Abs(normal - other);
if (diff > 0.00001)
{
throw new InvalidOperationException($"The difference between normal and {name} was {diff}");
}
};
foreach (var count in Data.Select(x => (int)x[0]))
{
var normal = Normal(count);
var vectorUnaligned = VectorUnaligned(count);
ensureAlmostSame(nameof(vectorUnaligned), normal, vectorUnaligned);
var vectorAligned = VectorAligned(count);
ensureAlmostSame(nameof(vectorAligned), normal, vectorAligned);
var avx2Aligned = Avx2Aligned(count);
ensureAlmostSame(nameof(avx2Aligned), normal, avx2Aligned);
var fmaAligned = FmaAligned(count);
ensureAlmostSame(nameof(fmaAligned), normal, fmaAligned);
}
}
//[Benchmark(Baseline = true)]
[ArgumentsSource(nameof(Data))]
public float Normal(int count)
{
var result = 0f;
for (var i = 0; i < count; i++)
{
result += rawFirstData[i] * rawSecondData[i];
}
return result;
}
[Benchmark]
[ArgumentsSource(nameof(Data))]
public float VectorUnaligned(int count)
{
int vectorSize = Vector<float>.Count;
var accVector = Vector<float>.Zero;
int i = 0;
for (; i <= count - vectorSize; i += vectorSize)
{
var firstVector = new Vector<float>(rawFirstData, i);
var secondVector = new Vector<float>(rawSecondData, i);
var v = Vector.Multiply(firstVector, secondVector);
accVector = Vector.Add(v, accVector);
}
float result = Vector.Sum(accVector);
for (; i < count; i++)
{
result += rawFirstData[i] * rawSecondData[i];
}
return result;
}
//[Benchmark]
[ArgumentsSource(nameof(Data))]
public float VectorAligned(int count)
{
int vectorSize = Vector<float>.Count;
var accVector = Vector<float>.Zero;
int i = 0;
for (; i <= count - vectorSize; i += vectorSize)
{
var firstVector = new Vector<float>(alignedFirstData, alignedFirstDataOffset / sizeof(float) + i);
var secondVector = new Vector<float>(alignedSecondData, alignedSecondDataOffset / sizeof(float) + i);
var v = Vector.Multiply(firstVector, secondVector);
accVector = Vector.Add(v, accVector);
}
float result = Vector.Sum(accVector);
for (; i < count; i++)
{
result += rawFirstData[i] * rawSecondData[i];
}
return result;
}
[Benchmark]
[ArgumentsSource(nameof(Data))]
public float Avx2Aligned(int count)
{
int vectorSize = Vector256<float>.Count;
var accumulationVector = Vector256<float>.Zero;
var i = 0;
for (;i <= count - vectorSize; i += vectorSize)
{
var firstVector = Avx2.LoadAlignedVector256(alignedFirstDataPointer + i);
var secondVector = Avx2.LoadAlignedVector256(alignedSecondDataPointer + i);
var resultVector = Avx2.Multiply(firstVector, secondVector);
accumulationVector = Avx2.Add(accumulationVector, resultVector);
}
var result = 0f;
var temp = stackalloc float[vectorSize];
Avx2.Store(temp, accumulationVector);
for (int j = 0; j < vectorSize; j++)
{
result += temp[j];
}
for (; i < count; i++)
{
result += *(alignedFirstDataPointer + i) * *(alignedSecondDataPointer + i);
}
return result;
}
[Benchmark]
[ArgumentsSource(nameof(Data))]
public float FmaAligned(int count)
{
int vectorSize = Vector256<float>.Count;
var accumulationVector = Vector256<float>.Zero;
var i = 0;
for (; i <= count - vectorSize; i += vectorSize)
{
var firstVector = Avx2.LoadAlignedVector256(alignedFirstDataPointer + i);
var secondVector = Avx2.LoadAlignedVector256(alignedSecondDataPointer + i);
accumulationVector = Fma.MultiplyAdd(firstVector, secondVector, accumulationVector);
}
var result = 0f;
var temp = stackalloc float[vectorSize];
Avx2.Store(temp, accumulationVector);
for (int j = 0; j < vectorSize; j++)
{
result += temp[j];
}
for (; i < count; i++)
{
result += *(alignedFirstDataPointer + i) * *(alignedSecondDataPointer + i);
}
return result;
}
}
If I run this benchmark on my Zen3 CPU, I get the following result:
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19042.1586 (20H2/October2020Update)
AMD Ryzen 5 5600X, 1 CPU, 12 logical and 6 physical cores
.NET SDK=6.0.200
[Host] : .NET 6.0.2 (6.0.222.6406), X64 RyuJIT
DefaultJob : .NET 6.0.2 (6.0.222.6406), X64 RyuJIT
| Method | count | Mean | Error | StdDev |
|---------------- |------ |---------:|----------:|----------:|
| VectorUnaligned | 8 | 1.231 ns | 0.0093 ns | 0.0082 ns |
| Avx2Aligned | 8 | 3.576 ns | 0.0208 ns | 0.0195 ns |
| FmaAligned | 8 | 3.408 ns | 0.0259 ns | 0.0243 ns |
| VectorUnaligned | 20 | 4.428 ns | 0.0146 ns | 0.0122 ns |
| Avx2Aligned | 20 | 6.321 ns | 0.0578 ns | 0.0541 ns |
| FmaAligned | 20 | 5.845 ns | 0.0121 ns | 0.0113 ns |
| VectorUnaligned | 32 | 4.022 ns | 0.0098 ns | 0.0087 ns |
| Avx2Aligned | 32 | 5.205 ns | 0.0161 ns | 0.0150 ns |
| FmaAligned | 32 | 4.776 ns | 0.0265 ns | 0.0221 ns |
| VectorUnaligned | 50 | 6.901 ns | 0.0337 ns | 0.0315 ns |
| Avx2Aligned | 50 | 7.207 ns | 0.0476 ns | 0.0422 ns |
| FmaAligned | 50 | 7.246 ns | 0.0169 ns | 0.0158 ns |
Why is VectorUnaligned
so much faster that the more optimized AVX2
and Fma
code?
If I enable VectorAligned
its also slower than VectorUnaligned
.
Not an answer but a tip for "Fastest way to multiply".
Sorry, I don't know how to deal with alignment but you missed the option of casting the array type. It might be faster than picking floats from source arrays in the loop.
It makes a bit more JIT Assembler code than
VectorUnaligned
method but the first loop looks like twice shorter because if contains only one out-of-range check instead of 4. Give it a chance to test with different types of vectors and alignment.this one
VectorUnaligned
loop, looks like JIT failed to optimize itCompiled code got from https://sharplab.io/. Real generated code may vary from CPU to CPU because
Vector<T>.Count
on certain CPUs may vary.