With Godbolt.org benchmarking:
-march=native
: https://godbolt.org/z/Ys6rqbGGe-march=cascadelake
: https://godbolt.org/z/W3EM5hKsq
the -mprefer-vector-width=128 flag makes it complete sqrt operation in 0.67 cycles while -mprefer-vector-width=512 causes it to complete sqrt operation in 1.95 cycles.
Source code:
#include <omp.h>
#include <iostream>
#include <string>
#include <functional>
#include<cmath>
template<typename Type, int Simd>
struct KernelData
{
alignas(32)
Type data[Simd];
inline void readFrom(const Type * const __restrict__ ptr) noexcept
{
#pragma GCC ivdep
for(int i=0;i<Simd;i++)
{
data[i] = ptr[i];
}
}
inline void writeTo(Type * const __restrict__ ptr) const noexcept
{
#pragma GCC ivdep
for(int i=0;i<Simd;i++)
{
ptr[i] = data[i];
}
}
inline const KernelData<Type,Simd> sqrt() const noexcept
{
KernelData<Type,Simd> result;
#pragma GCC ivdep
for(int i=0;i<Simd;i++)
{
result.data[i] = std::sqrt(data[i]);
}
return result;
}
};
template<int mask>
struct KernelDataFactory
{
KernelDataFactory()
{
}
template<typename Type>
inline
KernelData<Type,mask> generate() const
{
return KernelData<Type,mask>();
}
};
template<int SimdWidth, typename... Args>
class Kernel
{
public:
Kernel(std::function<void(int,int, Args...)> kernelPrm)
{
kernel = kernelPrm;
}
void run(int n, Args... args)
{
const int nLoop = (n/SimdWidth);
for(int i=0;i<nLoop;i++)
{
kernel(i*SimdWidth,SimdWidth, args...);
}
if((n/SimdWidth)*SimdWidth != n)
{
const int m = n%SimdWidth;
for(int i=0;i<m;i++)
{
kernel(nLoop*SimdWidth+i,1, args...);
}
}
}
private:
std::function<void(int,int, Args...)> kernel;
};
// cpu cycles from stackoverflow
#include <stdint.h> // <cstdint> is preferred in C++, but stdint.h works.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
inline
uint64_t readTSC() {
// _mm_lfence(); // optionally wait for earlier insns to retire before reading the clock
uint64_t tsc = __rdtsc();
// _mm_lfence(); // optionally block later instructions until rdtsc retires
return tsc;
}
int main(int argC, char** argV)
{
constexpr int simd = 16;
constexpr int n = 1003;
Kernel<simd, float *, float *> kernel([](int simdGroupId, int simdWidth, float * input, float * output){
const int id = simdGroupId;
if(simdWidth == simd)
{
const KernelDataFactory<simd> factory;
auto a = factory.generate<float>();
a.readFrom(input+id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output+id);
}
else
{
const KernelDataFactory<1> factory;
auto a = factory.generate<float>();
a.readFrom(input+id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output+id);
}
});
alignas(32)
float i[n],o[n];
for(int j=0;j<n;j++)
i[j]=j;
auto t1 = readTSC();
for(int k=0;k<10000;k++)
kernel.run(n,i,o);
auto t2 = readTSC();
for(int i=n-10;i<n;i++)
{
std::cout<<"i="<<i<<" value="<<o[i]<<std::endl;
}
std::cout<<0.0001f*(t2-t1)/(float)(15*n)<<" cycles per sqrt"<<std::endl;
return 0;
}
Godbolt output for 128 bit preference (-std=c++2a -O3 -march=native -mprefer-vector-width=128 -ftree-vectorize -fno-math-errno -lpthread):
push rbp
add rsi, rcx
mov rbp, rsp
and rsp, -32
sub rsp, 8
vsqrtps xmm0, XMMWORD PTR [rdx]
vsqrtps xmm3, XMMWORD PTR [rdx+16]
vsqrtps xmm2, XMMWORD PTR [rdx+32]
vsqrtps xmm1, XMMWORD PTR [rdx+48]
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vsqrtps xmm0, xmm0
vsqrtps xmm3, xmm3
vsqrtps xmm2, xmm2
vsqrtps xmm1, xmm1
vmovaps XMMWORD PTR [rsp-40], xmm3
vmovaps XMMWORD PTR [rsp-24], xmm2
vmovaps XMMWORD PTR [rsp-8], xmm1
vmovdqu XMMWORD PTR [rsi], xmm0
vmovdqa xmm4, XMMWORD PTR [rsp-40]
vmovdqu XMMWORD PTR [rsi+16], xmm4
vmovdqa xmm5, XMMWORD PTR [rsp-24]
vmovdqu XMMWORD PTR [rsi+32], xmm5
vmovdqa xmm6, XMMWORD PTR [rsp-8]
vmovdqu XMMWORD PTR [rsi+48], xmm6
leave
ret
Godbolt output for 512 bit preference (-std=c++2a -O3 -march=native -mprefer-vector-width=512 -ftree-vectorize -fno-math-errno -lpthread):
push rbp
add rsi, rcx
mov rbp, rsp
and rsp, -64
sub rsp, 8
vmovdqu xmm2, XMMWORD PTR [rdx]
vmovdqu xmm3, XMMWORD PTR [rdx+16]
vmovdqu xmm4, XMMWORD PTR [rdx+32]
vmovdqu xmm5, XMMWORD PTR [rdx+48]
vmovdqa XMMWORD PTR [rsp-120], xmm2
vmovdqa XMMWORD PTR [rsp-104], xmm3
vmovdqa XMMWORD PTR [rsp-88], xmm4
vmovdqa XMMWORD PTR [rsp-72], xmm5
vsqrtps zmm0, ZMMWORD PTR [rsp-120]
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vsqrtps zmm0, zmm0
vmovaps ZMMWORD PTR [rsp-56], zmm0
vmovdqu XMMWORD PTR [rsi], xmm0
vmovdqa xmm6, XMMWORD PTR [rsp-40]
vmovdqu XMMWORD PTR [rsi+16], xmm6
vmovdqa xmm7, XMMWORD PTR [rsp-24]
vmovdqu XMMWORD PTR [rsi+32], xmm7
vmovdqa xmm1, XMMWORD PTR [rsp-8]
vmovdqu XMMWORD PTR [rsi+48], xmm1
vzeroupper
leave
ret
How does AVX512-related packed square root operation run a lot slower than 128bit?