In optimization for an AABB collision detection algorithm's inner-most 4-versus-4 comparison part, I am stuck at simplifying code at the same time gaining(or just retaining) performance.
Here is the version with hand-unrolled initialization:
https://godbolt.org/z/TMGMhdsss
inline
const int intersectDim(const float minx, const float maxx, const float minx2, const float maxx2) noexcept
{
return !((maxx < minx2) || (maxx2 < minx));
}
inline
void comp4vs4( const int * const __restrict__ partId1, const int * const __restrict__ partId2,
const float * const __restrict__ minx1, const float * const __restrict__ minx2,
const float * const __restrict__ miny1, const float * const __restrict__ miny2,
const float * const __restrict__ minz1, const float * const __restrict__ minz2,
const float * const __restrict__ maxx1, const float * const __restrict__ maxx2,
const float * const __restrict__ maxy1, const float * const __restrict__ maxy2,
const float * const __restrict__ maxz1, const float * const __restrict__ maxz2,
int * const __restrict__ out
)
{
alignas(32)
int result[16]={
// 0v0 0v1 0v2 0v3
// 1v0 1v1 1v2 1v3
// 2v0 2v1 2v2 2v3
// 3v0 3v1 3v2 3v3
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0
};
alignas(32)
int tileId1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
partId1[0],partId1[1],partId1[2],partId1[3],
partId1[0],partId1[1],partId1[2],partId1[3],
partId1[0],partId1[1],partId1[2],partId1[3],
partId1[0],partId1[1],partId1[2],partId1[3]
};
alignas(32)
int tileId2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
partId2[0],partId2[0],partId2[0],partId2[0],
partId2[1],partId2[1],partId2[1],partId2[1],
partId2[2],partId2[2],partId2[2],partId2[2],
partId2[3],partId2[3],partId2[3],partId2[3]
};
alignas(32)
float tileMinX1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
minx1[0],minx1[1],minx1[2],minx1[3],
minx1[0],minx1[1],minx1[2],minx1[3],
minx1[0],minx1[1],minx1[2],minx1[3],
minx1[0],minx1[1],minx1[2],minx1[3]
};
alignas(32)
float tileMinX2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
minx2[0],minx2[0],minx2[0],minx2[0],
minx2[1],minx2[1],minx2[1],minx2[1],
minx2[2],minx2[2],minx2[2],minx2[2],
minx2[3],minx2[3],minx2[3],minx2[3]
};
alignas(32)
float tileMinY1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
miny1[0],miny1[1],miny1[2],miny1[3],
miny1[0],miny1[1],miny1[2],miny1[3],
miny1[0],miny1[1],miny1[2],miny1[3],
miny1[0],miny1[1],miny1[2],miny1[3]
};
alignas(32)
float tileMinY2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
miny2[0],miny2[0],miny2[0],miny2[0],
miny2[1],miny2[1],miny2[1],miny2[1],
miny2[2],miny2[2],miny2[2],miny2[2],
miny2[3],miny2[3],miny2[3],miny2[3]
};
alignas(32)
float tileMinZ1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
minz1[0],minz1[1],minz1[2],minz1[3],
minz1[0],minz1[1],minz1[2],minz1[3],
minz1[0],minz1[1],minz1[2],minz1[3],
minz1[0],minz1[1],minz1[2],minz1[3]
};
alignas(32)
float tileMinZ2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
minz2[0],minz2[0],minz2[0],minz2[0],
minz2[1],minz2[1],minz2[1],minz2[1],
minz2[2],minz2[2],minz2[2],minz2[2],
minz2[3],minz2[3],minz2[3],minz2[3]
};
alignas(32)
float tileMaxX1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
maxx1[0],maxx1[1],maxx1[2],maxx1[3],
maxx1[0],maxx1[1],maxx1[2],maxx1[3],
maxx1[0],maxx1[1],maxx1[2],maxx1[3],
maxx1[0],maxx1[1],maxx1[2],maxx1[3]
};
alignas(32)
float tileMaxX2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
maxx2[0],maxx2[0],maxx2[0],maxx2[0],
maxx2[1],maxx2[1],maxx2[1],maxx2[1],
maxx2[2],maxx2[2],maxx2[2],maxx2[2],
maxx2[3],maxx2[3],maxx2[3],maxx2[3]
};
alignas(32)
float tileMaxY1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
maxy1[0],maxy1[1],maxy1[2],maxy1[3],
maxy1[0],maxy1[1],maxy1[2],maxy1[3],
maxy1[0],maxy1[1],maxy1[2],maxy1[3],
maxy1[0],maxy1[1],maxy1[2],maxy1[3]
};
alignas(32)
float tileMaxY2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
maxy2[0],maxy2[0],maxy2[0],maxy2[0],
maxy2[1],maxy2[1],maxy2[1],maxy2[1],
maxy2[2],maxy2[2],maxy2[2],maxy2[2],
maxy2[3],maxy2[3],maxy2[3],maxy2[3]
};
alignas(32)
float tileMaxZ1[16]={
// 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
maxz1[0],maxz1[1],maxz1[2],maxz1[3],
maxz1[0],maxz1[1],maxz1[2],maxz1[3],
maxz1[0],maxz1[1],maxz1[2],maxz1[3],
maxz1[0],maxz1[1],maxz1[2],maxz1[3]
};
alignas(32)
float tileMaxZ2[16]={
// 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
maxz2[0],maxz2[0],maxz2[0],maxz2[0],
maxz2[1],maxz2[1],maxz2[1],maxz2[1],
maxz2[2],maxz2[2],maxz2[2],maxz2[2],
maxz2[3],maxz2[3],maxz2[3],maxz2[3]
};
for(int i=0;i<16;i++)
result[i] = (tileId1[i] < tileId2[i]);
for(int i=0;i<16;i++)
result[i] = result[i] &&
intersectDim(tileMinX1[i], tileMaxX1[i], tileMinX2[i], tileMaxX2[i]) &&
intersectDim(tileMinY1[i], tileMaxY1[i], tileMinY2[i], tileMaxY2[i]) &&
intersectDim(tileMinZ1[i], tileMaxZ1[i], tileMinZ2[i], tileMaxZ2[i]);
for(int i=0;i<16;i++)
out[i]=result[i];
}
#include<iostream>
int main()
{
int tile1[4];int tile2[4];
float tile3[4];float tile4[4];
float tile5[4];float tile6[4];
float tile7[4];float tile8[4];
float tile9[4];float tile10[4];
float tile11[4];float tile12[4];
float tile13[4];float tile14[4];
for(int i=0;i<4;i++)
{
std::cin>>tile1[i];
std::cin>>tile2[i];
std::cin>>tile3[i];
std::cin>>tile4[i];
std::cin>>tile5[i];
std::cin>>tile6[i];
std::cin>>tile7[i];
std::cin>>tile8[i];
std::cin>>tile9[i];
std::cin>>tile10[i];
std::cin>>tile11[i];
std::cin>>tile12[i];
std::cin>>tile13[i];
std::cin>>tile14[i];
}
int out[16];
comp4vs4(tile1,tile2,tile3,tile4,tile5,tile6,tile7,tile8,tile9,
tile10,tile11,tile12,tile13,tile14,out);
for(int i=0;i<16;i++)
std::cout<<out[i];
return 0;
}
and its output from godbolt:
comp4vs4(int const*, int const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, int*):
push rbp
mov rbp, rsp
and rsp, -32
sub rsp, 8
mov rax, QWORD PTR [rbp+80]
vmovups xmm0, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+16]
vmovups xmm6, XMMWORD PTR [rcx]
vmovups xmm5, XMMWORD PTR [r9]
vmovups xmm9, XMMWORD PTR [r8]
vmovdqu xmm15, XMMWORD PTR [rsi]
vmovdqu xmm8, XMMWORD PTR [rdi]
vmovups xmm2, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+24]
vpermilps xmm1, xmm6, 0
vmovdqa XMMWORD PTR [rsp-88], xmm15
vmovups xmm4, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+32]
vmovups xmm14, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+40]
vmovups xmm11, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+48]
vcmpleps xmm3, xmm1, xmm14
vmovups xmm13, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+56]
vpermilps xmm1, xmm11, 0
vcmpleps xmm1, xmm0, xmm1
vmovups xmm10, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+64]
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm5, 0
vcmpleps xmm3, xmm3, xmm13
vmovups xmm7, XMMWORD PTR [rdx]
mov rdx, QWORD PTR [rbp+72]
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm10, 0
vmovaps XMMWORD PTR [rsp-72], xmm7
vcmpleps xmm3, xmm9, xmm3
vmovups xmm7, XMMWORD PTR [rdx]
vpand xmm1, xmm1, xmm3
vpshufd xmm3, xmm15, 0
vpcomltd xmm3, xmm8, xmm3
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm7, 0
vcmpleps xmm12, xmm2, xmm3
vpermilps xmm3, xmm4, 0
vcmpleps xmm3, xmm3, XMMWORD PTR [rsp-72]
vpand xmm3, xmm3, xmm12
vmovdqa xmm12, XMMWORD PTR .LC0[rip]
vpand xmm3, xmm3, xmm12
vpand xmm1, xmm1, xmm3
vmovdqa XMMWORD PTR [rsp-104], xmm1
vpermilps xmm1, xmm6, 85
vcmpleps xmm3, xmm1, xmm14
vpermilps xmm1, xmm11, 85
vcmpleps xmm1, xmm0, xmm1
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm5, 85
vcmpleps xmm3, xmm3, xmm13
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm10, 85
vcmpleps xmm3, xmm9, xmm3
vpand xmm1, xmm1, xmm3
vpshufd xmm3, xmm15, 85
vpermilps xmm15, xmm4, 85
vpcomltd xmm3, xmm8, xmm3
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm7, 85
vcmpleps xmm15, xmm15, XMMWORD PTR [rsp-72]
vcmpleps xmm3, xmm2, xmm3
vpand xmm3, xmm3, xmm15
vpermilps xmm15, xmm4, 170
vpand xmm3, xmm3, xmm12
vpermilps xmm4, xmm4, 255
vcmpleps xmm15, xmm15, XMMWORD PTR [rsp-72]
vpand xmm1, xmm1, xmm3
vcmpleps xmm4, xmm4, XMMWORD PTR [rsp-72]
vmovdqa XMMWORD PTR [rsp-120], xmm1
vpermilps xmm1, xmm6, 170
vpermilps xmm6, xmm6, 255
vcmpleps xmm3, xmm1, xmm14
vpermilps xmm1, xmm11, 170
vpermilps xmm11, xmm11, 255
vcmpleps xmm6, xmm6, xmm14
vcmpleps xmm1, xmm0, xmm1
vcmpleps xmm11, xmm0, xmm11
vpshufd xmm0, XMMWORD PTR [rsp-88], 255
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm5, 170
vpermilps xmm5, xmm5, 255
vcmpleps xmm3, xmm3, xmm13
vpand xmm6, xmm11, xmm6
vcmpleps xmm13, xmm5, xmm13
vmovdqa xmm5, XMMWORD PTR [rsp-104]
vpand xmm1, xmm1, xmm3
vpermilps xmm3, xmm10, 170
vpermilps xmm10, xmm10, 255
vcmpleps xmm3, xmm9, xmm3
vpand xmm6, xmm6, xmm13
vmovdqu XMMWORD PTR [rax], xmm5
vcmpleps xmm9, xmm9, xmm10
vpand xmm1, xmm1, xmm3
vpshufd xmm3, XMMWORD PTR [rsp-88], 170
vpand xmm9, xmm6, xmm9
vpcomltd xmm3, xmm8, xmm3
vpand xmm1, xmm1, xmm3
vpcomltd xmm8, xmm8, xmm0
vmovdqa xmm0, XMMWORD PTR [rsp-120]
vpermilps xmm3, xmm7, 170
vpermilps xmm7, xmm7, 255
vcmpleps xmm3, xmm2, xmm3
vpand xmm8, xmm9, xmm8
vcmpleps xmm2, xmm2, xmm7
vmovdqu XMMWORD PTR [rax+16], xmm0
vpand xmm3, xmm3, xmm15
vpand xmm2, xmm2, xmm4
vpand xmm3, xmm3, xmm12
vpand xmm12, xmm2, xmm12
vpand xmm3, xmm1, xmm3
vpand xmm12, xmm8, xmm12
vmovdqu XMMWORD PTR [rax+32], xmm3
vmovdqu XMMWORD PTR [rax+48], xmm12
leave
ret
main: // character limit 30k
It is ~123 lines of vector instructions. Since it runs somewhat ok performance, I tried to simplify it with simple bitwise operations:
https://godbolt.org/z/zKqe49a73
inline
const int intersectDim(const float minx, const float maxx, const float minx2, const float maxx2) noexcept
{
return !((maxx < minx2) || (maxx2 < minx));
}
inline
void comp4vs4( const int * const __restrict__ partId1, const int * const __restrict__ partId2,
const float * const __restrict__ minx1, const float * const __restrict__ minx2,
const float * const __restrict__ miny1, const float * const __restrict__ miny2,
const float * const __restrict__ minz1, const float * const __restrict__ minz2,
const float * const __restrict__ maxx1, const float * const __restrict__ maxx2,
const float * const __restrict__ maxy1, const float * const __restrict__ maxy2,
const float * const __restrict__ maxz1, const float * const __restrict__ maxz2,
int * const __restrict__ out
)
{
alignas(32)
int result[16]={
// 0v0 0v1 0v2 0v3
// 1v0 1v1 1v2 1v3
// 2v0 2v1 2v2 2v3
// 3v0 3v1 3v2 3v3
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0
};
for(int i=0;i<16;i++)
result[i] = partId1[i&3]<partId2[i/4];
for(int i=0;i<16;i++)
result[i] = result[i] &&
intersectDim(minx1[i&3], maxx1[i&3], minx2[i/4], maxx2[i/4]) &&
intersectDim(miny1[i&3], maxy1[i&3], miny2[i/4], maxy2[i/4]) &&
intersectDim(minz1[i&3], maxz1[i&3], minz2[i/4], maxz2[i/4]);
for(int i=0;i<16;i++)
out[i]=result[i];
}
#include<iostream>
int main()
{
int tile1[4];int tile2[4];
float tile3[4];float tile4[4];
float tile5[4];float tile6[4];
float tile7[4];float tile8[4];
float tile9[4];float tile10[4];
float tile11[4];float tile12[4];
float tile13[4];float tile14[4];
for(int i=0;i<4;i++)
{
std::cin>>tile1[i];
std::cin>>tile2[i];
std::cin>>tile3[i];
std::cin>>tile4[i];
std::cin>>tile5[i];
std::cin>>tile6[i];
std::cin>>tile7[i];
std::cin>>tile8[i];
std::cin>>tile9[i];
std::cin>>tile10[i];
std::cin>>tile11[i];
std::cin>>tile12[i];
std::cin>>tile13[i];
std::cin>>tile14[i];
}
int out[16];
comp4vs4(tile1,tile2,tile3,tile4,tile5,tile6,tile7,tile8,tile9,
tile10,tile11,tile12,tile13,tile14,out);
for(int i=0;i<16;i++)
std::cout<<out[i];
return 0;
}
how godbolt outputs:
main:
// character limit 30k
vpxor xmm0, xmm0, xmm0
vmovdqa xmm3, XMMWORD PTR .LC0[rip]
lea rax, [rsp+240]
vpxor xmm4, xmm4, xmm4
vmovdqa XMMWORD PTR [rsp+224], xmm0
vmovdqa XMMWORD PTR [rsp+240], xmm0
vmovdqa XMMWORD PTR [rsp+256], xmm0
vmovdqa XMMWORD PTR [rsp+272], xmm0
vpcmpeqd xmm0, xmm0, xmm0
vmovdqa xmm7, xmm0
vmovdqa xmm6, xmm0
vmovdqa xmm5, xmm0
vpgatherdd xmm2, DWORD PTR [rsp+16+xmm4*4], xmm7
vmovdqa xmm4, XMMWORD PTR .LC1[rip]
vpgatherdd xmm1, DWORD PTR [rdx+xmm3*4], xmm6
vmovdqa xmm7, xmm0
vmovdqa xmm6, xmm0
vpcomltd xmm1, xmm1, xmm2
vpand xmm1, xmm1, xmm4
vmovdqa XMMWORD PTR [rsp+224], xmm1
vpgatherdd xmm1, DWORD PTR [rdx+xmm3*4], xmm6
vpgatherdd xmm2, DWORD PTR [rsp+16+xmm4*4], xmm7
vmovdqa xmm6, xmm0
vmovdqa xmm7, xmm0
vpcomltd xmm1, xmm1, xmm2
vpand xmm1, xmm1, xmm4
vmovdqa XMMWORD PTR [rsp+240], xmm1
vpgatherdd xmm1, DWORD PTR [rdx+xmm3*4], xmm5
vmovdqa xmm5, XMMWORD PTR .LC2[rip]
vpgatherdd xmm2, DWORD PTR [rsp+16+xmm5*4], xmm6
vmovdqa xmm5, XMMWORD PTR .LC3[rip]
vmovdqa xmm6, xmm0
vpcomltd xmm1, xmm1, xmm2
vpand xmm1, xmm1, xmm4
vmovdqa XMMWORD PTR [rsp+256], xmm1
vpgatherdd xmm1, DWORD PTR [rdx+xmm3*4], xmm7
vpgatherdd xmm0, DWORD PTR [rsp+16+xmm5*4], xmm6
vmovdqa xmm7, XMMWORD PTR .LC4[rip]
vpxor xmm6, xmm6, xmm6
lea rdx, [rsp+304]
vpcomltd xmm0, xmm1, xmm0
vpand xmm0, xmm0, xmm4
vmovdqa XMMWORD PTR [rsp+272], xmm0
.L3:
vmovdqa xmm0, XMMWORD PTR [rax-16]
vmovdqa xmm2, xmm3
prefetcht0 [rax]
add rax, 16
vpaddd xmm3, xmm3, xmm7
vpsrad xmm8, xmm2, 2
vpand xmm2, xmm2, xmm5
vpcomneqd xmm1, xmm0, xmm6
vmovaps xmm0, xmm1
vmovaps xmm11, xmm1
vmovaps xmm12, xmm1
vmovaps xmm13, xmm1
vgatherdps xmm11, DWORD PTR [rsp+144+xmm8*4], xmm0
vmovaps xmm14, xmm1
vmovaps xmm0, xmm1
vmovaps xmm10, xmm1
vmovaps xmm9, xmm1
vgatherdps xmm10, DWORD PTR [rsp+128+xmm2*4], xmm13
vgatherdps xmm0, DWORD PTR [r13+0+xmm8*4], xmm12
vgatherdps xmm9, DWORD PTR [rsp+32+xmm2*4], xmm14
vcmpleps xmm0, xmm0, xmm10
vcmpleps xmm9, xmm9, xmm11
vpand xmm0, xmm0, xmm9
vpand xmm1, xmm0, xmm1
vmovaps xmm0, xmm1
vmovaps xmm11, xmm1
vmovaps xmm15, xmm1
vmovaps xmm10, xmm1
vgatherdps xmm11, DWORD PTR [r15+xmm8*4], xmm0
vmovaps xmm12, xmm1
vmovaps xmm0, xmm1
vmovaps xmm9, xmm1
vmovaps xmm13, xmm1
vgatherdps xmm10, DWORD PTR [r12+xmm2*4], xmm12
vgatherdps xmm0, DWORD PTR [rsp+80+xmm8*4], xmm15
vgatherdps xmm9, DWORD PTR [rsp+64+xmm2*4], xmm13
vcmpleps xmm0, xmm0, xmm10
vcmpleps xmm9, xmm9, xmm11
vpand xmm0, xmm0, xmm9
vpand xmm0, xmm0, xmm1
vmovaps xmm1, xmm0
vmovaps xmm10, xmm0
vmovaps xmm9, xmm0
vmovaps xmm14, xmm0
vgatherdps xmm10, DWORD PTR [rsp+208+xmm8*4], xmm1
vmovaps xmm1, xmm0
vgatherdps xmm9, DWORD PTR [r14+xmm8*4], xmm1
vmovaps xmm1, xmm0
vmovaps xmm8, xmm0
vgatherdps xmm8, DWORD PTR [rsp+192+xmm2*4], xmm1
vmovaps xmm1, xmm0
vgatherdps xmm1, DWORD PTR [rsp+96+xmm2*4], xmm14
vcmpleps xmm2, xmm9, xmm8
vcmpleps xmm1, xmm1, xmm10
vpand xmm1, xmm1, xmm2
vpand xmm1, xmm1, xmm4
vpand xmm0, xmm0, xmm1
vmovdqa XMMWORD PTR [rax-32], xmm0
cmp rdx, rax
jne .L3
vmovdqa xmm5, XMMWORD PTR [rsp+224]
vmovdqa xmm7, XMMWORD PTR [rsp+240]
vmovdqa xmm4, XMMWORD PTR [rsp+256]
lea rbx, [rsp+288]
lea r12, [rsp+352]
vmovdqa XMMWORD PTR [rsp+288], xmm5
vmovdqa xmm5, XMMWORD PTR [rsp+272]
vmovdqa XMMWORD PTR [rsp+304], xmm7
vmovdqa XMMWORD PTR [rsp+320], xmm4
vmovdqa XMMWORD PTR [rsp+336], xmm5
.L4:
// character limit 30k
it has ~110 lines of vector instructions. Despite having less instructions than first version, it runs at half performance (at least on bdver1 compiler flag). Is it because of "AND" and division operations for indexing?
Also, parameters using the restrict keyword are pointing to same memory occasionally. Could this be a problem for performance?
If it helps, here are performance-test source codes on some online-service with avx512-cpu (maximum 32 AABBs per leaf node):
- Unrolled version: https://rextester.com/YKDN52107
- Readable version: https://rextester.com/TAFR72415 (slow)
A bit less performance difference when 128 AABBs per leaf node(tested in godbolt server):
- Unrolled: https://godbolt.org/z/rx13zorjr
- Readable: https://godbolt.org/z/e1cfbEPKn