Auto-vectorization for hand-unrolled initialized tiled-computation versus simple loop with no initialization

83 Views Asked by At

In optimization for an AABB collision detection algorithm's inner-most 4-versus-4 comparison part, I am stuck at simplifying code at the same time gaining(or just retaining) performance.

Here is the version with hand-unrolled initialization:

https://godbolt.org/z/TMGMhdsss

inline
const int intersectDim(const float minx, const float maxx, const float minx2, const float maxx2) noexcept
{
    return !((maxx < minx2) || (maxx2 < minx));
}


inline
void comp4vs4(  const int * const __restrict__ partId1, const int * const __restrict__ partId2,
                    const float * const __restrict__ minx1, const float * const __restrict__ minx2,
                    const float * const __restrict__ miny1, const float * const __restrict__ miny2,
                    const float * const __restrict__ minz1, const float * const __restrict__ minz2,
                    const float * const __restrict__ maxx1, const float * const __restrict__ maxx2,
                    const float * const __restrict__ maxy1, const float * const __restrict__ maxy2,
                    const float * const __restrict__ maxz1, const float * const __restrict__ maxz2,
                    int * const __restrict__ out
                    )
{
            alignas(32)
            int result[16]={
                // 0v0 0v1 0v2 0v3
                // 1v0 1v1 1v2 1v3
                // 2v0 2v1 2v2 2v3
                // 3v0 3v1 3v2 3v3
                0, 0, 0, 0,
                0, 0, 0, 0,
                0, 0, 0, 0,
                0, 0, 0, 0
            };

            alignas(32)
            int tileId1[16]={
                    // 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
                    partId1[0],partId1[1],partId1[2],partId1[3],
                    partId1[0],partId1[1],partId1[2],partId1[3],
                    partId1[0],partId1[1],partId1[2],partId1[3],
                    partId1[0],partId1[1],partId1[2],partId1[3]
            };

            alignas(32)
            int tileId2[16]={
                    // 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
                    partId2[0],partId2[0],partId2[0],partId2[0],
                    partId2[1],partId2[1],partId2[1],partId2[1],
                    partId2[2],partId2[2],partId2[2],partId2[2],
                    partId2[3],partId2[3],partId2[3],partId2[3]
            };

            alignas(32)
            float tileMinX1[16]={
                    // 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
                    minx1[0],minx1[1],minx1[2],minx1[3],
                    minx1[0],minx1[1],minx1[2],minx1[3],
                    minx1[0],minx1[1],minx1[2],minx1[3],
                    minx1[0],minx1[1],minx1[2],minx1[3]
            };

            alignas(32)
            float tileMinX2[16]={
                    // 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
                    minx2[0],minx2[0],minx2[0],minx2[0],
                    minx2[1],minx2[1],minx2[1],minx2[1],
                    minx2[2],minx2[2],minx2[2],minx2[2],
                    minx2[3],minx2[3],minx2[3],minx2[3]
            };

            alignas(32)
            float tileMinY1[16]={
                    // 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
                    miny1[0],miny1[1],miny1[2],miny1[3],
                    miny1[0],miny1[1],miny1[2],miny1[3],
                    miny1[0],miny1[1],miny1[2],miny1[3],
                    miny1[0],miny1[1],miny1[2],miny1[3]
            };

            alignas(32)
            float tileMinY2[16]={
                    // 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
                    miny2[0],miny2[0],miny2[0],miny2[0],
                    miny2[1],miny2[1],miny2[1],miny2[1],
                    miny2[2],miny2[2],miny2[2],miny2[2],
                    miny2[3],miny2[3],miny2[3],miny2[3]
            };

            alignas(32)
            float tileMinZ1[16]={
                    // 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
                    minz1[0],minz1[1],minz1[2],minz1[3],
                    minz1[0],minz1[1],minz1[2],minz1[3],
                    minz1[0],minz1[1],minz1[2],minz1[3],
                    minz1[0],minz1[1],minz1[2],minz1[3]
            };

            alignas(32)
            float tileMinZ2[16]={
                    // 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
                    minz2[0],minz2[0],minz2[0],minz2[0],
                    minz2[1],minz2[1],minz2[1],minz2[1],
                    minz2[2],minz2[2],minz2[2],minz2[2],
                    minz2[3],minz2[3],minz2[3],minz2[3]
            };











            alignas(32)
            float tileMaxX1[16]={
                    // 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
                    maxx1[0],maxx1[1],maxx1[2],maxx1[3],
                    maxx1[0],maxx1[1],maxx1[2],maxx1[3],
                    maxx1[0],maxx1[1],maxx1[2],maxx1[3],
                    maxx1[0],maxx1[1],maxx1[2],maxx1[3]
            };

            alignas(32)
            float tileMaxX2[16]={
                    // 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
                    maxx2[0],maxx2[0],maxx2[0],maxx2[0],
                    maxx2[1],maxx2[1],maxx2[1],maxx2[1],
                    maxx2[2],maxx2[2],maxx2[2],maxx2[2],
                    maxx2[3],maxx2[3],maxx2[3],maxx2[3]
            };

            alignas(32)
            float tileMaxY1[16]={
                    // 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
                    maxy1[0],maxy1[1],maxy1[2],maxy1[3],
                    maxy1[0],maxy1[1],maxy1[2],maxy1[3],
                    maxy1[0],maxy1[1],maxy1[2],maxy1[3],
                    maxy1[0],maxy1[1],maxy1[2],maxy1[3]
            };

            alignas(32)
            float tileMaxY2[16]={
                    // 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
                    maxy2[0],maxy2[0],maxy2[0],maxy2[0],
                    maxy2[1],maxy2[1],maxy2[1],maxy2[1],
                    maxy2[2],maxy2[2],maxy2[2],maxy2[2],
                    maxy2[3],maxy2[3],maxy2[3],maxy2[3]
            };

            alignas(32)
            float tileMaxZ1[16]={
                    // 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3
                    maxz1[0],maxz1[1],maxz1[2],maxz1[3],
                    maxz1[0],maxz1[1],maxz1[2],maxz1[3],
                    maxz1[0],maxz1[1],maxz1[2],maxz1[3],
                    maxz1[0],maxz1[1],maxz1[2],maxz1[3]
            };

            alignas(32)
            float tileMaxZ2[16]={
                    // 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
                    maxz2[0],maxz2[0],maxz2[0],maxz2[0],
                    maxz2[1],maxz2[1],maxz2[1],maxz2[1],
                    maxz2[2],maxz2[2],maxz2[2],maxz2[2],
                    maxz2[3],maxz2[3],maxz2[3],maxz2[3]
            };




            for(int i=0;i<16;i++)
                result[i] =  (tileId1[i] < tileId2[i]);

            for(int i=0;i<16;i++)
                result[i] = result[i] &&
                intersectDim(tileMinX1[i], tileMaxX1[i], tileMinX2[i], tileMaxX2[i]) &&
                intersectDim(tileMinY1[i], tileMaxY1[i], tileMinY2[i], tileMaxY2[i]) &&
                intersectDim(tileMinZ1[i], tileMaxZ1[i], tileMinZ2[i], tileMaxZ2[i]);

            for(int i=0;i<16;i++)
                out[i]=result[i];
}

#include<iostream>
int main()
{
    int tile1[4];int tile2[4];
    float tile3[4];float tile4[4];
    float tile5[4];float tile6[4];
    float tile7[4];float tile8[4];
    float tile9[4];float tile10[4];
    float tile11[4];float tile12[4];
    float tile13[4];float tile14[4];
    for(int i=0;i<4;i++)
    {
        std::cin>>tile1[i];
        std::cin>>tile2[i];
        std::cin>>tile3[i];
        std::cin>>tile4[i];
        std::cin>>tile5[i];
        std::cin>>tile6[i];
        std::cin>>tile7[i];
        std::cin>>tile8[i];
        std::cin>>tile9[i];
        std::cin>>tile10[i];
        std::cin>>tile11[i];
        std::cin>>tile12[i];
        std::cin>>tile13[i];
        std::cin>>tile14[i];
    }
    int out[16];
    comp4vs4(tile1,tile2,tile3,tile4,tile5,tile6,tile7,tile8,tile9,
        tile10,tile11,tile12,tile13,tile14,out);

    for(int i=0;i<16;i++)
        std::cout<<out[i];
    return 0;
}

and its output from godbolt:

comp4vs4(int const*, int const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, float const*, int*):
        push    rbp
        mov     rbp, rsp
        and     rsp, -32
        sub     rsp, 8
        mov     rax, QWORD PTR [rbp+80]
        vmovups xmm0, XMMWORD PTR [rdx]
        mov     rdx, QWORD PTR [rbp+16]
        vmovups xmm6, XMMWORD PTR [rcx]
        vmovups xmm5, XMMWORD PTR [r9]
        vmovups xmm9, XMMWORD PTR [r8]
        vmovdqu xmm15, XMMWORD PTR [rsi]
        vmovdqu xmm8, XMMWORD PTR [rdi]
        vmovups xmm2, XMMWORD PTR [rdx]
        mov     rdx, QWORD PTR [rbp+24]
        vpermilps       xmm1, xmm6, 0
        vmovdqa XMMWORD PTR [rsp-88], xmm15
        vmovups xmm4, XMMWORD PTR [rdx]
        mov     rdx, QWORD PTR [rbp+32]
        vmovups xmm14, XMMWORD PTR [rdx]
        mov     rdx, QWORD PTR [rbp+40]
        vmovups xmm11, XMMWORD PTR [rdx]
        mov     rdx, QWORD PTR [rbp+48]
        vcmpleps        xmm3, xmm1, xmm14
        vmovups xmm13, XMMWORD PTR [rdx]
        mov     rdx, QWORD PTR [rbp+56]
        vpermilps       xmm1, xmm11, 0
        vcmpleps        xmm1, xmm0, xmm1
        vmovups xmm10, XMMWORD PTR [rdx]
        mov     rdx, QWORD PTR [rbp+64]
        vpand   xmm1, xmm1, xmm3
        vpermilps       xmm3, xmm5, 0
        vcmpleps        xmm3, xmm3, xmm13
        vmovups xmm7, XMMWORD PTR [rdx]
        mov     rdx, QWORD PTR [rbp+72]
        vpand   xmm1, xmm1, xmm3
        vpermilps       xmm3, xmm10, 0
        vmovaps XMMWORD PTR [rsp-72], xmm7
        vcmpleps        xmm3, xmm9, xmm3
        vmovups xmm7, XMMWORD PTR [rdx]
        vpand   xmm1, xmm1, xmm3
        vpshufd xmm3, xmm15, 0
        vpcomltd        xmm3, xmm8, xmm3
        vpand   xmm1, xmm1, xmm3
        vpermilps       xmm3, xmm7, 0
        vcmpleps        xmm12, xmm2, xmm3
        vpermilps       xmm3, xmm4, 0
        vcmpleps        xmm3, xmm3, XMMWORD PTR [rsp-72]
        vpand   xmm3, xmm3, xmm12
        vmovdqa xmm12, XMMWORD PTR .LC0[rip]
        vpand   xmm3, xmm3, xmm12
        vpand   xmm1, xmm1, xmm3
        vmovdqa XMMWORD PTR [rsp-104], xmm1
        vpermilps       xmm1, xmm6, 85
        vcmpleps        xmm3, xmm1, xmm14
        vpermilps       xmm1, xmm11, 85
        vcmpleps        xmm1, xmm0, xmm1
        vpand   xmm1, xmm1, xmm3
        vpermilps       xmm3, xmm5, 85
        vcmpleps        xmm3, xmm3, xmm13
        vpand   xmm1, xmm1, xmm3
        vpermilps       xmm3, xmm10, 85
        vcmpleps        xmm3, xmm9, xmm3
        vpand   xmm1, xmm1, xmm3
        vpshufd xmm3, xmm15, 85
        vpermilps       xmm15, xmm4, 85
        vpcomltd        xmm3, xmm8, xmm3
        vpand   xmm1, xmm1, xmm3
        vpermilps       xmm3, xmm7, 85
        vcmpleps        xmm15, xmm15, XMMWORD PTR [rsp-72]
        vcmpleps        xmm3, xmm2, xmm3
        vpand   xmm3, xmm3, xmm15
        vpermilps       xmm15, xmm4, 170
        vpand   xmm3, xmm3, xmm12
        vpermilps       xmm4, xmm4, 255
        vcmpleps        xmm15, xmm15, XMMWORD PTR [rsp-72]
        vpand   xmm1, xmm1, xmm3
        vcmpleps        xmm4, xmm4, XMMWORD PTR [rsp-72]
        vmovdqa XMMWORD PTR [rsp-120], xmm1
        vpermilps       xmm1, xmm6, 170
        vpermilps       xmm6, xmm6, 255
        vcmpleps        xmm3, xmm1, xmm14
        vpermilps       xmm1, xmm11, 170
        vpermilps       xmm11, xmm11, 255
        vcmpleps        xmm6, xmm6, xmm14
        vcmpleps        xmm1, xmm0, xmm1
        vcmpleps        xmm11, xmm0, xmm11
        vpshufd xmm0, XMMWORD PTR [rsp-88], 255
        vpand   xmm1, xmm1, xmm3
        vpermilps       xmm3, xmm5, 170
        vpermilps       xmm5, xmm5, 255
        vcmpleps        xmm3, xmm3, xmm13
        vpand   xmm6, xmm11, xmm6
        vcmpleps        xmm13, xmm5, xmm13
        vmovdqa xmm5, XMMWORD PTR [rsp-104]
        vpand   xmm1, xmm1, xmm3
        vpermilps       xmm3, xmm10, 170
        vpermilps       xmm10, xmm10, 255
        vcmpleps        xmm3, xmm9, xmm3
        vpand   xmm6, xmm6, xmm13
        vmovdqu XMMWORD PTR [rax], xmm5
        vcmpleps        xmm9, xmm9, xmm10
        vpand   xmm1, xmm1, xmm3
        vpshufd xmm3, XMMWORD PTR [rsp-88], 170
        vpand   xmm9, xmm6, xmm9
        vpcomltd        xmm3, xmm8, xmm3
        vpand   xmm1, xmm1, xmm3
        vpcomltd        xmm8, xmm8, xmm0
        vmovdqa xmm0, XMMWORD PTR [rsp-120]
        vpermilps       xmm3, xmm7, 170
        vpermilps       xmm7, xmm7, 255
        vcmpleps        xmm3, xmm2, xmm3
        vpand   xmm8, xmm9, xmm8
        vcmpleps        xmm2, xmm2, xmm7
        vmovdqu XMMWORD PTR [rax+16], xmm0
        vpand   xmm3, xmm3, xmm15
        vpand   xmm2, xmm2, xmm4
        vpand   xmm3, xmm3, xmm12
        vpand   xmm12, xmm2, xmm12
        vpand   xmm3, xmm1, xmm3
        vpand   xmm12, xmm8, xmm12
        vmovdqu XMMWORD PTR [rax+32], xmm3
        vmovdqu XMMWORD PTR [rax+48], xmm12
        leave
        ret
main: // character limit 30k

It is ~123 lines of vector instructions. Since it runs somewhat ok performance, I tried to simplify it with simple bitwise operations:

https://godbolt.org/z/zKqe49a73

inline
const int intersectDim(const float minx, const float maxx, const float minx2, const float maxx2) noexcept
{
    return !((maxx < minx2) || (maxx2 < minx));
}


inline
void comp4vs4(  const int * const __restrict__ partId1, const int * const __restrict__ partId2,
                    const float * const __restrict__ minx1, const float * const __restrict__ minx2,
                    const float * const __restrict__ miny1, const float * const __restrict__ miny2,
                    const float * const __restrict__ minz1, const float * const __restrict__ minz2,
                    const float * const __restrict__ maxx1, const float * const __restrict__ maxx2,
                    const float * const __restrict__ maxy1, const float * const __restrict__ maxy2,
                    const float * const __restrict__ maxz1, const float * const __restrict__ maxz2,
                    int * const __restrict__ out
                    )
{
        alignas(32)
        int result[16]={
            // 0v0 0v1 0v2 0v3
            // 1v0 1v1 1v2 1v3
            // 2v0 2v1 2v2 2v3
            // 3v0 3v1 3v2 3v3
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0
        };



        for(int i=0;i<16;i++)
            result[i] = partId1[i&3]<partId2[i/4];


        for(int i=0;i<16;i++)
            result[i] = result[i] &&
            intersectDim(minx1[i&3], maxx1[i&3], minx2[i/4], maxx2[i/4]) &&
            intersectDim(miny1[i&3], maxy1[i&3], miny2[i/4], maxy2[i/4]) &&
            intersectDim(minz1[i&3], maxz1[i&3], minz2[i/4], maxz2[i/4]);


        for(int i=0;i<16;i++)
            out[i]=result[i];
}

#include<iostream>
int main()
{
    int tile1[4];int tile2[4];
    float tile3[4];float tile4[4];
    float tile5[4];float tile6[4];
    float tile7[4];float tile8[4];
    float tile9[4];float tile10[4];
    float tile11[4];float tile12[4];
    float tile13[4];float tile14[4];
    for(int i=0;i<4;i++)
    {
        std::cin>>tile1[i];
        std::cin>>tile2[i];
        std::cin>>tile3[i];
        std::cin>>tile4[i];
        std::cin>>tile5[i];
        std::cin>>tile6[i];
        std::cin>>tile7[i];
        std::cin>>tile8[i];
        std::cin>>tile9[i];
        std::cin>>tile10[i];
        std::cin>>tile11[i];
        std::cin>>tile12[i];
        std::cin>>tile13[i];
        std::cin>>tile14[i];
    }
    int out[16];
    comp4vs4(tile1,tile2,tile3,tile4,tile5,tile6,tile7,tile8,tile9,
        tile10,tile11,tile12,tile13,tile14,out);

    for(int i=0;i<16;i++)
        std::cout<<out[i];
    return 0;
}

how godbolt outputs:

main:
       // character limit 30k
        vpxor   xmm0, xmm0, xmm0
        vmovdqa xmm3, XMMWORD PTR .LC0[rip]
        lea     rax, [rsp+240]
        vpxor   xmm4, xmm4, xmm4
        vmovdqa XMMWORD PTR [rsp+224], xmm0
        vmovdqa XMMWORD PTR [rsp+240], xmm0
        vmovdqa XMMWORD PTR [rsp+256], xmm0
        vmovdqa XMMWORD PTR [rsp+272], xmm0
        vpcmpeqd        xmm0, xmm0, xmm0
        vmovdqa xmm7, xmm0
        vmovdqa xmm6, xmm0
        vmovdqa xmm5, xmm0
        vpgatherdd      xmm2, DWORD PTR [rsp+16+xmm4*4], xmm7
        vmovdqa xmm4, XMMWORD PTR .LC1[rip]
        vpgatherdd      xmm1, DWORD PTR [rdx+xmm3*4], xmm6
        vmovdqa xmm7, xmm0
        vmovdqa xmm6, xmm0
        vpcomltd        xmm1, xmm1, xmm2
        vpand   xmm1, xmm1, xmm4
        vmovdqa XMMWORD PTR [rsp+224], xmm1
        vpgatherdd      xmm1, DWORD PTR [rdx+xmm3*4], xmm6
        vpgatherdd      xmm2, DWORD PTR [rsp+16+xmm4*4], xmm7
        vmovdqa xmm6, xmm0
        vmovdqa xmm7, xmm0
        vpcomltd        xmm1, xmm1, xmm2
        vpand   xmm1, xmm1, xmm4
        vmovdqa XMMWORD PTR [rsp+240], xmm1
        vpgatherdd      xmm1, DWORD PTR [rdx+xmm3*4], xmm5
        vmovdqa xmm5, XMMWORD PTR .LC2[rip]
        vpgatherdd      xmm2, DWORD PTR [rsp+16+xmm5*4], xmm6
        vmovdqa xmm5, XMMWORD PTR .LC3[rip]
        vmovdqa xmm6, xmm0
        vpcomltd        xmm1, xmm1, xmm2
        vpand   xmm1, xmm1, xmm4
        vmovdqa XMMWORD PTR [rsp+256], xmm1
        vpgatherdd      xmm1, DWORD PTR [rdx+xmm3*4], xmm7
        vpgatherdd      xmm0, DWORD PTR [rsp+16+xmm5*4], xmm6
        vmovdqa xmm7, XMMWORD PTR .LC4[rip]
        vpxor   xmm6, xmm6, xmm6
        lea     rdx, [rsp+304]
        vpcomltd        xmm0, xmm1, xmm0
        vpand   xmm0, xmm0, xmm4
        vmovdqa XMMWORD PTR [rsp+272], xmm0
.L3:
        vmovdqa xmm0, XMMWORD PTR [rax-16]
        vmovdqa xmm2, xmm3
        prefetcht0      [rax]
        add     rax, 16
        vpaddd  xmm3, xmm3, xmm7
        vpsrad  xmm8, xmm2, 2
        vpand   xmm2, xmm2, xmm5
        vpcomneqd       xmm1, xmm0, xmm6
        vmovaps xmm0, xmm1
        vmovaps xmm11, xmm1
        vmovaps xmm12, xmm1
        vmovaps xmm13, xmm1
        vgatherdps      xmm11, DWORD PTR [rsp+144+xmm8*4], xmm0
        vmovaps xmm14, xmm1
        vmovaps xmm0, xmm1
        vmovaps xmm10, xmm1
        vmovaps xmm9, xmm1
        vgatherdps      xmm10, DWORD PTR [rsp+128+xmm2*4], xmm13
        vgatherdps      xmm0, DWORD PTR [r13+0+xmm8*4], xmm12
        vgatherdps      xmm9, DWORD PTR [rsp+32+xmm2*4], xmm14
        vcmpleps        xmm0, xmm0, xmm10
        vcmpleps        xmm9, xmm9, xmm11
        vpand   xmm0, xmm0, xmm9
        vpand   xmm1, xmm0, xmm1
        vmovaps xmm0, xmm1
        vmovaps xmm11, xmm1
        vmovaps xmm15, xmm1
        vmovaps xmm10, xmm1
        vgatherdps      xmm11, DWORD PTR [r15+xmm8*4], xmm0
        vmovaps xmm12, xmm1
        vmovaps xmm0, xmm1
        vmovaps xmm9, xmm1
        vmovaps xmm13, xmm1
        vgatherdps      xmm10, DWORD PTR [r12+xmm2*4], xmm12
        vgatherdps      xmm0, DWORD PTR [rsp+80+xmm8*4], xmm15
        vgatherdps      xmm9, DWORD PTR [rsp+64+xmm2*4], xmm13
        vcmpleps        xmm0, xmm0, xmm10
        vcmpleps        xmm9, xmm9, xmm11
        vpand   xmm0, xmm0, xmm9
        vpand   xmm0, xmm0, xmm1
        vmovaps xmm1, xmm0
        vmovaps xmm10, xmm0
        vmovaps xmm9, xmm0
        vmovaps xmm14, xmm0
        vgatherdps      xmm10, DWORD PTR [rsp+208+xmm8*4], xmm1
        vmovaps xmm1, xmm0
        vgatherdps      xmm9, DWORD PTR [r14+xmm8*4], xmm1
        vmovaps xmm1, xmm0
        vmovaps xmm8, xmm0
        vgatherdps      xmm8, DWORD PTR [rsp+192+xmm2*4], xmm1
        vmovaps xmm1, xmm0
        vgatherdps      xmm1, DWORD PTR [rsp+96+xmm2*4], xmm14
        vcmpleps        xmm2, xmm9, xmm8
        vcmpleps        xmm1, xmm1, xmm10
        vpand   xmm1, xmm1, xmm2
        vpand   xmm1, xmm1, xmm4
        vpand   xmm0, xmm0, xmm1
        vmovdqa XMMWORD PTR [rax-32], xmm0
        cmp     rdx, rax
        jne     .L3
        vmovdqa xmm5, XMMWORD PTR [rsp+224]
        vmovdqa xmm7, XMMWORD PTR [rsp+240]
        vmovdqa xmm4, XMMWORD PTR [rsp+256]
        lea     rbx, [rsp+288]
        lea     r12, [rsp+352]
        vmovdqa XMMWORD PTR [rsp+288], xmm5
        vmovdqa xmm5, XMMWORD PTR [rsp+272]
        vmovdqa XMMWORD PTR [rsp+304], xmm7
        vmovdqa XMMWORD PTR [rsp+320], xmm4
        vmovdqa XMMWORD PTR [rsp+336], xmm5
.L4:
        // character limit 30k 
       

it has ~110 lines of vector instructions. Despite having less instructions than first version, it runs at half performance (at least on bdver1 compiler flag). Is it because of "AND" and division operations for indexing?

Also, parameters using the restrict keyword are pointing to same memory occasionally. Could this be a problem for performance?

If it helps, here are performance-test source codes on some online-service with avx512-cpu (maximum 32 AABBs per leaf node):

A bit less performance difference when 128 AABBs per leaf node(tested in godbolt server):

0

There are 0 best solutions below