Avx2 intrinsics don't use all registers available. .NET 8

87 Views Asked by At

I have optimised certain algorithms, using SIMD, such that they are latency-limited versus L1 cache. For reasons known only to the C# compiler, said inexplicably emits code whereby it only uses ymm0 and ignores the other 15 ymmX registers. Indeed, after every operation, it appears to store the result back to RAM (L1 cache) and then directly reload said from the very same location.

Given my current understanding, this is actual JIT output vs expected output.

When observing disassembly:

    mov         qword ptr [rbp-0B0h],rcx                                
Vector256<ulong> Accum_vec = Avx2.BroadcastScalarToVector256(&zero);
    vpbroadcastq ymm0,mmword ptr [rbp-0B0h]  
    vmovups     ymmword ptr [rbp-0D0h],ymm0  

Vector256<ulong> A_vec = Avx2.LoadVector256(A_Ptr);
    mov         rcx,qword ptr [rbp-60h]  
    vmovups     ymm0,ymmword ptr [rcx]  
    vmovups     ymmword ptr [rbp-1B0h],ymm0 
    
Vector256<ulong> B_vec = Avx2.LoadVector256(B_Ptr);
    mov         rcx,qword ptr [rbp-70h]  
    vmovups     ymm0,ymmword ptr [rcx]  
    vmovups     ymmword ptr [rbp-0F0h],ymm0  
    
Vector256<ulong> C_vec = Avx2.LoadVector256(C_Ptr);
    mov         rcx,qword ptr [rbp-80h]  
    vmovups     ymm0,ymmword ptr [rcx]  
    vmovups     ymmword ptr [rbp-110h],ymm0  
    
Vector256<ulong> D_vec = Avx2.LoadVector256(D_Ptr);
    mov         rcx,qword ptr [rbp-90h]  
    vmovups     ymm0,ymmword ptr [rcx]  
    vmovups     ymmword ptr [rbp-130h],ymm0  
    
Accum_vec = Avx2.Add(A_vec, B_vec);
    vmovups     ymm0,ymmword ptr [rbp-1B0h]  
    vpaddq      ymm0,ymm0,ymmword ptr [rbp-0F0h]  
    vmovups     ymmword ptr [rbp-0D0h],ymm0  
    
Accum_vec = Avx2.Add(Accum_vec, C_vec);
    vmovups     ymm0,ymmword ptr [rbp-0D0h]  
    vpaddq      ymm0,ymm0,ymmword ptr [rbp-110h]  
    vmovups     ymmword ptr [rbp-0D0h],ymm0  
    
Accum_vec = Avx2.Add(Accum_vec, D_vec);
    vmovups     ymm0,ymmword ptr [rbp-0D0h]  
    vpaddq      ymm0,ymm0,ymmword ptr [rbp-130h]  
    vmovups     ymmword ptr [rbp-0D0h],ymm0  
    
Avx2.Store(store_Ptr, Accum_vec);
    mov         rcx,qword ptr [rbp-0A0h]  
    vmovups     ymm0,ymmword ptr [rbp-0D0h]  
    vmovups     ymmword ptr [rcx],ymm0  

Surely it should be:

    mov         qword ptr [rbp-0B0h],rcx  
Vector256<ulong> Accum_vec = Avx2.BroadcastScalarToVector256(&zero);
    vpbroadcastq ymm0,mmword ptr [rbp-0B0h]

Vector256<ulong> A_vec = Avx2.LoadVector256(A_Ptr);
    mov         rcx,qword ptr [rbp-60h]  
    vmovups     ymm1,ymmword ptr [rcx]
    
Vector256<ulong> B_vec = Avx2.LoadVector256(B_Ptr);
    mov         rcx,qword ptr [rbp-70h]  
    vmovups     ymm2,ymmword ptr [rcx]
    
Vector256<ulong> C_vec = Avx2.LoadVector256(C_Ptr);
    mov         rcx,qword ptr [rbp-80h]  
    vmovups     ymm3,ymmword ptr [rcx]
    
Vector256<ulong> D_vec = Avx2.LoadVector256(D_Ptr);
    mov         rcx,qword ptr [rbp-90h]  
    vmovups     ymm4,ymmword ptr [rcx] 
    
Accum_vec = Avx2.Add(A_vec, B_vec);
    vpaddq      ymm0, ymm1, ymm2
    
Accum_vec = Avx2.Add(Accum_vec, C_vec);
    vpaddq      ymm0, ymm0, ymm3
    
Accum_vec = Avx2.Add(Accum_vec, D_vec);
    vpaddq      ymm0, ymm0, ymm4
    
Avx2.Store(store_Ptr, Accum_vec);
    mov         rcx,qword ptr [rbp-0A0h] 
    vmovups     ymmword ptr [rcx],ymm0  

Is there are reason C# intrinsics only "understand" ymm0 (and 1 and 2 given other functions and their arguments)?

Why all the load/store, even if loading and storing the same variable?

Tried all manner of rearranging the SIMD code to attempt to get the JITter to output code using all sixteen AVX2 registers. It didn't work. JIT outputted code only concerned with ymm0.

0

There are 0 best solutions below