Extra register usage with if

199 Views Asked by At

I was working on a large cuda kernel and I noticed that the kernel was using 43 registers per thread. In order to understand what was going on, I wrote a smaller program to figure out register usage. I noticed that whenever I use if, register usage goes up. The small code is as follows:

#include <limits.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <cstdlib>
#include <stdint.h>

using namespace std;

__global__ void test_ifs(unsigned int* result){
  unsigned int k = 0;
  for(int j=0;j<MAX_COMP;j++){
    //if(j <= threadIdx.x%MAX_COMP){                                                                                                                                                                                                          
      k += j;
      //}                                                                                                                                                                                                                                     
  }
  result[threadIdx.x] = k;
}

int main(){
  unsigned int* result;
  cudaError_t e1 = cudaMalloc((void**) &result, THREADSPERBLOCK*sizeof(unsigned int));
  if(e1 == cudaSuccess){
    test_ifs<<<1, THREADSPERBLOCK>>>(result);
    cudaError_t e2 = cudaGetLastError();
    if(e2 == cudaSuccess){
    }
    else{
      cout << "kernel failed to launch" << endl;
    }
  }
  else{
    cout << "Failed to allocate results memory" << endl;
  }
}

When I compile this code, each thread uses 5 registers

ptxas info    : Compiling entry function '_Z8test_ifsPj' for 'sm_20'
ptxas info    : Function properties for _Z8test_ifsPj
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 5 registers, 40 bytes cmem[0]

But, if I uncomment if, each thread uses 8 registers. Can anyone please explain to me what is going on?

ptxas info    : Compiling entry function '_Z8test_ifsPj' for 'sm_20'
ptxas info    : Function properties for _Z8test_ifsPj
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 8 registers, 40 bytes cmem[0]
1

There are 1 best solutions below

0
On

The behaviour you are seeing in this example is due to compiler optimization. In the simple loop case, the result of the loop can be calculated at compile time and the whole loop code replaced by a constant, while in the loop containing an if statement case, the result of the loop depends on a variable whose value is not known to the compiler, and the loop must stay.

To prove this is the case, let's look at an only slightly modified version of your kernel:

#define MAX_COMP (32)

template<unsigned int s>
__global__ void test_ifs(unsigned int * result){
    unsigned int k = 0;
    for(int j=0;j<MAX_COMP;j++){
        switch (s) {
            case 1:
                if (j <= threadIdx.x%MAX_COMP){ k += j; }
                break;            

            case 0:
                { k += j; }
        }
    }
    result[threadIdx.x] = k;
}

template __global__ void test_ifs<0>(unsigned int *);
template __global__ void test_ifs<1>(unsigned int *);

and the PTX it emits. For the first case:

    .entry _Z8test_ifsILj0EEvPj (
        .param .u32 __cudaparm__Z8test_ifsILj0EEvPj_result)
    {
    .reg .u16 %rh<3>;
    .reg .u32 %r<6>;
    .loc    14  4   0
$LDWbegin__Z8test_ifsILj0EEvPj:
    .loc    14  16  0
    mov.u32     %r1, 496;  <--- here the loop has been replaced with 496
    ld.param.u32    %r2, [__cudaparm__Z8test_ifsILj0EEvPj_result];
    mov.u16     %rh1, %tid.x;
    mul.wide.u16    %r3, %rh1, 4;
    add.u32     %r4, %r2, %r3;
    st.global.u32   [%r4+0], %r1;
    .loc    14  17  0
    exit;
$LDWend__Z8test_ifsILj0EEvPj:
    } // _Z8test_ifsILj0EEvPj

and the second case the loop remains intact:

    .entry _Z8test_ifsILj1EEvPj (
        .param .u32 __cudaparm__Z8test_ifsILj1EEvPj_result)
    {
    .reg .u32 %r<11>;
    .reg .pred %p<4>;
    .loc    14  4   0
$LDWbegin__Z8test_ifsILj1EEvPj:
    cvt.u32.u16     %r1, %tid.x;
    and.b32     %r2, %r1, 31;
    mov.s32     %r3, 0;
    mov.u32     %r4, 0;
$Lt_1_3842:
 //<loop> Loop body line 4, nesting depth: 1, iterations: 32
    .loc    14  7   0
    add.u32     %r5, %r3, %r4;
    setp.le.u32     %p1, %r3, %r2;
    selp.u32    %r4, %r5, %r4, %p1;
    add.s32     %r3, %r3, 1;
    mov.u32     %r6, 32;
    setp.ne.s32     %p2, %r3, %r6;
    @%p2 bra    $Lt_1_3842;
    .loc    14  16  0
    ld.param.u32    %r7, [__cudaparm__Z8test_ifsILj1EEvPj_result];
    mul24.lo.u32    %r8, %r1, 4;
    add.u32     %r9, %r7, %r8;
    st.global.u32   [%r9+0], %r4;
    .loc    14  17  0
    exit;
$LDWend__Z8test_ifsILj1EEvPj:
    } // _Z8test_ifsILj1EEvPj

You should not conclude that differences will always be due to compiler optimisation, because that depends strongly on the code and the compiler. But in this case, that is the difference.