cooperative_groups::this_grid() causes any CUDA API call to return 'unknown error'

2.1k Views Asked by At

Following the same steps in CUDA samples to launch a kernel and sync across the grid using cooperative_groups::this_grid().sync() causes any CUDA API call to fails. While using cooperative_groups::this_thread_block().sync() works fine and gives correct results.

I used the following code and CMakeLists.txt (cmake version 3.11.1) to test it using CUDA 10 on TITAN V GPU (Driver Version 410.73) with Ubuntu 16.04.5 LTS. The code is also available on github in order to make it easy to reproduce the error.

The code reads an array and then reverses it (from [0 1 2 ... 9] to [9 8 7 ... 0]). In order to do this, each thread reads a single element from the array, sync, and then writes its element to the right destination. The code can be easily modified to ensure that this_thread_block().sync() works fine. Simply change arr_size to be less 1024 and use cg::thread_block barrier = cg::this_thread_block(); instead.

test_cg.cu

#include <cuda_runtime_api.h>
#include <stdio.h>
#include <stdint.h>
#include <cstdint>
#include <numeric>
#include <cuda.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

//********************** CUDA_ERROR
inline void HandleError(cudaError_t err, const char *file, int line) {
    //Error handling micro, wrap it around function whenever possible
    if (err != cudaSuccess) {
        printf("\n%s in %s at line %d\n", cudaGetErrorString(err), file, line);

#ifdef _WIN32
        system("pause");
#else
        exit(EXIT_FAILURE);
#endif
    }
}
#define CUDA_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
//******************************************************************************


//********************** cg kernel 
__global__ void testing_cg_grid_sync(const uint32_t num_elements,
    uint32_t *d_arr){
    uint32_t tid = threadIdx.x + blockDim.x*blockIdx.x;

    if (tid < num_elements){

        uint32_t my_element = d_arr[tid];

        //to sync across the whole grid 
        cg::grid_group barrier = cg::this_grid();

        //to sync within a single block 
        //cg::thread_block barrier = cg::this_thread_block();

        //wait for all reads 
        barrier.sync();

        uint32_t tar_id = num_elements - tid - 1;

        d_arr[tar_id] = my_element;
    }
}
//******************************************************************************


//********************** execute  
void execute_test(const int sm_count){

    //host array 
    const uint32_t arr_size = 1 << 20; //1M 
    uint32_t* h_arr = (uint32_t*)malloc(arr_size * sizeof(uint32_t));
    //fill with sequential numbers
    std::iota(h_arr, h_arr + arr_size, 0);

    //device array 
    uint32_t* d_arr;
    CUDA_ERROR(cudaMalloc((void**)&d_arr, arr_size*sizeof(uint32_t)));
    CUDA_ERROR(cudaMemcpy(d_arr, h_arr, arr_size*sizeof(uint32_t),
        cudaMemcpyHostToDevice));

    //launch config
    const int threads = 512;

    //following the same steps done in conjugateGradientMultiBlockCG.cu 
    //cuda sample to launch kernel that sync across grid 
    //https://github.com/NVIDIA/cuda-samples/blob/master/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu#L436

    int num_blocks_per_sm = 0;
    CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
        (void*)testing_cg_grid_sync, threads, 0));

    dim3 grid_dim(sm_count * num_blocks_per_sm, 1, 1), block_dim(threads, 1, 1);

    if(arr_size > grid_dim.x*block_dim.x){
         printf("\n The grid size (numBlocks*numThreads) is less than array size.\n");
         exit(EXIT_FAILURE);
    }
    printf("\n Launching %d blocks, each containing %d threads", grid_dim.x,
        block_dim.x);

    //argument passed to the kernel     
    void *kernel_args[] = {
        (void *)&arr_size,
        (void *)&d_arr, };


    //finally launch the kernel 
    cudaLaunchCooperativeKernel((void*)testing_cg_grid_sync,
        grid_dim, block_dim, kernel_args);


    //make sure everything went okay
    CUDA_ERROR(cudaGetLastError());
    CUDA_ERROR(cudaDeviceSynchronize());


    //get results on the host 
    CUDA_ERROR(cudaMemcpy(h_arr, d_arr, arr_size*sizeof(uint32_t),
        cudaMemcpyDeviceToHost));

    //validate 
    for (uint32_t i = 0; i < arr_size; i++){
        if (h_arr[i] != arr_size - i - 1){
            printf("\n Result mismatch in h_arr[%u] = %u\n", i, h_arr[i]);
            exit(EXIT_FAILURE);
        }
    }
}
//******************************************************************************

int main(int argc, char**argv) {

    //set to Titan V
    uint32_t device_id = 0;
    cudaSetDevice(device_id);

    //get sm count 
    cudaDeviceProp devProp;
    CUDA_ERROR(cudaGetDeviceProperties(&devProp, device_id));
    int sm_count = devProp.multiProcessorCount;

    //execute 
    execute_test(sm_count);

    printf("\n Mission accomplished \n");
    return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 3.8 FATAL_ERROR)

set(PROJECT_NAME "test_cg")
project(${PROJECT_NAME} LANGUAGES CXX CUDA)  

#default build type is Release
if (CMAKE_BUILD_TYPE STREQUAL "")
    set(CMAKE_BUILD_TYPE Release)
endif ()

SET(CUDA_SEPARABLE_COMPILATION ON)

########## Libraries/flags Starts Here ######################
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -lineinfo; -std=c++11; -expt-extended-lambda; -O3; -use_fast_math; -rdc=true;)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode=arch=compute_70,code=sm_70) #for TITAN V
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -Wall -std=c++11")
########## Libraries/flags Ends Here ######################


########## inc/libs/exe/features Starts Here ######################
set(CMAKE_INCLUDE_CURRENT_DIR ON)
CUDA_ADD_EXECUTABLE(${PROJECT_NAME} test_cg.cu)
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE  ON)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)    
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY})
########## inc/libs/exe/features Ends Here ######################

Running this code gives:

unknown error in /home/ahdhn/test_cg/test_cg.cu at line 67

This is the first line that uses cudaMalloc. I made sure that the code is compiled for the correct architecture by querying __CUDA_ARCH__ from the device and the results is 700. Kindly let me know if you spot me doing something wrong in the code or the CMakeLists.txt file.

1

There are 1 best solutions below

1
On

With external help, the solution that got the code working is to add string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70 --cudart shared") after the second set(CUDA_NVCC_FLAGS...... The reason is that I only have libcudadevrt.a under my /usr/local/cuda-10.0/lib64/ and so I have to signal CUDA to link shared/dynamic run-time library since the default is to link to static. string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70") after the second set(CUDA_NVCC_FLAGS...... The reason is that the sm_70 flag was not passed to the linker properly.

Additionally, using only CUDA_NVCC_FLAGS will only pass the sm_70 info to the compiler not the linker. While only using CMAKE_NVCC_FLAGS will report error: namespace "cooperative_groups" has no member "grid_group" error.