I have run the example in the CUDA documentation,but I get unexpected results. So how to use the wmma function? Is my wmma::load_matrix_sync
wrong? Or something else we should notice?...
WMMA_M,WMMA_N,WMMA_K = 16
__global__ void wmma_kernel(half *a, half *b, float *c, int matrix_size)
{
//Declare the fragment
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> a_frag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K,float> acc_frag;
//Load the matrix to fragment
wmma::load_matrix_sync(a_frag, a, WMMA_M);
wmma::load_matrix_sync(b_frag, b, WMMA_K);
//perform mma
wmma::fill_fragment(acc_frag, 0.0f);
for(int i=0; i<1e4; i++)
{
wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
}
//store the result
wmma::store_matrix_sync(c, acc_frag, WMMA_M, wmma::mem_row_major);
}
... I set the value of element (d_a[i] d_b[i]) to 1.0f, and c[i]=0.0f. After performing the wmma_kernel function, the c[i] is still 0.0f,and elapsedTime is also 0.0f.
matrix_size = 16 x 16
//create the event
cudaEvent_t start, stop;
CUDA_CHECK_RETURN(cudaEventCreate(&start));
CUDA_CHECK_RETURN(cudaEventCreate(&stop));
//perform the wmma_kernel
CUDA_CHECK_RETURN(cudaEventRecord(start));
wmma_kernel<<<1,256>>>(d_a, d_b, d_c, matrix_size);
CUDA_CHECK_RETURN(cudaEventRecord(stop));
CUDA_CHECK_RETURN(cudaEventSynchronize(stop));
//calculate the elapsed time
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed Time : %f\n",elapsedTime);
I would suggest switching to CUDA 10. It has made
half
datatype considerably easier to use.However, the following example should work similarly whether using CUDA 9.2 or CUDA 10:
To get the kernel time measurement, you could use
cudaEvent
based timing, but it seems easier to me just to usenvprof
: