CUBLAS_STATUS_INVALID_VALUE

145 Views Asked by At

I am doing so-called "tensor contraction" using the cublasSgemmStridedBatched API. I have tensor A of shape 60000*20*9 and tensor B of shape 9*32, both of them are row-major. By definition, C = A * B should give me the the result tensor C of shape 60000*20*32. The code that i write is following:


    int batch_count = 60000;
    int M = 20;
    int K = 9;
    int N = 32;

    cublasHandle_t handle;
    cublasCreate(&handle);
    float alpha = 1.0;
    float beta = 0.0;
    int strideA = 20 * 9;
    int strideB = 0;
    int strideC = 20 * 32;
    // A(60000 * 20 * 9) * B(9 * 32) = C(60000 * 20 * 32)
    cublasStatus_t ret = cublasSgemmStridedBatched(
                              handle, 
                              CUBLAS_OP_N, //transposed, since in row-major
                              CUBLAS_OP_N, //transposed, since in row-major
                              N,
                              M,
                              K,
                              &alpha,
                              B.data<float>(), //already in GPU 
                              N, // lda, transposed
                              strideB, 
                              A.data<float>(), //already in GPU 
                              K, // ldb, transposed
                              strideA, 
                              &beta, 
                              C.data<float>(),//already in GPU 
                              N, // ldc
                              strideC, 
                              batchCount);
    cublasDestroy(handle);
    if(ret != CUBLAS_STATUS_SUCCESS){
      printf("cublasSgemmStridedBatched failed %d line (%d)\n", ret, __LINE__);
    }

The above code can't get the work done and keeps showing cublasSgemmStridedBatched failed 7, which according to manual represents CUBLAS_STATUS_INVALID_VALUE . Any help or suggestion is appreciated!

1

There are 1 best solutions below

4
Homer512 On BEST ANSWER

Here is a minimal version that works and tests the result:

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cstdio>
#include <Eigen/Dense>

int main()
{
  cublasHandle_t cubl;
  cublasCreate(&cubl);
  int batch_count = 60000;
  int M = 20;
  int K = 9;
  int N = 32;
  float* A, *B, *C;
  cudaMallocManaged(&A, sizeof(float) * batch_count * M * K);
  cudaMallocManaged(&B, sizeof(float) * K * N);
  cudaMallocManaged(&C, sizeof(float) * batch_count * M * N);
  for(int b = 0; b < batch_count; ++b)
    for(int m = 0; m < M; ++m)
      for(int k = 0; k < K; ++k)
    A[((b * M) + m) * K + k] = (float)(b + 1) * (m + 2) * (k + 3) / (M*N*K);
  for(int k = 0; k < K; ++k)
    for(int n = 0; n < N; ++n)
      B[k * N + n] = (float) (k + 1) * (n + 2) / (N*K);
  const float alpha = 1.f, beta = 0.f;
  const int strideA = K * M, strideB = 0, strideC = M * N;
  cublasStatus_t ret = cublasSgemmStridedBatched(
    cubl, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha,
    B, N /*lda*/, strideB,
    A, K /*ldb*/, strideA, &beta,
    C, N /*ldc*/, strideC, batch_count);
   if(ret != CUBLAS_STATUS_SUCCESS)
     std::printf("cublasSgemmStridedBatched failed %d line (%d)\n",
                 ret, __LINE__);
   cudaError_t curet = cudaDeviceSynchronize();
   std::printf("Device sync: %d\n", ret);
   Eigen::ArrayXXf reference, error = Eigen::ArrayXXf::Zero(N, M);
   const auto B_map = Eigen::MatrixXf::Map(B, N, K);
   for(int b = 0; b < batch_count; ++b) {
     const auto A_map = Eigen::MatrixXf::Map(A + strideA * b, K, M);
     reference.matrix().noalias() = B_map * A_map;
     const auto C_map = Eigen::ArrayXXf::Map(C + strideC * b, N, M);
     const auto rel_error = (C_map - reference).abs() /
           C_map.abs().max(reference.abs());
     error = error.max(rel_error);
   }
   std::printf("Max relative error %g\n", error.maxCoeff());
}

Reports a maximum relative error of 2.5e-7