I'm using cuBlas to create a library for some matrix operations. I first implemented a matrix mult
Snippet of library header class (.h file)
#include "cusolverDn.h" // NOLINT
#include "cuda_runtime.h" // NOLINT
#include "device_launch_parameters.h" // NOLINT
namespace perception_core {
namespace matrix_transform {
class CudaMatrixTransformations {
public:
CudaMatrixTransformations();
~CudaMatrixTransformations();
void MatrixMultiplicationDouble(double *A, double *B, double *C, const int m, const int k, const int n);
private:
// Cublas stuff
cudaError_t cudaStat1;
cudaError_t cudaStat2;
cublasHandle_t cublasH;
cublasStatus_t cublas_status;
};
} // namespace matrix_transform
} // namespace perception_core
#endif // LIB_CUDA_ROUTINES_INCLUDE_MATRIX_TRANSFORMS_H_
Snippet of library class implementation for multiplication (.cu file)
// This calculates the matrix mult C(m,n) = A(m,k) * B(k,n)
void CudaMatrixTransformations::MatrixMultiplicationDouble(
double *A, double *B, double *C, int m, int k, const int n) {
// Calculate size of each array
size_t s_A = m * k;
size_t s_B = k * n;
size_t s_C = m * n;
// Create the arrays to use in the GPU
double *d_A = NULL;
double *d_B = NULL;
double *d_C = NULL;
// Allocate memory
cudaStat1 = cudaMallocManaged(&d_A, s_A * sizeof(double));
cudaStat2 = cudaMallocManaged(&d_B, s_B * sizeof(double));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
cudaStat1 = cudaMallocManaged(&d_C, s_C * sizeof(double));
assert(cudaSuccess == cudaStat1);
// Copy the data to the device data
memcpy(d_A, A, s_A * sizeof(double));
memcpy(d_B, B, s_B * sizeof(double));
// Set up stuff for using CUDA
int lda = m;
int ldb = k;
int ldc = m;
const double alf = 1;
const double bet = 0;
const double *alpha = &alf;
const double *beta = &bet;
cublas_status = cublasCreate(&cublasH);
assert(cublas_status == CUBLAS_STATUS_SUCCESS);
// Perform multiplication
cublas_status = cublasDgemm(cublasH, // CUDA handle
CUBLAS_OP_N, CUBLAS_OP_N, // no operation on matrices
m, n, k, // dimensions in the matrices
alpha, // scalar for multiplication
d_A, lda, // matrix d_A and its leading dim
d_B, ldb, // matrix d_B and its leading dim
beta, // scalar for multiplication
d_C, ldc // matrix d_C and its leading dim
);
cudaStat1 = cudaDeviceSynchronize();
assert(cublas_status == CUBLAS_STATUS_SUCCESS);
assert(cudaSuccess == cudaStat1);
// Destroy the handle
cublasDestroy(cublasH);
C = (double*)malloc(s_C * sizeof(double));
memcpy(C, d_C, s_C * sizeof(double));
// Make sure to free resources
if (d_A) cudaFree(d_A);
if (d_B) cudaFree(d_B);
if (d_C) cudaFree(d_C);
return;
}
CudaMatrixTransformations::CudaMatrixTransformations() {
cublas_status = CUBLAS_STATUS_SUCCESS;
cudaStat1 = cudaSuccess;
cudaStat2 = cudaSuccess;
}
Then I created a gtest program to test my function. Where I passed a double *result = NULL; as my C parameter in my MatrixMultiplicationDouble function.
Snippet of gtest program (.cc file)
TEST_F(MatrixTransformsTest, MatrixMultiplication) {
double loc_q[] = {3, 4, 5, 6, 7 ,8};
double *q = loc_q;
double loc_w[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
double *w = loc_w;
double *result = NULL;
double loc_result[M_ROWS * M_COLS] = {14, 50, 86, 122, 23, 86, 149, 212};
matrix_result = loc_result;
size_t m = 4;
size_t k = 3;
size_t n = 2;
perception_core::matrix_transform::CudaMatrixTransformations transforms;
transforms.MatrixMultiplicationDouble(w, q, result, m, k, n);
auto rr = std::addressof(result);
printf("\nC addr: %p\n", rr);
std::cout << "result:\n";
print_matrix(result, m, n);
EXPECT_TRUE(compare<double>(result, matrix_result, m * n));
}
The routine in cuBlas works fine as I can see the result when I print the matrix inside the .cu file. However, when I try to access result in my gtest file, I get a seg fault. Upon further inspection I noticed that the address of the result pointer is different inside the .cu and in the .cpp. As an example I get:
C addr: 0x7ffc5749db08 (inside .cu)
C addr: 0x7ffc5749dba0 (inside .cpp)
I thought that by using Unified Memory I could access that pointer either from host or device. I can't seem to find an answer as to why this address changes and fix the seg fault issue. Is there something I'm missing about using Unified Memory? Thank you!
This line isn't doing what you need:
when you modify the numerical value of the
Cpointer, that modification will not show up in the calling environment. That is the nature of pass-by-value, and the numerical value of theCpointer is being passed by value when you callCudaMatrixTransformations::MatrixMultiplicationDoubleSo that line will work inside your function (perhaps), but the results won't be passed back to the calling environment that way.
I would suggest reworking your code so that you handle
Cin a fashion similar to how you are handlingAandB. Create an extra pointerd_C, do yourcudaMallocManagedon that, then before returning,memcpythe results fromd_Cback toC. This assumes you are allocating properly for theCpointer before calling this function.Also note that at the end you are freeing
AandB- that's not what you want, I don't think. You should freed_A,d_B, andd_Cbefore returning.There are other issues with your code as well. For example you refer to returning a
resultpointer but I don't see any evidence of that. I don't see any pointer namedresult, actually. Furthermore, the function prototype (in the class definition) doesn't match your implementation. The prototype suggests a returndouble*whereas your implementation returnsvoid.And since I'm listing observations, I don't think your use of
addressofis giving you the information you presume it is. If you're going to compare numerical pointer values, you need to compare the values themselves, not the address of the location where those values are stored.