I have sparse 3-diagonal NxN matrix A
built by some rule and want to solve the system Ax=b
. For this I'm using cusolverSpScsrlsvqr()
from cuSolverSp module. Is it ok to have device version many times slower than cusolverSpScsrlsvqrHost()
for large N? E.g. for N=2^14 the times are 174.1 ms for device and 3.5 ms for host. I'm on RTX 2060.
Code:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
using namespace std;
void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
char const *str = "UNKNOWN STATUS";
switch (status) {
case CUSOLVER_STATUS_SUCCESS:
str = "CUSOLVER_STATUS_SUCCESS";
break;
case CUSOLVER_STATUS_NOT_INITIALIZED:
str = "CUSOLVER_STATUS_NOT_INITIALIZED";
break;
case CUSOLVER_STATUS_ALLOC_FAILED:
str = "CUSOLVER_STATUS_ALLOC_FAILED";
break;
case CUSOLVER_STATUS_INVALID_VALUE:
str = "CUSOLVER_STATUS_INVALID_VALUE";
break;
case CUSOLVER_STATUS_ARCH_MISMATCH:
str = "CUSOLVER_STATUS_ARCH_MISMATCH";
break;
case CUSOLVER_STATUS_MAPPING_ERROR:
str = "CUSOLVER_STATUS_MAPPING_ERROR";
break;
case CUSOLVER_STATUS_EXECUTION_FAILED:
str = "CUSOLVER_STATUS_EXECUTION_FAILED";
break;
case CUSOLVER_STATUS_INTERNAL_ERROR:
str = "CUSOLVER_STATUS_INTERNAL_ERROR";
break;
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
break;
case CUSOLVER_STATUS_ZERO_PIVOT:
str = "CUSOLVER_STATUS_ZERO_PIVOT";
break;
}
cout << left << setw(30) << operation << " " << str << endl;
}
__global__ void fillAB(float *aValues, int *aRowPtrs, int *aColIdxs, float *b, int const n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) return;
if (i == 0) {
float xn = 10 * (n + 1);
aValues[n * 3] = xn;
aRowPtrs[0] = 0;
aRowPtrs[n + 1] = n * 3 + 1;
aColIdxs[n * 3] = n;
b[n] = xn * 2;
}
float xi = 10 * (i + 1);
aValues[i * 3 + 0] = xi;
aValues[i * 3 + 1] = xi + 5;
aValues[i * 3 + 2] = xi - 5;
aColIdxs[i * 3 + 0] = i;
aColIdxs[i * 3 + 1] = i + 1;
aColIdxs[i * 3 + 2] = i;
aRowPtrs[i + 1] = 2 + (i * 3);
b[i] = xi * 2;
}
int main() {
int const n = (int)pow(2, 14); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
int const valCount = n * 3 - 2;
float *const aValues = new float[valCount];
int *const aRowPtrs = new int[n + 1];
int *const aColIdxs = new int[valCount];
float *const b = new float[n];
float *const x = new float[n];
float *dev_aValues;
int *dev_aRowPtrs;
int *dev_aColIdxs;
float *dev_b;
float *dev_x;
int aValuesSize = sizeof(float) * valCount;
int aRowPtrsSize = sizeof(int) * (n + 1);
int aColIdxsSize = sizeof(int) * valCount;
int bSize = sizeof(float) * n;
int xSize = sizeof(float) * n;
cudaMalloc((void**)&dev_aValues, aValuesSize);
cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
cudaMalloc((void**)&dev_b, bSize);
cudaMalloc((void**)&dev_x, xSize);
fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);
cusolverSpHandle_t handle;
checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
cusparseMatDescr_t aDescr;
cusparseCreateMatDescr(&aDescr);
cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
int singularity;
cusolverStatus_t status;
cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
cudaDeviceSynchronize();
auto t0 = chrono::high_resolution_clock::now();
for (int i = 0; i < 10; ++i)
////////////////////// CUSOLVER HERE //////////////////////
status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
//status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
///////////////////////////////////////////////////////////
cudaDeviceSynchronize();
auto t1 = chrono::high_resolution_clock::now();
checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
cout << "System solved: " << setw(20) << fixed << right << setprecision(3) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;
cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
/*for (int i = 0; i < n; ++i) {
cout << " " << x[i];
}*/
cudaFree(dev_aValues);
cudaFree(dev_aRowPtrs);
cudaFree(dev_aColIdxs);
cudaFree(dev_b);
cudaFree(dev_x);
delete[] aValues;
delete[] aRowPtrs;
delete[] aColIdxs;
delete[] b;
delete[] x;
cudaDeviceReset();
return 0;
}
My guess is the issue here is that it is a tridiagonal matrix. I suspect that may eliminate certain parallelism aspects that would be a benefit to the GPU cusolver routine. I don't really have any justification for this statement except that I read in the cusparse docs statements like this:
I can't say what that means, exactly, except that it suggests to me that for a tridiagonal matrix, maybe a different approach is warranted. And cusparse provides solvers specifically for the tridiagonal case.
If we use one of those, we can get results that are faster than your specific host cusolver example, on your test case. Here is an example:
Notes:
float
case appear to be within about 1% of each other. I think some of this is due tofloat
resolution, but there may be other factors. Without further study I wouldn't have any reason to claim one is "more correct" than the other.nopivot
variant ofgtsv2
because it seemed to suggest that it would be faster in the power-of-2 size case, which is your case. And according to my testing it was faster.