"CUDA_ERROR_ILLEGAL_ADDRESS" when executing cuCtxSynchronize() in JCUDA

I am learning JCuda and studying with JCuda samples.

When I studied a KMeans algorithm code using JCuda, I got a "CUDA_ERROR_ILLEGAL_ADDRESS" when executed line cuCtxSynchronize();

It confused me a lot. How can I solve it?

Here is KMeansKernel.cu

extern "C"
__global__ void add(int n, float *a, float *b, float *sum)
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i<n)
    sum[i] = a[i] + b[i];

Main method(my class named "CUDA"):

public static void main(String[] args){
    // omit some code which input kinds of parameters

    try {
        // Open image file
        BufferedImage bi = ImageIO.read(picFiles);      
        if (bi == null) {
            System.out.println("ERROR: File input error.");

        // Read image data
        int length = bi.getWidth() * bi.getHeight();
        int[] imageProperty = new int[length*5];
        int[] pixel;
        int count = 0;
        for (int y = 0; y < bi.getHeight(); y++) {
            for (int x = 0; x < bi.getWidth(); x++) {
                pixel = bi.getRaster().getPixel(x, y, new int[4]);
                imageProperty[count*5  ] = pixel[0];
                imageProperty[count*5+1] = pixel[1];
                imageProperty[count*5+2] = pixel[2];
                imageProperty[count*5+3] = x;
                imageProperty[count*5+4] = y;


        // Create the PTX file
        String ptxFileName;
            ptxFileName = preparePtxFile("KmeansKernel.cu");
        catch (IOException e)

        CUdevice device = new CUdevice();
        cuDeviceGet(device, 0);
        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);

        CUmodule module = new CUmodule();
        cuModuleLoad(module, ptxFileName);

        CUfunction kmeansFunction = new CUfunction();
        cuModuleGetFunction(kmeansFunction, module, "add");

        //copy host input to device
        CUdeviceptr imageDevice = new CUdeviceptr();
        cuMemAlloc(imageDevice, imageProperty.length * Sizeof.INT);
        cuMemcpyHtoD(imageDevice, Pointer.to(imageProperty), imageProperty.length * Sizeof.INT);

        int blockSizeX = 256;
        int gridSizeX = (int) Math.ceil((double)(imageProperty.length / 5) / blockSizeX);

        long et = System.currentTimeMillis();
        System.out.println(((double)(et-st)/1000.0) + "s");

        for (int k = startClusters; k <= endClusters; k++) {
            long startTime = System.currentTimeMillis();

            int[] clusters = new int[length];
            int[] c = new int[k*5];
            int h = 0;
            for(int i = 0; i < k; i++) {
                c[i*5] = imageProperty[h*5];
                c[i*5+1] = imageProperty[h*5+1];
                c[i*5+2] = imageProperty[h*5+2];
                c[i*5+3] = imageProperty[h*5+3];
                c[i*5+4] = imageProperty[h*5+4];
                h += length / k;

            double tolerance = 1e-4;
            **//got warning in following line
            CUDA.KmeansKernel(kmeansFunction, imageDevice, imageProperty, clusters, c, k, tolerance, distanceWeight, colorWeight, blockSizeX, gridSizeX);** 

            int[] output = calculateAveragePixels(imageProperty, clusters);

            BufferedImage outputImage = new BufferedImage(bi.getWidth(), bi.getHeight(), BufferedImage.TYPE_INT_RGB);

            for (int i = 0; i < length; i++) {
                int rgb = output[i*5];
                rgb = (rgb * 256) + output[i*5+1];
                rgb = (rgb * 256) + output[i*5+2];
                outputImage.setRGB(i%bi.getWidth(), i/bi.getWidth(), rgb);

            String fileName = (picFiles.getName()) + ".bmp";

            File outputFile = new File("output/" + fileName);
            ImageIO.write(outputImage, "BMP", outputFile);

            long runTime = System.currentTimeMillis() - startTime;
            System.out.println("Completed iteration k=" + k + " in " + ((double)runTime/1000.0) + "s");

        System.out.println("Files saved to " + outputDirectory.getAbsolutePath() + "\\");

    } catch (IOException e) {

Method KmeansKernel:

private static void KmeansKernel(CUfunction kmeansFunction, CUdeviceptr imageDevice, int[] imageProperty, int[] clusters, int[] c,
                                        int k, double tolerance, double distanceWeight, double colorWeight,
                                        int blockSizeX, int gridSizeX) {

    CUdeviceptr clustersDevice = new CUdeviceptr();
    cuMemAlloc(clustersDevice, clusters.length * Sizeof.INT);

    // Alloc device output
    CUdeviceptr centroidPixels = new CUdeviceptr();
    cuMemAlloc(centroidPixels, k * 5 * Sizeof.INT);

    CUdeviceptr errorDevice = new CUdeviceptr();
    cuMemAlloc(errorDevice, Sizeof.DOUBLE * clusters.length);

    int[] c1 = new int[k*5];

    cuMemcpyHtoD(centroidPixels, Pointer.to(c), Sizeof.INT * 5 * k);

    // begin algorithm
    int[] counts = new int[k];
    double old_error, error = Double.MAX_VALUE;
    int l = 0;

    do {
        old_error = error;
        error = 0;

        Arrays.fill(counts, 0);
        Arrays.fill(c1, 0);
        cuMemcpyHtoD(centroidPixels, Pointer.to(c), k * 5 * Sizeof.INT);

        Pointer kernelParameters = Pointer.to(
                Pointer.to(new int[] {clusters.length}),
                Pointer.to(new int[] {k}),
                Pointer.to(new double[] {colorWeight}),
                Pointer.to(new double[] {distanceWeight}),

                gridSizeX, 1, 1,
                blockSizeX, 1, 1,
                0, null,
                kernelParameters, null
        **cuCtxSynchronize(); //got warning here.why?**

        cuMemcpyDtoH(Pointer.to(clusters), clustersDevice, Sizeof.INT*clusters.length);

        for (int i = 0; i < clusters.length; i++) {
            int cluster = clusters[i];
            c1[cluster*5] += imageProperty[i*5];
            c1[cluster*5+1] += imageProperty[i*5+1];
            c1[cluster*5+2] += imageProperty[i*5+2];
            c1[cluster*5+3] += imageProperty[i*5+3];
            c1[cluster*5+4] += imageProperty[i*5+4];

        for (int i = 0; i < k; i++) {
            if (counts[i] > 0) {
                c[i*5] = c1[i*5] / counts[i];
                c[i*5+1] = c1[i*5+1] / counts[i];
                c[i*5+2] = c1[i*5+2] / counts[i];
                c[i*5+3] = c1[i*5+3] / counts[i];
                c[i*5+4] = c1[i*5+4] / counts[i];
            } else {
                c[i*5] = c1[i*5];
                c[i*5+1] = c1[i*5+1];
                c[i*5+2] = c1[i*5+2];
                c[i*5+3] = c1[i*5+3];
                c[i*5+4] = c1[i*5+4];

        double[] errors = new double[clusters.length];
        cuMemcpyDtoH(Pointer.to(errors), errorDevice, Sizeof.DOUBLE*clusters.length);
        error = sumArray(errors);
        System.out.println("" + l + " iterations");

    } while (Math.abs(old_error - error) > tolerance);

    cuMemcpyDtoH(Pointer.to(clusters), clustersDevice, clusters.length * Sizeof.INT);


Stack trace:

Exception in thread "main" jcuda.CudaException: CUDA_ERROR_ILLEGAL_ADDRESS
    at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:330)
    at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1938)
    at com.test.CUDA.KmeansKernel(CUDA.java:269)
    at com.test.CUDA.main(CUDA.java:184)

As @talonmies mentions, the kernelParameters you are passing to the cuLaunchKernel method are not in line with add kernel function signature.

You get the error at cuCtxSynchronize because CUDA execution model is asynchronous: cuLaunchKernel returns immediately and actual execution of the kernel on the device is asynchronous. cuCtxSynchronize documentation reads:

Note that this function may also return error codes from previous, asynchronous launches.

The second kernelParameters entry is an int k, where the second parameter of add method is a pointer to float, hence most probably the illegal access error.