Matrix multiplication in OpenCL

802 Views Asked by At

I'm a beginner in OpenCL and I'm using OpenCL on an Intel GPU.

I have the following program in C for the multiplication of two matrices; the kernel is separated with the code.

#include <CL/cl.h>
#include <stdio.h>
#include <math.h>
#include <assert.h>
#include <sys/stat.h>


char* lire_fichier_source ( const char *nom_fichier )
{
 // On determine la taille du fichier 
 struct stat st ;
 // récupérer des informations d’un fichier
 stat ( nom_fichier , &st );
 // On ouvre et on lit le contenu du fichier 
 FILE* fich = fopen( nom_fichier,"r");
 assert ( fich != NULL ); //tester l'etat du fichier
 char* programme = (char*) malloc(st.st_size +1); 
 if(fread ( programme, sizeof(char),st.st_size ,fich ))
  fclose ( fich );
 else exit(1);
 
 /*// La chaine doit se terminer par ASCII 0.
 programme [ st.st_size ]='\0';
 return programme ;
}*/
// #define PRECISION 0.01
void initialiser ( float *a , size_t N )
{
 //int k = 0;
 for ( int i=0;(unsigned)i<N;i++) {
   for ( int j = 0; (unsigned)j<N ; j++) {
     a[ i*N+j ] = 1;
   }
 }
}
/*void imprimer ( const char *nom , float *a , size_t N )
{
 printf("%s::\n", nom );
  for ( int i = 0; (unsigned)i<N ; i++) {
    for ( int j = 0; (unsigned)j<N ; j++) {
      printf ( " %f ", a[i*N+j] );
    }
  printf ("\n ");
  }
}*/
int main (int argc, char** argv )
{
 
 // On lit l’argument du programme , qui indique la taille
 // des matrices a generer et multiplier .
 //assert( argc >= 2 );
 size_t N = atoi(argv[1]); //char to num
 
 // On obtient un device 
 cl_platform_id plateforme ;
 clGetPlatformIDs (1, &plateforme,NULL );//Obtenir la liste des plateformes disponibles
 cl_device_id device ;
 
 //Interroge les appareils disponibles sur une plate-forme
 clGetDeviceIDs ( plateforme , CL_DEVICE_TYPE_ALL , 1 , &device , NULL );  
 
 // On cree un contexte et une queue d’execution .
 cl_context contexte = clCreateContext ( NULL , 1 , &device , NULL , NULL , NULL );
  
 //Création d'une file d'attente de commandes
 cl_command_queue queue = clCreateCommandQueueWithProperties( contexte , device , 0 , NULL );
 char *source = lire_fichier_source ( "produit_matrices.cl" );
 
//créer un programme à partir de votre code source(du kernel)
 cl_program programme = clCreateProgramWithSource ( contexte , 1 ,( const char**)&source,NULL, NULL );
 free ( source );

 //compiler le programme le périphérique cible
 clBuildProgram ( programme , 0 , NULL , NULL , NULL , NULL );

 cl_kernel kernel=clCreateKernel ( programme , " produit_matrices " , NULL );

 //float a[ N*N ],b[ N*N ];
 double a[ N*N ],b[ N*N ],c[ N*N ];
 initialiser ( a , N );
 initialiser ( b , N ); 
 initialiser ( c , N ); 
 //L’allocation de la mémoire device pour les arguments en spésifiant le type de lecture par le kernel
 cl_mem buffer_a=clCreateBuffer(contexte,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR ,sizeof( double //float )*N*N , a , NULL );
 cl_mem buffer_b=clCreateBuffer( contexte ,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR ,sizeof( double //float )*N*N , b , NULL );
 cl_mem buffer_c= clCreateBuffer( contexte , CL_MEM_WRITE_ONLY ,sizeof( double //float )*N*N , NULL , NULL );

 // On specifie les arguments du kernel 
 clSetKernelArg( kernel , 0 , sizeof( cl_mem ) , &buffer_a );
 clSetKernelArg( kernel , 1 , sizeof( cl_mem ) , &buffer_b );
 clSetKernelArg( kernel , 2 , sizeof( cl_mem ) , &buffer_c );
 clSetKernelArg( kernel , 3 , sizeof( int ) , &N );
 size_t nb_taches[]={N,N};
 //et on lance l'execution le kernel sur le périphérique
 clEnqueueNDRangeKernel( queue , kernel , 2 , NULL , nb_taches ,NULL , 0 , NULL , NULL );

 // On va chercher le resultat : CL_TRUE = > bloquant !
 float c[N*N];
 clEnqueueReadBuffer( queue , buffer_c , CL_TRUE , 0 ,sizeof( float )*N*N , c , 0 , NULL , NULL );
 
 printf("matrice a\n");
  for ( int i = 0; (size_t)i < N ; i++ ) {
   for ( int j = 0; (size_t)j <N ;j ++) {
   printf("%f \t",a[i*N+j]);
   }
   printf("\n");}
   
   printf("matrice b\n");
  for ( int i = 0; (size_t)i < N ; i++ ) {
   for ( int j = 0; (size_t)j <N ;j ++) {
   printf("%f \t",b[i*N+j]);
   }
   printf("\n");}
   
 printf("matrice c\n");
  for ( int i = 0; (size_t)i < N ; i++ ) {
   for ( int j = 0; (size_t)j <N ;j ++) {
   printf("%f \t",c[i*N+j]);
   }
   printf("\n");}
   
   printf("je suis dans10\n");
/*for ( int i = 0; (unsigned)i<N ; i++ ) {
   for ( int j = 0; (unsigned)j<N ;j++) {
     if ( fabs(c[i*N+j]-(1.0*N))>PRECISION) {
        printf("*** Erreur : resultat incorrect [%d , %d ]: %f\n" ,i , j , c[i*N+j]);
        exit( -1 );
     }
   }
  }*/
 //libirer les ressources
 clReleaseMemObject( buffer_a );
 clReleaseMemObject( buffer_b );
 clReleaseMemObject( buffer_c );
 clReleaseKernel(kernel );
 clReleaseProgram(programme );
 clReleaseCommandQueue(queue );
 clReleaseContext(contexte );
 return 0;
}

My kernel:

__kernel void produit_matrices ( __global constant float* a ,
__global constant float* b ,
__global float* c ,
uint N )
{
int i = get_global_id (0);
int j = get_global_id (1);
float total = 0.0;
for ( int k = 0; k<N ; k++ ) {
total += a [i*N+k]*b[k*N+j];
}
c[i*N+j]=total ;
}

When I run the program, it gives me 0 or other illogical result. I want to know if the problem is in the program or in the OpenCL installation.

1

There are 1 best solutions below

1
On

In addition to the general advice in the comments, I can see at least two problems straight away:

1. Kernel name

cl_kernel kernel=clCreateKernel ( programme , " produit_matrices " , NULL );
                                               ^                ^

The spaces around the name of your kernel function will probably cause it to fail.

Again, this is easily detected if you check for errors:

cl_int error = CL_SUCCESS;
cl_kernel kernel=clCreateKernel ( programme , " produit_matrices " , &error );
if (error != CL_SUCCESS)
{
    fprintf(stderr, "clCreateKernel failed: %d (%s)\n", error, clewErrorString(error));
    abort();
}

(here, clewErrorString is the function defined in this answer about converting numeric OpenCL error codes to names.)

2. Conflicting memory space specifiers

Your kernel is defined with the signature:

__kernel void produit_matrices ( __global constant float* a ,
__global constant float* b ,
__global float* c ,
uint N )

__global and constant on the same variable are mutually exclusive. Data can either be located in global or constant memory, not both. If you want to indicate that some global memory is or should be treated as read-only, use const:

__global const float* a ,
__global const float* b ,

clBuildProgram should tell you about this sort of error if you only let it - it can provide you with a compile log for your kernel.