I'm a beginner in OpenCL and I'm using OpenCL on an Intel GPU.
I have the following program in C for the multiplication of two matrices; the kernel is separated with the code.
#include <CL/cl.h>
#include <stdio.h>
#include <math.h>
#include <assert.h>
#include <sys/stat.h>
char* lire_fichier_source ( const char *nom_fichier )
{
// On determine la taille du fichier
struct stat st ;
// récupérer des informations d’un fichier
stat ( nom_fichier , &st );
// On ouvre et on lit le contenu du fichier
FILE* fich = fopen( nom_fichier,"r");
assert ( fich != NULL ); //tester l'etat du fichier
char* programme = (char*) malloc(st.st_size +1);
if(fread ( programme, sizeof(char),st.st_size ,fich ))
fclose ( fich );
else exit(1);
/*// La chaine doit se terminer par ASCII 0.
programme [ st.st_size ]='\0';
return programme ;
}*/
// #define PRECISION 0.01
void initialiser ( float *a , size_t N )
{
//int k = 0;
for ( int i=0;(unsigned)i<N;i++) {
for ( int j = 0; (unsigned)j<N ; j++) {
a[ i*N+j ] = 1;
}
}
}
/*void imprimer ( const char *nom , float *a , size_t N )
{
printf("%s::\n", nom );
for ( int i = 0; (unsigned)i<N ; i++) {
for ( int j = 0; (unsigned)j<N ; j++) {
printf ( " %f ", a[i*N+j] );
}
printf ("\n ");
}
}*/
int main (int argc, char** argv )
{
// On lit l’argument du programme , qui indique la taille
// des matrices a generer et multiplier .
//assert( argc >= 2 );
size_t N = atoi(argv[1]); //char to num
// On obtient un device
cl_platform_id plateforme ;
clGetPlatformIDs (1, &plateforme,NULL );//Obtenir la liste des plateformes disponibles
cl_device_id device ;
//Interroge les appareils disponibles sur une plate-forme
clGetDeviceIDs ( plateforme , CL_DEVICE_TYPE_ALL , 1 , &device , NULL );
// On cree un contexte et une queue d’execution .
cl_context contexte = clCreateContext ( NULL , 1 , &device , NULL , NULL , NULL );
//Création d'une file d'attente de commandes
cl_command_queue queue = clCreateCommandQueueWithProperties( contexte , device , 0 , NULL );
char *source = lire_fichier_source ( "produit_matrices.cl" );
//créer un programme à partir de votre code source(du kernel)
cl_program programme = clCreateProgramWithSource ( contexte , 1 ,( const char**)&source,NULL, NULL );
free ( source );
//compiler le programme le périphérique cible
clBuildProgram ( programme , 0 , NULL , NULL , NULL , NULL );
cl_kernel kernel=clCreateKernel ( programme , " produit_matrices " , NULL );
//float a[ N*N ],b[ N*N ];
double a[ N*N ],b[ N*N ],c[ N*N ];
initialiser ( a , N );
initialiser ( b , N );
initialiser ( c , N );
//L’allocation de la mémoire device pour les arguments en spésifiant le type de lecture par le kernel
cl_mem buffer_a=clCreateBuffer(contexte,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR ,sizeof( double //float )*N*N , a , NULL );
cl_mem buffer_b=clCreateBuffer( contexte ,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR ,sizeof( double //float )*N*N , b , NULL );
cl_mem buffer_c= clCreateBuffer( contexte , CL_MEM_WRITE_ONLY ,sizeof( double //float )*N*N , NULL , NULL );
// On specifie les arguments du kernel
clSetKernelArg( kernel , 0 , sizeof( cl_mem ) , &buffer_a );
clSetKernelArg( kernel , 1 , sizeof( cl_mem ) , &buffer_b );
clSetKernelArg( kernel , 2 , sizeof( cl_mem ) , &buffer_c );
clSetKernelArg( kernel , 3 , sizeof( int ) , &N );
size_t nb_taches[]={N,N};
//et on lance l'execution le kernel sur le périphérique
clEnqueueNDRangeKernel( queue , kernel , 2 , NULL , nb_taches ,NULL , 0 , NULL , NULL );
// On va chercher le resultat : CL_TRUE = > bloquant !
float c[N*N];
clEnqueueReadBuffer( queue , buffer_c , CL_TRUE , 0 ,sizeof( float )*N*N , c , 0 , NULL , NULL );
printf("matrice a\n");
for ( int i = 0; (size_t)i < N ; i++ ) {
for ( int j = 0; (size_t)j <N ;j ++) {
printf("%f \t",a[i*N+j]);
}
printf("\n");}
printf("matrice b\n");
for ( int i = 0; (size_t)i < N ; i++ ) {
for ( int j = 0; (size_t)j <N ;j ++) {
printf("%f \t",b[i*N+j]);
}
printf("\n");}
printf("matrice c\n");
for ( int i = 0; (size_t)i < N ; i++ ) {
for ( int j = 0; (size_t)j <N ;j ++) {
printf("%f \t",c[i*N+j]);
}
printf("\n");}
printf("je suis dans10\n");
/*for ( int i = 0; (unsigned)i<N ; i++ ) {
for ( int j = 0; (unsigned)j<N ;j++) {
if ( fabs(c[i*N+j]-(1.0*N))>PRECISION) {
printf("*** Erreur : resultat incorrect [%d , %d ]: %f\n" ,i , j , c[i*N+j]);
exit( -1 );
}
}
}*/
//libirer les ressources
clReleaseMemObject( buffer_a );
clReleaseMemObject( buffer_b );
clReleaseMemObject( buffer_c );
clReleaseKernel(kernel );
clReleaseProgram(programme );
clReleaseCommandQueue(queue );
clReleaseContext(contexte );
return 0;
}
My kernel:
__kernel void produit_matrices ( __global constant float* a ,
__global constant float* b ,
__global float* c ,
uint N )
{
int i = get_global_id (0);
int j = get_global_id (1);
float total = 0.0;
for ( int k = 0; k<N ; k++ ) {
total += a [i*N+k]*b[k*N+j];
}
c[i*N+j]=total ;
}
When I run the program, it gives me 0 or other illogical result. I want to know if the problem is in the program or in the OpenCL installation.
In addition to the general advice in the comments, I can see at least two problems straight away:
1. Kernel name
The spaces around the name of your kernel function will probably cause it to fail.
Again, this is easily detected if you check for errors:
(here,
clewErrorString
is the function defined in this answer about converting numeric OpenCL error codes to names.)2. Conflicting memory space specifiers
Your kernel is defined with the signature:
__global
andconstant
on the same variable are mutually exclusive. Data can either be located inglobal
orconstant
memory, not both. If you want to indicate that some global memory is or should be treated as read-only, useconst
:clBuildProgram
should tell you about this sort of error if you only let it - it can provide you with a compile log for your kernel.