The DPC++ code is very simple, just have a local array ,set the value of array be 0 and barrier mem.
#include <CL/sycl.hpp>
using namespace sycl;
#define WRAP_SIZE 32
int main(){
sycl::gpu_selector selector;
queue exec_queue(selector);
int num_blocks=128;
int num_threads=256;
int casBeg=0;
int casEnd=2;
auto device_mem=sycl::malloc_device(227000*sizeof(double),exec_queue);
exec_queue.submit([&](sycl::handler& cgh)
{
sycl::stream out{ 4096, 128, cgh };
auto sharedmem = sycl::accessor<int, 1, sycl::access_mode::read_write, sycl::access::target::local>(11, cgh);
cgh.parallel_for(
sycl::nd_range<1>(num_blocks * num_threads, num_threads),
[=](sycl::nd_item<1> item_ct1) [[intel::reqd_sub_group_size(WRAP_SIZE)]] {
int blkId = item_ct1.get_group(0);
int tid = item_ct1.get_local_id(0);
int stride = item_ct1.get_local_range().get(0);
out<<"inter\n";
if (tid == 0)
for (int i = 0; i < 11; ++i)
sharedmem[i] = 0;
item_ct1.barrier(sycl::access::fence_space::local_space);
});
}).wait();
return 0;
}
The build command is
dpcpp -DMKL_ILP64 -lmkl_sycl -lmkl_intel_ilp64 -lmkl_tbb_thread -lmkl_core -pthread -std=c++17 -O0 -o <project_name> <code_name>.cpp
Compiled program can work ok on the P690 GPU, but not work on the NDK intel ATS-P GPU. Why? Thx