I expect a cuda shared memory bank conflict in the following two situations:
- Accessing successive half floats (2 words) with successive threads
- Accessing vectorized int4 datatypes by successive threads
A tabular representation of the shared memory banks and the memory access in the two situations is:
| Banks | 0 | 1 | ... | 15 | 16 | .. | 31 | 0 |
|---|---|---|---|---|---|---|---|---|
| TwoBanks | 0, 1 | 2, 3 | 30, 31 | - | - | - | ||
| Vectorized | 0 | 0 | 3 | 4 | 7 | 8 |
When accessing half floats, two threads should compete for the first 16 memory banks. When accessing int4s, 4 threads should compete for 4 banks each. However, nisght-compute tells me there are no conflicts.
To illustrate, consider the following program (available on godbolt):
#include <iostream>
#include <cuda_fp16.h>
using T = half;
__global__ void TwoBanks(T* sum) {
__shared__ T shmem[32];
shmem[threadIdx.x] = (T)threadIdx.x;
__syncthreads();
T accum{0};
*sum = accum; // set to zero
for (size_t i = 0; i < 32; ++i) {
accum += shmem[i];
}
*sum = accum;
}
__global__ void Vectorized(T* sum) {
__shared__ T shmem[32 * 8];
T write[8];
for (uint i = 0; i < 8; ++i) {
write[i] = static_cast<T>(threadIdx.x) + static_cast<T>(i);
}
T* dst = shmem + threadIdx.x * 8;
reinterpret_cast<int4*>(dst)[0] = reinterpret_cast<int4*>(write)[0];
__syncthreads();
T accum{0};
*sum = accum; // set to zero
for (size_t i = 0; i < 32 * 8; ++i) {
accum += shmem[i];
}
*sum = accum;
}
int main() {
dim3 GridDim(32);
dim3 BlockDim(1);
constexpr size_t sz = sizeof(T);
T* device_sum;
T host_sum[1];
cudaMalloc((void**)&device_sum, sz);
TwoBanks<<<BlockDim, GridDim>>>(device_sum);
cudaMemcpy(host_sum, device_sum, sz, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
std::cout << "The sum for TwoBanks is: " << (float)host_sum[0] << std::endl;
Vectorized<<<BlockDim, GridDim>>>(device_sum);
cudaMemcpy(host_sum, device_sum, sz, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
std::cout << "The sum for Vectorized: " << (float)host_sum[0] << std::endl;
}
When I run the program with nsight-compute, I am told there are no conflicts:
Section: Command line profiler metrics
-------------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
-------------------------------------------------------- ----------- ------------
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum 0
-------------------------------------------------------- ----------- ------------
Vectorized(__half *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
Section: Command line profiler metrics
-------------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
-------------------------------------------------------- ----------- ------------
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum 0
-------------------------------------------------------- ----------- ------------
What's wrong with my analysis?