I am writing a kernel module on NVIDIA's Jetson AGX Xavier (ARM architecture). I am creating a shared memory between kernel space and user space in the following way:
- Allocate memory in kernel space using
kzalloc(). - Use
dma_map_single()to get bus address for the allocated memory. - Use
remap_pfn_range()to map the memory to user space. - In user space, use
mmap()to get a pointer to this shared memory. - Write data to the shared memory via
memcpy(). - Read the shared memory in kernel space to see if it contains the data that was written from user space.
I am facing an issue that when I read the memory from kernel space, it sometimes contains up-to-date data, and sometimes it does not contain up-to-date data, which makes me suspect a cache related issue. I have used pgprot_noncached() in the kernel module to make the memory non-cached, but still facing this issue. The relevant portion of the kernel module code is shown below.
static int tx1_mem_get(struct file* file_ptr, struct vm_area_struct* mem_struct)
{
int ret_val;
unsigned int i;
unsigned long start_addr;
unsigned long page_frame_num;
unsigned long mem_size;
struct device* device_ptr;
if (tx1_mem_count < BUFFER_COUNT) {
/* Obtain the device pointer for the device which is requesting the memory */
device_ptr = &pcidev_global_ptr->dev;
/* Get the array index to store the addresses */
i = tx1_mem_count;
/* Allocate TX1 Buffer */
tx1_data_buffers[i] = kzalloc(BUFFER_SIZE, GFP_DMA | GFP_ATOMIC);
if (tx1_data_buffers[i] == NULL) {
sprintf(msg_buffer, "WARNING: failed to allocate tx1 buffer %d", i);
umtrx_ep_msg(msg_buffer);
return 0;
}
/* Get the TX1 DMA address */
tx1_dma_buffers[i] = dma_map_single(device_ptr, tx1_data_buffers[i], BUFFER_SIZE, DMA_BIDIRECTIONAL);
if (tx1_dma_buffers[i] == DMA_MAPPING_ERROR) {
sprintf(msg_buffer, "WARNING: failed to obtain dma memory for tx1 buffer %d", i);
umtrx_ep_msg(msg_buffer);
kfree(tx1_data_buffers[i]);
tx1_data_buffers[i] = NULL;
return 0;
}
/* Set memory attribute flags (VM_DONTEXPAND | VM_DONTDUMP = VM_RESERVED; VM_RESERVED flag not supported in new kernel versions)*/
mem_struct->vm_flags |= VM_READ | VM_WRITE | VM_SHARED | VM_LOCKED | VM_DONTEXPAND | VM_DONTDUMP;
/* Set the page to non-cached */
mem_struct->vm_page_prot = pgprot_noncached(mem_struct->vm_page_prot);
/* Obtain required parameters for creating a mapping */
start_addr = mem_struct->vm_start;
page_frame_num = virt_to_phys(tx1_data_buffers[i]) >> PAGE_SHIFT;
mem_size = mem_struct->vm_end - mem_struct->vm_start;
if (mem_size > BUFFER_SIZE) {
sprintf(msg_buffer, "WARNING: couldn't map tx1 buffer %d, can't map more than %d bytes", i, BUFFER_SIZE);
umtrx_ep_msg(msg_buffer);
dma_unmap_single(device_ptr, tx1_dma_buffers[i], BUFFER_SIZE, DMA_BIDIRECTIONAL);
tx1_dma_buffers[i] = 0;
kfree(tx1_data_buffers[i]);
tx1_data_buffers[i] = NULL;
return 0;
}
ret_val = remap_pfn_range(mem_struct, start_addr, page_frame_num, mem_size, mem_struct->vm_page_prot);
if (ret_val != 0) {
sprintf(msg_buffer, "WARNING: couldn't map tx1 buffer %d", i);
umtrx_ep_msg(msg_buffer);
dma_unmap_single(device_ptr, tx1_dma_buffers[i], BUFFER_SIZE, DMA_BIDIRECTIONAL);
tx1_dma_buffers[i] = 0;
kfree(tx1_data_buffers[i]);
tx1_data_buffers[i] = NULL;
return 0;
}
/* Reserve the obtained memory, so that it is not swapped out by the kernel. Kernel space and user space can both access this buffer, so it must be
* reserved. */
reserve_mem_buffer(tx1_data_buffers[i], (unsigned int)(BUFFER_SIZE));
/* Memory has been obtained, and reserved. Now populate it with the physical address using GPC-DMA */
write_phys_addr(tx1_data_buffers[i]);
sprintf(msg_buffer, "tx1 buffer %d acquired", i);
umtrx_ep_msg(msg_buffer);
tx1_mem_count = tx1_mem_count + 1;
} else {
umtrx_ep_msg("WARNING: failed to acquire tx1 buffer. tx1 memory is full");
}
return 0;
}
I have tried to use dma_mmap_attrs() API instead of remap_pfn_range() (as per my understanding, dma_mmap_attrs() creates a mapping to user space without the need for a physical address or page frame number). However, this makes no difference.
Currently SMMU is enabled (it is enabled by default on NVIDIA AGX Xavier). If I disable SMMU and use the following user space code to explicitly flush cache, then the issue is resolved.
void flush_cache(void* start, size_t size)
{
// Ensure start address is aligned to cache line size
uintptr_t addr = (uintptr_t)start & ~(uintptr_t)(CACHE_LINE_SIZE - 1);
// Calculate the end address alligned to cache line size
uintptr_t end = ((uintptr_t)start + size + CACHE_LINE_SIZE - 1) & ~(uintptr_t)(CACHE_LINE_SIZE - 1);
// Flush the cache for each cache line in the specified range
for (; addr < end; addr+= CACHE_LINE_SIZE) {
__asm__ __volatile__("dc civac, %0" : : "r"(addr));
}
// Ensure completion of cache maintenance operations
__asm__ __volatile("dsb sy");
}
However, I can't disable SMMU due to some other limitation in my application. As per my understanding, the method I have used to create a shared memory should give me an uncached memory. What am I doing wrong here? Am I missing something obvious here?
Any help would be greatly appreciated.
Note: The size of the shared memory is 4MB, but I have tried going all the way down to 4KB but still no luck (4KB is the page size, and the smallest size which the kernel allocates successfully using kzalloc()).