Reading many rows of a 2D array from file in C++

51 Views Asked by At

I have a file storing a 2D float array of shape M*128, where M is a quite large number. Now I want to read N=8,000,000 rows of the array into memory, with the row indices to read are randomly picked and stored in a 1D array idx. I can do it with the following code (using pread or mmap):

#include <fcntl.h>
#include <omp.h>
#include <sys/mman.h>
#include <unistd.h>

#include <chrono>
#include <cstdlib>
#include <cstring>
#include <memory>
#include <string>

int main() {
    std::chrono::time_point<std::chrono::high_resolution_clock> start, stop;
    using ms = std::chrono::duration<float, std::milli>;

    constexpr int N = 8'000'000;
    int fd;

    // reading indices from file
    int64_t* idx = (int64_t*)malloc(N * sizeof(int64_t));
    fd = open("idx8M.bin", O_RDONLY);
    if (fd == -1) {
        return 1;
    }
    read(fd, idx, N * sizeof(int64_t));
    close(fd);

    constexpr int dim = 128;
    constexpr size_t size = dim * sizeof(float);
    void* data = malloc(N * size);

    fd = open("2dArray.bin", O_RDONLY);
    if (fd == -1) {
        return 1;
    }

    size_t len = lseek(fd, 0, SEEK_END);
    char* addr = (char*)mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0);

    start = std::chrono::high_resolution_clock::now();

    #pragma omp parallel for num_threads(32)
    for (int i = 0; i < N; ++i) {
        // pread(fd, (char*)data + i * size, size, idx[i] * size);
        memcpy((char*)data + i * size, addr + idx[i] * size, size);
    }

    stop = std::chrono::high_resolution_clock::now();
    printf("Cost %f ms\n", std::chrono::duration_cast<ms>(stop - start).count());

    free(idx);
    free(data);
    close(fd);
    munmap(addr, len);
    return 0;
}

The problem is the reading speed. I ran the code on a server with an SSD (which is advertised to have 180MB/s maximum throughput). Either pread or mmap took over 600s to finish the read. Is there any way to speed up the reads?

0

There are 0 best solutions below