How to load (map) thousands of files as fast as possible?

103 Views Asked by At

I want to load the content of thousands of files with different sizes (1KB ~ 50MB) on Windows.

Currently, I am using the mapping library from here, but it is still too slow, even with the std::async solution.

void RegisterArchives(const std::vector<std::string_view>& archives)
{
    std::vector<std::future<bool>> archives_vector;
    archives_future.reserve(archives.size());

    for (auto& i : archives)
        archives_future.emplace_back(std::async(RegisterArchive, i));

    for (auto& i : archives_future)
        i.get();
}

bool RegisterArchive(const std::string_view archive)
{
    for (auto& i : std::filesystem::recursive_directory_iterator(archive))
    {
        if (i.is_directory())
            continue;

        mapped_file map(i.path().string().c_str());
    }

    return true;
}

My question is, how can I map these files as quickly as possible?

I tried without std::async and tried other loading methods like FILE*, std::ifstream etc.

1

There are 1 best solutions below

0
huseyin tugrul buyukisik On

If what you need is to reduce file-system API calls for all redundant read-only file-read operations, you should use a cache.

Here is my multi-level cache implementation (direct mapped + lru) for requesting data while cache-miss is handled automatically:

enter image description here

Test code:

#include "CpuBenchmarker.h"
#include "LruClockCache.h"
#include "DirectMappedCache.h"
#include "CacheThreader.h"
#include <atomic>
#include <memory>
#include <iostream>
#include <random>
#include<omp.h>
int main()
{
    std::atomic<size_t> total = 0;
    for (float benchmark = 2; benchmark >= 0.5f; benchmark *= 0.97f)
    {
        const int LLCsize = 1024 * 128;

        // randomly fill array
        std::vector<int> backingStore(5000000);
        std::random_device rd;
        std::mt19937 rng(rd());
        std::uniform_real_distribution<float> rnd(0, 5000000);
        for (int i = 0; i < 5000000; i++)
            backingStore[i] = rnd(rng);

        auto LLC = std::make_shared<LruClockCache<int, int>>(LLCsize,
            [&](int key) {
        
                // cache read-miss: runs when cache.get() does not find data in cache container
                // replace this with your file-read api (that is called only when data was not found in this cache)
                return backingStore[key];
            },
            [&](int key, int value) {

                // write-miss
                // don't use cache.set() because this is only a read-only mod!! but code is here just in case you want to build a read-write cache later
                backingStore[key] = value;
            });

        const int L2size = LLCsize / 4;
        const int L1size = L2size / 4; // L1 size has to be integer power of 2 !!!


        const size_t N = 100 * benchmark;
        const size_t repeat = 50 / (benchmark * benchmark * benchmark * benchmark * benchmark);
        const int numThreads = 20;
        const size_t totalWork = repeat * N * N * numThreads;
        CpuBenchmarker bench(totalWork * sizeof(int), "image pixels = " + std::to_string(N) + "x" + std::to_string(N) + "=" + std::to_string(N * N) +
            "   20 threads (LLC=" + std::to_string(LLCsize) +
            ", L2=" + std::to_string(L2size) +
            " x" + std::to_string(numThreads) + ", L1=" + std::to_string(L1size) + " x" + std::to_string(numThreads) + ")", totalWork);
        {

            #pragma omp parallel for  num_threads(20)
            for(int k=0;k<numThreads;k++)
                {
                        CacheThreader<LruClockCache,int,int> cache(LLC,L1size,L2size);
                        size_t subTotal = 0;
                        for (int m = 0; m < repeat; m++)
                            for (int j = 0; j < N; j++)
                                for (int i = 0; i < N; i++)
                                {
                                    subTotal += cache.get(i + j * N);
                                }

                        total += subTotal;
                }

        }
    }
    std::cout << "sum of random numbers:" << total << std::endl;
    return 0;
}

output:

image pixels = 105x105=11025   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 66022200 nanoseconds     (bandwidth = 507.65 MB/s)      (throughput = 7.88 nanoseconds per iteration)
image pixels = 102x102=10404   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 61253300 nanoseconds     (bandwidth = 597.88 MB/s)      (throughput = 6.69 nanoseconds per iteration)
image pixels = 99x99=9801   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 56015600 nanoseconds     (bandwidth = 713.87 MB/s)      (throughput = 5.60 nanoseconds per iteration)
image pixels = 96x96=9216   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 52906400 nanoseconds     (bandwidth = 836.13 MB/s)      (throughput = 4.78 nanoseconds per iteration)
image pixels = 93x93=8649   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 45101200 nanoseconds     (bandwidth = 1073.90 MB/s)      (throughput = 3.72 nanoseconds per iteration)
image pixels = 90x90=8100   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 37863000 nanoseconds     (bandwidth = 1386.26 MB/s)      (throughput = 2.89 nanoseconds per iteration)
image pixels = 87x87=7569   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 39471500 nanoseconds     (bandwidth = 1457.37 MB/s)      (throughput = 2.74 nanoseconds per iteration)
image pixels = 85x85=7225   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 42749400 nanoseconds     (bandwidth = 1500.79 MB/s)      (throughput = 2.67 nanoseconds per iteration)
image pixels = 82x82=6724   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 44301300 nanoseconds     (bandwidth = 1566.36 MB/s)      (throughput = 2.55 nanoseconds per iteration)
image pixels = 80x80=6400   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 50015600 nanoseconds     (bandwidth = 1535.52 MB/s)      (throughput = 2.60 nanoseconds per iteration)
image pixels = 77x77=5929   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 50565400 nanoseconds     (bandwidth = 1641.56 MB/s)      (throughput = 2.44 nanoseconds per iteration)
image pixels = 75x75=5625   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 54548700 nanoseconds     (bandwidth = 1682.90 MB/s)      (throughput = 2.38 nanoseconds per iteration)
image pixels = 73x73=5329   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 58442300 nanoseconds     (bandwidth = 1728.85 MB/s)      (throughput = 2.31 nanoseconds per iteration)
image pixels = 71x71=5041   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 63226100 nanoseconds     (bandwidth = 1766.81 MB/s)      (throughput = 2.26 nanoseconds per iteration)
image pixels = 68x68=4624   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 66321500 nanoseconds     (bandwidth = 1796.01 MB/s)      (throughput = 2.23 nanoseconds per iteration)
image pixels = 66x66=4356   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 72890800 nanoseconds     (bandwidth = 1792.82 MB/s)      (throughput = 2.23 nanoseconds per iteration)
image pixels = 64x64=4096   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 78564000 nanoseconds     (bandwidth = 1822.67 MB/s)      (throughput = 2.19 nanoseconds per iteration)
image pixels = 62x62=3844   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 85039200 nanoseconds     (bandwidth = 1840.65 MB/s)      (throughput = 2.17 nanoseconds per iteration)
image pixels = 60x60=3600   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 91043900 nanoseconds     (bandwidth = 1875.84 MB/s)      (throughput = 2.13 nanoseconds per iteration)
image pixels = 59x59=3481   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 101974900 nanoseconds     (bandwidth = 1884.30 MB/s)      (throughput = 2.12 nanoseconds per iteration)
image pixels = 57x57=3249   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 109013800 nanoseconds     (bandwidth = 1916.97 MB/s)      (throughput = 2.09 nanoseconds per iteration)
image pixels = 55x55=3025   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 118682500 nanoseconds     (bandwidth = 1910.59 MB/s)      (throughput = 2.09 nanoseconds per iteration)
image pixels = 53x53=2809   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 128109100 nanoseconds     (bandwidth = 1913.76 MB/s)      (throughput = 2.09 nanoseconds per iteration)
image pixels = 52x52=2704   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 142076600 nanoseconds     (bandwidth = 1933.65 MB/s)      (throughput = 2.07 nanoseconds per iteration)
image pixels = 50x50=2500   20 threads (LLC=131072, L2=32768 x20, L1=8192 x20): 151400200 nanoseconds     (bandwidth = 1953.76 MB/s)      (throughput = 2.05 nanoseconds per iteration)
sum of random numbers:2099302252640500

This is only for 4 bytes per request. With any file of 1kB+, you'd get the performance of your CPU's cache or RAM.

Implementation files (header-only but enable optimization flags, avx512 instruction set, C++14, etc):

CacheThreader.h:

/*
 * CacheThreader.h
 *
 *  Created on: Oct 7, 2021
 *      Author: tugrul
 */

#ifndef CACHETHREADER_H_
#define CACHETHREADER_H_


#include<vector>
#include<memory>
#include<thread>
#include<atomic>
#include"DirectMappedCache.h"
#include"LruClockCache.h"
 /* L1: direct mapped cache, for each thread
  * L2: LRU clock cache, for each thread (size must be integer-power of 2)
  * LLC: user-defined cache with thread-safe get/set methods that is slower but global
  * currently only 1 thread is supported
 */
template<template<typename, typename, typename> class Cache, typename CacheKey, typename CacheValue, typename CacheInternalCounterTypeInteger = size_t>
class CacheThreader
{
private:
    // last level cache, slow because of lock-guard
    std::shared_ptr<Cache<CacheKey, CacheValue, CacheInternalCounterTypeInteger>> LLC;
    std::shared_ptr<LruClockCache<CacheKey, CacheValue, CacheInternalCounterTypeInteger>> L2;
    std::shared_ptr<DirectMappedCache<CacheKey, CacheValue>> L1;


public:
    CacheThreader(std::shared_ptr<Cache<CacheKey, CacheValue, CacheInternalCounterTypeInteger>> cacheLLC, int sizeCacheL1, int sizeCacheL2)
    {

        LLC = cacheLLC;
        // backing-store of L1 is LLC
        L2 = std::make_shared<LruClockCache<CacheKey, CacheValue, CacheInternalCounterTypeInteger>>(sizeCacheL2, [this](CacheKey key) {

            return this->LLC->getThreadSafe(key);
            }, [this](CacheKey key, CacheValue value) {

                this->LLC->setThreadSafe(key, value);
            });
        L1 = std::make_shared<DirectMappedCache<CacheKey, CacheValue>>(sizeCacheL1, [this](CacheKey key) {

            return this->L2->get(key);
            }, [this](CacheKey key, CacheValue value) {

                this->L2->set(key, value);
            });
    }

    // get data from closest cache
    // currently only 1 thread supported
    inline
        const CacheValue get(CacheKey key) const
    {
        return L1->get(key);
    }

    // set data to closest cache
    // currently only 1 thread supported
    inline
        void set(CacheKey key, CacheValue value) const
    {
        L1->set(key, value);
    }

    // currently only 1 thread supported for read+write
    // only read-only usage for multi-threaded apps
    // must be called from all threads
    // does not flush LLC
    // LLC needs to be flushed manually by main-thread
    void flush()
    {
        L1->flush();
        L2->flush();
    }

    ~CacheThreader() {  }
};


#endif /* CACHETHREADER_H_ */

DirectMappedCache:

/*
 * DirectMappedCache.h
 *
 *  Created on: Oct 8, 2021
 *      Author: root
 */

#ifndef DIRECTMAPPEDCACHE_H_
#define DIRECTMAPPEDCACHE_H_

#include<vector>
#include<functional>
#include<mutex>


 /* Direct-mapped cache implementation
  * Only usable for integer type keys in range [0,maxPositive-1]
  *
  * CacheKey: type of key (only integers: int, char, size_t)
  * CacheValue: type of value that is bound to key (same as above)
  */
template<   typename CacheKey, typename CacheValue>
class DirectMappedCache
{
public:
    // allocates buffers for numElements number of cache slots/lanes
    // readMiss:    cache-miss for read operations. User needs to give this function
    //              to let the cache automatically get data from backing-store
    //              example: [&](MyClass key){ return redis.get(key); }
    //              takes a CacheKey as key, returns CacheValue as value
    // writeMiss:   cache-miss for write operations. User needs to give this function
    //              to let the cache automatically set data to backing-store
    //              example: [&](MyClass key, MyAnotherClass value){ redis.set(key,value); }
    //              takes a CacheKey as key and CacheValue as value
    // numElements: has to be integer-power of 2 (e.g. 2,4,8,16,...)
    DirectMappedCache(CacheKey numElements,
        const std::function<CacheValue(CacheKey)>& readMiss,
        const std::function<void(CacheKey, CacheValue)>& writeMiss,
        const int zenithShards = 4, /* unused for DirectMappedCache alone */
        const int zenithLane = 0 /* unused for DirectMappedCacheAlone*/
    ) :size(numElements), sizeM1(numElements - 1), loadData(readMiss), saveData(writeMiss)
    {
        // initialize buffers
        for (size_t i = 0; i < numElements; i++)
        {
            valueBuffer.push_back(CacheValue());
            isEditedBuffer.push_back(0);
            keyBuffer.push_back(CacheKey() - 1);// mapping of 0+ allowed
        }
    }



    // get element from cache
    // if cache doesn't find it in buffers,
    // then cache gets data from backing-store
    // then returns the result to user
    // then cache is available from RAM on next get/set access with same key
    inline
        const CacheValue get(const CacheKey& key)  noexcept
    {
        return accessDirect(key, nullptr);
    }

    // only syntactic difference
    inline
        const std::vector<CacheValue> getMultiple(const std::vector<CacheKey>& key)  noexcept
    {
        const int n = key.size();
        std::vector<CacheValue> result(n);

        for (int i = 0; i < n; i++)
        {
            result[i] = accessDirect(key[i], nullptr);
        }
        return result;
    }


    // thread-safe but slower version of get()
    inline
        const CacheValue getThreadSafe(const CacheKey& key)  noexcept
    {
        std::lock_guard<std::mutex> lg(mut);
        return accessDirect(key, nullptr);
    }

    // set element to cache
    // if cache doesn't find it in buffers,
    // then cache sets data on just cache
    // writing to backing-store only happens when
    //                  another access evicts the cache slot containing this key/value
    //                  or when cache is flushed by flush() method
    // then returns the given value back
    // then cache is available from RAM on next get/set access with same key
    inline
        void set(const CacheKey& key, const CacheValue& val) noexcept
    {
        accessDirect(key, &val, 1);
    }

    // thread-safe but slower version of set()
    inline
        void setThreadSafe(const CacheKey& key, const CacheValue& val)  noexcept
    {
        std::lock_guard<std::mutex> lg(mut);
        accessDirect(key, &val, 1);
    }

    // use this before closing the backing-store to store the latest bits of data
    void flush()
    {
        try
        {
            std::lock_guard<std::mutex> lg(mut);
            for (size_t i = 0; i < size; i++)
            {
                if (isEditedBuffer[i] == 1)
                {
                    isEditedBuffer[i] = 0;
                    auto oldKey = keyBuffer[i];
                    auto oldValue = valueBuffer[i];
                    saveData(oldKey, oldValue);
                }
            }
        }
        catch (std::exception& ex) { std::cout << ex.what() << std::endl; }
    }

    // direct mapped access
    // opType=0: get
    // opType=1: set
    CacheValue const accessDirect(const CacheKey& key, const CacheValue* value, const bool opType = 0)
    {

        // find tag mapped to the key
        CacheKey tag = key & sizeM1;

        // compare keys
        if (keyBuffer[tag] == key)
        {
            // cache-hit

            // "set"
            if (opType == 1)
            {
                isEditedBuffer[tag] = 1;
                valueBuffer[tag] = *value;
            }

            // cache hit value
            return valueBuffer[tag];
        }
        else // cache-miss
        {
            CacheValue oldValue = valueBuffer[tag];
            CacheKey oldKey = keyBuffer[tag];

            // eviction algorithm start
            if (isEditedBuffer[tag] == 1)
            {
                // if it is "get"
                if (opType == 0)
                {
                    isEditedBuffer[tag] = 0;
                }

                saveData(oldKey, oldValue);

                // "get"
                if (opType == 0)
                {
                    const CacheValue&& loadedData = loadData(key);
                    valueBuffer[tag] = loadedData;
                    keyBuffer[tag] = key;
                    return loadedData;
                }
                else /* "set" */
                {
                    valueBuffer[tag] = *value;
                    keyBuffer[tag] = key;
                    return *value;
                }
            }
            else // not edited
            {
                // "set"
                if (opType == 1)
                {
                    isEditedBuffer[tag] = 1;
                }

                // "get"
                if (opType == 0)
                {
                    const CacheValue&& loadedData = loadData(key);
                    valueBuffer[tag] = loadedData;
                    keyBuffer[tag] = key;
                    return loadedData;
                }
                else // "set"
                {
                    valueBuffer[tag] = *value;
                    keyBuffer[tag] = key;
                    return *value;
                }
            }

        }
    }


private:
    const CacheKey size;
    const CacheKey sizeM1;
    std::mutex mut;

    std::vector<CacheValue> valueBuffer;
    std::vector<unsigned char> isEditedBuffer;
    std::vector<CacheKey> keyBuffer;
    const std::function<CacheValue(CacheKey)>  loadData;
    const std::function<void(CacheKey, CacheValue)>  saveData;

};


#endif /* DIRECTMAPPEDCACHE_H_ */

LruClockCache.h:

#ifndef LRUCLOCKCACHE_H_
#define LRUCLOCKCACHE_H_

#include<vector>
#include<algorithm>
#include<unordered_map>
#include<functional>
#include<mutex>
#include<unordered_map>


 /* LRU-CLOCK-second-chance implementation
  *
  * LruKey: type of key (std::string, int, char, size_t, objects)
  * LruValue: type of value that is bound to key (same as above)
  * ClockHandInteger: just an optional optimization to reduce memory consumption when cache size is equal to or less than 255,65535,4B-1,...
  */
template<   typename LruKey, typename LruValue, typename ClockHandInteger = size_t>
class LruClockCache
{
public:
    // allocates circular buffers for numElements number of cache slots
    // readMiss:    cache-miss for read operations. User needs to give this function
    //              to let the cache automatically get data from backing-store
    //              example: [&](MyClass key){ return redis.get(key); }
    //              takes a LruKey as key, returns LruValue as value
    // writeMiss:   cache-miss for write operations. User needs to give this function
    //              to let the cache automatically set data to backing-store
    //              example: [&](MyClass key, MyAnotherClass value){ redis.set(key,value); }
    //              takes a LruKey as key and LruValue as value
    LruClockCache(ClockHandInteger numElements,
        const std::function<LruValue(LruKey)>& readMiss,
        const std::function<void(LruKey, LruValue)>& writeMiss) :size(numElements), loadData(readMiss), saveData(writeMiss)
    {
        ctr = 0;
        // 50% phase difference between eviction and second-chance hands of the "second-chance" CLOCK algorithm
        ctrEvict = numElements / 2;

        //loadData=readMiss;
        //saveData=writeMiss;

        // initialize circular buffers
        for (ClockHandInteger i = 0; i < numElements; i++)
        {
            valueBuffer.push_back(LruValue());
            chanceToSurviveBuffer.push_back(0);
            isEditedBuffer.push_back(0);
            keyBuffer.push_back(LruKey());
        }
        mapping.reserve(numElements);
    }



    // get element from cache
    // if cache doesn't find it in circular buffers,
    // then cache gets data from backing-store
    // then returns the result to user
    // then cache is available from RAM on next get/set access with same key
    inline
        const LruValue get(const LruKey& key)  noexcept
    {
        return accessClock2Hand(key, nullptr);
    }

    // only syntactic difference
    inline
        const std::vector<LruValue> getMultiple(const std::vector<LruKey>& key)  noexcept
    {
        const int n = key.size();
        std::vector<LruValue> result(n);

        for (int i = 0; i < n; i++)
        {
            result[i] = accessClock2Hand(key[i], nullptr);
        }
        return result;
    }


    // thread-safe but slower version of get()
    inline
        const LruValue getThreadSafe(const LruKey& key) noexcept
    {
        std::lock_guard<std::mutex> lg(mut);
        return accessClock2Hand(key, nullptr);
    }

    // set element to cache
    // if cache doesn't find it in circular buffers,
    // then cache sets data on just cache
    // writing to backing-store only happens when
    //                  another access evicts the cache slot containing this key/value
    //                  or when cache is flushed by flush() method
    // then returns the given value back
    // then cache is available from RAM on next get/set access with same key
    inline
        void set(const LruKey& key, const LruValue& val) noexcept
    {
        accessClock2Hand(key, &val, 1);
    }

    // thread-safe but slower version of set()
    inline
        void setThreadSafe(const LruKey& key, const LruValue& val)  noexcept
    {
        std::lock_guard<std::mutex> lg(mut);
        accessClock2Hand(key, &val, 1);
    }

    // use this before closing the backing-store to store the latest bits of data
    void flush()
    {
        std::lock_guard<std::mutex> lg(mut);
        for (auto mp = mapping.cbegin(); mp != mapping.cend() /* not hoisted */; /* no increment */)
        {
            if (isEditedBuffer[mp->second] == 1)
            {
                isEditedBuffer[mp->second] = 0;
                auto oldKey = keyBuffer[mp->second];
                auto oldValue = valueBuffer[mp->second];
                saveData(oldKey, oldValue);
                mapping.erase(mp++);    // or "it = m.erase(it)" since C++11
            }
            else
            {
                ++mp;
            }
        }
    }

    // CLOCK algorithm with 2 hand counters (1 for second chance for a cache slot to survive, 1 for eviction of cache slot)
    // opType=0: get
    // opType=1: set
    LruValue const accessClock2Hand(const LruKey& key, const LruValue* value, const bool opType = 0)
    {

        // check if it is a cache-hit (in-cache)
        typename std::unordered_map<LruKey, ClockHandInteger>::iterator it = mapping.find(key);
        if (it != mapping.end())
        {

            chanceToSurviveBuffer[it->second] = 1;
            if (opType == 1)
            {
                isEditedBuffer[it->second] = 1;
                valueBuffer[it->second] = *value;
            }
            return valueBuffer[it->second];
        }
        else // could not found key in cache, so searching in circular-buffer starts
        {
            long long ctrFound = -1;
            LruValue oldValue;
            LruKey oldKey;
            while (ctrFound == -1)
            {
                // second-chance hand lowers the "chance" status down if its 1 but slot is saved from eviction
                // 1 more chance to be in a cache-hit until eviction-hand finds this
                if (chanceToSurviveBuffer[ctr] > 0)
                {
                    chanceToSurviveBuffer[ctr] = 0;
                }

                // circular buffer has no bounds
                ctr++;
                if (ctr >= size)
                {
                    ctr = 0;
                }

                // unlucky slot is selected for eviction by eviction hand
                if (chanceToSurviveBuffer[ctrEvict] == 0)
                {
                    ctrFound = ctrEvict;
                    oldValue = valueBuffer[ctrFound];
                    oldKey = keyBuffer[ctrFound];
                }

                // circular buffer has no bounds
                ctrEvict++;
                if (ctrEvict >= size)
                {
                    ctrEvict = 0;
                }
            }

            // eviction algorithm start
            if (isEditedBuffer[ctrFound] == 1)
            {
                // if it is "get"
                if (opType == 0)
                {
                    isEditedBuffer[ctrFound] = 0;
                }

                saveData(oldKey, oldValue);

                // "get"
                if (opType == 0)
                {
                    const LruValue&& loadedData = loadData(key);
                    mapping.erase(keyBuffer[ctrFound]);
                    valueBuffer[ctrFound] = loadedData;
                    chanceToSurviveBuffer[ctrFound] = 0;

                    mapping.emplace(key, ctrFound);
                    keyBuffer[ctrFound] = key;

                    return loadedData;
                }
                else /* "set" */
                {
                    mapping.erase(keyBuffer[ctrFound]);


                    valueBuffer[ctrFound] = *value;
                    chanceToSurviveBuffer[ctrFound] = 0;

                    mapping.emplace(key, ctrFound);
                    keyBuffer[ctrFound] = key;
                    return *value;
                }
            }
            else // not edited
            {
                // "set"
                if (opType == 1)
                {
                    isEditedBuffer[ctrFound] = 1;
                }

                // "get"
                if (opType == 0)
                {
                    const LruValue&& loadedData = loadData(key);
                    mapping.erase(keyBuffer[ctrFound]);
                    valueBuffer[ctrFound] = loadedData;
                    chanceToSurviveBuffer[ctrFound] = 0;

                    mapping.emplace(key, ctrFound);
                    keyBuffer[ctrFound] = key;

                    return loadedData;
                }
                else // "set"
                {
                    mapping.erase(keyBuffer[ctrFound]);


                    valueBuffer[ctrFound] = *value;
                    chanceToSurviveBuffer[ctrFound] = 0;

                    mapping.emplace(key, ctrFound);
                    keyBuffer[ctrFound] = key;
                    return *value;
                }
            }

        }
    }


private:
    const ClockHandInteger size;
    std::mutex mut;
    std::unordered_map<LruKey, ClockHandInteger> mapping;
    std::vector<LruValue> valueBuffer;
    std::vector<unsigned char> chanceToSurviveBuffer;
    std::vector<unsigned char> isEditedBuffer;
    std::vector<LruKey> keyBuffer;
    const std::function<LruValue(LruKey)>  loadData;
    const std::function<void(LruKey, LruValue)>  saveData;
    ClockHandInteger ctr;
    ClockHandInteger ctrEvict;
};

#endif /* LRUCLOCKCACHE_H_ */

CpuBenchmarker.h:

#ifndef CPUBENCHMARKER_H_
#define CPUBENCHMARKER_H_

#include <chrono>
#include <string>
#include <iostream>
#include <iomanip>

 // RAII type benchmarker
class CpuBenchmarker
{
public:
    CpuBenchmarker() :CpuBenchmarker(0, "", 0)
    {
        measurementTarget = nullptr;
    }

    CpuBenchmarker(size_t bytesToBench) :CpuBenchmarker(bytesToBench, "", 0)
    {
        measurementTarget = nullptr;
    }

    CpuBenchmarker(size_t bytesToBench, std::string infoExtra) :CpuBenchmarker(bytesToBench, infoExtra, 0)
    {
        measurementTarget = nullptr;
    }

    CpuBenchmarker(size_t bytesToBench, std::string infoExtra, size_t countForThroughput) :t1(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()))
    {
        bytes = bytesToBench;
        info = infoExtra;
        count = countForThroughput;
        measurementTarget = nullptr;
    }

    // writes elapsed time (in seconds) to this variable upon destruction
    void addTimeWriteTarget(double* measurement)
    {
        measurementTarget = measurement;
    }

    ~CpuBenchmarker()
    {
        std::chrono::nanoseconds t2 = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now().time_since_epoch());
        size_t t = t2.count() - t1.count();
        if (measurementTarget != nullptr)
        {
            *measurementTarget = t / 1000000000.0; // seconds
        }
        if (info != std::string(""))
            std::cout << info << ": ";
        std::cout << t << " nanoseconds    ";
        if (bytes > 0)
        {
            std::cout << " (bandwidth = ";
            std::cout << std::fixed;
            std::cout << std::setprecision(2);
            std::cout << (bytes / (((double)t) / 1000000000.0)) / 1000000.0 << " MB/s)     ";
        }
        if (count > 0)
        {
            std::cout << " (throughput = ";
            std::cout << std::fixed;
            std::cout << std::setprecision(2);
            std::cout << (((double)t) / count) << " nanoseconds per iteration) ";
        }
        std::cout << std::endl;
    }

private:
    std::chrono::nanoseconds t1;
    size_t bytes;
    size_t count;
    std::string info;
    double* measurementTarget;
};

#endif /* CPUBENCHMARKER_H_ */

If you need read+write coherence in multithreaded use, you can look at implementation here:

https://github.com/tugrul512bit/LruClockCache/blob/main/AsyncCache.h


If you are using just a single thread, then it's better to use non-thread-safe version like this:

using MyKeyType = std::string;
using MyValueType = MinecraftChunk;

LruClockCache<MyKeyType,MyValueType> cache(1024*5,[&](MyKeyType key){ 
  // cache miss (read)
  // access data-store (network, hdd, graphics card, anything that is slower than RAM or higher-latency than RAM-latency x2)
  return readChunkFromHDD(key);
  },[&](MyKeyType key,MyValueType value){ 
  
  // cache miss (write)
  // access data-store
  writeChunkToHDD(key,value);
});

// cache handles all cace-miss functions automatically
MinecraftChunk chunk = cache.get("world coordinates 1500 35 2000");

// cache handles all cace-miss functions automatically
cache.set("world coordinates 1502 35 1999",chunk);

cache.flush(); // clears all pending-writes in the cache and writes to backing-store

it's also automatically coherent when you set/write something through the cache.