STXXL underperforms when using very fast SSDs

189 Views Asked by At

I'm currently writing a tool which uses the STXXL to find similarities between a large file on the hard drive and vectors residing in RAM. I wrote an example(for Windows) below to show what I mean. The referenced file is about 38GB large and was created with said tool.

On normal HDDs and SATA-SSDs it's performing quite well but on M.2 or faster SSDs it's only using at most a third of the bandwith, even though it could get more.

The VS2017 Profiler indicates, that the

const_reference const_element(const blocked_index_type& offset) const

function of the STXXL which indirectly calls

https://stxxl.org/tags/1.4.1/group__reqlayer.html#ga016f4e02b691aa4818ad3305de3f47bd

seems to be the slowest part. Does anyone have an idea as to why it underperforms and what needs to be changed?

Thank you in advance.

EDIT: Performance table for the new code:

Threads  CPU%   SSD MB/s   Time in s
8        72     450        265
6        64     560        286
4        52     720        329
2        29     790        600
1        17     900        1051

Code:

#include <inttypes.h>
#include <vector>
#include <random>
#include <ctime>
#include <limits>
#include <algorithm>
#include <tuple>
#include <chrono>

#include <stxxl/vector>
#include <stxxl/bits/io/wincall_file.h>

#include <omp.h>

using namespace std;

typedef stxxl::VECTOR_GENERATOR<tuple<uint32_t, uint32_t>, 16U, 16U, 4096 * 8, stxxl::RC>::result contentVecType;

int main() {
    auto start = std::chrono::high_resolution_clock::now();
    const string& sLibFile = "I:/DA/merged_s_80_h";
    const int32_t& iNumOfThreads = 1, iNumOfVecs = 24;
    const int32_t& iDiv = iNumOfVecs / iNumOfThreads;
    omp_set_num_threads(iNumOfThreads);

    ifstream fLibInfo(sLibFile + "_info.txt");
    uint64_t iSizeOfLib = 0;
    fLibInfo >> iSizeOfLib;

    stxxl::wincall_file* stxxlLibFile = new stxxl::wincall_file(sLibFile, stxxl::file::RDONLY);
    const contentVecType** vLib = new const contentVecType*[iNumOfThreads];
    for (int32_t i = 0; i < iNumOfThreads; ++i) {
        vLib[i] = new const contentVecType(stxxlLibFile, iSizeOfLib);
    }

    mt19937_64 rng(time(0));
    uniform_int_distribution<uint32_t> unii(1, numeric_limits<uint32_t>::max());
    vector<tuple<uint32_t,uint32_t>> vInput[24];
    for (int32_t i = 0; i < iNumOfVecs; ++i) {
        for (int32_t j = 0; j < 100000; ++j) {
            vInput[i].push_back(make_tuple(unii(rng),0));
        }
    }

    #pragma omp parallel for
    for (int32_t i = 0; i < iNumOfVecs; ++i) {
        sort(vInput[i].begin(), vInput[i].end());
    }

    #pragma omp parallel for
    for (int32_t i = 0; i < iNumOfThreads; ++i) {
        for (int32_t j = i* iDiv; j < (i+1)*iDiv; ++j) {
            vector<tuple<uint32_t, uint32_t>> vDummy(vInput[j].size());
            set_intersection(vInput[j].cbegin(), vInput[j].cend(), vLib[i]->cbegin(), vLib[i]->cend(), vDummy.begin(), [](const tuple<uint32_t, uint32_t>& a, const tuple<uint32_t, uint32_t>& b) { return get<0>(a) < get<0>(b); });
        }
    }


    for (int32_t i = 0; i < iNumOfThreads; ++i) {
        delete vLib[i];
    }
    delete[] vLib;
    delete stxxlLibFile;

    auto end = std::chrono::high_resolution_clock::now();
    cout << "Time: " << chrono::duration_cast<std::chrono::seconds>(end - start).count() << endl;
    return 0;
}
0

There are 0 best solutions below