What is the fastest way to append 4KB writes to a file ?
I tried concurrently writing 4KB from multiple threads with each thread tracking its offset in file. No data is corrupted or writes are lost. But 4 writer threads are slower than 1 writer thread. Disk utilization is also not different.
I then tried writes on a preallocated file which already has 4 GB data and then writing to it. Single writer thread is faster than appending to empty file. Even in preallocated case, 4 writer threads are slower than 1 writer thread.
#include <fstream>
#include <vector>
#include <thread>
#include "gtest/gtest.h"
void write_concurrently(string filename, const int data_in_gb, const int num_threads, const char start_char,
bool stream_cache = true) {
const int offset = 4 * 1024;
const long long num_records_each_thread = (data_in_gb * 1024 * ((1024 * 1024)/ (num_threads * offset)));
{
auto write_file_fn = [&](int index) {
// each thread has its own handle
fstream file_handle(filename, fstream::in | fstream::out | fstream::ate | fstream::binary);
if (!stream_cache) {
file_handle.rdbuf()->pubsetbuf(nullptr, 0); // no bufferring in fstream
}
file_handle.seekp(0, ios_base::beg);
vector<char> data(offset, (char)(index + start_char));
long long myoffset = index * offset;
for (long long i = 0; i < num_records_each_thread; ++i) {
file_handle.seekp(myoffset, ios_base::beg);
file_handle.write(data.data(), offset);
myoffset += num_threads * offset;
}
// file_handle.flush();
};
auto start_time = chrono::high_resolution_clock::now();
vector<thread> writer_threads;
for (int i = 0; i < num_threads; ++i) {
writer_threads.push_back(std::thread(write_file_fn, i));
}
for (int i = 0; i < num_threads; ++i) {
writer_threads[i].join();
}
auto end_time = chrono::high_resolution_clock::now();
std::cout << "Data written : " << data_in_gb << " GB, " << num_threads << " threads "
<< ", cache " << (stream_cache ? "true " : "false ") << ", size " << offset << " bytes ";
std::cout << "Time taken: " << (end_time - start_time).count() / 1000 << " micro-secs" << std::endl;
}
{
ifstream file(filename, fstream::in | fstream::binary);
file.seekg(0, ios_base::end);
EXPECT_EQ(num_records_each_thread * num_threads * offset, file.tellg());
file.seekg(0, ios_base::beg);
EXPECT_TRUE(file);
char data[offset]{ 0 };
for (long long i = 0; i < (num_records_each_thread * num_threads); ++i) {
file.read(data, offset);
EXPECT_TRUE(file || file.eof()); // should be able to read until eof
char expected_char = (char)((i % num_threads) + start_char);
bool same = true;
for (auto & c : data) {
same = same && (c == expected_char);
}
EXPECT_TRUE(same);
if (!same) {
std::cout << "corruption detected !!!" << std::endl;
break;
}
if (file.eof()) {
EXPECT_EQ(num_records_each_thread * num_threads, i + 1);
break;
}
}
}
}
// analysis: Possible but multiple thread is slower because of exlsuive locking in filesystem : ExAcquireResourceExclusiveLite
//Name Inc % Inc Exc % Exc Fold When First Last
//module ntoskrnl << ntoskrnl!ExAcquireResourceExclusiveLite >> 11.5 1, 030 11.5 1, 030 3 1o010___110110110___11011011____ 63.097 12, 941.694
//+ module ntfs.sys << ntfs!NtfsCopyWriteA >> 7.7 693 0.0 0 0 00000___00000010o___01o10000____ 63.097 12, 941.694
//| +module fltmgr.sys << fltmgr!FltpFastIoWrite >> 7.7 693 0.0 0 0 00000___00000010o___01o10000____ 63.097 12, 941.694
//| +module ntoskrnl << ntoskrnl!NtWriteFile >> 7.7 693 0.0 0 0 00000___00000010o___01o10000____ 63.097 12, 941.694
//+ module ntfs.sys << ntfs!NtfsFsdWrite >> 3.8 337 0.0 0 0 o0000___00o00o00o___00o00o00____ 68.759 12, 912.626
//+ module ntoskrnl << ntoskrnl!IofCallDriver >> 3.8 337 0.0 0 0 o0000___00o00o00o___00o00o00____ 68.759 12, 912.626
//+ module fltmgr.sys << fltmgr!FltpDispatch >> 3.8 337 0.0 0 0 o0000___00o00o00o___00o00o00____ 68.759 12, 912.626
//+ module ntoskrnl << ntoskrnl!NtWriteFile >> 3.8 337 0.0 0 0 o0000___00o00o00o___00o00o00____ 68.759 12, 912.626
TEST(fstream, write_concurrently_to_same_file) {
string filename = "file4.log";
long data_in_gb = 4;
{
// create file before write threads start.
{
fstream file(filename, fstream::in | fstream::out | fstream::trunc | fstream::binary);
}
write_concurrently(filename, data_in_gb, 1, 'A'); // concurrent is slower than 1 thread
}
{
// create file before write threads start.
{
fstream file(filename, fstream::in | fstream::out | fstream::trunc | fstream::binary);
}
write_concurrently(filename, data_in_gb, 4, 'A'); // concurrent is slower than 1 thread
}
std::remove(filename.c_str());
}
// Preallocated file
TEST(fstream, preallocated_file_concurrent_writes) {
string filename = "file5.log";
const int data_in_gb = 4;
{
// create file before write threads start.
fstream file(filename, fstream::in | fstream::out | fstream::trunc | fstream::binary);
write_concurrently(filename, data_in_gb, 1, 'A');
}
std::cout << "Preallocated file." << std::endl;
write_concurrently(filename, data_in_gb, 1, 'B');
write_concurrently(filename, data_in_gb, 4, 'C');
std::remove(filename.c_str());
}
Runs with release mode, visual studio 2017 msvc compiler. Code above is cross platform.
Running main() from gtest_main.cc
Note: Google Test filter = *write_concurrently_to_same_file
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from fstream
[ RUN ] fstream.write_concurrently_to_same_file
Data written : 4 GB, 1 threads , cache true , size 4096 bytes Time taken: 21895907 micro-secs
Data written : 4 GB, 4 threads , cache true , size 4096 bytes Time taken: 30937922 micro-secs
[ OK ] fstream.write_concurrently_to_same_file (84657 ms)
[----------] 1 test from fstream (84658 ms total)
[----------] Global test environment tear-down
[==========] 1 test from 1 test case ran. (84660 ms total)
[ PASSED ] 1 test.
>.\x64\Release\tests.exe --gtest_filter=*preallocated_file_concurrent_writes
Running main() from gtest_main.cc
Note: Google Test filter = *preallocated_file_concurrent_writes
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from fstream
[ RUN ] fstream.preallocated_file_concurrent_writes
Data written : 4 GB, 1 threads , cache true , size 4096 bytes Time taken: 20529264 micro-secs
Preallocated file.
Data written : 4 GB, 1 threads , cache true , size 4096 bytes Time taken: 15053958 micro-secs
Data written : 4 GB, 4 threads , cache true , size 4096 bytes Time taken: 16394996 micro-secs
[ OK ] fstream.preallocated_file_concurrent_writes (87653 ms)
[----------] 1 test from fstream (87654 ms total)
[----------] Global test environment tear-down
[==========] 1 test from 1 test case ran. (87659 ms total)
[ PASSED ] 1 test.
I looked at profile and one difference i noticed was lock acquire fn showing up in filesystem in multi threads case.
// analysis: multiple threads are slower because of exlsuive locking in filesystem : ExAcquireResourceExclusiveLite
//Name Inc % Inc Exc % Exc Fold When First Last
//module ntoskrnl << ntoskrnl!ExAcquireResourceExclusiveLite >> 11.5 1, 030 11.5 1, 030 3 1o010___110110110___11011011____ 63.097 12, 941.694
//+ module ntfs.sys << ntfs!NtfsCopyWriteA >> 7.7 693 0.0 0 0 00000___00000010o___01o10000____ 63.097 12, 941.694
//| +module fltmgr.sys << fltmgr!FltpFastIoWrite >> 7.7 693 0.0 0 0 00000___00000010o___01o10000____ 63.097 12, 941.694
//| +module ntoskrnl << ntoskrnl!NtWriteFile >> 7.7 693 0.0 0 0 00000___00000010o___01o10000____ 63.097 12, 941.694
//+ module ntfs.sys << ntfs!NtfsFsdWrite >> 3.8 337 0.0 0 0 o0000___00o00o00o___00o00o00____ 68.759 12, 912.626
//+ module ntoskrnl << ntoskrnl!IofCallDriver >> 3.8 337 0.0 0 0 o0000___00o00o00o___00o00o00____ 68.759 12, 912.626
//+ module fltmgr.sys << fltmgr!FltpDispatch >> 3.8 337 0.0 0 0 o0000___00o00o00o___00o00o00____ 68.759 12, 912.626
//+ module ntoskrnl << ntoskrnl!NtWriteFile >> 3.8 337 0.0 0 0 o0000___00o00o00o___00o00o00____ 68.759 12, 912.626
Current test runs are in windows.