diff --git a/README.adoc b/README.adoc index 757752a..5b3018e 100644 --- a/README.adoc +++ b/README.adoc @@ -975,7 +975,7 @@ This setup: + -- ** can run most examples, including those for other CPU architectures, with the notable exception of examples that rely on kernel modules -** can run reproducible approximate performance experiments with gem5, see e.g. <> +** can run reproducible approximate performance experiments with gem5, see e.g. <> -- * from full system simulation as shown at: <>. + @@ -9961,7 +9961,7 @@ Now you can play a fun little game with your friends: * make a program that solves the computation problem, and outputs output to stdout * write the code that runs the correct computation in the smallest number of cycles possible -To find out why your program is slow, a good first step is to have a look at <> file. +To find out why your program is slow, a good first step is to have a look at <> file. ==== Skip extra benchmark instructions @@ -10210,36 +10210,79 @@ Buildroot built-in libraries, mostly under Libraries > Other: There are not yet enabled, but it should be easy to so, see: <> -===== BST vs heap +===== BST vs heap vs hashmap -https://stackoverflow.com/questions/6147242/heap-vs-binary-search-tree-bst/29548834#29548834 +The following benchmark setup works both: -First we build it with <> enabled, and then we extract the stats: +* on host through timers + link:https://stackoverflow.com/questions/51952471/why-do-i-get-a-constant-instead-of-logarithmic-curve-for-an-insert-time-benchmar/51953081#51953081[granule] +* gem5 with <>, which can get more precise results with `granule == 1` + +It has been used to answer: + +* BST vs heap: https://stackoverflow.com/questions/6147243/heap-vs-binary-search-tree-bst/29548834#29548834 +* `std::set`: https://stackoverflow.com/questions/2558153/what-is-the-underlying-data-structure-of-a-stl-set-in-c/51944661#51944661 +* `std::map`: https://stackoverflow.com/questions/18414579/what-data-structure-is-inside-stdmap-in-c/51945119#51945119 + +To benchmark on the host, we do: + +.... +./build-userland-in-tree --force-rebuild --optimization-level 3 ./userland/cpp/bst_vs_heap_vs_hashmap.cpp +./userland/cpp/bst_vs_heap_vs_hashmap.out | tee bst_vs_heap_vs_hashmap.dat +gnuplot \ + -e 'input_noext="bst_vs_heap_vs_hashmap"' \ + -e 'heap_zoom_max=50' \ + -e 'hashmap_zoom_max=400' \ + ./bst-vs-heap-vs-hashmap.gnuplot \ +; +xdg-open bst_vs_heap_vs_hashmap.tmp.png +.... + +The parameters `heap_zoom_max` and `hashmap_zoom_max` are chosen manually interactively to best showcase the regions of interest in those plots. + +First we build the benchmark with <> enabled, and then we run it and extract the stats: .... ./build-userland \ - --arch aarch64 \ + --arch x86_64 \ --ccflags='-DLKMC_M5OPS_ENABLE=1' \ - --force-rebuild cpp/bst_vs_heap \ + --force-rebuild userland/cpp/bst_vs_heap_vs_hashmap.cpp \ --static \ + --optimization-level 3 \ ; ./run \ - --arch aarch64 \ + --arch x86_64 \ --emulator gem5 \ --static \ - --userland userland/cpp/bst_vs_heap.cpp \ - --userland-args='1000' \ + --userland userland/cpp/bst_vs_heap_vs_hashmap.cpp \ + --userland-args='100000' \ + -- \ + --cpu-type=DerivO3CPU \ + --caches \ + --l2cache \ + --l1d_size=32kB \ + --l1i_size=32kB \ + --l2_size=256kB \ + --l3_size=20MB \ ; -./bst-vs-heap --arch aarch64 > bst_vs_heap.dat -./bst-vs-heap.gnuplot -xdg-open bst-vs-heap.tmp.png +./bst-vs-heap-vs-hashmap-gem5-stats --arch x86_64 | tee bst_vs_heap_vs_hashmap_gem5.dat +gnuplot \ + -e 'input_noext="bst_vs_heap_vs_hashmap_gem5"' \ + -e 'heap_zoom_max=500' \ + -e 'hashmap_zoom_max=400' \ + ./bst-vs-heap-vs-hashmap.gnuplot \ +; +xdg-open bst_vs_heap_vs_hashmap_gem5.tmp.png .... +The cache sizes were chosen to match the host <> to improve the comparison. Ideally we sould also use the same standard library. + +Note that this will take a long time, and will produce a humongous ~40Gb stats file due to: <> + Sources: -* link:userland/cpp/bst_vs_heap.cpp[] -* link:bst-vs-heap[] -* link:bst-vs-heap.gnuplot[] +* link:userland/cpp/bst_vs_heap_vs_hashmap.cpp[] +* link:bst-vs-heap-vs-hashmap-gem5-stats[] +* link:bst-vs-heap-vs-hashmap.gnuplot[] ===== BLAS @@ -11110,7 +11153,7 @@ Contains UART output, both from the Linux kernel or from the baremetal system. Can also be seen live on <>. -==== stats.txt +==== gem5 stats.txt This file contains important statistics about the run: @@ -11136,6 +11179,14 @@ system.cpu.dtb.inst_hits For x86, it is interesting to try and correlate `numCycles` with: +===== gem5 only dump selected stats + +TODO + +https://stackoverflow.com/questions/52014953/how-to-dump-only-a-single-or-certain-selected-stats-in-gem5 + +To prevent the stats file from becoming humongous. + ==== config.ini The `config.ini` file, contains a very good high level description of the system: @@ -12974,7 +13025,7 @@ RDTSC stores its output to EDX:EAX, even in 64-bit mode, top bits are zeroed out TODO: review this section, make a more controlled userland experiment with <> instrumentation. -Let's have some fun and try to correlate the gem5 <> `system.cpu.numCycles` cycle count with the link:https://en.wikipedia.org/wiki/Time_Stamp_Counter[x86 RDTSC instruction] that is supposed to do the same thing: +Let's have some fun and try to correlate the gem5 <> `system.cpu.numCycles` cycle count with the link:https://en.wikipedia.org/wiki/Time_Stamp_Counter[x86 RDTSC instruction] that is supposed to do the same thing: .... ./build-userland --static userland/arch/x86_64/inline_asm/rdtsc.S diff --git a/bst-vs-heap b/bst-vs-heap-vs-hashmap-gem5-stats similarity index 59% rename from bst-vs-heap rename to bst-vs-heap-vs-hashmap-gem5-stats index a3b4eb5..7662d3f 100755 --- a/bst-vs-heap +++ b/bst-vs-heap-vs-hashmap-gem5-stats @@ -18,13 +18,19 @@ Convert a BST vs heap stat file into a gnuplot input stats = self.get_stats() it = iter(stats) i = 1 - for stat in it: + for heap_num_cycles in it: try: - next_stat = next(it) + bst_num_cycles = next(it) + hashmap_num_cycles = next(it) except StopIteration: - # Automatic dumpstats at end may lead to odd number of stats. + # Automatic dumpstats at end may lead to one extra stat at the end. break - print('{} {} {}'.format(i, stat, next_stat)) + print('{} {} {} {}'.format( + i, + heap_num_cycles, + bst_num_cycles, + hashmap_num_cycles, + )) i += 1 if __name__ == '__main__': diff --git a/bst-vs-heap-vs-hashmap.gnuplot b/bst-vs-heap-vs-hashmap.gnuplot new file mode 100755 index 0000000..5137ee7 --- /dev/null +++ b/bst-vs-heap-vs-hashmap.gnuplot @@ -0,0 +1,27 @@ +#!/usr/bin/env gnuplot + +set terminal png noenhanced size 800, 1400 +set output input_noext . ".tmp.png" +set multiplot layout 5,1 title "\nC++ Heap vs BST vs Hash map insert time" font ",22" +set xlabel "container size" +set ylabel "insert time (ns)" +set title font ",16" + +set title "Heap (std::priority_queue)" +plot input_noext . ".dat" using 1:2 notitle + +set title "Heap (zoom)" +set yrange [0:heap_zoom_max] +plot input_noext . ".dat" using 1:2 notitle + +set title "BST (std::set)" +set yrange [*:*] +plot input_noext . ".dat" using 1:3 notitle + +set title "Hash map (std::unordered_set)" +set yrange [*:*] +plot input_noext . ".dat" using 1:4 notitle + +set title "Hash map zoom" +set yrange [0:hashmap_zoom_max] +plot input_noext . ".dat" using 1:4 notitle diff --git a/bst-vs-heap.gnuplot b/bst-vs-heap.gnuplot deleted file mode 100755 index 4f025c2..0000000 --- a/bst-vs-heap.gnuplot +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env gnuplot -set terminal png size 1024, 2048 -set output "bst-vs-heap.tmp.png" -set multiplot layout 5,1 title "Heap vs BST vs Hash map insert time" -set xlabel "size" -set ylabel "nanoseconds" - -set title "Heap" -plot "bst_vs_heap.dat" using 1:2 notitle - -set title "Heap (zoom)" -set yrange [0:25] -plot "bst_vs_heap.dat" using 1:2 notitle - -set title "BST" -set yrange [*:*] -plot "bst_vs_heap.dat" using 1:3 notitle - -#set title "Hash map" -#set yrange [*:*] -#plot "bst_vs_heap.dat" using 1:4 notitle -# -#set title "Hash map zoom" -#set yrange [0:350] -#plot "bst_vs_heap.dat" using 1:4 notitle diff --git a/path_properties.py b/path_properties.py index 059543d..9efb79e 100644 --- a/path_properties.py +++ b/path_properties.py @@ -474,6 +474,12 @@ path_properties_tuples = ( 'return2.c': {'exit_status': 2}, } ), + 'cpp': ( + {}, + { + 'bst_vs_heap_vs_hashmap.cpp': {'more_than_1s': True}, + }, + ), 'gcc': ( gnu_extension_properties, { diff --git a/userland/cpp/bst_vs_heap.cpp b/userland/cpp/bst_vs_heap.cpp deleted file mode 100644 index dca4028..0000000 --- a/userland/cpp/bst_vs_heap.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// https://github.com/cirosantilli/linux-kernel-module-cheat#bst-vs-heap - -#include -#include -#include -#include -#include - -#include - -int main(int argc, char **argv) { - typedef uint64_t I; - std::vector randoms; - size_t i, n; - std::priority_queue heap; - std::set bst; - unsigned int seed = std::random_device()(); - - // CLI arguments. - if (argc > 1) { - n = std::stoi(argv[1]); - } else { - n = 1; - } - - // Action. - for (i = 0; i < n; ++i) { - randoms.push_back(i); - } - std::shuffle(randoms.begin(), randoms.end(), std::mt19937(seed)); - for (i = 0; i < n; ++i) { - auto random = randoms[i]; - - // Heap. - LKMC_M5OPS_RESETSTATS; - heap.emplace(random); - LKMC_M5OPS_DUMPSTATS; - - // BST. - LKMC_M5OPS_RESETSTATS; - bst.insert(random); - LKMC_M5OPS_DUMPSTATS; - } -} diff --git a/userland/cpp/bst_vs_heap_vs_hashmap.cpp b/userland/cpp/bst_vs_heap_vs_hashmap.cpp new file mode 100644 index 0000000..4fe3e3d --- /dev/null +++ b/userland/cpp/bst_vs_heap_vs_hashmap.cpp @@ -0,0 +1,154 @@ +// https://github.com/cirosantilli/linux-kernel-module-cheat#bst-vs-heap-vs-hashmap + +//#include +//#include +//#include +//#include +//#include +// +//#include +// +//int main(int argc, char **argv) { +// typedef uint64_t I; +// std::vector randoms; +// size_t i, n; +// std::priority_queue heap; +// std::set bst; +// unsigned int seed = std::random_device()(); +// +// // CLI arguments. +// if (argc > 1) { +// n = std::stoi(argv[1]); +// } else { +// n = 1; +// } +// +// // Action. +// for (i = 0; i < n; ++i) { +// randoms.push_back(i); +// } +// std::shuffle(randoms.begin(), randoms.end(), std::mt19937(seed)); +// for (i = 0; i < n; ++i) { +// auto random = randoms[i]; +// +// // Heap. +// LKMC_M5OPS_RESETSTATS; +// heap.emplace(random); +// LKMC_M5OPS_DUMPSTATS; +// +// // BST. +// LKMC_M5OPS_RESETSTATS; +// bst.insert(random); +// LKMC_M5OPS_DUMPSTATS; +// } +//} + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char **argv) { + typedef uint64_t I; + std::vector randoms; + size_t i, j, n, granule, base; + std::priority_queue heap; + std::set bst; + std::unordered_set hashmap; + unsigned int seed = std::random_device()(); + + // CLI arguments. + if (argc > 1) { + n = std::stoi(argv[1]); + } else { + n = 10000000; + } +#ifdef LKMC_M5OPS_ENABLE + // Let's comment useless stuff out to speed up gem5 simulations. + granule = 1; + j = 0; +#else + if (argc > 2) { + granule = std::stoi(argv[2]); + } else { + granule = 10000; + } +#endif + + // Action. + for (i = 0; i < n; ++i) { + randoms.push_back(i); + } + std::shuffle(randoms.begin(), randoms.end(), std::mt19937(seed)); + for (i = 0; i < n / granule; ++i) { +#ifndef LKMC_M5OPS_ENABLE + using clk = std::chrono::high_resolution_clock; + decltype(clk::now()) start, end; +#endif + base = i * granule; + + // Heap. +#ifndef LKMC_M5OPS_ENABLE + start = clk::now(); + for (j = 0; j < granule; ++j) { +#endif + LKMC_M5OPS_RESETSTATS; + heap.emplace(randoms[base + j]); + LKMC_M5OPS_DUMPSTATS; +#ifndef LKMC_M5OPS_ENABLE + } + end = clk::now(); + auto dt_heap = (end - start) / granule; +#endif + + // BST. +#ifndef LKMC_M5OPS_ENABLE + start = clk::now(); + for (j = 0; j < granule; ++j) { +#endif + LKMC_M5OPS_RESETSTATS; + bst.insert(randoms[base + j]); + LKMC_M5OPS_DUMPSTATS; +#ifndef LKMC_M5OPS_ENABLE + } + end = clk::now(); + auto dt_bst = (end - start) / granule; +#endif + + // Hashmap. +#ifndef LKMC_M5OPS_ENABLE + start = clk::now(); + for (j = 0; j < granule; ++j) { +#endif + LKMC_M5OPS_RESETSTATS; + hashmap.insert(randoms[base + j]); + LKMC_M5OPS_DUMPSTATS; +#ifndef LKMC_M5OPS_ENABLE + } + end = clk::now(); + auto dt_hashmap = (end - start) / granule; +#endif + +#ifndef LKMC_M5OPS_ENABLE + // Output. + std::cout + << base << " " + << std::chrono::duration_cast(dt_heap).count() << " " + << std::chrono::duration_cast(dt_bst).count() << " " + << std::chrono::duration_cast(dt_hashmap).count() << std::endl + ; +#endif + } + + // Sanity check. + for (auto it = bst.rbegin(); it != bst.rend(); ++it) { + assert(*it == heap.top()); + heap.pop(); + } +}