mirror of
https://github.com/cirosantilli/linux-kernel-module-cheat.git
synced 2026-01-23 02:05:57 +01:00
bst vs heap: move in fully from cpp-cheat
This commit is contained in:
87
README.adoc
87
README.adoc
@@ -975,7 +975,7 @@ This setup:
|
|||||||
+
|
+
|
||||||
--
|
--
|
||||||
** can run most examples, including those for other CPU architectures, with the notable exception of examples that rely on kernel modules
|
** can run most examples, including those for other CPU architectures, with the notable exception of examples that rely on kernel modules
|
||||||
** can run reproducible approximate performance experiments with gem5, see e.g. <<bst-vs-heap>>
|
** can run reproducible approximate performance experiments with gem5, see e.g. <<bst-vs-heap-vs-hashmap>>
|
||||||
--
|
--
|
||||||
* from full system simulation as shown at: <<qemu-buildroot-setup-getting-started>>.
|
* from full system simulation as shown at: <<qemu-buildroot-setup-getting-started>>.
|
||||||
+
|
+
|
||||||
@@ -9961,7 +9961,7 @@ Now you can play a fun little game with your friends:
|
|||||||
* make a program that solves the computation problem, and outputs output to stdout
|
* make a program that solves the computation problem, and outputs output to stdout
|
||||||
* write the code that runs the correct computation in the smallest number of cycles possible
|
* write the code that runs the correct computation in the smallest number of cycles possible
|
||||||
|
|
||||||
To find out why your program is slow, a good first step is to have a look at <<stats-txt>> file.
|
To find out why your program is slow, a good first step is to have a look at <<gem5-stats-txt>> file.
|
||||||
|
|
||||||
==== Skip extra benchmark instructions
|
==== Skip extra benchmark instructions
|
||||||
|
|
||||||
@@ -10210,36 +10210,79 @@ Buildroot built-in libraries, mostly under Libraries > Other:
|
|||||||
|
|
||||||
There are not yet enabled, but it should be easy to so, see: <<add-new-buildroot-packages>>
|
There are not yet enabled, but it should be easy to so, see: <<add-new-buildroot-packages>>
|
||||||
|
|
||||||
===== BST vs heap
|
===== BST vs heap vs hashmap
|
||||||
|
|
||||||
https://stackoverflow.com/questions/6147242/heap-vs-binary-search-tree-bst/29548834#29548834
|
The following benchmark setup works both:
|
||||||
|
|
||||||
First we build it with <<m5ops-instructions>> enabled, and then we extract the stats:
|
* on host through timers + link:https://stackoverflow.com/questions/51952471/why-do-i-get-a-constant-instead-of-logarithmic-curve-for-an-insert-time-benchmar/51953081#51953081[granule]
|
||||||
|
* gem5 with <<m5ops-instructions,dumpstats>>, which can get more precise results with `granule == 1`
|
||||||
|
|
||||||
|
It has been used to answer:
|
||||||
|
|
||||||
|
* BST vs heap: https://stackoverflow.com/questions/6147243/heap-vs-binary-search-tree-bst/29548834#29548834
|
||||||
|
* `std::set`: https://stackoverflow.com/questions/2558153/what-is-the-underlying-data-structure-of-a-stl-set-in-c/51944661#51944661
|
||||||
|
* `std::map`: https://stackoverflow.com/questions/18414579/what-data-structure-is-inside-stdmap-in-c/51945119#51945119
|
||||||
|
|
||||||
|
To benchmark on the host, we do:
|
||||||
|
|
||||||
|
....
|
||||||
|
./build-userland-in-tree --force-rebuild --optimization-level 3 ./userland/cpp/bst_vs_heap_vs_hashmap.cpp
|
||||||
|
./userland/cpp/bst_vs_heap_vs_hashmap.out | tee bst_vs_heap_vs_hashmap.dat
|
||||||
|
gnuplot \
|
||||||
|
-e 'input_noext="bst_vs_heap_vs_hashmap"' \
|
||||||
|
-e 'heap_zoom_max=50' \
|
||||||
|
-e 'hashmap_zoom_max=400' \
|
||||||
|
./bst-vs-heap-vs-hashmap.gnuplot \
|
||||||
|
;
|
||||||
|
xdg-open bst_vs_heap_vs_hashmap.tmp.png
|
||||||
|
....
|
||||||
|
|
||||||
|
The parameters `heap_zoom_max` and `hashmap_zoom_max` are chosen manually interactively to best showcase the regions of interest in those plots.
|
||||||
|
|
||||||
|
First we build the benchmark with <<m5ops-instructions>> enabled, and then we run it and extract the stats:
|
||||||
|
|
||||||
....
|
....
|
||||||
./build-userland \
|
./build-userland \
|
||||||
--arch aarch64 \
|
--arch x86_64 \
|
||||||
--ccflags='-DLKMC_M5OPS_ENABLE=1' \
|
--ccflags='-DLKMC_M5OPS_ENABLE=1' \
|
||||||
--force-rebuild cpp/bst_vs_heap \
|
--force-rebuild userland/cpp/bst_vs_heap_vs_hashmap.cpp \
|
||||||
--static \
|
--static \
|
||||||
|
--optimization-level 3 \
|
||||||
;
|
;
|
||||||
./run \
|
./run \
|
||||||
--arch aarch64 \
|
--arch x86_64 \
|
||||||
--emulator gem5 \
|
--emulator gem5 \
|
||||||
--static \
|
--static \
|
||||||
--userland userland/cpp/bst_vs_heap.cpp \
|
--userland userland/cpp/bst_vs_heap_vs_hashmap.cpp \
|
||||||
--userland-args='1000' \
|
--userland-args='100000' \
|
||||||
|
-- \
|
||||||
|
--cpu-type=DerivO3CPU \
|
||||||
|
--caches \
|
||||||
|
--l2cache \
|
||||||
|
--l1d_size=32kB \
|
||||||
|
--l1i_size=32kB \
|
||||||
|
--l2_size=256kB \
|
||||||
|
--l3_size=20MB \
|
||||||
;
|
;
|
||||||
./bst-vs-heap --arch aarch64 > bst_vs_heap.dat
|
./bst-vs-heap-vs-hashmap-gem5-stats --arch x86_64 | tee bst_vs_heap_vs_hashmap_gem5.dat
|
||||||
./bst-vs-heap.gnuplot
|
gnuplot \
|
||||||
xdg-open bst-vs-heap.tmp.png
|
-e 'input_noext="bst_vs_heap_vs_hashmap_gem5"' \
|
||||||
|
-e 'heap_zoom_max=500' \
|
||||||
|
-e 'hashmap_zoom_max=400' \
|
||||||
|
./bst-vs-heap-vs-hashmap.gnuplot \
|
||||||
|
;
|
||||||
|
xdg-open bst_vs_heap_vs_hashmap_gem5.tmp.png
|
||||||
....
|
....
|
||||||
|
|
||||||
|
The cache sizes were chosen to match the host <<p51>> to improve the comparison. Ideally we sould also use the same standard library.
|
||||||
|
|
||||||
|
Note that this will take a long time, and will produce a humongous ~40Gb stats file due to: <<gem5-only-dump-selected-stats>>
|
||||||
|
|
||||||
Sources:
|
Sources:
|
||||||
|
|
||||||
* link:userland/cpp/bst_vs_heap.cpp[]
|
* link:userland/cpp/bst_vs_heap_vs_hashmap.cpp[]
|
||||||
* link:bst-vs-heap[]
|
* link:bst-vs-heap-vs-hashmap-gem5-stats[]
|
||||||
* link:bst-vs-heap.gnuplot[]
|
* link:bst-vs-heap-vs-hashmap.gnuplot[]
|
||||||
|
|
||||||
===== BLAS
|
===== BLAS
|
||||||
|
|
||||||
@@ -11110,7 +11153,7 @@ Contains UART output, both from the Linux kernel or from the baremetal system.
|
|||||||
|
|
||||||
Can also be seen live on <<m5term>>.
|
Can also be seen live on <<m5term>>.
|
||||||
|
|
||||||
==== stats.txt
|
==== gem5 stats.txt
|
||||||
|
|
||||||
This file contains important statistics about the run:
|
This file contains important statistics about the run:
|
||||||
|
|
||||||
@@ -11136,6 +11179,14 @@ system.cpu.dtb.inst_hits
|
|||||||
|
|
||||||
For x86, it is interesting to try and correlate `numCycles` with:
|
For x86, it is interesting to try and correlate `numCycles` with:
|
||||||
|
|
||||||
|
===== gem5 only dump selected stats
|
||||||
|
|
||||||
|
TODO
|
||||||
|
|
||||||
|
https://stackoverflow.com/questions/52014953/how-to-dump-only-a-single-or-certain-selected-stats-in-gem5
|
||||||
|
|
||||||
|
To prevent the stats file from becoming humongous.
|
||||||
|
|
||||||
==== config.ini
|
==== config.ini
|
||||||
|
|
||||||
The `config.ini` file, contains a very good high level description of the system:
|
The `config.ini` file, contains a very good high level description of the system:
|
||||||
@@ -12974,7 +13025,7 @@ RDTSC stores its output to EDX:EAX, even in 64-bit mode, top bits are zeroed out
|
|||||||
|
|
||||||
TODO: review this section, make a more controlled userland experiment with <<m5ops>> instrumentation.
|
TODO: review this section, make a more controlled userland experiment with <<m5ops>> instrumentation.
|
||||||
|
|
||||||
Let's have some fun and try to correlate the gem5 <<stats-txt>> `system.cpu.numCycles` cycle count with the link:https://en.wikipedia.org/wiki/Time_Stamp_Counter[x86 RDTSC instruction] that is supposed to do the same thing:
|
Let's have some fun and try to correlate the gem5 <<gem5-stats-txt>> `system.cpu.numCycles` cycle count with the link:https://en.wikipedia.org/wiki/Time_Stamp_Counter[x86 RDTSC instruction] that is supposed to do the same thing:
|
||||||
|
|
||||||
....
|
....
|
||||||
./build-userland --static userland/arch/x86_64/inline_asm/rdtsc.S
|
./build-userland --static userland/arch/x86_64/inline_asm/rdtsc.S
|
||||||
|
|||||||
@@ -18,13 +18,19 @@ Convert a BST vs heap stat file into a gnuplot input
|
|||||||
stats = self.get_stats()
|
stats = self.get_stats()
|
||||||
it = iter(stats)
|
it = iter(stats)
|
||||||
i = 1
|
i = 1
|
||||||
for stat in it:
|
for heap_num_cycles in it:
|
||||||
try:
|
try:
|
||||||
next_stat = next(it)
|
bst_num_cycles = next(it)
|
||||||
|
hashmap_num_cycles = next(it)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
# Automatic dumpstats at end may lead to odd number of stats.
|
# Automatic dumpstats at end may lead to one extra stat at the end.
|
||||||
break
|
break
|
||||||
print('{} {} {}'.format(i, stat, next_stat))
|
print('{} {} {} {}'.format(
|
||||||
|
i,
|
||||||
|
heap_num_cycles,
|
||||||
|
bst_num_cycles,
|
||||||
|
hashmap_num_cycles,
|
||||||
|
))
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
27
bst-vs-heap-vs-hashmap.gnuplot
Executable file
27
bst-vs-heap-vs-hashmap.gnuplot
Executable file
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env gnuplot
|
||||||
|
|
||||||
|
set terminal png noenhanced size 800, 1400
|
||||||
|
set output input_noext . ".tmp.png"
|
||||||
|
set multiplot layout 5,1 title "\nC++ Heap vs BST vs Hash map insert time" font ",22"
|
||||||
|
set xlabel "container size"
|
||||||
|
set ylabel "insert time (ns)"
|
||||||
|
set title font ",16"
|
||||||
|
|
||||||
|
set title "Heap (std::priority_queue)"
|
||||||
|
plot input_noext . ".dat" using 1:2 notitle
|
||||||
|
|
||||||
|
set title "Heap (zoom)"
|
||||||
|
set yrange [0:heap_zoom_max]
|
||||||
|
plot input_noext . ".dat" using 1:2 notitle
|
||||||
|
|
||||||
|
set title "BST (std::set)"
|
||||||
|
set yrange [*:*]
|
||||||
|
plot input_noext . ".dat" using 1:3 notitle
|
||||||
|
|
||||||
|
set title "Hash map (std::unordered_set)"
|
||||||
|
set yrange [*:*]
|
||||||
|
plot input_noext . ".dat" using 1:4 notitle
|
||||||
|
|
||||||
|
set title "Hash map zoom"
|
||||||
|
set yrange [0:hashmap_zoom_max]
|
||||||
|
plot input_noext . ".dat" using 1:4 notitle
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
#!/usr/bin/env gnuplot
|
|
||||||
set terminal png size 1024, 2048
|
|
||||||
set output "bst-vs-heap.tmp.png"
|
|
||||||
set multiplot layout 5,1 title "Heap vs BST vs Hash map insert time"
|
|
||||||
set xlabel "size"
|
|
||||||
set ylabel "nanoseconds"
|
|
||||||
|
|
||||||
set title "Heap"
|
|
||||||
plot "bst_vs_heap.dat" using 1:2 notitle
|
|
||||||
|
|
||||||
set title "Heap (zoom)"
|
|
||||||
set yrange [0:25]
|
|
||||||
plot "bst_vs_heap.dat" using 1:2 notitle
|
|
||||||
|
|
||||||
set title "BST"
|
|
||||||
set yrange [*:*]
|
|
||||||
plot "bst_vs_heap.dat" using 1:3 notitle
|
|
||||||
|
|
||||||
#set title "Hash map"
|
|
||||||
#set yrange [*:*]
|
|
||||||
#plot "bst_vs_heap.dat" using 1:4 notitle
|
|
||||||
#
|
|
||||||
#set title "Hash map zoom"
|
|
||||||
#set yrange [0:350]
|
|
||||||
#plot "bst_vs_heap.dat" using 1:4 notitle
|
|
||||||
@@ -474,6 +474,12 @@ path_properties_tuples = (
|
|||||||
'return2.c': {'exit_status': 2},
|
'return2.c': {'exit_status': 2},
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
|
'cpp': (
|
||||||
|
{},
|
||||||
|
{
|
||||||
|
'bst_vs_heap_vs_hashmap.cpp': {'more_than_1s': True},
|
||||||
|
},
|
||||||
|
),
|
||||||
'gcc': (
|
'gcc': (
|
||||||
gnu_extension_properties,
|
gnu_extension_properties,
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,44 +0,0 @@
|
|||||||
// https://github.com/cirosantilli/linux-kernel-module-cheat#bst-vs-heap
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <iostream>
|
|
||||||
#include <queue>
|
|
||||||
#include <random>
|
|
||||||
#include <set>
|
|
||||||
|
|
||||||
#include <lkmc/m5ops.h>
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
typedef uint64_t I;
|
|
||||||
std::vector<I> randoms;
|
|
||||||
size_t i, n;
|
|
||||||
std::priority_queue<I> heap;
|
|
||||||
std::set<I> bst;
|
|
||||||
unsigned int seed = std::random_device()();
|
|
||||||
|
|
||||||
// CLI arguments.
|
|
||||||
if (argc > 1) {
|
|
||||||
n = std::stoi(argv[1]);
|
|
||||||
} else {
|
|
||||||
n = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Action.
|
|
||||||
for (i = 0; i < n; ++i) {
|
|
||||||
randoms.push_back(i);
|
|
||||||
}
|
|
||||||
std::shuffle(randoms.begin(), randoms.end(), std::mt19937(seed));
|
|
||||||
for (i = 0; i < n; ++i) {
|
|
||||||
auto random = randoms[i];
|
|
||||||
|
|
||||||
// Heap.
|
|
||||||
LKMC_M5OPS_RESETSTATS;
|
|
||||||
heap.emplace(random);
|
|
||||||
LKMC_M5OPS_DUMPSTATS;
|
|
||||||
|
|
||||||
// BST.
|
|
||||||
LKMC_M5OPS_RESETSTATS;
|
|
||||||
bst.insert(random);
|
|
||||||
LKMC_M5OPS_DUMPSTATS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
154
userland/cpp/bst_vs_heap_vs_hashmap.cpp
Normal file
154
userland/cpp/bst_vs_heap_vs_hashmap.cpp
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
// https://github.com/cirosantilli/linux-kernel-module-cheat#bst-vs-heap-vs-hashmap
|
||||||
|
|
||||||
|
//#include <algorithm>
|
||||||
|
//#include <iostream>
|
||||||
|
//#include <queue>
|
||||||
|
//#include <random>
|
||||||
|
//#include <set>
|
||||||
|
//
|
||||||
|
//#include <lkmc/m5ops.h>
|
||||||
|
//
|
||||||
|
//int main(int argc, char **argv) {
|
||||||
|
// typedef uint64_t I;
|
||||||
|
// std::vector<I> randoms;
|
||||||
|
// size_t i, n;
|
||||||
|
// std::priority_queue<I> heap;
|
||||||
|
// std::set<I> bst;
|
||||||
|
// unsigned int seed = std::random_device()();
|
||||||
|
//
|
||||||
|
// // CLI arguments.
|
||||||
|
// if (argc > 1) {
|
||||||
|
// n = std::stoi(argv[1]);
|
||||||
|
// } else {
|
||||||
|
// n = 1;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// // Action.
|
||||||
|
// for (i = 0; i < n; ++i) {
|
||||||
|
// randoms.push_back(i);
|
||||||
|
// }
|
||||||
|
// std::shuffle(randoms.begin(), randoms.end(), std::mt19937(seed));
|
||||||
|
// for (i = 0; i < n; ++i) {
|
||||||
|
// auto random = randoms[i];
|
||||||
|
//
|
||||||
|
// // Heap.
|
||||||
|
// LKMC_M5OPS_RESETSTATS;
|
||||||
|
// heap.emplace(random);
|
||||||
|
// LKMC_M5OPS_DUMPSTATS;
|
||||||
|
//
|
||||||
|
// // BST.
|
||||||
|
// LKMC_M5OPS_RESETSTATS;
|
||||||
|
// bst.insert(random);
|
||||||
|
// LKMC_M5OPS_DUMPSTATS;
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cassert>
|
||||||
|
#include <chrono>
|
||||||
|
#include <iostream>
|
||||||
|
#include <queue>
|
||||||
|
#include <random>
|
||||||
|
#include <set>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
|
#include <lkmc/m5ops.h>
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
typedef uint64_t I;
|
||||||
|
std::vector<I> randoms;
|
||||||
|
size_t i, j, n, granule, base;
|
||||||
|
std::priority_queue<I> heap;
|
||||||
|
std::set<I> bst;
|
||||||
|
std::unordered_set<I> hashmap;
|
||||||
|
unsigned int seed = std::random_device()();
|
||||||
|
|
||||||
|
// CLI arguments.
|
||||||
|
if (argc > 1) {
|
||||||
|
n = std::stoi(argv[1]);
|
||||||
|
} else {
|
||||||
|
n = 10000000;
|
||||||
|
}
|
||||||
|
#ifdef LKMC_M5OPS_ENABLE
|
||||||
|
// Let's comment useless stuff out to speed up gem5 simulations.
|
||||||
|
granule = 1;
|
||||||
|
j = 0;
|
||||||
|
#else
|
||||||
|
if (argc > 2) {
|
||||||
|
granule = std::stoi(argv[2]);
|
||||||
|
} else {
|
||||||
|
granule = 10000;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Action.
|
||||||
|
for (i = 0; i < n; ++i) {
|
||||||
|
randoms.push_back(i);
|
||||||
|
}
|
||||||
|
std::shuffle(randoms.begin(), randoms.end(), std::mt19937(seed));
|
||||||
|
for (i = 0; i < n / granule; ++i) {
|
||||||
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
|
using clk = std::chrono::high_resolution_clock;
|
||||||
|
decltype(clk::now()) start, end;
|
||||||
|
#endif
|
||||||
|
base = i * granule;
|
||||||
|
|
||||||
|
// Heap.
|
||||||
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
|
start = clk::now();
|
||||||
|
for (j = 0; j < granule; ++j) {
|
||||||
|
#endif
|
||||||
|
LKMC_M5OPS_RESETSTATS;
|
||||||
|
heap.emplace(randoms[base + j]);
|
||||||
|
LKMC_M5OPS_DUMPSTATS;
|
||||||
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
|
}
|
||||||
|
end = clk::now();
|
||||||
|
auto dt_heap = (end - start) / granule;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// BST.
|
||||||
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
|
start = clk::now();
|
||||||
|
for (j = 0; j < granule; ++j) {
|
||||||
|
#endif
|
||||||
|
LKMC_M5OPS_RESETSTATS;
|
||||||
|
bst.insert(randoms[base + j]);
|
||||||
|
LKMC_M5OPS_DUMPSTATS;
|
||||||
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
|
}
|
||||||
|
end = clk::now();
|
||||||
|
auto dt_bst = (end - start) / granule;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Hashmap.
|
||||||
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
|
start = clk::now();
|
||||||
|
for (j = 0; j < granule; ++j) {
|
||||||
|
#endif
|
||||||
|
LKMC_M5OPS_RESETSTATS;
|
||||||
|
hashmap.insert(randoms[base + j]);
|
||||||
|
LKMC_M5OPS_DUMPSTATS;
|
||||||
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
|
}
|
||||||
|
end = clk::now();
|
||||||
|
auto dt_hashmap = (end - start) / granule;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
|
// Output.
|
||||||
|
std::cout
|
||||||
|
<< base << " "
|
||||||
|
<< std::chrono::duration_cast<std::chrono::nanoseconds>(dt_heap).count() << " "
|
||||||
|
<< std::chrono::duration_cast<std::chrono::nanoseconds>(dt_bst).count() << " "
|
||||||
|
<< std::chrono::duration_cast<std::chrono::nanoseconds>(dt_hashmap).count() << std::endl
|
||||||
|
;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sanity check.
|
||||||
|
for (auto it = bst.rbegin(); it != bst.rend(); ++it) {
|
||||||
|
assert(*it == heap.top());
|
||||||
|
heap.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user