mirror of
https://github.com/cirosantilli/linux-kernel-module-cheat.git
synced 2026-01-23 02:05:57 +01:00
start moving algorithm in
This commit is contained in:
900
README.adoc
900
README.adoc
@@ -1007,7 +1007,7 @@ Notable userland content included / moving into this repository includes:
|
|||||||
* <<c>>
|
* <<c>>
|
||||||
* <<cpp>>
|
* <<cpp>>
|
||||||
* <<posix>>
|
* <<posix>>
|
||||||
* https://github.com/cirosantilli/algorithm-cheat TODO will be good to move here for performance analysis <<gem5-run-benchmark,with gem5>>
|
* <<algorithms>>
|
||||||
|
|
||||||
==== Userland setup getting started
|
==== Userland setup getting started
|
||||||
|
|
||||||
@@ -3930,6 +3930,46 @@ The workaround:
|
|||||||
|
|
||||||
fixes some of the problems, but not all, so we are just skipping those tests for now.
|
fixes some of the problems, but not all, so we are just skipping those tests for now.
|
||||||
|
|
||||||
|
=== syscall emulation mode program stdin
|
||||||
|
|
||||||
|
The following work on both QEMU and gem5 as of LKMC 99d6bc6bc19d4c7f62b172643be95d9c43c26145 + 1. Interactive input:
|
||||||
|
|
||||||
|
....
|
||||||
|
./run --userland userland/c/getchar.c
|
||||||
|
....
|
||||||
|
|
||||||
|
Source: link:userland/c/getchar.c[]
|
||||||
|
|
||||||
|
A line of type should show:
|
||||||
|
|
||||||
|
....
|
||||||
|
enter a character:
|
||||||
|
....
|
||||||
|
|
||||||
|
and after pressing say `a` and Enter, we get:
|
||||||
|
|
||||||
|
....
|
||||||
|
you entered: a
|
||||||
|
....
|
||||||
|
|
||||||
|
Note however that due to <<qemu-user-mode-does-not-show-stdout-immediately>> we don't really see the initial `enter a character` line.
|
||||||
|
|
||||||
|
Non-interactive input from a file by forwarding emulators stdin implicitly through our Python scripts:
|
||||||
|
|
||||||
|
....
|
||||||
|
printf a > f.tmp
|
||||||
|
./run --userland userland/c/getchar.c < f.tmp
|
||||||
|
....
|
||||||
|
|
||||||
|
Input from a file by explicitly requesting our scripts to use it via the Python API:
|
||||||
|
|
||||||
|
....
|
||||||
|
printf a > f.tmp
|
||||||
|
./run --emulator gem5 --userland userland/c/getchar.c --static --stdin-file f.tmp
|
||||||
|
....
|
||||||
|
|
||||||
|
This is especially useful when running tests that require stdin input.
|
||||||
|
|
||||||
=== gem5 syscall emulation mode
|
=== gem5 syscall emulation mode
|
||||||
|
|
||||||
Less robust than QEMU's, but still usable:
|
Less robust than QEMU's, but still usable:
|
||||||
@@ -4014,25 +4054,6 @@ which we parse in link:run[] and then exit with the correct result ourselves...
|
|||||||
|
|
||||||
Related thread: https://stackoverflow.com/questions/56032347/is-there-a-way-to-identify-if-gem5-run-got-over-successfully
|
Related thread: https://stackoverflow.com/questions/56032347/is-there-a-way-to-identify-if-gem5-run-got-over-successfully
|
||||||
|
|
||||||
==== gem5 syscall emulation mode program stdin
|
|
||||||
|
|
||||||
gem5 shows its own stdout to terminal, and does not allow you to type stdin to programs.
|
|
||||||
|
|
||||||
Instead, you must pass stdin non-interactively with the through a file with the `--se.py --input` option, e.g.:
|
|
||||||
|
|
||||||
....
|
|
||||||
printf a > f
|
|
||||||
./run --emulator gem5 --userland userland/c/getchar.c --static -- --input f
|
|
||||||
....
|
|
||||||
|
|
||||||
leads to gem5 output:
|
|
||||||
|
|
||||||
....
|
|
||||||
enter a character: you entered: a
|
|
||||||
....
|
|
||||||
|
|
||||||
Source: link:userland/c/getchar.c[]
|
|
||||||
|
|
||||||
==== gem5 syscall emulation mode syscall tracing
|
==== gem5 syscall emulation mode syscall tracing
|
||||||
|
|
||||||
Since gem5 has to implement syscalls itself in syscall emulation mode, it can of course clearly see which syscalls are being made, and we can log them for debug purposes with <<gem5-tracing>>, e.g.:
|
Since gem5 has to implement syscalls itself in syscall emulation mode, it can of course clearly see which syscalls are being made, and we can log them for debug purposes with <<gem5-tracing>>, e.g.:
|
||||||
@@ -10645,6 +10666,11 @@ Now you can play a fun little game with your friends:
|
|||||||
* make a program that solves the computation problem, and outputs output to stdout
|
* make a program that solves the computation problem, and outputs output to stdout
|
||||||
* write the code that runs the correct computation in the smallest number of cycles possible
|
* write the code that runs the correct computation in the smallest number of cycles possible
|
||||||
|
|
||||||
|
Interesting algorithms and benchmarks for this game are being collected at:
|
||||||
|
|
||||||
|
* <<algorithms>>
|
||||||
|
* <<benchmarks>>
|
||||||
|
|
||||||
To find out why your program is slow, a good first step is to have a look at the <<gem5-m5out-stats-txt-file>>.
|
To find out why your program is slow, a good first step is to have a look at the <<gem5-m5out-stats-txt-file>>.
|
||||||
|
|
||||||
==== Skip extra benchmark instructions
|
==== Skip extra benchmark instructions
|
||||||
@@ -11028,386 +11054,6 @@ TODO: why doesn't this exist:
|
|||||||
ls /sys/devices/system/cpu/cpu0/cpufreq
|
ls /sys/devices/system/cpu/cpu0/cpufreq
|
||||||
....
|
....
|
||||||
|
|
||||||
==== Interesting benchmarks
|
|
||||||
|
|
||||||
Buildroot built-in libraries, mostly under Libraries > Other:
|
|
||||||
|
|
||||||
* Armadillo `C++`: linear algebra
|
|
||||||
* fftw: Fourier transform
|
|
||||||
* Flann
|
|
||||||
* GSL: various
|
|
||||||
* liblinear
|
|
||||||
* libspacialindex
|
|
||||||
* libtommath
|
|
||||||
* qhull
|
|
||||||
|
|
||||||
Open source but not in Buildroot:
|
|
||||||
|
|
||||||
* https://github.com/kozyraki/stamp transactional memory benchmarks
|
|
||||||
|
|
||||||
There are not yet enabled, but it should be easy to so, see: xref:add-new-buildroot-packages[xrefstyle=full]
|
|
||||||
|
|
||||||
===== Dhrystone
|
|
||||||
|
|
||||||
https://en.wikipedia.org/wiki/Dhrystone
|
|
||||||
|
|
||||||
Created in the 80's, it is not a representative measure of performance in modern computers anymore. It has mostly been replaced by https://en.wikipedia.org/wiki/SPECint[SPEC], which is... closed source! Unbelievable.
|
|
||||||
|
|
||||||
<<buildroot>> has a `dhrystone` package, but because it is so interesting to us, we decided to also build it ourselves, which allows things like static and baremetal compilation more easily.
|
|
||||||
|
|
||||||
Build and run on QEMU <<user-mode-simulation>>:
|
|
||||||
|
|
||||||
....
|
|
||||||
git submodule update --init submodules/dhrystone
|
|
||||||
./build-dhrystone --mode userland
|
|
||||||
./run --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"
|
|
||||||
....
|
|
||||||
|
|
||||||
Build and run on gem5 user mode:
|
|
||||||
|
|
||||||
....
|
|
||||||
./build-dhrystone --mode userland --static --force-rebuild
|
|
||||||
./run --emulator gem5 --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"
|
|
||||||
....
|
|
||||||
|
|
||||||
TODO automate run more nicely.
|
|
||||||
|
|
||||||
Build for <<baremetal>> execution and run it in baremetal QEMU:
|
|
||||||
|
|
||||||
....
|
|
||||||
# Build our Newlib stubs.
|
|
||||||
./build-baremetal --arch aarch64
|
|
||||||
./build-dhrystone --arch aarch64 --mode baremetal
|
|
||||||
./run --arch aarch64 --baremetal "$(./getvar baremetal_build_dir)/submodules/dhrystone/dhrystone"
|
|
||||||
....
|
|
||||||
|
|
||||||
TODO: fix the build, just need to factor out all run arguments from link:build-baremetal[] into link:common.py[] and it should just work, no missing syscalls.
|
|
||||||
|
|
||||||
If you really want the Buildroot package for some reason, build it with:
|
|
||||||
|
|
||||||
....
|
|
||||||
./build-buildroot --config 'BR2_PACKAGE_DHRYSTONE=y'
|
|
||||||
....
|
|
||||||
|
|
||||||
and run inside the guest from `PATH` with:
|
|
||||||
|
|
||||||
....
|
|
||||||
dhrystone
|
|
||||||
....
|
|
||||||
|
|
||||||
===== BST vs heap vs hashmap
|
|
||||||
|
|
||||||
TODO: move benchmark graph from link:userland/cpp/bst_vs_heap_vs_hashmap.cpp[] to link:userland/algorithm/set[].
|
|
||||||
|
|
||||||
The following benchmark setup works both:
|
|
||||||
|
|
||||||
* on host through timers + https://stackoverflow.com/questions/51952471/why-do-i-get-a-constant-instead-of-logarithmic-curve-for-an-insert-time-benchmar/51953081#51953081[granule]
|
|
||||||
* gem5 with <<m5ops-instructions,dumpstats>>, which can get more precise results with `granule == 1`
|
|
||||||
|
|
||||||
It has been used to answer:
|
|
||||||
|
|
||||||
* BST vs heap: https://stackoverflow.com/questions/6147243/heap-vs-binary-search-tree-bst/29548834#29548834
|
|
||||||
* `std::set`: https://stackoverflow.com/questions/2558153/what-is-the-underlying-data-structure-of-a-stl-set-in-c/51944661#51944661
|
|
||||||
* `std::map`: https://stackoverflow.com/questions/18414579/what-data-structure-is-inside-stdmap-in-c/51945119#51945119
|
|
||||||
|
|
||||||
To benchmark on the host, we do:
|
|
||||||
|
|
||||||
....
|
|
||||||
./build-userland-in-tree \
|
|
||||||
--force-rebuild \
|
|
||||||
--optimization-level 3 \
|
|
||||||
./userland/cpp/bst_vs_heap_vs_hashmap.cpp \
|
|
||||||
;
|
|
||||||
./userland/cpp/bst_vs_heap_vs_hashmap.out 10000000 10000 0 | tee bst_vs_heap_vs_hashmap.dat
|
|
||||||
gnuplot \
|
|
||||||
-e 'input_noext="bst_vs_heap_vs_hashmap"' \
|
|
||||||
-e 'heap_zoom_max=50' \
|
|
||||||
-e 'hashmap_zoom_max=400' \
|
|
||||||
./bst-vs-heap-vs-hashmap.gnuplot \
|
|
||||||
;
|
|
||||||
xdg-open bst_vs_heap_vs_hashmap.tmp.png
|
|
||||||
....
|
|
||||||
|
|
||||||
The parameters `heap_zoom_max` and `hashmap_zoom_max` are chosen manually interactively to best showcase the regions of interest in those plots.
|
|
||||||
|
|
||||||
To benchmark on gem5, we first build the benchmark with <<m5ops-instructions>> enabled, and then we run it and extract the stats:
|
|
||||||
|
|
||||||
....
|
|
||||||
./build-userland \
|
|
||||||
--arch x86_64 \
|
|
||||||
--ccflags='-DLKMC_M5OPS_ENABLE=1' \
|
|
||||||
--force-rebuild userland/cpp/bst_vs_heap_vs_hashmap.cpp \
|
|
||||||
--static \
|
|
||||||
--optimization-level 3 \
|
|
||||||
;
|
|
||||||
./run \
|
|
||||||
--arch x86_64 \
|
|
||||||
--emulator gem5 \
|
|
||||||
--static \
|
|
||||||
--userland userland/cpp/bst_vs_heap_vs_hashmap.cpp \
|
|
||||||
--userland-args='100000 1 0' \
|
|
||||||
-- \
|
|
||||||
--cpu-type=DerivO3CPU \
|
|
||||||
--caches \
|
|
||||||
--l2cache \
|
|
||||||
--l1d_size=32kB \
|
|
||||||
--l1i_size=32kB \
|
|
||||||
--l2_size=256kB \
|
|
||||||
--l3_size=20MB \
|
|
||||||
;
|
|
||||||
./bst-vs-heap-vs-hashmap-gem5-stats --arch x86_64 | tee bst_vs_heap_vs_hashmap_gem5.dat
|
|
||||||
gnuplot \
|
|
||||||
-e 'input_noext="bst_vs_heap_vs_hashmap_gem5"' \
|
|
||||||
-e 'heap_zoom_max=500' \
|
|
||||||
-e 'hashmap_zoom_max=400' \
|
|
||||||
./bst-vs-heap-vs-hashmap.gnuplot \
|
|
||||||
;
|
|
||||||
xdg-open bst_vs_heap_vs_hashmap_gem5.tmp.png
|
|
||||||
....
|
|
||||||
|
|
||||||
TODO: the gem5 simulation blows up on a tcmalloc allocation somewhere near 25k elements as of 3fdd83c2c58327d9714fa2347c724b78d7c05e2b + 1, likely linked to the extreme inefficiency of the stats collection?
|
|
||||||
|
|
||||||
The cache sizes were chosen to match the host <<p51>> to improve the comparison. Ideally we should also use the same standard library.
|
|
||||||
|
|
||||||
Note that this will take a long time, and will produce a humongous ~40Gb stats file as explained at: xref:gem5-only-dump-selected-stats[xrefstyle=full]
|
|
||||||
|
|
||||||
Sources:
|
|
||||||
|
|
||||||
* link:userland/cpp/bst_vs_heap_vs_hashmap.cpp[]
|
|
||||||
* link:bst-vs-heap-vs-hashmap-gem5-stats[]
|
|
||||||
* link:bst-vs-heap-vs-hashmap.gnuplot[]
|
|
||||||
|
|
||||||
===== BLAS
|
|
||||||
|
|
||||||
Buildroot supports it, which makes everything just trivial:
|
|
||||||
|
|
||||||
....
|
|
||||||
./build-buildroot --config 'BR2_PACKAGE_OPENBLAS=y'
|
|
||||||
./build-userland --package openblas -- userland/libs/openblas/hello.c
|
|
||||||
./run --eval-after './libs/openblas/hello.out; echo $?'
|
|
||||||
....
|
|
||||||
|
|
||||||
Outcome: the test passes:
|
|
||||||
|
|
||||||
....
|
|
||||||
0
|
|
||||||
....
|
|
||||||
|
|
||||||
Source: link:userland/libs/openblas/hello.c[]
|
|
||||||
|
|
||||||
The test performs a general matrix multiplication:
|
|
||||||
|
|
||||||
....
|
|
||||||
| 1.0 -3.0 | | 1.0 2.0 1.0 | | 0.5 0.5 0.5 | | 11.0 - 9.0 5.0 |
|
|
||||||
1 * | 2.0 4.0 | * | -3.0 4.0 -1.0 | + 2 * | 0.5 0.5 0.5 | = | - 9.0 21.0 -1.0 |
|
|
||||||
| 1.0 -1.0 | | 0.5 0.5 0.5 | | 5.0 - 1.0 3.0 |
|
|
||||||
....
|
|
||||||
|
|
||||||
This can be deduced from the Fortran interfaces at
|
|
||||||
|
|
||||||
....
|
|
||||||
less "$(./getvar buildroot_build_build_dir)"/openblas-*/reference/dgemmf.f
|
|
||||||
....
|
|
||||||
|
|
||||||
which we can map to our call as:
|
|
||||||
|
|
||||||
....
|
|
||||||
C := alpha*op( A )*op( B ) + beta*C,
|
|
||||||
SUBROUTINE DGEMMF( TRANA, TRANB, M,N,K, ALPHA,A,LDA,B,LDB,BETA,C,LDC)
|
|
||||||
cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans,3,3,2 ,1, A,3, B,3, 2 ,C,3 );
|
|
||||||
....
|
|
||||||
|
|
||||||
===== Eigen
|
|
||||||
|
|
||||||
Header only linear algebra library with a mainline Buildroot package:
|
|
||||||
|
|
||||||
....
|
|
||||||
./build-buildroot --config 'BR2_PACKAGE_EIGEN=y'
|
|
||||||
./build-userland --package eigen -- userland/libs/eigen/hello.cpp
|
|
||||||
....
|
|
||||||
|
|
||||||
Just create an array and print it:
|
|
||||||
|
|
||||||
....
|
|
||||||
./run --eval-after './libs/eigen/hello.out'
|
|
||||||
....
|
|
||||||
|
|
||||||
Output:
|
|
||||||
|
|
||||||
....
|
|
||||||
3 -1
|
|
||||||
2.5 1.5
|
|
||||||
....
|
|
||||||
|
|
||||||
Source: link:userland/libs/eigen/hello.cpp[]
|
|
||||||
|
|
||||||
This example just creates a matrix and prints it out.
|
|
||||||
|
|
||||||
Tested on: https://github.com/cirosantilli/linux-kernel-module-cheat/commit/a4bdcf102c068762bb1ef26c591fcf71e5907525[a4bdcf102c068762bb1ef26c591fcf71e5907525]
|
|
||||||
|
|
||||||
===== PARSEC benchmark
|
|
||||||
|
|
||||||
We have ported parts of the http://parsec.cs.princeton.edu[PARSEC benchmark] for cross compilation at: https://github.com/cirosantilli/parsec-benchmark See the documentation on that repo to find out which benchmarks have been ported. Some of the benchmarks were are segfaulting, they are documented in that repo.
|
|
||||||
|
|
||||||
There are two ways to run PARSEC with this repo:
|
|
||||||
|
|
||||||
* <<parsec-benchmark-without-parsecmgmt,without `pasecmgmt`>>, most likely what you want
|
|
||||||
* <<parsec-benchmark-with-parsecmgmt,with `pasecmgmt`>>
|
|
||||||
|
|
||||||
====== PARSEC benchmark without parsecmgmt
|
|
||||||
|
|
||||||
....
|
|
||||||
./build --arch arm --download-dependencies gem5-buildroot parsec-benchmark
|
|
||||||
./build-buildroot --arch arm --config 'BR2_PACKAGE_PARSEC_BENCHMARK=y'
|
|
||||||
./run --arch arm --emulator gem5
|
|
||||||
....
|
|
||||||
|
|
||||||
Once inside the guest, launch one of the `test` input sized benchmarks manually as in:
|
|
||||||
|
|
||||||
....
|
|
||||||
cd /parsec/ext/splash2x/apps/fmm/run
|
|
||||||
../inst/arm-linux.gcc/bin/fmm 1 < input_1
|
|
||||||
....
|
|
||||||
|
|
||||||
To find run out how to run many of the benchmarks, have a look at the `test.sh` script of the `parse-benchmark` repo.
|
|
||||||
|
|
||||||
From the guest, you can also run it as:
|
|
||||||
|
|
||||||
....
|
|
||||||
cd /parsec
|
|
||||||
./test.sh
|
|
||||||
....
|
|
||||||
|
|
||||||
but this might be a bit time consuming in gem5.
|
|
||||||
|
|
||||||
====== PARSEC change the input size
|
|
||||||
|
|
||||||
Running a benchmark of a size different than `test`, e.g. `simsmall`, requires a rebuild with:
|
|
||||||
|
|
||||||
....
|
|
||||||
./build-buildroot \
|
|
||||||
--arch arm \
|
|
||||||
--config 'BR2_PACKAGE_PARSEC_BENCHMARK=y' \
|
|
||||||
--config 'BR2_PACKAGE_PARSEC_BENCHMARK_INPUT_SIZE="simsmall"' \
|
|
||||||
-- parsec_benchmark-reconfigure \
|
|
||||||
;
|
|
||||||
....
|
|
||||||
|
|
||||||
Large input may also require tweaking:
|
|
||||||
|
|
||||||
* <<br2-target-rootfs-ext2-size>> if the unpacked inputs are large
|
|
||||||
* <<memory-size>>, unless you want to meet the OOM killer, which is admittedly kind of fun
|
|
||||||
|
|
||||||
`test.sh` only contains the run commands for the `test` size, and cannot be used for `simsmall`.
|
|
||||||
|
|
||||||
The easiest thing to do, is to https://superuser.com/questions/231002/how-can-i-search-within-the-output-buffer-of-a-tmux-shell/1253137#1253137[scroll up on the host shell] after the build, and look for a line of type:
|
|
||||||
|
|
||||||
....
|
|
||||||
Running /root/linux-kernel-module-cheat/out/aarch64/buildroot/build/parsec-benchmark-custom/ext/splash2x/apps/ocean_ncp/inst/aarch64-linux.gcc/bin/ocean_ncp -n2050 -p1 -e1e-07 -r20000 -t28800
|
|
||||||
....
|
|
||||||
|
|
||||||
and then tweak the command found in `test.sh` accordingly.
|
|
||||||
|
|
||||||
Yes, we do run the benchmarks on host just to unpack / generate inputs. They are expected fail to run since they were build for the guest instead of host, including for x86_64 guest which has a different interpreter than the host's (see `file myexecutable`).
|
|
||||||
|
|
||||||
The rebuild is required because we unpack input files on the host.
|
|
||||||
|
|
||||||
Separating input sizes also allows to create smaller images when only running the smaller benchmarks.
|
|
||||||
|
|
||||||
This limitation exists because `parsecmgmt` generates the input files just before running via the Bash scripts, but we can't run `parsecmgmt` on gem5 as it is too slow!
|
|
||||||
|
|
||||||
One option would be to do that inside the guest with QEMU.
|
|
||||||
|
|
||||||
Also, we can't generate all input sizes at once, because many of them have the same name and would overwrite one another...
|
|
||||||
|
|
||||||
PARSEC simply wasn't designed with non native machines in mind...
|
|
||||||
|
|
||||||
====== PARSEC benchmark with parsecmgmt
|
|
||||||
|
|
||||||
Most users won't want to use this method because:
|
|
||||||
|
|
||||||
* running the `parsecmgmt` Bash scripts takes forever before it ever starts running the actual benchmarks on gem5
|
|
||||||
+
|
|
||||||
Running on QEMU is feasible, but not the main use case, since QEMU cannot be used for performance measurements
|
|
||||||
* it requires putting the full `.tar` inputs on the guest, which makes the image twice as large (1x for the `.tar`, 1x for the unpacked input files)
|
|
||||||
|
|
||||||
It would be awesome if it were possible to use this method, since this is what Parsec supports officially, and so:
|
|
||||||
|
|
||||||
* you don't have to dig into what raw command to run
|
|
||||||
* there is an easy way to run all the benchmarks in one go to test them out
|
|
||||||
* you can just run any of the benchmarks that you want
|
|
||||||
|
|
||||||
but it simply is not feasible in gem5 because it takes too long.
|
|
||||||
|
|
||||||
If you still want to run this, try it out with:
|
|
||||||
|
|
||||||
....
|
|
||||||
./build-buildroot \
|
|
||||||
--arch aarch64 \
|
|
||||||
--config 'BR2_PACKAGE_PARSEC_BENCHMARK=y' \
|
|
||||||
--config 'BR2_PACKAGE_PARSEC_BENCHMARK_PARSECMGMT=y' \
|
|
||||||
--config 'BR2_TARGET_ROOTFS_EXT2_SIZE="3G"' \
|
|
||||||
-- parsec_benchmark-reconfigure \
|
|
||||||
;
|
|
||||||
....
|
|
||||||
|
|
||||||
And then you can run it just as you would on the host:
|
|
||||||
|
|
||||||
....
|
|
||||||
cd /parsec/
|
|
||||||
bash
|
|
||||||
. env.sh
|
|
||||||
parsecmgmt -a run -p splash2x.fmm -i test
|
|
||||||
....
|
|
||||||
|
|
||||||
====== PARSEC uninstall
|
|
||||||
|
|
||||||
If you want to remove PARSEC later, Buildroot doesn't provide an automated package removal mechanism as mentioned at: xref:remove-buildroot-packages[xrefstyle=full], but the following procedure should be satisfactory:
|
|
||||||
|
|
||||||
....
|
|
||||||
rm -rf \
|
|
||||||
"$(./getvar buildroot_download_dir)"/parsec-* \
|
|
||||||
"$(./getvar buildroot_build_dir)"/build/parsec-* \
|
|
||||||
"$(./getvar buildroot_build_dir)"/build/packages-file-list.txt \
|
|
||||||
"$(./getvar buildroot_build_dir)"/images/rootfs.* \
|
|
||||||
"$(./getvar buildroot_build_dir)"/target/parsec-* \
|
|
||||||
;
|
|
||||||
./build-buildroot --arch arm
|
|
||||||
....
|
|
||||||
|
|
||||||
====== PARSEC benchmark hacking
|
|
||||||
|
|
||||||
If you end up going inside link:submodules/parsec-benchmark[] to hack up the benchmark (you will!), these tips will be helpful.
|
|
||||||
|
|
||||||
Buildroot was not designed to deal with large images, and currently cross rebuilds are a bit slow, due to some image generation and validation steps.
|
|
||||||
|
|
||||||
A few workarounds are:
|
|
||||||
|
|
||||||
* develop in host first as much as you can. Our PARSEC fork supports it.
|
|
||||||
+
|
|
||||||
If you do this, don't forget to do a:
|
|
||||||
+
|
|
||||||
....
|
|
||||||
cd "$(./getvar parsec_source_dir)"
|
|
||||||
git clean -xdf .
|
|
||||||
....
|
|
||||||
before going for the cross compile build.
|
|
||||||
+
|
|
||||||
* patch Buildroot to work well, and keep cross compiling all the way. This should be totally viable, and we should do it.
|
|
||||||
+
|
|
||||||
Don't forget to explicitly rebuild PARSEC with:
|
|
||||||
+
|
|
||||||
....
|
|
||||||
./build-buildroot \
|
|
||||||
--arch arm \
|
|
||||||
--config 'BR2_PACKAGE_PARSEC_BENCHMARK=y' \
|
|
||||||
-- parsec_benchmark-reconfigure \
|
|
||||||
;
|
|
||||||
....
|
|
||||||
+
|
|
||||||
You may also want to test if your patches are still functionally correct inside of QEMU first, which is a faster emulator.
|
|
||||||
* sell your soul, and compile natively inside the guest. We won't do this, not only because it is evil, but also because Buildroot explicitly does not support it: https://buildroot.org/downloads/manual/manual.html#faq-no-compiler-on-target ARM employees have been known to do this: https://github.com/arm-university/arm-gem5-rsk/blob/aa3b51b175a0f3b6e75c9c856092ae0c8f2a7cdc/parsec_patches/qemu-patch.diff
|
|
||||||
|
|
||||||
=== gem5 kernel command line parameters
|
=== gem5 kernel command line parameters
|
||||||
|
|
||||||
Analogous <<kernel-command-line-parameters,to QEMU>>:
|
Analogous <<kernel-command-line-parameters,to QEMU>>:
|
||||||
@@ -14209,9 +13855,7 @@ Example: link:userland/c/memory_leak.c[]
|
|||||||
|
|
||||||
Maybe some day someone will use this setup to study the performance of interpreters:
|
Maybe some day someone will use this setup to study the performance of interpreters:
|
||||||
|
|
||||||
* <<node-js>>
|
==== Node.js
|
||||||
|
|
||||||
=== Node.js
|
|
||||||
|
|
||||||
Parent section: <<interpreted-languages>>.
|
Parent section: <<interpreted-languages>>.
|
||||||
|
|
||||||
@@ -14237,6 +13881,456 @@ Examples:
|
|||||||
** link:rootfs_overlay/lkmc/nodejs/file_write_read.js[]
|
** link:rootfs_overlay/lkmc/nodejs/file_write_read.js[]
|
||||||
** link:rootfs_overlay/lkmc/nodejs/read_stdin_to_string.js[] Question: https://stackoverflow.com/questions/30441025/read-all-text-from-stdin-to-a-string
|
** link:rootfs_overlay/lkmc/nodejs/read_stdin_to_string.js[] Question: https://stackoverflow.com/questions/30441025/read-all-text-from-stdin-to-a-string
|
||||||
|
|
||||||
|
=== Algorithms
|
||||||
|
|
||||||
|
link:userland/algorithm[]
|
||||||
|
|
||||||
|
This is still work in progress and needs better automation, but is already a good sketch. Key missing features:
|
||||||
|
|
||||||
|
* actually check that outputs are correct in `./test`
|
||||||
|
* create a mechanism to run all or some selected hand coded inputs
|
||||||
|
* create a mechanism to run generated input
|
||||||
|
|
||||||
|
The idea was originally started at: https://github.com/cirosantilli/algorithm-cheat
|
||||||
|
|
||||||
|
The key idea is that input / output pairs are present in human readable files generated either:
|
||||||
|
|
||||||
|
* manually for small test inputs
|
||||||
|
* with a Python script for larger randomized tests
|
||||||
|
|
||||||
|
Test programs then:
|
||||||
|
|
||||||
|
* read input from sdtin
|
||||||
|
* produce output to stdout
|
||||||
|
|
||||||
|
so that we can compare the output to the expected one.
|
||||||
|
|
||||||
|
This way, tests can be reused across several implementations in different languages, emulating the many multi-language programming competition websites out there.
|
||||||
|
|
||||||
|
For example, for a <<userland-setup-getting-started-natively,native run>> we can can run a set / sorting test:
|
||||||
|
|
||||||
|
....
|
||||||
|
cd userland/algorithm/set
|
||||||
|
./build
|
||||||
|
|
||||||
|
# Run with a small hand written test.
|
||||||
|
./std_set.out < test_data/8.i > tmp.raw
|
||||||
|
|
||||||
|
# Extract the output from the sorted stdout, which also
|
||||||
|
# contained some timing information.
|
||||||
|
./parse_output output < tmp.raw > tmp.o
|
||||||
|
|
||||||
|
# Compare the output to the Expected one.
|
||||||
|
cmp tmp.o test_data/8.e
|
||||||
|
|
||||||
|
# Same but now with a large randomly generated input.
|
||||||
|
./generate_io
|
||||||
|
./std_set.out < tmp.i | ./parse_output output > tmp.o
|
||||||
|
cmp tmp.o tmp.e
|
||||||
|
....
|
||||||
|
|
||||||
|
It is also possible to the algorithm tests normally from emulators in <<user-mode-simulation>> by setting stdin as explained at <<syscall-emulation-mode-program-stdin>>, e.g.:
|
||||||
|
|
||||||
|
....
|
||||||
|
./run --arch aarch64 -u userland/algorithm/set/std_set.cpp --stdin-file userland/algorithm/set/test_data/8.i
|
||||||
|
....
|
||||||
|
|
||||||
|
Sources:
|
||||||
|
|
||||||
|
* link:userland/algorithm/set/generate_input[]
|
||||||
|
* link:userland/algorithm/set/main.hpp[]
|
||||||
|
* link:userland/algorithm/set/parse_output[]
|
||||||
|
* link:userland/algorithm/set/std_set.cpp[]
|
||||||
|
* link:userland/algorithm/set/test_data/8.e[]
|
||||||
|
* link:userland/algorithm/set/test_data/8.i[]
|
||||||
|
|
||||||
|
link:userland/algorithm/set/parse_output[] is needed because timing instrumentation measurements must be embedded in the program itself to allow:
|
||||||
|
|
||||||
|
* discounting the input reading / output writing operations from the actual "read / write to / from memory algorithm" itself
|
||||||
|
* measuring the evolution of the benchmark mid way, e.g. to see how the current container size affects insertion time: <<bst-vs-heap-vs-hashmap>>
|
||||||
|
|
||||||
|
The following are also interesting Buildroot libraries that we could benchmark:
|
||||||
|
|
||||||
|
* Armadillo `C++`: linear algebra
|
||||||
|
* fftw: Fourier transform
|
||||||
|
* Flann
|
||||||
|
* GSL: various
|
||||||
|
* liblinear
|
||||||
|
* libspacialindex
|
||||||
|
* libtommath
|
||||||
|
* qhull
|
||||||
|
|
||||||
|
These are good targets for <<gem5-run-benchmark,performance analysis with gem5>>, and there is some overlap between this section and <<benchmarks>>.
|
||||||
|
|
||||||
|
==== BST vs heap vs hashmap
|
||||||
|
|
||||||
|
TODO: move benchmark graph from link:userland/cpp/bst_vs_heap_vs_hashmap.cpp[] to link:userland/algorithm/set[].
|
||||||
|
|
||||||
|
The following benchmark setup works both:
|
||||||
|
|
||||||
|
* on host through timers + https://stackoverflow.com/questions/51952471/why-do-i-get-a-constant-instead-of-logarithmic-curve-for-an-insert-time-benchmar/51953081#51953081[granule]
|
||||||
|
* gem5 with <<m5ops-instructions,dumpstats>>, which can get more precise results with `granule == 1`
|
||||||
|
|
||||||
|
It has been used to answer:
|
||||||
|
|
||||||
|
* BST vs heap: https://stackoverflow.com/questions/6147243/heap-vs-binary-search-tree-bst/29548834#29548834
|
||||||
|
* `std::set`: https://stackoverflow.com/questions/2558153/what-is-the-underlying-data-structure-of-a-stl-set-in-c/51944661#51944661
|
||||||
|
* `std::map`: https://stackoverflow.com/questions/18414579/what-data-structure-is-inside-stdmap-in-c/51945119#51945119
|
||||||
|
|
||||||
|
To benchmark on the host, we do:
|
||||||
|
|
||||||
|
....
|
||||||
|
./build-userland-in-tree \
|
||||||
|
--force-rebuild \
|
||||||
|
--optimization-level 3 \
|
||||||
|
./userland/cpp/bst_vs_heap_vs_hashmap.cpp \
|
||||||
|
;
|
||||||
|
./userland/cpp/bst_vs_heap_vs_hashmap.out 10000000 10000 0 | tee bst_vs_heap_vs_hashmap.dat
|
||||||
|
gnuplot \
|
||||||
|
-e 'input_noext="bst_vs_heap_vs_hashmap"' \
|
||||||
|
-e 'heap_zoom_max=50' \
|
||||||
|
-e 'hashmap_zoom_max=400' \
|
||||||
|
./bst-vs-heap-vs-hashmap.gnuplot \
|
||||||
|
;
|
||||||
|
xdg-open bst_vs_heap_vs_hashmap.tmp.png
|
||||||
|
....
|
||||||
|
|
||||||
|
The parameters `heap_zoom_max` and `hashmap_zoom_max` are chosen manually interactively to best showcase the regions of interest in those plots.
|
||||||
|
|
||||||
|
To benchmark on gem5, we first build the benchmark with <<m5ops-instructions>> enabled, and then we run it and extract the stats:
|
||||||
|
|
||||||
|
....
|
||||||
|
./build-userland \
|
||||||
|
--arch x86_64 \
|
||||||
|
--ccflags='-DLKMC_M5OPS_ENABLE=1' \
|
||||||
|
--force-rebuild userland/cpp/bst_vs_heap_vs_hashmap.cpp \
|
||||||
|
--static \
|
||||||
|
--optimization-level 3 \
|
||||||
|
;
|
||||||
|
./run \
|
||||||
|
--arch x86_64 \
|
||||||
|
--emulator gem5 \
|
||||||
|
--static \
|
||||||
|
--userland userland/cpp/bst_vs_heap_vs_hashmap.cpp \
|
||||||
|
--userland-args='100000 1 0' \
|
||||||
|
-- \
|
||||||
|
--cpu-type=DerivO3CPU \
|
||||||
|
--caches \
|
||||||
|
--l2cache \
|
||||||
|
--l1d_size=32kB \
|
||||||
|
--l1i_size=32kB \
|
||||||
|
--l2_size=256kB \
|
||||||
|
--l3_size=20MB \
|
||||||
|
;
|
||||||
|
./bst-vs-heap-vs-hashmap-gem5-stats --arch x86_64 | tee bst_vs_heap_vs_hashmap_gem5.dat
|
||||||
|
gnuplot \
|
||||||
|
-e 'input_noext="bst_vs_heap_vs_hashmap_gem5"' \
|
||||||
|
-e 'heap_zoom_max=500' \
|
||||||
|
-e 'hashmap_zoom_max=400' \
|
||||||
|
./bst-vs-heap-vs-hashmap.gnuplot \
|
||||||
|
;
|
||||||
|
xdg-open bst_vs_heap_vs_hashmap_gem5.tmp.png
|
||||||
|
....
|
||||||
|
|
||||||
|
TODO: the gem5 simulation blows up on a tcmalloc allocation somewhere near 25k elements as of 3fdd83c2c58327d9714fa2347c724b78d7c05e2b + 1, likely linked to the extreme inefficiency of the stats collection?
|
||||||
|
|
||||||
|
The cache sizes were chosen to match the host <<p51>> to improve the comparison. Ideally we should also use the same standard library.
|
||||||
|
|
||||||
|
Note that this will take a long time, and will produce a humongous ~40Gb stats file as explained at: xref:gem5-only-dump-selected-stats[xrefstyle=full]
|
||||||
|
|
||||||
|
Sources:
|
||||||
|
|
||||||
|
* link:userland/cpp/bst_vs_heap_vs_hashmap.cpp[]
|
||||||
|
* link:bst-vs-heap-vs-hashmap-gem5-stats[]
|
||||||
|
* link:bst-vs-heap-vs-hashmap.gnuplot[]
|
||||||
|
|
||||||
|
==== BLAS
|
||||||
|
|
||||||
|
Buildroot supports it, which makes everything just trivial:
|
||||||
|
|
||||||
|
....
|
||||||
|
./build-buildroot --config 'BR2_PACKAGE_OPENBLAS=y'
|
||||||
|
./build-userland --package openblas -- userland/libs/openblas/hello.c
|
||||||
|
./run --eval-after './libs/openblas/hello.out; echo $?'
|
||||||
|
....
|
||||||
|
|
||||||
|
Outcome: the test passes:
|
||||||
|
|
||||||
|
....
|
||||||
|
0
|
||||||
|
....
|
||||||
|
|
||||||
|
Source: link:userland/libs/openblas/hello.c[]
|
||||||
|
|
||||||
|
The test performs a general matrix multiplication:
|
||||||
|
|
||||||
|
....
|
||||||
|
| 1.0 -3.0 | | 1.0 2.0 1.0 | | 0.5 0.5 0.5 | | 11.0 - 9.0 5.0 |
|
||||||
|
1 * | 2.0 4.0 | * | -3.0 4.0 -1.0 | + 2 * | 0.5 0.5 0.5 | = | - 9.0 21.0 -1.0 |
|
||||||
|
| 1.0 -1.0 | | 0.5 0.5 0.5 | | 5.0 - 1.0 3.0 |
|
||||||
|
....
|
||||||
|
|
||||||
|
This can be deduced from the Fortran interfaces at
|
||||||
|
|
||||||
|
....
|
||||||
|
less "$(./getvar buildroot_build_build_dir)"/openblas-*/reference/dgemmf.f
|
||||||
|
....
|
||||||
|
|
||||||
|
which we can map to our call as:
|
||||||
|
|
||||||
|
....
|
||||||
|
C := alpha*op( A )*op( B ) + beta*C,
|
||||||
|
SUBROUTINE DGEMMF( TRANA, TRANB, M,N,K, ALPHA,A,LDA,B,LDB,BETA,C,LDC)
|
||||||
|
cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans,3,3,2 ,1, A,3, B,3, 2 ,C,3 );
|
||||||
|
....
|
||||||
|
|
||||||
|
==== Eigen
|
||||||
|
|
||||||
|
Header only linear algebra library with a mainline Buildroot package:
|
||||||
|
|
||||||
|
....
|
||||||
|
./build-buildroot --config 'BR2_PACKAGE_EIGEN=y'
|
||||||
|
./build-userland --package eigen -- userland/libs/eigen/hello.cpp
|
||||||
|
....
|
||||||
|
|
||||||
|
Just create an array and print it:
|
||||||
|
|
||||||
|
....
|
||||||
|
./run --eval-after './libs/eigen/hello.out'
|
||||||
|
....
|
||||||
|
|
||||||
|
Output:
|
||||||
|
|
||||||
|
....
|
||||||
|
3 -1
|
||||||
|
2.5 1.5
|
||||||
|
....
|
||||||
|
|
||||||
|
Source: link:userland/libs/eigen/hello.cpp[]
|
||||||
|
|
||||||
|
This example just creates a matrix and prints it out.
|
||||||
|
|
||||||
|
Tested on: https://github.com/cirosantilli/linux-kernel-module-cheat/commit/a4bdcf102c068762bb1ef26c591fcf71e5907525[a4bdcf102c068762bb1ef26c591fcf71e5907525]
|
||||||
|
|
||||||
|
=== Benchmarks
|
||||||
|
|
||||||
|
These are good targets for <<gem5-run-benchmark,performance analysis with gem5>>.
|
||||||
|
|
||||||
|
TODO also consider the following:
|
||||||
|
|
||||||
|
* https://github.com/kozyraki/stamp transactional memory benchmarks
|
||||||
|
|
||||||
|
==== Dhrystone
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/Dhrystone
|
||||||
|
|
||||||
|
Created in the 80's, it is not a representative measure of performance in modern computers anymore. It has mostly been replaced by https://en.wikipedia.org/wiki/SPECint[SPEC], which is... closed source! Unbelievable.
|
||||||
|
|
||||||
|
<<buildroot>> has a `dhrystone` package, but because it is so interesting to us, we decided to also build it ourselves, which allows things like static and baremetal compilation more easily.
|
||||||
|
|
||||||
|
Build and run on QEMU <<user-mode-simulation>>:
|
||||||
|
|
||||||
|
....
|
||||||
|
git submodule update --init submodules/dhrystone
|
||||||
|
./build-dhrystone --mode userland
|
||||||
|
./run --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"
|
||||||
|
....
|
||||||
|
|
||||||
|
Build and run on gem5 user mode:
|
||||||
|
|
||||||
|
....
|
||||||
|
./build-dhrystone --mode userland --static --force-rebuild
|
||||||
|
./run --emulator gem5 --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"
|
||||||
|
....
|
||||||
|
|
||||||
|
TODO automate run more nicely.
|
||||||
|
|
||||||
|
Build for <<baremetal>> execution and run it in baremetal QEMU:
|
||||||
|
|
||||||
|
....
|
||||||
|
# Build our Newlib stubs.
|
||||||
|
./build-baremetal --arch aarch64
|
||||||
|
./build-dhrystone --arch aarch64 --mode baremetal
|
||||||
|
./run --arch aarch64 --baremetal "$(./getvar baremetal_build_dir)/submodules/dhrystone/dhrystone"
|
||||||
|
....
|
||||||
|
|
||||||
|
TODO: fix the build, just need to factor out all run arguments from link:build-baremetal[] into link:common.py[] and it should just work, no missing syscalls.
|
||||||
|
|
||||||
|
If you really want the Buildroot package for some reason, build it with:
|
||||||
|
|
||||||
|
....
|
||||||
|
./build-buildroot --config 'BR2_PACKAGE_DHRYSTONE=y'
|
||||||
|
....
|
||||||
|
|
||||||
|
and run inside the guest from `PATH` with:
|
||||||
|
|
||||||
|
....
|
||||||
|
dhrystone
|
||||||
|
....
|
||||||
|
|
||||||
|
==== PARSEC benchmark
|
||||||
|
|
||||||
|
We have ported parts of the http://parsec.cs.princeton.edu[PARSEC benchmark] for cross compilation at: https://github.com/cirosantilli/parsec-benchmark See the documentation on that repo to find out which benchmarks have been ported. Some of the benchmarks were are segfaulting, they are documented in that repo.
|
||||||
|
|
||||||
|
There are two ways to run PARSEC with this repo:
|
||||||
|
|
||||||
|
* <<parsec-benchmark-without-parsecmgmt,without `pasecmgmt`>>, most likely what you want
|
||||||
|
* <<parsec-benchmark-with-parsecmgmt,with `pasecmgmt`>>
|
||||||
|
|
||||||
|
===== PARSEC benchmark without parsecmgmt
|
||||||
|
|
||||||
|
....
|
||||||
|
./build --arch arm --download-dependencies gem5-buildroot parsec-benchmark
|
||||||
|
./build-buildroot --arch arm --config 'BR2_PACKAGE_PARSEC_BENCHMARK=y'
|
||||||
|
./run --arch arm --emulator gem5
|
||||||
|
....
|
||||||
|
|
||||||
|
Once inside the guest, launch one of the `test` input sized benchmarks manually as in:
|
||||||
|
|
||||||
|
....
|
||||||
|
cd /parsec/ext/splash2x/apps/fmm/run
|
||||||
|
../inst/arm-linux.gcc/bin/fmm 1 < input_1
|
||||||
|
....
|
||||||
|
|
||||||
|
To find run out how to run many of the benchmarks, have a look at the `test.sh` script of the `parse-benchmark` repo.
|
||||||
|
|
||||||
|
From the guest, you can also run it as:
|
||||||
|
|
||||||
|
....
|
||||||
|
cd /parsec
|
||||||
|
./test.sh
|
||||||
|
....
|
||||||
|
|
||||||
|
but this might be a bit time consuming in gem5.
|
||||||
|
|
||||||
|
===== PARSEC change the input size
|
||||||
|
|
||||||
|
Running a benchmark of a size different than `test`, e.g. `simsmall`, requires a rebuild with:
|
||||||
|
|
||||||
|
....
|
||||||
|
./build-buildroot \
|
||||||
|
--arch arm \
|
||||||
|
--config 'BR2_PACKAGE_PARSEC_BENCHMARK=y' \
|
||||||
|
--config 'BR2_PACKAGE_PARSEC_BENCHMARK_INPUT_SIZE="simsmall"' \
|
||||||
|
-- parsec_benchmark-reconfigure \
|
||||||
|
;
|
||||||
|
....
|
||||||
|
|
||||||
|
Large input may also require tweaking:
|
||||||
|
|
||||||
|
* <<br2-target-rootfs-ext2-size>> if the unpacked inputs are large
|
||||||
|
* <<memory-size>>, unless you want to meet the OOM killer, which is admittedly kind of fun
|
||||||
|
|
||||||
|
`test.sh` only contains the run commands for the `test` size, and cannot be used for `simsmall`.
|
||||||
|
|
||||||
|
The easiest thing to do, is to https://superuser.com/questions/231002/how-can-i-search-within-the-output-buffer-of-a-tmux-shell/1253137#1253137[scroll up on the host shell] after the build, and look for a line of type:
|
||||||
|
|
||||||
|
....
|
||||||
|
Running /root/linux-kernel-module-cheat/out/aarch64/buildroot/build/parsec-benchmark-custom/ext/splash2x/apps/ocean_ncp/inst/aarch64-linux.gcc/bin/ocean_ncp -n2050 -p1 -e1e-07 -r20000 -t28800
|
||||||
|
....
|
||||||
|
|
||||||
|
and then tweak the command found in `test.sh` accordingly.
|
||||||
|
|
||||||
|
Yes, we do run the benchmarks on host just to unpack / generate inputs. They are expected fail to run since they were build for the guest instead of host, including for x86_64 guest which has a different interpreter than the host's (see `file myexecutable`).
|
||||||
|
|
||||||
|
The rebuild is required because we unpack input files on the host.
|
||||||
|
|
||||||
|
Separating input sizes also allows to create smaller images when only running the smaller benchmarks.
|
||||||
|
|
||||||
|
This limitation exists because `parsecmgmt` generates the input files just before running via the Bash scripts, but we can't run `parsecmgmt` on gem5 as it is too slow!
|
||||||
|
|
||||||
|
One option would be to do that inside the guest with QEMU.
|
||||||
|
|
||||||
|
Also, we can't generate all input sizes at once, because many of them have the same name and would overwrite one another...
|
||||||
|
|
||||||
|
PARSEC simply wasn't designed with non native machines in mind...
|
||||||
|
|
||||||
|
===== PARSEC benchmark with parsecmgmt
|
||||||
|
|
||||||
|
Most users won't want to use this method because:
|
||||||
|
|
||||||
|
* running the `parsecmgmt` Bash scripts takes forever before it ever starts running the actual benchmarks on gem5
|
||||||
|
+
|
||||||
|
Running on QEMU is feasible, but not the main use case, since QEMU cannot be used for performance measurements
|
||||||
|
* it requires putting the full `.tar` inputs on the guest, which makes the image twice as large (1x for the `.tar`, 1x for the unpacked input files)
|
||||||
|
|
||||||
|
It would be awesome if it were possible to use this method, since this is what Parsec supports officially, and so:
|
||||||
|
|
||||||
|
* you don't have to dig into what raw command to run
|
||||||
|
* there is an easy way to run all the benchmarks in one go to test them out
|
||||||
|
* you can just run any of the benchmarks that you want
|
||||||
|
|
||||||
|
but it simply is not feasible in gem5 because it takes too long.
|
||||||
|
|
||||||
|
If you still want to run this, try it out with:
|
||||||
|
|
||||||
|
....
|
||||||
|
./build-buildroot \
|
||||||
|
--arch aarch64 \
|
||||||
|
--config 'BR2_PACKAGE_PARSEC_BENCHMARK=y' \
|
||||||
|
--config 'BR2_PACKAGE_PARSEC_BENCHMARK_PARSECMGMT=y' \
|
||||||
|
--config 'BR2_TARGET_ROOTFS_EXT2_SIZE="3G"' \
|
||||||
|
-- parsec_benchmark-reconfigure \
|
||||||
|
;
|
||||||
|
....
|
||||||
|
|
||||||
|
And then you can run it just as you would on the host:
|
||||||
|
|
||||||
|
....
|
||||||
|
cd /parsec/
|
||||||
|
bash
|
||||||
|
. env.sh
|
||||||
|
parsecmgmt -a run -p splash2x.fmm -i test
|
||||||
|
....
|
||||||
|
|
||||||
|
===== PARSEC uninstall
|
||||||
|
|
||||||
|
If you want to remove PARSEC later, Buildroot doesn't provide an automated package removal mechanism as mentioned at: xref:remove-buildroot-packages[xrefstyle=full], but the following procedure should be satisfactory:
|
||||||
|
|
||||||
|
....
|
||||||
|
rm -rf \
|
||||||
|
"$(./getvar buildroot_download_dir)"/parsec-* \
|
||||||
|
"$(./getvar buildroot_build_dir)"/build/parsec-* \
|
||||||
|
"$(./getvar buildroot_build_dir)"/build/packages-file-list.txt \
|
||||||
|
"$(./getvar buildroot_build_dir)"/images/rootfs.* \
|
||||||
|
"$(./getvar buildroot_build_dir)"/target/parsec-* \
|
||||||
|
;
|
||||||
|
./build-buildroot --arch arm
|
||||||
|
....
|
||||||
|
|
||||||
|
===== PARSEC benchmark hacking
|
||||||
|
|
||||||
|
If you end up going inside link:submodules/parsec-benchmark[] to hack up the benchmark (you will!), these tips will be helpful.
|
||||||
|
|
||||||
|
Buildroot was not designed to deal with large images, and currently cross rebuilds are a bit slow, due to some image generation and validation steps.
|
||||||
|
|
||||||
|
A few workarounds are:
|
||||||
|
|
||||||
|
* develop in host first as much as you can. Our PARSEC fork supports it.
|
||||||
|
+
|
||||||
|
If you do this, don't forget to do a:
|
||||||
|
+
|
||||||
|
....
|
||||||
|
cd "$(./getvar parsec_source_dir)"
|
||||||
|
git clean -xdf .
|
||||||
|
....
|
||||||
|
before going for the cross compile build.
|
||||||
|
+
|
||||||
|
* patch Buildroot to work well, and keep cross compiling all the way. This should be totally viable, and we should do it.
|
||||||
|
+
|
||||||
|
Don't forget to explicitly rebuild PARSEC with:
|
||||||
|
+
|
||||||
|
....
|
||||||
|
./build-buildroot \
|
||||||
|
--arch arm \
|
||||||
|
--config 'BR2_PACKAGE_PARSEC_BENCHMARK=y' \
|
||||||
|
-- parsec_benchmark-reconfigure \
|
||||||
|
;
|
||||||
|
....
|
||||||
|
+
|
||||||
|
You may also want to test if your patches are still functionally correct inside of QEMU first, which is a faster emulator.
|
||||||
|
* sell your soul, and compile natively inside the guest. We won't do this, not only because it is evil, but also because Buildroot explicitly does not support it: https://buildroot.org/downloads/manual/manual.html#faq-no-compiler-on-target ARM employees have been known to do this: https://github.com/arm-university/arm-gem5-rsk/blob/aa3b51b175a0f3b6e75c9c856092ae0c8f2a7cdc/parsec_patches/qemu-patch.diff
|
||||||
|
|
||||||
=== Userland content bibliography
|
=== Userland content bibliography
|
||||||
|
|
||||||
* The Linux Programming Interface by Michael Kerrisk https://www.amazon.co.uk/Linux-Programming-Interface-System-Handbook/dp/1593272200 Lots of open source POSIX examples: https://github.com/cirosantilli/linux-programming-interface-kerrisk
|
* The Linux Programming Interface by Michael Kerrisk https://www.amazon.co.uk/Linux-Programming-Interface-System-Handbook/dp/1593272200 Lots of open source POSIX examples: https://github.com/cirosantilli/linux-programming-interface-kerrisk
|
||||||
|
|||||||
33
common.py
33
common.py
@@ -1941,3 +1941,36 @@ class TestCliFunction(LkmcCliFunction):
|
|||||||
self.log_error('A test failed')
|
self.log_error('A test failed')
|
||||||
return 1
|
return 1
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
# IO format.
|
||||||
|
|
||||||
|
class LkmcList(list):
|
||||||
|
'''
|
||||||
|
list with a lightweight serialization format for algorithm IO.
|
||||||
|
'''
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
if 'oneline' in kwargs:
|
||||||
|
self.oneline = kwargs['oneline']
|
||||||
|
del kwargs['oneline']
|
||||||
|
else:
|
||||||
|
self.oneline = False
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
def __str__(self):
|
||||||
|
if self.oneline:
|
||||||
|
sep = ' '
|
||||||
|
else:
|
||||||
|
sep = '\n'
|
||||||
|
return sep.join([str(item) for item in self])
|
||||||
|
|
||||||
|
class LkmcOrderedDict(collections.OrderedDict):
|
||||||
|
'''
|
||||||
|
dict with a lightweight serialization format for algorithm IO.
|
||||||
|
'''
|
||||||
|
def __str__(self):
|
||||||
|
out = []
|
||||||
|
for key in self:
|
||||||
|
out.extend([
|
||||||
|
str(key),
|
||||||
|
str(self[key]) + '\n',
|
||||||
|
])
|
||||||
|
return '\n'.join(out)
|
||||||
|
|||||||
@@ -93,6 +93,10 @@ class PathProperties:
|
|||||||
# We were lazy to properly classify why we are skipping these tests.
|
# We were lazy to properly classify why we are skipping these tests.
|
||||||
# TODO get it done.
|
# TODO get it done.
|
||||||
'skip_run_unclassified': False,
|
'skip_run_unclassified': False,
|
||||||
|
# Look for the given file under test_data/ relative to the file under test,
|
||||||
|
# and pass the given file as the stdin of the program. The .i input extension is
|
||||||
|
# appended implicitly to the test path.
|
||||||
|
'test_stdin_data': None,
|
||||||
# Aruments added automatically to run when running tests,
|
# Aruments added automatically to run when running tests,
|
||||||
# but not on manual running.
|
# but not on manual running.
|
||||||
'test_run_args': {},
|
'test_run_args': {},
|
||||||
@@ -311,6 +315,9 @@ class PrefixTree:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_from_tuples(tuples):
|
def make_from_tuples(tuples):
|
||||||
|
'''
|
||||||
|
TODO check that all paths exist.
|
||||||
|
'''
|
||||||
def tree_from_tuples(tuple_):
|
def tree_from_tuples(tuple_):
|
||||||
if not type(tuple_) is tuple:
|
if not type(tuple_) is tuple:
|
||||||
tuple_ = (tuple_, {})
|
tuple_ = (tuple_, {})
|
||||||
@@ -328,6 +335,9 @@ class PrefixTree:
|
|||||||
return top_tree
|
return top_tree
|
||||||
|
|
||||||
def get(path):
|
def get(path):
|
||||||
|
'''
|
||||||
|
Get the merged path properties of a given path.
|
||||||
|
'''
|
||||||
cur_node = path_properties_tree
|
cur_node = path_properties_tree
|
||||||
path_components = path.split(os.sep)
|
path_components = path.split(os.sep)
|
||||||
path_properties = PathProperties(cur_node.path_properties.properties.copy())
|
path_properties = PathProperties(cur_node.path_properties.properties.copy())
|
||||||
@@ -439,7 +449,9 @@ path_properties_tuples = (
|
|||||||
{},
|
{},
|
||||||
{
|
{
|
||||||
'set': (
|
'set': (
|
||||||
{},
|
{
|
||||||
|
'test_stdin_data': '8',
|
||||||
|
},
|
||||||
{
|
{
|
||||||
'std_priority_queue_gem5.cpp': {'allowed_emulators': {'gem5'}},
|
'std_priority_queue_gem5.cpp': {'allowed_emulators': {'gem5'}},
|
||||||
'std_set_gem5.cpp': {'allowed_emulators': {'gem5'}},
|
'std_set_gem5.cpp': {'allowed_emulators': {'gem5'}},
|
||||||
@@ -606,8 +618,8 @@ path_properties_tuples = (
|
|||||||
{
|
{
|
||||||
'aarch64_add.cpp': {'allowed_archs': {'aarch64'}},
|
'aarch64_add.cpp': {'allowed_archs': {'aarch64'}},
|
||||||
'aarch64_ldadd.cpp': {'allowed_archs': {'aarch64'}},
|
'aarch64_ldadd.cpp': {'allowed_archs': {'aarch64'}},
|
||||||
'x86_64_add.cpp': {'allowed_archs': {'x86_64'}},
|
'x86_64_inc.cpp': {'allowed_archs': {'x86_64'}},
|
||||||
'x86_64_ldadd.cpp': {'allowed_archs': {'x86_64'}},
|
'x86_64_lock_inc.cpp': {'allowed_archs': {'x86_64'}},
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
'count.cpp': {'more_than_1s': True},
|
'count.cpp': {'more_than_1s': True},
|
||||||
|
|||||||
9
run
9
run
@@ -234,6 +234,14 @@ Setup a kernel init parameter that makes the emulator quit immediately after boo
|
|||||||
default=True,
|
default=True,
|
||||||
help='''Show emulator stdout and stderr on the host terminal.'''
|
help='''Show emulator stdout and stderr on the host terminal.'''
|
||||||
)
|
)
|
||||||
|
self.add_argument(
|
||||||
|
'--stdin-file',
|
||||||
|
help='''\
|
||||||
|
Set the given file as the stdin source of the emulator. QEMU and gem5 then
|
||||||
|
forward this to the guest in user mode simulation.
|
||||||
|
https://cirosantilli.com/linux-kernel-module-cheat#syscall-emulation-mode-program-stdin
|
||||||
|
'''
|
||||||
|
)
|
||||||
self.add_argument(
|
self.add_argument(
|
||||||
'--terminal',
|
'--terminal',
|
||||||
default=False,
|
default=False,
|
||||||
@@ -805,6 +813,7 @@ Extra options to append at the end of the emulator command line.
|
|||||||
out_file=out_file,
|
out_file=out_file,
|
||||||
raise_on_failure=False,
|
raise_on_failure=False,
|
||||||
show_stdout=show_stdout,
|
show_stdout=show_stdout,
|
||||||
|
stdin_path=self.env['stdin_file'],
|
||||||
)
|
)
|
||||||
if self.env['debug_vm_rr']:
|
if self.env['debug_vm_rr']:
|
||||||
exit_status = self.sh.run_cmd(
|
exit_status = self.sh.run_cmd(
|
||||||
|
|||||||
@@ -115,6 +115,8 @@ class ShellHelpers:
|
|||||||
extra_env=None,
|
extra_env=None,
|
||||||
extra_paths=None,
|
extra_paths=None,
|
||||||
force_oneline: bool =False,
|
force_oneline: bool =False,
|
||||||
|
*,
|
||||||
|
stdin_path: Union[str,None] =None
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Format a command given as a list of strings so that it can
|
Format a command given as a list of strings so that it can
|
||||||
@@ -156,6 +158,8 @@ class ShellHelpers:
|
|||||||
if not x
|
if not x
|
||||||
]
|
]
|
||||||
out.extend(cmd_quote)
|
out.extend(cmd_quote)
|
||||||
|
if stdin_path is not None:
|
||||||
|
out.append('< {}'.format(shlex.quote(stdin_path)))
|
||||||
if force_oneline or newline_count == 1 and cmd[-1] == LF:
|
if force_oneline or newline_count == 1 and cmd[-1] == LF:
|
||||||
ending = ''
|
ending = ''
|
||||||
else:
|
else:
|
||||||
@@ -241,6 +245,8 @@ class ShellHelpers:
|
|||||||
extra_env=None,
|
extra_env=None,
|
||||||
extra_paths=None,
|
extra_paths=None,
|
||||||
force_oneline=False,
|
force_oneline=False,
|
||||||
|
*,
|
||||||
|
stdin_path: Union[str,None] =None
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Print cmd_to_string to stdout.
|
Print cmd_to_string to stdout.
|
||||||
@@ -257,6 +263,7 @@ class ShellHelpers:
|
|||||||
extra_env=extra_env,
|
extra_env=extra_env,
|
||||||
extra_paths=extra_paths,
|
extra_paths=extra_paths,
|
||||||
force_oneline=force_oneline,
|
force_oneline=force_oneline,
|
||||||
|
stdin_path=stdin_path
|
||||||
)
|
)
|
||||||
if not self.quiet:
|
if not self.quiet:
|
||||||
self._print_thread_safe('+ ' + cmd_string)
|
self._print_thread_safe('+ ' + cmd_string)
|
||||||
@@ -288,6 +295,7 @@ class ShellHelpers:
|
|||||||
raise_on_failure=True,
|
raise_on_failure=True,
|
||||||
*,
|
*,
|
||||||
out_str=None,
|
out_str=None,
|
||||||
|
stdin_path: Union[str,None] =None,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
@@ -350,7 +358,8 @@ class ShellHelpers:
|
|||||||
cwd=cwd,
|
cwd=cwd,
|
||||||
cmd_file=cmd_file,
|
cmd_file=cmd_file,
|
||||||
extra_env=extra_env,
|
extra_env=extra_env,
|
||||||
extra_paths=extra_paths
|
extra_paths=extra_paths,
|
||||||
|
stdin_path=stdin_path
|
||||||
)
|
)
|
||||||
|
|
||||||
# Otherwise, if called from a non-main thread:
|
# Otherwise, if called from a non-main thread:
|
||||||
@@ -372,9 +381,14 @@ class ShellHelpers:
|
|||||||
|
|
||||||
cmd = self.strip_newlines(cmd)
|
cmd = self.strip_newlines(cmd)
|
||||||
if not self.dry_run:
|
if not self.dry_run:
|
||||||
|
if stdin_path is None:
|
||||||
|
stdin = None
|
||||||
|
else:
|
||||||
|
stdin = open(stdin_path, 'r')
|
||||||
# https://stackoverflow.com/questions/15535240/python-popen-write-to-stdout-and-log-file-simultaneously/52090802#52090802
|
# https://stackoverflow.com/questions/15535240/python-popen-write-to-stdout-and-log-file-simultaneously/52090802#52090802
|
||||||
with subprocess.Popen(
|
with subprocess.Popen(
|
||||||
cmd,
|
cmd,
|
||||||
|
stdin=stdin,
|
||||||
stdout=stdout,
|
stdout=stdout,
|
||||||
stderr=stderr,
|
stderr=stderr,
|
||||||
env=env,
|
env=env,
|
||||||
@@ -409,6 +423,8 @@ class ShellHelpers:
|
|||||||
if threading.current_thread() == threading.main_thread():
|
if threading.current_thread() == threading.main_thread():
|
||||||
signal.signal(signal.SIGINT, sigint_old)
|
signal.signal(signal.SIGINT, sigint_old)
|
||||||
#signal.signal(signal.SIGPIPE, sigpipe_old)
|
#signal.signal(signal.SIGPIPE, sigpipe_old)
|
||||||
|
if stdin_path is not None:
|
||||||
|
stdin.close()
|
||||||
returncode = proc.returncode
|
returncode = proc.returncode
|
||||||
if returncode != 0 and raise_on_failure:
|
if returncode != 0 and raise_on_failure:
|
||||||
e = Exception('Command exited with status: {}'.format(returncode))
|
e = Exception('Command exited with status: {}'.format(returncode))
|
||||||
@@ -514,3 +530,7 @@ if __name__ == '__main__':
|
|||||||
assert \
|
assert \
|
||||||
shell_helpers.cmd_to_string(['cmd', LF, 'arg1', LF, 'arg2', LF], force_oneline=True) \
|
shell_helpers.cmd_to_string(['cmd', LF, 'arg1', LF, 'arg2', LF], force_oneline=True) \
|
||||||
== 'cmd arg1 arg2'
|
== 'cmd arg1 arg2'
|
||||||
|
|
||||||
|
# stdin_path
|
||||||
|
assert shell_helpers.cmd_to_string(['cmd'], stdin_path='ab') == "cmd \\\n < ab \\\n;"
|
||||||
|
assert shell_helpers.cmd_to_string(['cmd'], stdin_path='a b') == "cmd \\\n < 'a b' \\\n;"
|
||||||
|
|||||||
@@ -71,6 +71,12 @@ If given, run only the given tests. Otherwise, run all tests.
|
|||||||
),
|
),
|
||||||
})
|
})
|
||||||
cur_run_args.update(my_path_properties['test_run_args'])
|
cur_run_args.update(my_path_properties['test_run_args'])
|
||||||
|
if my_path_properties['test_stdin_data'] is not None:
|
||||||
|
cur_run_args['stdin_file'] = os.path.join(
|
||||||
|
path_abs,
|
||||||
|
'test_data',
|
||||||
|
my_path_properties['test_stdin_data'] + '.i'
|
||||||
|
)
|
||||||
run_test_args = {
|
run_test_args = {
|
||||||
'expected_exit_status': my_path_properties['exit_status'],
|
'expected_exit_status': my_path_properties['exit_status'],
|
||||||
'run_args': cur_run_args,
|
'run_args': cur_run_args,
|
||||||
|
|||||||
30
userland/algorithm/set/generate_io
Executable file
30
userland/algorithm/set/generate_io
Executable file
@@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
|
||||||
|
import common
|
||||||
|
|
||||||
|
# Handle CLI arguments.
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--min', type=int, default=0)
|
||||||
|
parser.add_argument('--max', type=int, default=(2**32 - 1))
|
||||||
|
parser.add_argument('--seed', type=int)
|
||||||
|
parser.add_argument('--size', type=int, default=1000000)
|
||||||
|
parser.add_argument('--unique', type=bool, default=True,
|
||||||
|
help='if True, remove duplicates from the expected output')
|
||||||
|
args = parser.parse_args()
|
||||||
|
random.seed(args.seed)
|
||||||
|
input_data = common.LkmcList()
|
||||||
|
for i in range(args.size):
|
||||||
|
input_data.append(random.randint(args.min, args.max))
|
||||||
|
with open('tmp.i', 'w') as i:
|
||||||
|
i.write(str(input_data) + '\n')
|
||||||
|
if args.unique:
|
||||||
|
input_data = common.LkmcList(set(input_data))
|
||||||
|
input_data.sort()
|
||||||
|
with open('tmp.e', 'w') as e:
|
||||||
|
e.write(str(input_data) + '\n')
|
||||||
@@ -18,9 +18,10 @@
|
|||||||
#include <lkmc/m5ops.h>
|
#include <lkmc/m5ops.h>
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
|
// Variables.
|
||||||
typedef uint64_t T;
|
typedef uint64_t T;
|
||||||
#if LKMC_ALGORITHM_SET_STD_PRIORITY_QUEUE
|
#if LKMC_ALGORITHM_SET_STD_PRIORITY_QUEUE
|
||||||
std::priority_queue<T> set;
|
std::priority_queue<T, std::vector<T>, std::greater<T>> set;
|
||||||
#endif
|
#endif
|
||||||
#if LKMC_ALGORITHM_SET_STD_SET
|
#if LKMC_ALGORITHM_SET_STD_SET
|
||||||
std::set<T> set;
|
std::set<T> set;
|
||||||
@@ -28,57 +29,59 @@ int main(int argc, char **argv) {
|
|||||||
#if LKMC_ALGORITHM_SET_STD_UNORDERED_SET
|
#if LKMC_ALGORITHM_SET_STD_UNORDERED_SET
|
||||||
std::unordered_set<T> set;
|
std::unordered_set<T> set;
|
||||||
#endif
|
#endif
|
||||||
std::vector<T> randoms;
|
std::vector<T> input;
|
||||||
size_t i, j = 0, n, granule, base;
|
size_t i, j = 0, n, granule, base;
|
||||||
unsigned int seed;
|
|
||||||
#ifndef LKMC_M5OPS_ENABLE
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
std::vector<std::chrono::nanoseconds::rep> dts;
|
std::vector<std::chrono::nanoseconds::rep> dts;
|
||||||
std::vector<decltype(base)> bases;
|
std::vector<decltype(base)> bases;
|
||||||
#endif
|
|
||||||
|
|
||||||
// CLI arguments.
|
|
||||||
if (argc > 1) {
|
|
||||||
n = std::stoi(argv[1]);
|
|
||||||
} else {
|
|
||||||
n = 10;
|
|
||||||
}
|
|
||||||
if (argc > 2) {
|
|
||||||
granule = std::stoi(argv[2]);
|
|
||||||
} else {
|
|
||||||
granule = 1;
|
|
||||||
}
|
|
||||||
if (argc > 3) {
|
|
||||||
seed = std::stoi(argv[3]);
|
|
||||||
} else {
|
|
||||||
seed = std::random_device()();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Action.
|
|
||||||
for (i = 0; i < n; ++i) {
|
|
||||||
randoms.push_back(i);
|
|
||||||
}
|
|
||||||
std::shuffle(randoms.begin(), randoms.end(), std::mt19937(seed));
|
|
||||||
for (i = 0; i < n / granule; ++i) {
|
|
||||||
#ifndef LKMC_M5OPS_ENABLE
|
|
||||||
using clk = std::chrono::high_resolution_clock;
|
using clk = std::chrono::high_resolution_clock;
|
||||||
decltype(clk::now()) start, end;
|
decltype(clk::now()) start, end;
|
||||||
#endif
|
#endif
|
||||||
base = i * granule;
|
|
||||||
#ifdef LKMC_M5OPS_ENABLE
|
// CLI arguments.
|
||||||
LKMC_M5OPS_RESETSTATS;
|
//
|
||||||
#else
|
// Save the clock time every `granule` loops.
|
||||||
start = clk::now();
|
//
|
||||||
for (j = 0; j < granule; ++j) {
|
// The magic value 0 means that a single time for the entire
|
||||||
#endif
|
// run is printed, therefore accounting for the full run time.
|
||||||
#if LKMC_ALGORITHM_SET_STD_PRIORITY_QUEUE
|
//
|
||||||
set.emplace(randoms[base + j]);
|
// Otherwise, must be a divisor of the input size.
|
||||||
#else
|
//
|
||||||
set.insert(randoms[base + j]);
|
// Default value: 0
|
||||||
#endif
|
if (argc > 1) {
|
||||||
#ifdef LKMC_M5OPS_ENABLE
|
granule = std::stoll(argv[1]);
|
||||||
LKMC_M5OPS_DUMPSTATS;
|
} else {
|
||||||
#else
|
granule = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Read input from stdin.
|
||||||
|
std::string str;
|
||||||
|
while (std::getline(std::cin, str)) {
|
||||||
|
if (str == "")
|
||||||
|
break;
|
||||||
|
input.push_back(std::stoll(str));
|
||||||
|
}
|
||||||
|
n = input.size();
|
||||||
|
if (granule == 0) {
|
||||||
|
granule = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Action.
|
||||||
|
for (i = 0; i < n / granule; ++i) {
|
||||||
|
base = i * granule;
|
||||||
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
|
start = clk::now();
|
||||||
|
#endif
|
||||||
|
for (j = 0; j < granule; ++j) {
|
||||||
|
LKMC_M5OPS_RESETSTATS;
|
||||||
|
#if LKMC_ALGORITHM_SET_STD_PRIORITY_QUEUE
|
||||||
|
set.emplace(input[base + j]);
|
||||||
|
#else
|
||||||
|
set.insert(input[base + j]);
|
||||||
|
#endif
|
||||||
|
LKMC_M5OPS_DUMPSTATS;
|
||||||
|
}
|
||||||
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
end = clk::now();
|
end = clk::now();
|
||||||
auto dt = (end - start) / granule;
|
auto dt = (end - start) / granule;
|
||||||
bases.push_back(base);
|
bases.push_back(base);
|
||||||
@@ -87,8 +90,26 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Report results.
|
// Report results.
|
||||||
|
std::cout << "output" << std::endl;
|
||||||
|
#if LKMC_ALGORITHM_SET_STD_PRIORITY_QUEUE
|
||||||
|
// Print priority queue without duplicates.
|
||||||
|
T last_val = set.top();
|
||||||
|
std::cout << last_val << std::endl;
|
||||||
|
set.pop();
|
||||||
|
while (!set.empty()) {
|
||||||
|
const auto& val = set.top();
|
||||||
|
if (val != last_val)
|
||||||
|
std::cout << val << std::endl;
|
||||||
|
last_val = val;
|
||||||
|
set.pop();
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
for (const auto& item : set) {
|
||||||
|
std::cout << item << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
std::cout << std::endl;
|
||||||
#ifndef LKMC_M5OPS_ENABLE
|
#ifndef LKMC_M5OPS_ENABLE
|
||||||
// Output.
|
|
||||||
std::cout << "times" << std::endl;
|
std::cout << "times" << std::endl;
|
||||||
auto bases_it = bases.begin();
|
auto bases_it = bases.begin();
|
||||||
auto dts_it = dts.begin();
|
auto dts_it = dts.begin();
|
||||||
@@ -99,17 +120,5 @@ int main(int argc, char **argv) {
|
|||||||
bases_it++;
|
bases_it++;
|
||||||
dts_it++;
|
dts_it++;
|
||||||
}
|
}
|
||||||
std::cout << std::endl;
|
|
||||||
std::cout << "output" << std::endl;
|
|
||||||
#if LKMC_ALGORITHM_SET_STD_PRIORITY_QUEUE
|
|
||||||
while (!set.empty()) {
|
|
||||||
std::cout << set.top() << std::endl;
|
|
||||||
set.pop();
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
for (const auto& item : set) {
|
|
||||||
std::cout << item << std::endl;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
40
userland/algorithm/set/parse_output
Executable file
40
userland/algorithm/set/parse_output
Executable file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import collections
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
|
||||||
|
import common
|
||||||
|
|
||||||
|
data = common.LkmcOrderedDict()
|
||||||
|
|
||||||
|
# Parse
|
||||||
|
|
||||||
|
output = common.LkmcList()
|
||||||
|
next(sys.stdin)
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.rstrip()
|
||||||
|
if line == '':
|
||||||
|
break
|
||||||
|
output.append(int(line))
|
||||||
|
data['output'] = output
|
||||||
|
|
||||||
|
times = common.LkmcList()
|
||||||
|
next(sys.stdin)
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.rstrip()
|
||||||
|
if line == '':
|
||||||
|
break
|
||||||
|
times.append(common.LkmcList([int(i) for i in line.split(' ')], oneline=True))
|
||||||
|
data['times'] = times
|
||||||
|
|
||||||
|
# Handle CLI arguments.
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('key', nargs='?')
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.key:
|
||||||
|
print(data[args.key])
|
||||||
|
else:
|
||||||
|
print(data)
|
||||||
3
userland/algorithm/set/test_data/3.e
Normal file
3
userland/algorithm/set/test_data/3.e
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
0
|
||||||
|
1
|
||||||
|
2
|
||||||
3
userland/algorithm/set/test_data/3.i
Normal file
3
userland/algorithm/set/test_data/3.i
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
1
|
||||||
|
2
|
||||||
|
0
|
||||||
4
userland/algorithm/set/test_data/4.e
Normal file
4
userland/algorithm/set/test_data/4.e
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
4
userland/algorithm/set/test_data/4.i
Normal file
4
userland/algorithm/set/test_data/4.i
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
1
|
||||||
|
3
|
||||||
|
2
|
||||||
|
0
|
||||||
5
userland/algorithm/set/test_data/5.e
Normal file
5
userland/algorithm/set/test_data/5.e
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
4
|
||||||
5
userland/algorithm/set/test_data/5.i
Normal file
5
userland/algorithm/set/test_data/5.i
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
1
|
||||||
|
4
|
||||||
|
0
|
||||||
|
2
|
||||||
|
3
|
||||||
8
userland/algorithm/set/test_data/8.e
Normal file
8
userland/algorithm/set/test_data/8.e
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
4
|
||||||
|
5
|
||||||
|
6
|
||||||
|
7
|
||||||
8
userland/algorithm/set/test_data/8.i
Normal file
8
userland/algorithm/set/test_data/8.i
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
4
|
||||||
|
5
|
||||||
|
6
|
||||||
|
2
|
||||||
|
1
|
||||||
|
3
|
||||||
|
0
|
||||||
|
7
|
||||||
@@ -1,98 +0,0 @@
|
|||||||
// https://cirosantilli.com/linux-kernel-module-cheat#cpp-multithreading
|
|
||||||
//
|
|
||||||
// The non-atomic counters have undefined values which get printed:
|
|
||||||
// they are extremely likely to be less than the correct value due to
|
|
||||||
// race conditions on the data read and update of the ++.
|
|
||||||
//
|
|
||||||
// The atomic counters have defined values, and are asserted
|
|
||||||
//
|
|
||||||
// Atomic operations are more restricted than mutex as they can
|
|
||||||
// only protect a few operations on integers.
|
|
||||||
//
|
|
||||||
// But when they can be used, they can be much more efficient than mutees.
|
|
||||||
//
|
|
||||||
// On GCC 4.8 x86-64, using atomic offered a 5x peformance improvement
|
|
||||||
// over the same program with mutexes.
|
|
||||||
|
|
||||||
#if __cplusplus >= 201103L
|
|
||||||
#include <atomic>
|
|
||||||
#include <cassert>
|
|
||||||
#include <iostream>
|
|
||||||
#include <thread>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
std::atomic_ulong my_atomic_ulong(0);
|
|
||||||
unsigned long my_non_atomic_ulong = 0;
|
|
||||||
#if defined(__x86_64__) || defined(__aarch64__)
|
|
||||||
unsigned long my_arch_atomic_ulong = 0;
|
|
||||||
unsigned long my_arch_non_atomic_ulong = 0;
|
|
||||||
#endif
|
|
||||||
size_t niters;
|
|
||||||
|
|
||||||
void threadMain() {
|
|
||||||
for (size_t i = 0; i < niters; ++i) {
|
|
||||||
my_atomic_ulong++;
|
|
||||||
my_non_atomic_ulong++;
|
|
||||||
#if defined(__x86_64__)
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
"incq %0;"
|
|
||||||
: "+m" (my_arch_non_atomic_ulong)
|
|
||||||
:
|
|
||||||
:
|
|
||||||
);
|
|
||||||
// https://cirosantilli.com/linux-kernel-module-cheat#x86-lock-prefix
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
"lock;"
|
|
||||||
"incq %0;"
|
|
||||||
: "+m" (my_arch_atomic_ulong)
|
|
||||||
:
|
|
||||||
:
|
|
||||||
);
|
|
||||||
#elif defined(__aarch64__)
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
"add %0, %0, 1;"
|
|
||||||
: "+r" (my_arch_non_atomic_ulong)
|
|
||||||
:
|
|
||||||
:
|
|
||||||
);
|
|
||||||
// https://cirosantilli.com/linux-kernel-module-cheat#arm-lse
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
"ldadd %[inc], xzr, [%[addr]];"
|
|
||||||
: "=m" (my_arch_atomic_ulong)
|
|
||||||
: [inc] "r" (1),
|
|
||||||
[addr] "r" (&my_arch_atomic_ulong)
|
|
||||||
:
|
|
||||||
);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
#if __cplusplus >= 201103L
|
|
||||||
size_t nthreads;
|
|
||||||
if (argc > 1) {
|
|
||||||
nthreads = std::stoull(argv[1], NULL, 0);
|
|
||||||
} else {
|
|
||||||
nthreads = 2;
|
|
||||||
}
|
|
||||||
if (argc > 2) {
|
|
||||||
niters = std::stoull(argv[2], NULL, 0);
|
|
||||||
} else {
|
|
||||||
niters = 10;
|
|
||||||
}
|
|
||||||
std::vector<std::thread> threads(nthreads);
|
|
||||||
for (size_t i = 0; i < nthreads; ++i)
|
|
||||||
threads[i] = std::thread(threadMain);
|
|
||||||
for (size_t i = 0; i < nthreads; ++i)
|
|
||||||
threads[i].join();
|
|
||||||
assert(my_atomic_ulong.load() == nthreads * niters);
|
|
||||||
// We can also use the atomics direclty through `operator T` conversion.
|
|
||||||
assert(my_atomic_ulong == my_atomic_ulong.load());
|
|
||||||
std::cout << "my_non_atomic_ulong " << my_non_atomic_ulong << std::endl;
|
|
||||||
#if defined(__x86_64__) || defined(__aarch64__)
|
|
||||||
assert(my_arch_atomic_ulong == nthreads * niters);
|
|
||||||
std::cout << "my_arch_non_atomic_ulong " << my_arch_non_atomic_ulong << std::endl;
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user