diff --git a/.gitignore b/.gitignore
index ec4bc4e..04677c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,8 +25,11 @@ __pycache__
 # Accidents.
 /core
 /m5out
+
+# In-tree userland builds.
 *.o
 *.out
+*.so
 
 # Kernel modules.
 *.ko
@@ -40,3 +43,7 @@ modules.order
 
 # node.js
 node_modules
+
+# Performance profiling stuff.
+perf.data
+callgrind.out.*
diff --git a/index.html b/index.html
index 21df1d2..3a7e67f 100644
--- a/index.html
+++ b/index.html
@@ -673,8 +673,8 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 <li><a href="#user-mode-buildroot-executables">10.3. User mode Buildroot executables</a></li>
 <li><a href="#user-mode-simulation-with-glibc">10.4. User mode simulation with glibc</a>
 <ul class="sectlevel3">
-<li><a href="#fatal-kernel-too-old">10.4.1. FATAL: kernel too old</a></li>
-<li><a href="#stack-smashing-detected">10.4.2. stack smashing detected</a></li>
+<li><a href="#fatal-kernel-too-old-failure-in-userland-simulation">10.4.1. FATAL: kernel too old failure in userland simulation</a></li>
+<li><a href="#stack-smashing-detected-when-using-glibc">10.4.2. stack smashing detected when using glibc</a></li>
 </ul>
 </li>
 <li><a href="#user-mode-static-executables">10.5. User mode static executables</a>
@@ -1121,9 +1121,14 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 </li>
 <li><a href="#gem5-checkpoint">19.5. gem5 checkpoint</a>
 <ul class="sectlevel3">
-<li><a href="#gem5-checkpoint-internals">19.5.1. gem5 checkpoint internals</a></li>
-<li><a href="#gem5-restore-new-script">19.5.2. gem5 checkpoint restore and run a different script</a></li>
-<li><a href="#gem5-restore-checkpoint-with-a-different-cpu">19.5.3. gem5 restore checkpoint with a different CPU</a></li>
+<li><a href="#gem5-checkpoint-userland-minimal-example">19.5.1. gem5 checkpoint userland minimal example</a></li>
+<li><a href="#gem5-checkpoint-internals">19.5.2. gem5 checkpoint internals</a></li>
+<li><a href="#gem5-restore-new-script">19.5.3. gem5 checkpoint restore and run a different script</a></li>
+<li><a href="#gem5-restore-checkpoint-with-a-different-cpu">19.5.4. gem5 restore checkpoint with a different CPU</a>
+<ul class="sectlevel4">
+<li><a href="#gem5-fast-forward">19.5.4.1. gem5 fast forward</a></li>
+</ul>
+</li>
 </ul>
 </li>
 <li><a href="#pass-extra-options-to-gem5">19.6. Pass extra options to gem5</a></li>
@@ -1166,60 +1171,66 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 <li><a href="#m5term">19.10. m5term</a></li>
 <li><a href="#gem5-python-scripts-without-rebuild">19.11. gem5 Python scripts without rebuild</a></li>
 <li><a href="#gem5-fs-biglittle">19.12. gem5 fs_bigLITTLE</a></li>
-<li><a href="#gem5-unit-tests">19.13. gem5 unit tests</a></li>
-<li><a href="#gem5-regression-tests">19.14. gem5 regression tests</a></li>
-<li><a href="#gem5-simulate-limit-reached">19.15. gem5 simulate() limit reached</a></li>
-<li><a href="#gem5-build-options">19.16. gem5 build options</a>
+<li><a href="#gem5-in-tree-tests">19.13. gem5 in-tree tests</a>
 <ul class="sectlevel3">
-<li><a href="#gem5-debug-build">19.16.1. gem5 debug build</a></li>
-<li><a href="#gem5-clang-build">19.16.2. gem5 clang build</a></li>
-<li><a href="#gem5-sanitation-build">19.16.3. gem5 sanitation build</a></li>
-<li><a href="#gem5-ruby-build">19.16.4. gem5 Ruby build</a></li>
-<li><a href="#gem5-python-3-build">19.16.5. gem5 Python 3 build</a></li>
+<li><a href="#gem5-unit-tests">19.13.1. gem5 unit tests</a></li>
+<li><a href="#gem5-regression-tests">19.13.2. gem5 regression tests</a></li>
 </ul>
 </li>
-<li><a href="#gem5-cpu-types">19.17. gem5 CPU types</a>
+<li><a href="#gem5-simulate-limit-reached">19.14. gem5 simulate() limit reached</a></li>
+<li><a href="#gem5-build-options">19.15. gem5 build options</a>
 <ul class="sectlevel3">
-<li><a href="#list-gem5-cpu-types">19.17.1. List gem5 CPU types</a>
-<ul class="sectlevel4">
-<li><a href="#gem5-basesimplecpu">19.17.1.1. gem5 BaseSimpleCPU</a></li>
-<li><a href="#gem5-minorcpu">19.17.1.2. gem5 MinorCPU</a></li>
-<li><a href="#gem5-deriveo3cpu">19.17.1.3. gem5 DeriveO3CPU</a></li>
+<li><a href="#gem5-debug-build">19.15.1. gem5 debug build</a></li>
+<li><a href="#gem5-fast-build">19.15.2. gem5 fast build</a></li>
+<li><a href="#gem5-prof-and-perf-builds">19.15.3. gem5 prof and perf builds</a></li>
+<li><a href="#gem5-clang-build">19.15.4. gem5 clang build</a></li>
+<li><a href="#gem5-sanitation-build">19.15.5. gem5 sanitation build</a></li>
+<li><a href="#gem5-ruby-build">19.15.6. gem5 Ruby build</a></li>
+<li><a href="#gem5-python-3-build">19.15.7. gem5 Python 3 build</a></li>
 </ul>
 </li>
-<li><a href="#gem5-arm-rsk">19.17.2. gem5 ARM RSK</a></li>
-</ul>
-</li>
-<li><a href="#gem5-arm-platforms">19.18. gem5 ARM platforms</a></li>
-<li><a href="#gem5-upstream-images">19.19. gem5 upstream images</a></li>
-<li><a href="#gem5-internals">19.20. gem5 internals</a>
+<li><a href="#gem5-cpu-types">19.16. gem5 CPU types</a>
 <ul class="sectlevel3">
-<li><a href="#gem5-eclipse-configuration">19.20.1. gem5 Eclipse configuration</a></li>
-<li><a href="#gem5-python-c-interaction">19.20.2. gem5 Python C++ interaction</a></li>
-<li><a href="#gem5-entry-point">19.20.3. gem5 entry point</a></li>
-<li><a href="#gem5-event-queue">19.20.4. gem5 event queue</a>
+<li><a href="#list-gem5-cpu-types">19.16.1. List gem5 CPU types</a>
 <ul class="sectlevel4">
-<li><a href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">19.20.4.1. gem5 event queue AtomicSimpleCPU syscall emulation freestanding example analysis</a></li>
-<li><a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis">19.20.4.2. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis</a></li>
-<li><a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches">19.20.4.3. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis with caches</a></li>
-<li><a href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis">19.20.4.4. gem5 event queue MinorCPU syscall emulation freestanding example analysis</a></li>
+<li><a href="#gem5-basesimplecpu">19.16.1.1. gem5 BaseSimpleCPU</a></li>
+<li><a href="#gem5-minorcpu">19.16.1.2. gem5 MinorCPU</a></li>
+<li><a href="#gem5-deriveo3cpu">19.16.1.3. gem5 DeriveO3CPU</a></li>
 </ul>
 </li>
-<li><a href="#gem5-stats-internals">19.20.5. gem5 stats internals</a></li>
-<li><a href="#gem5-code-generation">19.20.6. gem5 code generation</a>
+<li><a href="#gem5-arm-rsk">19.16.2. gem5 ARM RSK</a></li>
+</ul>
+</li>
+<li><a href="#gem5-arm-platforms">19.17. gem5 ARM platforms</a></li>
+<li><a href="#gem5-upstream-images">19.18. gem5 upstream images</a></li>
+<li><a href="#gem5-internals">19.19. gem5 internals</a>
+<ul class="sectlevel3">
+<li><a href="#gem5-eclipse-configuration">19.19.1. gem5 Eclipse configuration</a></li>
+<li><a href="#gem5-python-c-interaction">19.19.2. gem5 Python C++ interaction</a></li>
+<li><a href="#gem5-entry-point">19.19.3. gem5 entry point</a></li>
+<li><a href="#gem5-event-queue">19.19.4. gem5 event queue</a>
 <ul class="sectlevel4">
-<li><a href="#gem5-the-isa">19.20.6.1. gem5 THE_ISA</a></li>
+<li><a href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">19.19.4.1. gem5 event queue AtomicSimpleCPU syscall emulation freestanding example analysis</a></li>
+<li><a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis">19.19.4.2. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis</a></li>
+<li><a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches">19.19.4.3. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis with caches</a></li>
+<li><a href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis">19.19.4.4. gem5 event queue MinorCPU syscall emulation freestanding example analysis</a></li>
 </ul>
 </li>
-<li><a href="#gem5-build-system">19.20.7. gem5 build system</a>
+<li><a href="#gem5-stats-internals">19.19.5. gem5 stats internals</a></li>
+<li><a href="#gem5-code-generation">19.19.6. gem5 code generation</a>
 <ul class="sectlevel4">
-<li><a href="#gem5-polymorphic-isa-includes">19.20.7.1. gem5 polymorphic ISA includes</a></li>
-<li><a href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.20.7.2. Why are all C++ symlinked into the gem5 build dir?</a></li>
+<li><a href="#gem5-the-isa">19.19.6.1. gem5 THE_ISA</a></li>
+</ul>
+</li>
+<li><a href="#gem5-build-system">19.19.7. gem5 build system</a>
+<ul class="sectlevel4">
+<li><a href="#gem5-polymorphic-isa-includes">19.19.7.1. gem5 polymorphic ISA includes</a></li>
+<li><a href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.19.7.2. Why are all C++ symlinked into the gem5 build dir?</a></li>
 </ul>
 </li>
 </ul>
 </li>
-<li><a href="#gem5-bootloaders">19.21. gem5 bootloaders</a></li>
+<li><a href="#gem5-bootloaders">19.20. gem5 bootloaders</a></li>
 </ul>
 </li>
 <li><a href="#buildroot">20. Buildroot</a>
@@ -1296,6 +1307,7 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 <li><a href="#cpp17">21.2.2.1. C++17 N4659 standards draft</a></li>
 </ul>
 </li>
+<li><a href="#cpp-type-casting">21.2.3. C++ type casting</a></li>
 </ul>
 </li>
 <li><a href="#posix">21.3. POSIX</a>
@@ -1327,11 +1339,19 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 <ul class="sectlevel3">
 <li><a href="#stack-smashing">21.5.1. Stack smashing</a></li>
 <li><a href="#memory-leaks">21.5.2. Memory leaks</a></li>
+<li><a href="#profiling-userland-programs">21.5.3. Profiling userland programs</a></li>
 </ul>
 </li>
 <li><a href="#interpreted-languages">21.6. Interpreted languages</a>
 <ul class="sectlevel3">
-<li><a href="#python">21.6.1. Python</a></li>
+<li><a href="#python">21.6.1. Python</a>
+<ul class="sectlevel4">
+<li><a href="#build-and-install-the-interpreter">21.6.1.1. Build and install the interpreter</a></li>
+<li><a href="#python-gem5-user-mode-simulation">21.6.1.2. Python gem5 user mode simulation</a></li>
+<li><a href="#embedding-python-in-another-application">21.6.1.3. Embedding Python in another application</a></li>
+<li><a href="#pybind11">21.6.1.4. pybind11</a></li>
+</ul>
+</li>
 <li><a href="#node-js">21.6.2. Node.js</a>
 <ul class="sectlevel4">
 <li><a href="#npm">21.6.2.1. NPM</a>
@@ -1365,10 +1385,15 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 <li><a href="#parsec-benchmark-hacking">21.8.4.5. PARSEC benchmark hacking</a></li>
 </ul>
 </li>
-<li><a href="#userland-libs-directory">21.8.5. userland/libs directory</a></li>
+<li><a href="#userland-libs-directory">21.8.5. userland/libs directory</a>
+<ul class="sectlevel4">
+<li><a href="#hdf5">21.8.5.1. HDF5</a></li>
 </ul>
 </li>
-<li><a href="#userland-content-bibliography">21.9. Userland content bibliography</a></li>
+</ul>
+</li>
+<li><a href="#userland-content-filename-conventions">21.9. Userland content filename conventions</a></li>
+<li><a href="#userland-content-bibliography">21.10. Userland content bibliography</a></li>
 </ul>
 </li>
 <li><a href="#userland-assembly">22. Userland assembly</a>
@@ -3937,7 +3962,7 @@ echo "$(./getvar --arch aarch64 --emulator gem5 image)"</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>see also: <a href="#gem5-arm-platforms">Section 19.18, &#8220;gem5 ARM platforms&#8221;</a>.</p>
+<p>see also: <a href="#gem5-arm-platforms">Section 19.17, &#8220;gem5 ARM platforms&#8221;</a>.</p>
 </div>
 <div class="paragraph">
 <p>This generates yet new separate images with new magic constants:</p>
@@ -7457,7 +7482,7 @@ qw er</pre>
 <p>At 125d14805f769104f93c510bedaa685a52ec025d we <a href="#libc-choice">moved Buildroot from uClibc to glibc</a>, and caused some user mode pain, which we document here.</p>
 </div>
 <div class="sect3">
-<h4 id="fatal-kernel-too-old"><a class="anchor" href="#fatal-kernel-too-old"></a><a class="link" href="#fatal-kernel-too-old">10.4.1. FATAL: kernel too old</a></h4>
+<h4 id="fatal-kernel-too-old-failure-in-userland-simulation"><a class="anchor" href="#fatal-kernel-too-old-failure-in-userland-simulation"></a><a class="link" href="#fatal-kernel-too-old-failure-in-userland-simulation">10.4.1. FATAL: kernel too old failure in userland simulation</a></h4>
 <div class="paragraph">
 <p>glibc has a check for kernel version, likely obtained from the <code>uname</code> syscall, and if the kernel is not new enough, it quits.</p>
 </div>
@@ -7502,7 +7527,7 @@ qw er</pre>
 </div>
 </div>
 <div class="sect3">
-<h4 id="stack-smashing-detected"><a class="anchor" href="#stack-smashing-detected"></a><a class="link" href="#stack-smashing-detected">10.4.2. stack smashing detected</a></h4>
+<h4 id="stack-smashing-detected-when-using-glibc"><a class="anchor" href="#stack-smashing-detected-when-using-glibc"></a><a class="link" href="#stack-smashing-detected-when-using-glibc">10.4.2. stack smashing detected when using glibc</a></h4>
 <div class="paragraph">
 <p>For some reason QEMU / glibc x86_64 picks up the host libc, which breaks things.</p>
 </div>
@@ -7586,7 +7611,7 @@ qemu: uncaught target signal 6 (Aborted) - core dumped</pre>
 <div class="ulist">
 <ul>
 <li>
-<p>QEMU x86_64 guest on x86_64 host was failing with <a href="#stack-smashing-detected">stack smashing detected</a>, but we found a workaround</p>
+<p>QEMU x86_64 guest on x86_64 host was failing with <a href="#stack-smashing-detected-when-using-glibc">stack smashing detected when using glibc</a>, but we found a workaround</p>
 </li>
 <li>
 <p>gem5 user only supported static executables in the past, as mentioned at: <a href="#gem5-syscall-emulation-mode">Section 10.7, &#8220;gem5 syscall emulation mode&#8221;</a></p>
@@ -17890,6 +17915,12 @@ root</pre>
 <div class="paragraph">
 <p>gem5 however has tended towards horrendous intensive <a href="#gem5-code-generation">code generation</a> in order to support all its different hardware types</p>
 </div>
+<div class="paragraph">
+<p>gem5 also has a complex Python interface which is also largely auto-generated, which greatly increases the maintenance complexity of the project: <a href="#embedding-python-in-another-application">Embedding Python in another application</a>.</p>
+</div>
+<div class="paragraph">
+<p>This is done so that reconfiguring platforms can be done quickly without recompiling, and it is amazing when it works, but the maintenance costs are also very high.</p>
+</div>
 </li>
 </ul>
 </div>
@@ -18005,7 +18036,7 @@ cat out/gem5-bench-dhrystone.txt</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>but the problem is that this method does not allow to easily run a different script without running the boot again. The <code>./gem5.sh</code> script works around that by using <a href="#m5-readfile">m5 readfile</a> as explained further at: <a href="#gem5-restore-new-script">Section 19.5.2, &#8220;gem5 checkpoint restore and run a different script&#8221;</a>.</p>
+<p>but the problem is that this method does not allow to easily run a different script without running the boot again. The <code>./gem5.sh</code> script works around that by using <a href="#m5-readfile">m5 readfile</a> as explained further at: <a href="#gem5-restore-new-script">Section 19.5.3, &#8220;gem5 checkpoint restore and run a different script&#8221;</a>.</p>
 </div>
 <div class="paragraph">
 <p>Now you can play a fun little game with your friends:</p>
@@ -18147,10 +18178,13 @@ ps Haux | grep qemu | wc</pre>
 <div class="sect5">
 <h6 id="gem5-syscall-emulation-multithreading"><a class="anchor" href="#gem5-syscall-emulation-multithreading"></a><a class="link" href="#gem5-syscall-emulation-multithreading">19.2.2.1.2. gem5 syscall emulation multithreading</a></h6>
 <div class="paragraph">
-<p>gem5 user mode multithreading has been particularly flaky compared <a href="#qemu-user-mode-multithreading">to QEMU&#8217;s</a>.</p>
+<p>gem5 user mode multithreading has been particularly flaky compared <a href="#qemu-user-mode-multithreading">to QEMU&#8217;s</a>, but work is being put into improving it.</p>
 </div>
 <div class="paragraph">
-<p>You have the limitation that you must have at least one core per guest thread, otherwise <code>pthread_create</code> fails. For example:</p>
+<p>In gem5 syscall simulation, the <code>fork</code> syscall checks if there is a free CPU, and if there is a free one, the new threads runs on that CPU. Otherwise, the <code>fork</code> call, and therefore higher level interfaces to <code>fork</code> such as <code>pthread_create</code> also fail and return a failure return status in the guest.</p>
+</div>
+<div class="paragraph">
+<p>For example, if we use just one CPU for <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/posix/pthread_self.c">userland/posix/pthread_self.c</a> which spawns one thread besides <code>main</code>:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -18158,7 +18192,7 @@ ps Haux | grep qemu | wc</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>fails because that process has a total of 2 threads: one for <code>main</code> and one extra thread spawned: <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/posix/pthread_self.c">userland/posix/pthread_self.c</a> The error message is:</p>
+<p>fails with this error message coming from the guest stderr:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -18174,10 +18208,18 @@ ps Haux | grep qemu | wc</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>This has to do with the fact that gem5 has a more simplistic thread implementation that does not spawn one host thread per guest thread CPU. Maybe this is required to achieve reproducible runs? What is the task switch algorithm then?</p>
+<p>Once threads exit, their CPU is freed and becomes available for new <code>fork</code> calls: For example, the following run spawns a thread, joins it, and then spawns again, and 2 CPUs are enough:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./run --cpus 2 --emulator gem5 --userland userland/posix/pthread_self.c --userland-args '1 2'</pre>
+</div>
 </div>
 <div class="paragraph">
-<p>gem5 threading does however show the expected number of cores, e.g.:</p>
+<p>because at each point in time, only up to two threads are running.</p>
+</div>
+<div class="paragraph">
+<p>gem5 syscall emulation does show the expected number of cores when queried, e.g.:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -18188,22 +18230,6 @@ ps Haux | grep qemu | wc</pre>
 <div class="paragraph">
 <p>outputs <code>1</code> and <code>2</code> respectively.</p>
 </div>
-<div class="paragraph">
-<p>TODO: aarch64 seems to failing to spawn more than 2 threads at 369a47fc6e5c2f4a7f911c1c058b6088f8824463 + 1:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./run --arch aarch64 --cpus 3 --emulator gem5 --userland userland/posix/pthread_self.c --userland-args 2</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>fails with:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>Exiting @ tick 18446744073709551615 because simulate() limit reached</pre>
-</div>
-</div>
 </div>
 <div class="sect5">
 <h6 id="gem5-se-py-user-mode-with-2-or-more-pthreads-fails-with-because-simulate-limit-reached"><a class="anchor" href="#gem5-se-py-user-mode-with-2-or-more-pthreads-fails-with-because-simulate-limit-reached"></a><a class="link" href="#gem5-se-py-user-mode-with-2-or-more-pthreads-fails-with-because-simulate-limit-reached">19.2.2.1.3. gem5 se.py user mode with 2 or more pthreads fails with because simulate() limit reached</a></h6>
@@ -18690,9 +18716,12 @@ m5 dumpstats</pre>
 <div class="paragraph">
 <p>Documentation: <a href="http://gem5.org/Checkpoints" class="bare">http://gem5.org/Checkpoints</a></p>
 </div>
+<div class="paragraph">
+<p>To see it in action try:</p>
+</div>
 <div class="literalblock">
 <div class="content">
-<pre>./run --arch arm --emulator gem5</pre>
+<pre>./run --arch aarch64 --emulator gem5</pre>
 </div>
 </div>
 <div class="paragraph">
@@ -18768,7 +18797,61 @@ m5 checkpoint</pre>
 <p>since boot has already happened, and the parameters are already in the RAM of the snapshot.</p>
 </div>
 <div class="sect3">
-<h4 id="gem5-checkpoint-internals"><a class="anchor" href="#gem5-checkpoint-internals"></a><a class="link" href="#gem5-checkpoint-internals">19.5.1. gem5 checkpoint internals</a></h4>
+<h4 id="gem5-checkpoint-userland-minimal-example"><a class="anchor" href="#gem5-checkpoint-userland-minimal-example"></a><a class="link" href="#gem5-checkpoint-userland-minimal-example">19.5.1. gem5 checkpoint userland minimal example</a></h4>
+<div class="paragraph">
+<p>In order to debug checkpoint restore bugs, this minimal setup using <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/freestanding/gem5_checkpoint_restore.S">userland/freestanding/gem5_checkpoint_restore.S</a> can be handy:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./build-userland --arch aarch64 --static
+./run --arch aarch64 --emulator gem5 --static --userland userland/freestanding/gem5_checkpoint_restore.S --trace-insts-stdout
+./run --arch aarch64 --emulator gem5 --static --userland userland/freestanding/gem5_checkpoint_restore.S --trace-insts-stdout --gem5-restore 1
+./run --arch aarch64 --emulator gem5 --static --userland userland/freestanding/gem5_checkpoint_restore.S --trace-insts-stdout --gem5-restore 1 -- --cpu-type=DerivO3CPU --restore-with-cpu=DerivO3CPU --caches</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>On the initial run, we see that all instructions are executed and the checkpoint is taken:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>      0: system.cpu: A0 T0 : @asm_main_after_prologue    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+    500: system.cpu: A0 T0 : @asm_main_after_prologue+4    :   movz   x1, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+   1000: system.cpu: A0 T0 : @asm_main_after_prologue+8    :   m5checkpoint             : IntAlu :   flags=(IsInteger|IsNonSpeculative|IsUnverifiable)
+Writing checkpoint
+warn: Checkpoints for file descriptors currently do not work.
+info: Entering event queue @ 1000.  Starting simulation...
+   1500: system.cpu: A0 T0 : @asm_main_after_prologue+12    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+   2000: system.cpu: A0 T0 : @asm_main_after_prologue+16    :   m5exit                   : No_OpClass :   flags=(IsInteger|IsNonSpeculative)
+Exiting @ tick 2000 because m5_exit instruction encountered</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Then, on the first restore run, the checkpoint is restored, and only instructions after the checkpoint are executed:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>info: Entering event queue @ 1000.  Starting simulation...
+   1500: system.cpu: A0 T0 : @asm_main_after_prologue+12    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+   2000: system.cpu: A0 T0 : @asm_main_after_prologue+16    :   m5exit                   : No_OpClass :   flags=(IsInteger|IsNonSpeculative)
+Exiting @ tick 2000 because m5_exit instruction encountered</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and a similar thing happens for the <a href="#gem5-restore-checkpoint-with-a-different-cpu">restore with a different CPU type</a>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>info: Entering event queue @ 1000.  Starting simulation...
+  79000: system.cpu: A0 T0 : @asm_main_after_prologue+12    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  FetchSeq=1  CPSeq=1  flags=(IsInteger)
+Exiting @ tick 84500 because m5_exit instruction encountered</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Here we don&#8217;t see the last <code>m5 exit</code> instruction on the log, but it must just be something to do with the O3 logging.</p>
+</div>
+</div>
+<div class="sect3">
+<h4 id="gem5-checkpoint-internals"><a class="anchor" href="#gem5-checkpoint-internals"></a><a class="link" href="#gem5-checkpoint-internals">19.5.2. gem5 checkpoint internals</a></h4>
 <div class="paragraph">
 <p>Checkpoints are stored inside the <a href="#m5out-directory">m5out directory</a> at:</p>
 </div>
@@ -18794,7 +18877,7 @@ m5 checkpoint</pre>
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-restore-new-script"><a class="anchor" href="#gem5-restore-new-script"></a><a class="link" href="#gem5-restore-new-script">19.5.2. gem5 checkpoint restore and run a different script</a></h4>
+<h4 id="gem5-restore-new-script"><a class="anchor" href="#gem5-restore-new-script"></a><a class="link" href="#gem5-restore-new-script">19.5.3. gem5 checkpoint restore and run a different script</a></h4>
 <div class="paragraph">
 <p>You want to automate running several tests from a single pristine post-boot state.</p>
 </div>
@@ -18942,7 +19025,7 @@ expect eof</pre>
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-restore-checkpoint-with-a-different-cpu"><a class="anchor" href="#gem5-restore-checkpoint-with-a-different-cpu"></a><a class="link" href="#gem5-restore-checkpoint-with-a-different-cpu">19.5.3. gem5 restore checkpoint with a different CPU</a></h4>
+<h4 id="gem5-restore-checkpoint-with-a-different-cpu"><a class="anchor" href="#gem5-restore-checkpoint-with-a-different-cpu"></a><a class="link" href="#gem5-restore-checkpoint-with-a-different-cpu">19.5.4. gem5 restore checkpoint with a different CPU</a></h4>
 <div class="paragraph">
 <p>gem5 can switch to a different CPU model when restoring a checkpoint.</p>
 </div>
@@ -18950,27 +19033,232 @@ expect eof</pre>
 <p>A common combo is to boot Linux with a fast CPU, make a checkpoint and then replay the benchmark of interest with a slower CPU.</p>
 </div>
 <div class="paragraph">
-<p>An illustrative interactive run:</p>
+<p>This can be observed interactively in full system with:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>./run --arch arm --emulator gem5</pre>
+<pre>./run --arch aarch64 --emulator gem5</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>In guest:</p>
+<p>Then in the guest terminal after boot ends:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>m5 checkpoint</pre>
+<pre>sh -c 'm5 checkpoint;sh'
+m5 exit</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>And then restore the checkpoint with a different CPU:</p>
+<p>And then restore the checkpoint with a different slower CPU:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>./run --arch arm --emulator gem5 --gem5-restore 1 -- --caches --restore-with-cpu=HPI</pre>
+<pre>./run --arch arm --emulator gem5 --gem5-restore 1 -- --caches --cpu-type=DerivO3CPU</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>And now you will notice that everything happens much slower in the guest terminal!</p>
+</div>
+<div class="paragraph">
+<p>One even more direct and minimal way to observe this is with <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/freestanding/gem5_checkpoint_restore.S">userland/freestanding/gem5_checkpoint_restore.S</a> which was mentioned at <a href="#gem5-checkpoint-userland-minimal-example">gem5 checkpoint userland minimal example</a> plus some logging:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./run \
+  --arch aarch64 \
+  --emulator gem5 \
+  --static \
+  --trace ExecAll,FmtFlag,O3CPU,SimpleCPU \
+  --userland userland/freestanding/gem5_checkpoint_restore.S \
+;
+cat "$(./getvar --arch aarch64 --emulator gem5 trace_txt_file)"
+./run \
+  --arch aarch64 \
+  --emulator gem5 \
+  --gem5-restore 1 \
+  --static \
+  --trace ExecAll,FmtFlag,O3CPU,SimpleCPU \
+  --userland userland/freestanding/gem5_checkpoint_restore.S \
+  -- \
+  --caches \
+  --cpu-type DerivO3CPU \
+  --restore-with-cpu DerivO3CPU \
+;
+cat "$(./getvar --arch aarch64 --emulator gem5 trace_txt_file)"</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>At gem5 2235168b72537535d74c645a70a85479801e0651, the first run does everything in <a href="#gem5-basesimplecpu">AtomicSimpleCPU</a>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>...
+      0: SimpleCPU: system.cpu.dcache_port: received snoop pkt for addr:0x1f92 WriteReq
+      0: SimpleCPU: system.cpu.dcache_port: received snoop pkt for addr:0x1e40 WriteReq
+      0: SimpleCPU: system.cpu.dcache_port: received snoop pkt for addr:0x1e30 WriteReq
+      0: SimpleCPU: system.cpu: Tick
+      0: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+    500: SimpleCPU: system.cpu: Tick
+    500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+4    :   movz   x1, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+   1000: SimpleCPU: system.cpu: Tick
+   1000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+8    :   m5checkpoint             : IntAlu :   flags=(IsInteger|IsNonSpeculative|IsUnverifiable)
+   1000: SimpleCPU: system.cpu: Resume
+   1500: SimpleCPU: system.cpu: Tick
+   1500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+12    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+   2000: SimpleCPU: system.cpu: Tick
+   2000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+16    :   m5exit                   : No_OpClass :   flags=(IsInteger|IsNonSpeculative)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and after restore we see as expected a single <code>ExecEnable</code> instruction executed amidst <code>O3CPU</code> noise:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>FullO3CPU: Ticking main, FullO3CPU.
+  79000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+12    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  FetchSeq=1  CPSeq=1  flags=(IsInteger)
+  82500: O3CPU: system.cpu: Removing committed instruction [tid:0] PC (0x400084=&gt;0x400088).(0=&gt;1) [sn:1]
+  82500: O3CPU: system.cpu: Removing instruction, [tid:0] [sn:1] PC (0x400084=&gt;0x400088).(0=&gt;1)
+  82500: O3CPU: system.cpu: Scheduling next tick!
+  83000: O3CPU: system.cpu:</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which is the <code>movz</code> after the checkpoint. The final <code>m5exit</code> does not appear due to DerivO3CPU logging insanity.</p>
+</div>
+<div class="paragraph">
+<p>Bibliography:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://stackoverflow.com/questions/49011096/how-to-switch-cpu-models-in-gem5-after-restoring-a-checkpoint-and-then-observe-t" class="bare">https://stackoverflow.com/questions/49011096/how-to-switch-cpu-models-in-gem5-after-restoring-a-checkpoint-and-then-observe-t</a></p>
+</li>
+</ul>
+</div>
+<div class="sect4">
+<h5 id="gem5-fast-forward"><a class="anchor" href="#gem5-fast-forward"></a><a class="link" href="#gem5-fast-forward">19.5.4.1. gem5 fast forward</a></h5>
+<div class="paragraph">
+<p>Besides switching CPUs after a checkpoint restore, fs.py also has the <code>--fast-forward</code> option to automatically run the script from the start on a less detailed CPU, and switch to a more detailed CPU at a given tick.</p>
+</div>
+<div class="paragraph">
+<p>This is generally useless compared to checkpoint restoring because:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>checkpoint restore allows to run multiple contents after the restore, and restoring to multiple different system states, which you almost always want to do</p>
+</li>
+<li>
+<p>we generally don&#8217;t know the exact tick at which the region of interest will start, especially as the binaries change. It is much easier to just instrument the content with a checkoint <a href="#m5ops">m5op</a></p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>But let&#8217;s give it a try anyways with <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/freestanding/gem5_checkpoint_restore.S">userland/freestanding/gem5_checkpoint_restore.S</a> which was mentioned at <a href="#gem5-checkpoint-userland-minimal-example">gem5 checkpoint userland minimal example</a></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./run \
+  --arch aarch64 \
+  --emulator gem5 \
+  --static \
+  --trace ExecAll,FmtFlag,O3CPU,SimpleCPU \
+  --userland userland/freestanding/gem5_checkpoint_restore.S \
+  -- \
+  --caches
+  --cpu-type DerivO3CPU \
+  --fast-forward 1000 \
+;
+cat "$(./getvar --arch aarch64 --emulator gem5 trace_txt_file)"</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>At gem5 2235168b72537535d74c645a70a85479801e0651 we see something like:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>      0: O3CPU: system.switch_cpus: Creating O3CPU object.
+      0: O3CPU: system.switch_cpus: Workload[0] process is 0      0: SimpleCPU: system.cpu: ActivateContext 0
+      0: SimpleCPU: system.cpu.dcache_port: received snoop pkt for addr:0 WriteReq
+      0: SimpleCPU: system.cpu.dcache_port: received snoop pkt for addr:0x40 WriteReq
+...
+
+      0: SimpleCPU: system.cpu.dcache_port: received snoop pkt for addr:0x1f92 WriteReq
+      0: SimpleCPU: system.cpu.dcache_port: received snoop pkt for addr:0x1e40 WriteReq
+      0: SimpleCPU: system.cpu.dcache_port: received snoop pkt for addr:0x1e30 WriteReq
+      0: SimpleCPU: system.cpu: Tick
+      0: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+    500: SimpleCPU: system.cpu: Tick
+    500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+4    :   movz   x1, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+   1000: SimpleCPU: system.cpu: Tick
+   1000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+8    :   m5checkpoint             : IntAlu :   flags=(IsInteger|IsNonSpeculative|IsUnverifiable)
+   1000: O3CPU: system.switch_cpus: [tid:0] Calling activate thread.
+   1000: O3CPU: system.switch_cpus: [tid:0] Adding to active threads list
+   1500: O3CPU: system.switch_cpus:
+
+FullO3CPU: Ticking main, FullO3CPU.
+   1500: O3CPU: system.switch_cpus: Scheduling next tick!
+   2000: O3CPU: system.switch_cpus:
+
+FullO3CPU: Ticking main, FullO3CPU.
+   2000: O3CPU: system.switch_cpus: Scheduling next tick!
+   2500: O3CPU: system.switch_cpus:
+
+...
+
+FullO3CPU: Ticking main, FullO3CPU.
+  44500: ExecEnable: system.switch_cpus: A0 T0 : @asm_main_after_prologue+12    :   movz   x0, #0, #0        : IntAlu :  D=0x00000000000
+  48000: O3CPU: system.switch_cpus: Removing committed instruction [tid:0] PC (0x400084=&gt;0x400088).(0=&gt;1) [sn:1]
+  48000: O3CPU: system.switch_cpus: Removing instruction, [tid:0] [sn:1] PC (0x400084=&gt;0x400088).(0=&gt;1)
+  48000: O3CPU: system.switch_cpus: Scheduling next tick!
+  48500: O3CPU: system.switch_cpus:
+
+...</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>We can also compare that to the same log but without <code>--fast-forward</code> and other CPU switch options:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>      0: SimpleCPU: system.cpu.dcache_port: received snoop pkt for addr:0x1e40 WriteReq
+      0: SimpleCPU: system.cpu.dcache_port: received snoop pkt for addr:0x1e30 WriteReq
+      0: SimpleCPU: system.cpu: Tick
+      0: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+    500: SimpleCPU: system.cpu: Tick
+    500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+4    :   movz   x1, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+   1000: SimpleCPU: system.cpu: Tick
+   1000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+8    :   m5checkpoint             : IntAlu :   flags=(IsInteger|IsNonSpeculative|IsUnverifiable)
+   1000: SimpleCPU: system.cpu: Resume
+   1500: SimpleCPU: system.cpu: Tick
+   1500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+12    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+   2000: SimpleCPU: system.cpu: Tick
+   2000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+16    :   m5exit                   : No_OpClass :   flags=(IsInteger|IsNonSpeculative)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Therefore, it is clear that what we wanted happen:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>up until the tick 1000, <code>SimpleCPU</code> was ticking</p>
+</li>
+<li>
+<p>after tick 1000, cpu <code>O3CPU</code> started ticking</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>Bibliography:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://cs.stackexchange.com/questions/69511/what-does-fast-forwarding-mean-in-the-context-of-cpu-simulation" class="bare">https://cs.stackexchange.com/questions/69511/what-does-fast-forwarding-mean-in-the-context-of-cpu-simulation</a></p>
+</li>
+</ul>
 </div>
 </div>
 </div>
@@ -19521,7 +19809,7 @@ git -C "$(./getvar linux_source_dir)" checkout -
 </div>
 </div>
 <div class="paragraph">
-<p>because glibc was built to expect a newer Linux kernel as shown at: <a href="#fatal-kernel-too-old">Section 10.4.1, &#8220;FATAL: kernel too old&#8221;</a>. Your choices to sole this are:</p>
+<p>because glibc was built to expect a newer Linux kernel as shown at: <a href="#fatal-kernel-too-old-failure-in-userland-simulation">Section 10.4.1, &#8220;FATAL: kernel too old failure in userland simulation&#8221;</a>. Your choices to solve this are:</p>
 </div>
 <div class="ulist">
 <ul>
@@ -19537,7 +19825,7 @@ git -C "$(./getvar linux_source_dir)" checkout -
 </ul>
 </div>
 <div class="paragraph">
-<p>It is obviously not possible to understand what they actually do from their commit message, so let&#8217;s explain them one by one here as we understand them:</p>
+<p>It is obviously not possible to understand what the Linux kernel fork commits actually do from their commit message, so let&#8217;s explain them one by one here as we understand them:</p>
 </div>
 <div class="ulist">
 <ul>
@@ -19690,7 +19978,7 @@ clock=500</pre>
 <div class="ulist">
 <ul>
 <li>
-<p>the <code>type</code> parameter shows is present on every node, and it maps to a <code>Python</code> object that inherits from <code>SimObject</code>.</p>
+<p>the <code>type</code> parameter shows is present on every node, and it maps to a <code>Python</code> object that inherits from <a href="#gem5-python-c-interaction"><code>SimObject</code></a>.</p>
 <div class="paragraph">
 <p>For example, <code>AtomicSimpleCPU</code> maps is defined at <a href="https://github.com/gem5/gem5/blob/05c4c2b566ce351ab217b2bd7035562aa7a76570/src/cpu/simple/AtomicSimpleCPU.py#L45">src/cpu/simple/AtomicSimpleCPU.py</a>.</p>
 </div>
@@ -19833,11 +20121,19 @@ clock=500</pre>
 </div>
 </div>
 <div class="sect2">
-<h3 id="gem5-unit-tests"><a class="anchor" href="#gem5-unit-tests"></a><a class="link" href="#gem5-unit-tests">19.13. gem5 unit tests</a></h3>
+<h3 id="gem5-in-tree-tests"><a class="anchor" href="#gem5-in-tree-tests"></a><a class="link" href="#gem5-in-tree-tests">19.13. gem5 in-tree tests</a></h3>
 <div class="paragraph">
 <p><a href="https://stackoverflow.com/questions/52279971/how-to-run-the-gem5-unit-tests" class="bare">https://stackoverflow.com/questions/52279971/how-to-run-the-gem5-unit-tests</a></p>
 </div>
 <div class="paragraph">
+<p>All those tests could in theory be added to this repo instead of to gem5, and this is actually the superior setup as it is cross emulator.</p>
+</div>
+<div class="paragraph">
+<p>But can the people from the project be convinced of that?</p>
+</div>
+<div class="sect3">
+<h4 id="gem5-unit-tests"><a class="anchor" href="#gem5-unit-tests"></a><a class="link" href="#gem5-unit-tests">19.13.1. gem5 unit tests</a></h4>
+<div class="paragraph">
 <p>These are just very small GTest tests that test a single class in isolation, they don&#8217;t run any executables.</p>
 </div>
 <div class="paragraph">
@@ -19890,8 +20186,11 @@ clock=500</pre>
 <p>Note that the command and it&#8217;s corresponding results don&#8217;t need to show consecutively on stdout because tests are run in parallel. You just have to match them based on the class name <code>CircleBufTest</code> to the file <code>circlebuf.test.cpp</code>.</p>
 </div>
 </div>
-<div class="sect2">
-<h3 id="gem5-regression-tests"><a class="anchor" href="#gem5-regression-tests"></a><a class="link" href="#gem5-regression-tests">19.14. gem5 regression tests</a></h3>
+<div class="sect3">
+<h4 id="gem5-regression-tests"><a class="anchor" href="#gem5-regression-tests"></a><a class="link" href="#gem5-regression-tests">19.13.2. gem5 regression tests</a></h4>
+<div class="paragraph">
+<p>This section is about running the gem5 in-tree tests.</p>
+</div>
 <div class="paragraph">
 <p><a href="https://stackoverflow.com/questions/52279971/how-to-run-the-gem5-unit-tests" class="bare">https://stackoverflow.com/questions/52279971/how-to-run-the-gem5-unit-tests</a></p>
 </div>
@@ -19905,7 +20204,7 @@ clock=500</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>After the first run has downloaded the test binaries for you, you can speed up the process a little bit by skipping an useless scons call:</p>
+<p>After the first run has downloaded the test binaries for you, you can speed up the process a little bit by skipping an useless SCons call:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -19913,11 +20212,28 @@ clock=500</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>Note however that <code>--skip-build</code> is required at least once per branch to download the test binaries, because the test interface is bad.</p>
+<p>Note however that running without <code>--skip-build</code> is required at least once to download the test binaries, because the test interface is bad.</p>
+</div>
+<div class="paragraph">
+<p>List available instead of running them:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./gem5-regression --gem5-worktree master --arch aarch64 --cmd list</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>You can then pick one suite (has to be a suite, not an "individual test") from the list and run just it e.g. with:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./gem5-regression --arch aarch64 -- --uid SuiteUID:tests/gem5/cpu_tests/test.py:cpu_test_AtomicSimpleCPU_Bubblesort-ARM-opt</pre>
+</div>
+</div>
 </div>
 </div>
 <div class="sect2">
-<h3 id="gem5-simulate-limit-reached"><a class="anchor" href="#gem5-simulate-limit-reached"></a><a class="link" href="#gem5-simulate-limit-reached">19.15. gem5 simulate() limit reached</a></h3>
+<h3 id="gem5-simulate-limit-reached"><a class="anchor" href="#gem5-simulate-limit-reached"></a><a class="link" href="#gem5-simulate-limit-reached">19.14. gem5 simulate() limit reached</a></h3>
 <div class="paragraph">
 <p>This error happens when the following instruction limits are reached:</p>
 </div>
@@ -20053,18 +20369,58 @@ Exiting @ tick 18446744073709551615 because simulate() limit reached</pre>
 </div>
 </div>
 <div class="sect2">
-<h3 id="gem5-build-options"><a class="anchor" href="#gem5-build-options"></a><a class="link" href="#gem5-build-options">19.16. gem5 build options</a></h3>
+<h3 id="gem5-build-options"><a class="anchor" href="#gem5-build-options"></a><a class="link" href="#gem5-build-options">19.15. gem5 build options</a></h3>
 <div class="paragraph">
 <p>In order to use different build options, you might also want to use <a href="#gem5-build-variants">gem5 build variants</a> to keep the build outputs separate from one another.</p>
 </div>
 <div class="sect3">
-<h4 id="gem5-debug-build"><a class="anchor" href="#gem5-debug-build"></a><a class="link" href="#gem5-debug-build">19.16.1. gem5 debug build</a></h4>
+<h4 id="gem5-debug-build"><a class="anchor" href="#gem5-debug-build"></a><a class="link" href="#gem5-debug-build">19.15.1. gem5 debug build</a></h4>
 <div class="paragraph">
 <p>Explained at: <a href="#debug-the-emulator">Section 18.7, &#8220;Debug the emulator&#8221;</a>.</p>
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-clang-build"><a class="anchor" href="#gem5-clang-build"></a><a class="link" href="#gem5-clang-build">19.16.2. gem5 clang build</a></h4>
+<h4 id="gem5-fast-build"><a class="anchor" href="#gem5-fast-build"></a><a class="link" href="#gem5-fast-build">19.15.2. gem5 fast build</a></h4>
+<div class="literalblock">
+<div class="content">
+<pre>./build-gem5 --gem5-build-type fast</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>How it goes faster is explained at: <a href="https://stackoverflow.com/questions/59860091/how-to-increase-the-simulation-speed-of-a-gem5-run/59861375#59861375" class="bare">https://stackoverflow.com/questions/59860091/how-to-increase-the-simulation-speed-of-a-gem5-run/59861375#59861375</a></p>
+</div>
+<div class="paragraph">
+<p>Benchmarks present at:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="#benchmark-emulators-on-userland-executables">Section 29.2.2, &#8220;Benchmark emulators on userland executables&#8221;</a></p>
+</li>
+</ul>
+</div>
+</div>
+<div class="sect3">
+<h4 id="gem5-prof-and-perf-builds"><a class="anchor" href="#gem5-prof-and-perf-builds"></a><a class="link" href="#gem5-prof-and-perf-builds">19.15.3. gem5 prof and perf builds</a></h4>
+<div class="paragraph">
+<p>Profiling builds as of 3cea7d9ce49bda49c50e756339ff1287fd55df77 both use: <code>-g -O3</code> and disable asserts and logging like the <a href="#gem5-fast-build">gem5 fast build</a> and:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><code>prof</code> uses <code>-pg</code> for gprof</p>
+</li>
+<li>
+<p><code>perf</code> uses <code>-lprofile</code> for google-pprof</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>See also: <a href="#profiling-userland-programs">Profiling userland programs</a>.</p>
+</div>
+</div>
+<div class="sect3">
+<h4 id="gem5-clang-build"><a class="anchor" href="#gem5-clang-build"></a><a class="link" href="#gem5-clang-build">19.15.4. gem5 clang build</a></h4>
 <div class="paragraph">
 <p>TODO test properly, benchmark vs GCC.</p>
 </div>
@@ -20077,7 +20433,7 @@ Exiting @ tick 18446744073709551615 because simulate() limit reached</pre>
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-sanitation-build"><a class="anchor" href="#gem5-sanitation-build"></a><a class="link" href="#gem5-sanitation-build">19.16.3. gem5 sanitation build</a></h4>
+<h4 id="gem5-sanitation-build"><a class="anchor" href="#gem5-sanitation-build"></a><a class="link" href="#gem5-sanitation-build">19.15.5. gem5 sanitation build</a></h4>
 <div class="paragraph">
 <p>If there gem5 appears to have a C++ undefined behaviour bug, which is often very difficult to track down, you can try to build it with the following extra SCons options:</p>
 </div>
@@ -20118,20 +20474,29 @@ Direct leak of 2928 byte(s) in 43 object(s) allocated from:
 Direct leak of 2002 byte(s) in 3 object(s) allocated from:
     #0 0x7ff039804448 in malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x10c448)
     #1 0x7ff0394fd813 in PyString_FromStringAndSize ../Objects/stringobject.c:88
-    #2 0x7ff0394fd813 in PyString_FromStringAndSize ../Objects/stringobject.c:57                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Direct leak of 40 byte(s) in 2 object(s) allocated from:                                                                                                                                                                                                                            #0 0x7ff039804448 in malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x10c448)
+    #2 0x7ff0394fd813 in PyString_FromStringAndSize ../Objects/stringobject.c:
+    Direct leak of 40 byte(s) in 2 object(s) allocated from
+    #0 0x7ff039804448 in malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x10c448)
     #1 0x7ff03951ea4b in PyList_New ../Objects/listobject.c:152
 
-Indirect leak of 10384 byte(s) in 11 object(s) allocated from:                                                                                                                                                                                                                      #0 0x7ff039804448 in malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x10c448)                                                                                                                                                                                                   #1 0x7ff03945e40d in _PyObject_GC_Malloc ../Modules/gcmodule.c:1499                                                                                                                                                                                                             #2 0x7ff03945e40d in _PyObject_GC_Malloc ../Modules/gcmodule.c:1493
+Indirect leak of 10384 byte(s) in 11 object(s) allocated from
+    #0 0x7ff039804448 in malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x10c448
+    #1 0x7ff03945e40d in _PyObject_GC_Malloc ../Modules/gcmodule.c:
+    #2 0x7ff03945e40d in _PyObject_GC_Malloc ../Modules/gcmodule.c:1493
 
 Indirect leak of 4089 byte(s) in 6 object(s) allocated from:
     #0 0x7ff039804448 in malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x10c448)
     #1 0x7ff0394fd648 in PyString_FromString ../Objects/stringobject.c:143
 
 Indirect leak of 2090 byte(s) in 3 object(s) allocated from:
-    #0 0x7ff039804448 in malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x10c448)                                                                                                                                                                                                   #1 0x7ff0394eb36f in type_new ../Objects/typeobject.c:2421                                                                                                                                                                                                                      #2 0x7ff0394eb36f in type_new ../Objects/typeobject.c:2094
+    #0 0x7ff039804448 in malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x10c448
+    #1 0x7ff0394eb36f in type_new ../Objects/typeobject.c:
+    #2 0x7ff0394eb36f in type_new ../Objects/typeobject.c:2094
 Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
     #0 0x7ff039804448 in malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x10c448)
-    #1 0x7ff0394fd813 in PyString_FromStringAndSize ../Objects/stringobject.c:88                                                                                                                                                                                                    #2 0x7ff0394fd813 in PyString_FromStringAndSize ../Objects/stringobject.c:57                                                                                                                                                                                                                                                                                                                                                                                                                                                                                SUMMARY: AddressSanitizer: 418319 byte(s) leaked in 203 allocation(s).</pre>
+    #1 0x7ff0394fd813 in PyString_FromStringAndSize ../Objects/stringobject.c:
+    #2 0x7ff0394fd813 in PyString_FromStringAndSize ../Objects/stringobject.c:
+    SUMMARY: AddressSanitizer: 418319 byte(s) leaked in 203 allocation(s).</pre>
 </div>
 </div>
 <div class="paragraph">
@@ -20142,15 +20507,55 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-ruby-build"><a class="anchor" href="#gem5-ruby-build"></a><a class="link" href="#gem5-ruby-build">19.16.4. gem5 Ruby build</a></h4>
+<h4 id="gem5-ruby-build"><a class="anchor" href="#gem5-ruby-build"></a><a class="link" href="#gem5-ruby-build">19.15.6. gem5 Ruby build</a></h4>
 <div class="paragraph">
-<p>Ruby is a system that includes the SLICC domain specific language to describe memory systems: <a href="http://gem5.org/Ruby" class="bare">http://gem5.org/Ruby</a></p>
+<p>gem5 has two types of memory system:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>the classic memory system, which is used by default</p>
+</li>
+<li>
+<p>the Ruby memory system</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>The Ruby memory system includes the SLICC domain specific language to describe memory systems: <a href="http://gem5.org/Ruby" class="bare">http://gem5.org/Ruby</a></p>
 </div>
 <div class="paragraph">
 <p>It seems to have usage outside of gem5, but the naming overload with the <a href="https://en.wikipedia.org/wiki/Ruby_(programming_language)">Ruby programming language</a>, which also has <a href="https://thoughtbot.com/blog/writing-a-domain-specific-language-in-ruby">domain specific languages</a> as a concept, makes it impossible to google anything about it!</p>
 </div>
 <div class="paragraph">
-<p>Ruby is activated at compile time with the <code>PROTOCOL</code> flag, which specifies the desired memory system time.</p>
+<p>Since it is not the default, Ruby is generally less stable that the classic memory model. However, because it allows describing a wide variety of important coherency protocols, while the classic system only describes a single protocol, Ruby is very importanonly describes a single protocol, Ruby is a very important feature of gem5.</p>
+</div>
+<div class="paragraph">
+<p>Ruby support must be enabled at compile time with the <code>scons PROTOCOL=</code> flag, which compiles support for the desired memory system type.</p>
+</div>
+<div class="paragraph">
+<p>Note however that most ISAs already implicitly set <code>PROTOCOL</code> via the <code>build_opts/</code> directory, e.g. <code>build_opts/ARM</code> contains:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>PROTOCOL = 'MOESI_CMP_directory'</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and therefore ARM already compiles <code>MOESI_CMP_directory</code> by default.</p>
+</div>
+<div class="paragraph">
+<p>Then, with <code>fs.py</code> and <code>se.py</code>, you can choose to use either the classic or built-in ruby system at runtime with the <code>--ruby</code> option:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>if <code>--ruby</code> is given, use the ruby memory system</p>
+</li>
+<li>
+<p>otherwise, use the classic memory system</p>
+</li>
+</ul>
 </div>
 <div class="paragraph">
 <p>For example, to use a two level <a href="https://en.wikipedia.org/wiki/MESI_protocol">MESI</a> <a href="https://en.wikipedia.org/wiki/Cache_coherence">cache coherence protocol</a>, we can do:</p>
@@ -20173,10 +20578,7 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 <p>which shows that dozens of C++ files are being generated from Ruby SLICC.</p>
 </div>
 <div class="paragraph">
-<p>TODO observe it doing something during a run.</p>
-</div>
-<div class="paragraph">
-<p>The relevant source files live in the source tree under:</p>
+<p>The relevant Ruby source files live in the source tree under:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -20184,7 +20586,7 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 </div>
 </div>
 <div class="paragraph">
-<p>We already pass the <code>SLICC_HTML</code> flag by default to the build, which generates an HTML summary of each memory protocol under:</p>
+<p>We already pass the <code>SLICC_HTML</code> flag by default to the build, which generates an HTML summary of each memory protocol under (TODO broken: <a href="https://gem5.atlassian.net/browse/GEM5-357" class="bare">https://gem5.atlassian.net/browse/GEM5-357</a>):</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -20194,9 +20596,49 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 <div class="paragraph">
 <p>A minimized ruby config which was not merged upstream can be found for study at: <a href="https://gem5-review.googlesource.com/c/public/gem5/+/13599/1" class="bare">https://gem5-review.googlesource.com/c/public/gem5/+/13599/1</a></p>
 </div>
+<div class="paragraph">
+<p>One easy way to see that Ruby is being used without understanding it in detail is to <a href="#gem5-tracing">enable some logging</a>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./run \
+  --arch aarch64 \
+  --emulator gem5 \
+  --gem5-worktree master \
+  --userland userland/arch/aarch64/freestanding/linux/hello.S \
+  --static \
+  --trace ExecAll,FmtFlag,Ruby,XBar \
+  -- \
+  --ruby \
+;
+cat "$(./getvar --arch aarch64 --emulator gem5 trace_txt_file)"</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Then:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>when the <code>--ruby</code> flag is given, we see a gazillion Ruby related messages prefixed e.g. by <code>RubyPort:</code>.</p>
+<div class="paragraph">
+<p>We also observe from <code>ExecEnable</code> lines that instruction timing is not simple anymore, so the memory system must have latencies</p>
+</div>
+</li>
+<li>
+<p>without <code>--ruby</code>, we instead see <code>XBar</code> (Coherent Crossbar) related messages such as <code>CoherentXBar:</code>, which I believe is the more precise name for the memory model that the classic memory system uses</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>Certain features may not work in Ruby. For example, <a href="#gem5-checkpoint">gem5 checkpoint</a> creation is only possible in Ruby protocols that support flush, which is the case for <code>PROTOCOL=MOESI_hammer</code> but not <code>PROTOCOL=MESI_Three_Level</code>: <a href="https://www.mail-archive.com/gem5-users@gem5.org/msg17418.html" class="bare">https://www.mail-archive.com/gem5-users@gem5.org/msg17418.html</a></p>
+</div>
+<div class="paragraph">
+<p>Tested in gem5 d7d9bc240615625141cd6feddbadd392457e49eb.</p>
+</div>
 </div>
 <div class="sect3">
-<h4 id="gem5-python-3-build"><a class="anchor" href="#gem5-python-3-build"></a><a class="link" href="#gem5-python-3-build">19.16.5. gem5 Python 3 build</a></h4>
+<h4 id="gem5-python-3-build"><a class="anchor" href="#gem5-python-3-build"></a><a class="link" href="#gem5-python-3-build">19.15.7. gem5 Python 3 build</a></h4>
 <div class="paragraph">
 <p>Python 3 support was mostly added in 2019 Q3 at arounda347a1a68b8a6e370334be3a1d2d66675891e0f1 but remained buggy for some time afterwards.</p>
 </div>
@@ -20214,7 +20656,7 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 </div>
 </div>
 <div class="sect2">
-<h3 id="gem5-cpu-types"><a class="anchor" href="#gem5-cpu-types"></a><a class="link" href="#gem5-cpu-types">19.17. gem5 CPU types</a></h3>
+<h3 id="gem5-cpu-types"><a class="anchor" href="#gem5-cpu-types"></a><a class="link" href="#gem5-cpu-types">19.16. gem5 CPU types</a></h3>
 <div class="paragraph">
 <p>gem5 has a few in tree CPU models for different purposes.</p>
 </div>
@@ -20244,9 +20686,9 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 <p>Both of those can be checked with <code>git log</code> and <code>git blame</code>.</p>
 </div>
 <div class="sect3">
-<h4 id="list-gem5-cpu-types"><a class="anchor" href="#list-gem5-cpu-types"></a><a class="link" href="#list-gem5-cpu-types">19.17.1. List gem5 CPU types</a></h4>
+<h4 id="list-gem5-cpu-types"><a class="anchor" href="#list-gem5-cpu-types"></a><a class="link" href="#list-gem5-cpu-types">19.16.1. List gem5 CPU types</a></h4>
 <div class="sect4">
-<h5 id="gem5-basesimplecpu"><a class="anchor" href="#gem5-basesimplecpu"></a><a class="link" href="#gem5-basesimplecpu">19.17.1.1. gem5 BaseSimpleCPU</a></h5>
+<h5 id="gem5-basesimplecpu"><a class="anchor" href="#gem5-basesimplecpu"></a><a class="link" href="#gem5-basesimplecpu">19.16.1.1. gem5 BaseSimpleCPU</a></h5>
 <div class="paragraph">
 <p>Simple abstract CPU without a pipeline.</p>
 </div>
@@ -20283,7 +20725,7 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 </div>
 </div>
 <div class="sect4">
-<h5 id="gem5-minorcpu"><a class="anchor" href="#gem5-minorcpu"></a><a class="link" href="#gem5-minorcpu">19.17.1.2. gem5 MinorCPU</a></h5>
+<h5 id="gem5-minorcpu"><a class="anchor" href="#gem5-minorcpu"></a><a class="link" href="#gem5-minorcpu">19.16.1.2. gem5 MinorCPU</a></h5>
 <div class="paragraph">
 <p>Generic in-order core that does not model any specific CPU.</p>
 </div>
@@ -20352,7 +20794,7 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 </div>
 </div>
 <div class="sect4">
-<h5 id="gem5-deriveo3cpu"><a class="anchor" href="#gem5-deriveo3cpu"></a><a class="link" href="#gem5-deriveo3cpu">19.17.1.3. gem5 DeriveO3CPU</a></h5>
+<h5 id="gem5-deriveo3cpu"><a class="anchor" href="#gem5-deriveo3cpu"></a><a class="link" href="#gem5-deriveo3cpu">19.16.1.3. gem5 DeriveO3CPU</a></h5>
 <div class="paragraph">
 <p>Generic out-of-order core. "O3" Stands for "Out Of Order"!</p>
 </div>
@@ -20379,7 +20821,7 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-arm-rsk"><a class="anchor" href="#gem5-arm-rsk"></a><a class="link" href="#gem5-arm-rsk">19.17.2. gem5 ARM RSK</a></h4>
+<h4 id="gem5-arm-rsk"><a class="anchor" href="#gem5-arm-rsk"></a><a class="link" href="#gem5-arm-rsk">19.16.2. gem5 ARM RSK</a></h4>
 <div class="paragraph">
 <p><a href="https://github.com/arm-university/arm-gem5-rsk/blob/aa3b51b175a0f3b6e75c9c856092ae0c8f2a7cdc/gem5_rsk.pdf" class="bare">https://github.com/arm-university/arm-gem5-rsk/blob/aa3b51b175a0f3b6e75c9c856092ae0c8f2a7cdc/gem5_rsk.pdf</a></p>
 </div>
@@ -20389,7 +20831,7 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 </div>
 </div>
 <div class="sect2">
-<h3 id="gem5-arm-platforms"><a class="anchor" href="#gem5-arm-platforms"></a><a class="link" href="#gem5-arm-platforms">19.18. gem5 ARM platforms</a></h3>
+<h3 id="gem5-arm-platforms"><a class="anchor" href="#gem5-arm-platforms"></a><a class="link" href="#gem5-arm-platforms">19.17. gem5 ARM platforms</a></h3>
 <div class="paragraph">
 <p>The gem5 platform is selectable with the <code>--machine</code> option, which is named after the analogous QEMU <code>-machine</code> option, and which sets the <code>--machine-type</code>.</p>
 </div>
@@ -20417,7 +20859,7 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 </div>
 </div>
 <div class="sect2">
-<h3 id="gem5-upstream-images"><a class="anchor" href="#gem5-upstream-images"></a><a class="link" href="#gem5-upstream-images">19.19. gem5 upstream images</a></h3>
+<h3 id="gem5-upstream-images"><a class="anchor" href="#gem5-upstream-images"></a><a class="link" href="#gem5-upstream-images">19.18. gem5 upstream images</a></h3>
 <div class="paragraph">
 <p>Present at:</p>
 </div>
@@ -20471,7 +20913,7 @@ cd ..
 </div>
 </div>
 <div class="sect2">
-<h3 id="gem5-internals"><a class="anchor" href="#gem5-internals"></a><a class="link" href="#gem5-internals">19.20. gem5 internals</a></h3>
+<h3 id="gem5-internals"><a class="anchor" href="#gem5-internals"></a><a class="link" href="#gem5-internals">19.19. gem5 internals</a></h3>
 <div class="paragraph">
 <p>Internals under other sections:</p>
 </div>
@@ -20489,7 +20931,7 @@ cd ..
 </ul>
 </div>
 <div class="sect3">
-<h4 id="gem5-eclipse-configuration"><a class="anchor" href="#gem5-eclipse-configuration"></a><a class="link" href="#gem5-eclipse-configuration">19.20.1. gem5 Eclipse configuration</a></h4>
+<h4 id="gem5-eclipse-configuration"><a class="anchor" href="#gem5-eclipse-configuration"></a><a class="link" href="#gem5-eclipse-configuration">19.19.1. gem5 Eclipse configuration</a></h4>
 <div class="paragraph">
 <p>In order to develop complex C++ software such as gem5, a good IDE setup is fundamental.</p>
 </div>
@@ -20527,9 +20969,9 @@ cd ..
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-python-c-interaction"><a class="anchor" href="#gem5-python-c-interaction"></a><a class="link" href="#gem5-python-c-interaction">19.20.2. gem5 Python C++ interaction</a></h4>
+<h4 id="gem5-python-c-interaction"><a class="anchor" href="#gem5-python-c-interaction"></a><a class="link" href="#gem5-python-c-interaction">19.19.2. gem5 Python C++ interaction</a></h4>
 <div class="paragraph">
-<p>The interaction uses the Python C extension interface <a href="https://docs.python.org/2/extending/extending.html" class="bare">https://docs.python.org/2/extending/extending.html</a> interface through the pybind11 helper library: <a href="https://github.com/pybind/pybind11" class="bare">https://github.com/pybind/pybind11</a></p>
+<p>The interaction uses the Python C extension interface <a href="https://docs.python.org/2/extending/extending.html" class="bare">https://docs.python.org/2/extending/extending.html</a> interface through the <a href="#pybind11">pybind11</a> helper library: <a href="https://github.com/pybind/pybind11" class="bare">https://github.com/pybind/pybind11</a></p>
 </div>
 <div class="paragraph">
 <p>The C++ executable both:</p>
@@ -20558,7 +21000,7 @@ cd ..
 </ul>
 </div>
 <div class="paragraph">
-<p>then gem5 magic <code>simobject</code> class adds some crazy stuff on top of it further&#8230;&#8203; is is a mess. in particular, it auto generates <code>params/</code> headers. TODO: why is this mess needed at all? pybind11 seems to handle constructor arguments just fine:</p>
+<p>then gem5 magic <code>SimObject</code> class adds some crazy stuff on top of it further, is is a mess. In particular, it auto generates <code>params/</code> headers. TODO: why is this mess needed at all? pybind11 seems to handle constructor arguments just fine:</p>
 </div>
 <div class="ulist">
 <ul>
@@ -20593,7 +21035,7 @@ cd ..
 </div>
 </div>
 <div class="paragraph">
-<p>Since <code>BadDevice</code> has no <code>__init__</code> method, and neither <code>BasicPioDevice</code>, it all just falls through until the SimObject.<em>init</em> constructor.</p>
+<p>Since <code>BadDevice</code> has no <code>__init__</code> method, and neither <code>BasicPioDevice</code>, it all just falls through until the <code>SimObject.__init__</code> constructor.</p>
 </div>
 <div class="paragraph">
 <p>This constructor will loop through the inheritance chain and give the Python parameters to the C++ BadDeviceParams class as follows.</p>
@@ -20689,11 +21131,17 @@ static EmbeddedPyBind embed_obj("BadDevice", module_init, "BasicPioDevice");</pr
 </div>
 </div>
 <div class="paragraph">
+<p>It has been found that this usage of pybind11 across hundreds of <code>SimObject</code> files accounted for 50% of the gem5 build time at one point: <a href="https://gem5.atlassian.net/browse/GEM5-366" class="bare">https://gem5.atlassian.net/browse/GEM5-366</a></p>
+</div>
+<div class="paragraph">
+<p>To get a feeling of how <code>SimObject</code> objects are run, see: <a href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">gem5 event queue AtomicSimpleCPU syscall emulation freestanding example analysis</a>.</p>
+</div>
+<div class="paragraph">
 <p>Tested on gem5 08c79a194d1a3430801c04f37d13216cc9ec1da3.</p>
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-entry-point"><a class="anchor" href="#gem5-entry-point"></a><a class="link" href="#gem5-entry-point">19.20.3. gem5 entry point</a></h4>
+<h4 id="gem5-entry-point"><a class="anchor" href="#gem5-entry-point"></a><a class="link" href="#gem5-entry-point">19.19.3. gem5 entry point</a></h4>
 <div class="paragraph">
 <p>The main is at: <code>src/sim/main.cc</code>. It calls:</p>
 </div>
@@ -20775,14 +21223,14 @@ exec filecode in scope</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>and that is where doSimLoop the main event loop, <code>doSimLoop</code> gets called and starts kicking off the <a href="#gem5-event-queue">gem5 event queue</a>.</p>
+<p>and that is where the main event loop, <code>doSimLoop</code>, gets called and starts kicking off the <a href="#gem5-event-queue">gem5 event queue</a>.</p>
 </div>
 <div class="paragraph">
 <p>Tested at gem5 b4879ae5b0b6644e6836b0881e4da05c64a6550d.</p>
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-event-queue"><a class="anchor" href="#gem5-event-queue"></a><a class="link" href="#gem5-event-queue">19.20.4. gem5 event queue</a></h4>
+<h4 id="gem5-event-queue"><a class="anchor" href="#gem5-event-queue"></a><a class="link" href="#gem5-event-queue">19.19.4. gem5 event queue</a></h4>
 <div class="paragraph">
 <p>gem5 is an event based simulator, and as such the event queue is of of the crucial elements in the system.</p>
 </div>
@@ -20842,7 +21290,7 @@ exec filecode in scope</pre>
 <p>This calls the <code>Event::process</code> method of the event.</p>
 </div>
 <div class="sect4">
-<h5 id="gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">19.20.4.1. gem5 event queue AtomicSimpleCPU syscall emulation freestanding example analysis</a></h5>
+<h5 id="gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">19.19.4.1. gem5 event queue AtomicSimpleCPU syscall emulation freestanding example analysis</a></h5>
 <div class="paragraph">
 <p>Let&#8217;s now analyze every single event on a minimal <a href="#gem5-syscall-emulation-mode">gem5 syscall emulation mode</a> in the <a href="#gem5-cpu-types">simplest CPU that we have</a>:</p>
 </div>
@@ -20953,7 +21401,7 @@ ArmLinuxProcess64::initState() at process.cc:1,777 0x5555572d5e5e</pre>
 <p>which calls <code>EventManager::schedule</code>.</p>
 </div>
 <div class="paragraph">
-<p><code>AtomicSimpleCPU</code> is an <code>EventManager</code> because <code>SimObject</code> inherits from it.</p>
+<p><code>AtomicSimpleCPU</code> is an <code>EventManager</code> because <a href="#gem5-python-c-interaction"><code>SimObject</code></a> inherits from it.</p>
 </div>
 <div class="paragraph">
 <p><code>tickEvent</code> is an <code>EventFunctionWrapper</code> which contains a <code>std::function&lt;void(void)&gt; callback;</code>, and is initialized in the constructor as:</p>
@@ -20965,6 +21413,110 @@ ArmLinuxProcess64::initState() at process.cc:1,777 0x5555572d5e5e</pre>
 </div>
 </div>
 <div class="paragraph">
+<p>The call stack above <code>ArmLinuxProcess64::initState</code> is <a href="#pybind11">pybind11</a> fuzziness, but if we grep a bit we find the Python call point:</p>
+</div>
+<div class="paragraph">
+<p>src/python/m5/simulate.py</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>def instantiate(ckpt_dir=None):
+
+    ...
+
+    # Create the C++ sim objects and connect ports
+    for obj in root.descendants(): obj.createCCObject()
+    for obj in root.descendants(): obj.connectPorts()
+
+    # Do a second pass to finish initializing the sim objects
+    for obj in root.descendants(): obj.init()
+
+    ...
+
+    # Restore checkpoint (if any)
+    if ckpt_dir:
+        ...
+    else:
+        for obj in root.descendants(): obj.initState()</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>As we can see, <code>initState</code> is just one stage of generic <code>SimObject</code> initialization. <code>root.descendants()</code> goes over the entire <code>SimObject</code> tree calling <code>initState()</code>.</p>
+</div>
+<div class="paragraph">
+<p>Finally, we see that <code>initState</code> is part of the <code>SimObject</code> C++ API:</p>
+</div>
+<div class="paragraph">
+<p>src/sim/sim_object.hh</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>class SimObject : public EventManager, public Serializable, public Drainable,
+                  public Stats::Group
+{
+
+    ...
+
+    /**
+     * initState() is called on each SimObject when *not* restoring
+     * from a checkpoint.  This provides a hook for state
+     * initializations that are only required for a "cold start".
+     */
+    virtual void initState();</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Finally, we see that <code>initState</code> is exposed to the Python API at:</p>
+</div>
+<div class="paragraph">
+<p>build/ARM/python/_m5/param_SimObject.cc</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>module_init(py::module &amp;m_internal)
+{
+    py::module m = m_internal.def_submodule("param_SimObject");
+    py::class_&lt;SimObjectParams, std::unique_ptr&lt;SimObjectParams, py::nodelete&gt;&gt;(m, "SimObjectParams")
+        .def_readwrite("name", &amp;SimObjectParams::name)
+        .def_readwrite("eventq_index", &amp;SimObjectParams::eventq_index)
+        ;
+
+    py::class_&lt;SimObject, Drainable, Serializable, Stats::Group, std::unique_ptr&lt;SimObject, py::nodelete&gt;&gt;(m, "SimObject")
+        .def("init", &amp;SimObject::init)
+        .def("initState", &amp;SimObject::initState)
+        .def("memInvalidate", &amp;SimObject::memInvalidate)
+        .def("memWriteback", &amp;SimObject::memWriteback)
+        .def("regProbePoints", &amp;SimObject::regProbePoints)
+        .def("regProbeListeners", &amp;SimObject::regProbeListeners)
+        .def("startup", &amp;SimObject::startup)
+        .def("loadState", &amp;SimObject::loadState, py::arg("cp"))
+        .def("getPort", &amp;SimObject::getPort, pybind11::return_value_policy::reference, py::arg("if_name"), py::arg("idx"))
+        ;
+
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which is more magical than the other param classes since <code>py::class_&lt;SimObject</code> has non-trivial methods, those are auto-generated by the <code>cxx_exports</code> code generation mechanism:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>class SimObject(object):
+
+    ...
+
+    cxx_exports = [
+        PyBindMethod("init"),
+        PyBindMethod("initState"),
+        PyBindMethod("memInvalidate"),
+        PyBindMethod("memWriteback"),
+        PyBindMethod("regProbePoints"),
+        PyBindMethod("regProbeListeners"),
+        PyBindMethod("startup"),
+    ]</pre>
+</div>
+</div>
+<div class="paragraph">
 <p>So that&#8217;s how the main atomic tick loop works, fully understood!</p>
 </div>
 <div class="paragraph">
@@ -21033,7 +21585,7 @@ AtomicSimpleCPU::tick() at atomic.cc:757 0x55555907834c</pre>
 </div>
 </div>
 <div class="sect4">
-<h5 id="gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis">19.20.4.2. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis</a></h5>
+<h5 id="gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis">19.19.4.2. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis</a></h5>
 <div class="paragraph">
 <p>TODO: analyze better what each of the memory event mean. For now, we have just collected a bunch of data there, but needs interpreting. The CPU specifics in this section are already insightful however.</p>
 </div>
@@ -21440,7 +21992,7 @@ TimingSimpleCPU::IcachePort::ITickEvent::process()
 </div>
 </div>
 <div class="sect4">
-<h5 id="gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches"><a class="anchor" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches"></a><a class="link" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches">19.20.4.3. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis with caches</a></h5>
+<h5 id="gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches"><a class="anchor" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches"></a><a class="link" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches">19.19.4.3. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis with caches</a></h5>
 <div class="paragraph">
 <p>Let&#8217;s just add --caches to see if things go any faster:</p>
 </div>
@@ -21479,7 +22031,7 @@ info: Entering event queue @ 0.  Starting simulation...
 </div>
 </div>
 <div class="sect4">
-<h5 id="gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis">19.20.4.4. gem5 event queue MinorCPU syscall emulation freestanding example analysis</a></h5>
+<h5 id="gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis">19.19.4.4. gem5 event queue MinorCPU syscall emulation freestanding example analysis</a></h5>
 <div class="paragraph">
 <p>The events <a href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">for the Atomic CPU</a> were pretty simple: basically just ticks.</p>
 </div>
@@ -21506,7 +22058,7 @@ info: Entering event queue @ 0.  Starting simulation...
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-stats-internals"><a class="anchor" href="#gem5-stats-internals"></a><a class="link" href="#gem5-stats-internals">19.20.5. gem5 stats internals</a></h4>
+<h4 id="gem5-stats-internals"><a class="anchor" href="#gem5-stats-internals"></a><a class="link" href="#gem5-stats-internals">19.19.5. gem5 stats internals</a></h4>
 <div class="paragraph">
 <p>This describes the internals of the <a href="#gem5-m5out-stats-txt-file">gem5 m5out/stats.txt file</a>.</p>
 </div>
@@ -21579,7 +22131,7 @@ Text::end()
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-code-generation"><a class="anchor" href="#gem5-code-generation"></a><a class="link" href="#gem5-code-generation">19.20.6. gem5 code generation</a></h4>
+<h4 id="gem5-code-generation"><a class="anchor" href="#gem5-code-generation"></a><a class="link" href="#gem5-code-generation">19.19.6. gem5 code generation</a></h4>
 <div class="paragraph">
 <p>gem5 uses a ton of code generation, which makes the project horrendous:</p>
 </div>
@@ -21624,7 +22176,7 @@ Text::end()
 <p>But it has been widely overused to insanity. It likely also exists partly because when the project started in 2003 C++ compilers weren&#8217;t that good, so you couldn&#8217;t rely on features like templates that much.</p>
 </div>
 <div class="sect4">
-<h5 id="gem5-the-isa"><a class="anchor" href="#gem5-the-isa"></a><a class="link" href="#gem5-the-isa">19.20.6.1. gem5 THE_ISA</a></h5>
+<h5 id="gem5-the-isa"><a class="anchor" href="#gem5-the-isa"></a><a class="link" href="#gem5-the-isa">19.19.6.1. gem5 THE_ISA</a></h5>
 <div class="paragraph">
 <p>Generated code at: <code>build/&lt;ISA&gt;/config/the_isa.hh</code> which contains amongst other lines:</p>
 </div>
@@ -21651,9 +22203,9 @@ enum class Arch {
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-build-system"><a class="anchor" href="#gem5-build-system"></a><a class="link" href="#gem5-build-system">19.20.7. gem5 build system</a></h4>
+<h4 id="gem5-build-system"><a class="anchor" href="#gem5-build-system"></a><a class="link" href="#gem5-build-system">19.19.7. gem5 build system</a></h4>
 <div class="sect4">
-<h5 id="gem5-polymorphic-isa-includes"><a class="anchor" href="#gem5-polymorphic-isa-includes"></a><a class="link" href="#gem5-polymorphic-isa-includes">19.20.7.1. gem5 polymorphic ISA includes</a></h5>
+<h5 id="gem5-polymorphic-isa-includes"><a class="anchor" href="#gem5-polymorphic-isa-includes"></a><a class="link" href="#gem5-polymorphic-isa-includes">19.19.7.1. gem5 polymorphic ISA includes</a></h5>
 <div class="paragraph">
 <p>E.g. <code>src/cpu/decode_cache.hh</code> includes:</p>
 </div>
@@ -21732,7 +22284,7 @@ build/ARM/config/the_isa.hh
 </div>
 </div>
 <div class="sect4">
-<h5 id="why-are-all-c-symlinked-into-the-gem5-build-dir"><a class="anchor" href="#why-are-all-c-symlinked-into-the-gem5-build-dir"></a><a class="link" href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.20.7.2. Why are all C++ symlinked into the gem5 build dir?</a></h5>
+<h5 id="why-are-all-c-symlinked-into-the-gem5-build-dir"><a class="anchor" href="#why-are-all-c-symlinked-into-the-gem5-build-dir"></a><a class="link" href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.19.7.2. Why are all C++ symlinked into the gem5 build dir?</a></h5>
 <div class="paragraph">
 <p>Some scons madness.</p>
 </div>
@@ -21751,6 +22303,9 @@ build/ARM/config/the_isa.hh
 <div class="ulist">
 <ul>
 <li>
+<p>It is likely preventing <a href="#ccache">ccache</a> hits when building to different output paths, because it makes the <code>-I</code> includes point to different paths. This is especially important for <a href="#gem5-ruby-build">gem5 Ruby build</a>, which could have the exact same source files as the non-Ruby builds: <a href="https://stackoverflow.com/questions/60340271/can-ccache-handle-symlinks-to-the-same-input-source-file-as-hits" class="bare">https://stackoverflow.com/questions/60340271/can-ccache-handle-symlinks-to-the-same-input-source-file-as-hits</a></p>
+</li>
+<li>
 <p>when <a href="#debug-the-emulator">debugging the emulator</a>, it shows you directories inside the build directory rather than in the source tree</p>
 </li>
 <li>
@@ -21762,7 +22317,7 @@ build/ARM/config/the_isa.hh
 </div>
 </div>
 <div class="sect2">
-<h3 id="gem5-bootloaders"><a class="anchor" href="#gem5-bootloaders"></a><a class="link" href="#gem5-bootloaders">19.21. gem5 bootloaders</a></h3>
+<h3 id="gem5-bootloaders"><a class="anchor" href="#gem5-bootloaders"></a><a class="link" href="#gem5-bootloaders">19.20. gem5 bootloaders</a></h3>
 <div class="paragraph">
 <p>Certain ISAs like ARM have bootloaders that are automatically run before the main image to setup basic system state.</p>
 </div>
@@ -22174,7 +22729,7 @@ TODO benchmark: would gem5 suffer a considerable disk read performance hit due t
 <p>libguestfs: <a href="https://serverfault.com/questions/246835/convert-directory-to-qemu-kvm-virtual-disk-image/916697#916697" class="bare">https://serverfault.com/questions/246835/convert-directory-to-qemu-kvm-virtual-disk-image/916697#916697</a>, in particular <a href="http://libguestfs.org/guestfish.1.html#vfs-minimum-size"><code>vfs-minimum-size</code></a></p>
 </li>
 <li>
-<p>use methods described at: <a href="#gem5-restore-new-script">Section 19.5.2, &#8220;gem5 checkpoint restore and run a different script&#8221;</a> instead of putting builds on the root filesystem</p>
+<p>use methods described at: <a href="#gem5-restore-new-script">Section 19.5.3, &#8220;gem5 checkpoint restore and run a different script&#8221;</a> instead of putting builds on the root filesystem</p>
 </li>
 </ul>
 </div>
@@ -23295,6 +23850,15 @@ time ./mutex.out 4 100000000</pre>
 </div>
 </div>
 </div>
+<div class="sect3">
+<h4 id="cpp-type-casting"><a class="anchor" href="#cpp-type-casting"></a><a class="link" href="#cpp-type-casting">21.2.3. C++ type casting</a></h4>
+<div class="paragraph">
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/static_dynamic_reinterpret_cast.cpp">userland/cpp/static_dynamic_reinterpret_cast.cpp</a></p>
+</div>
+<div class="paragraph">
+<p><a href="https://stackoverflow.com/questions/332030/when-should-static-cast-dynamic-cast-const-cast-and-reinterpret-cast-be-used/60414256#60414256" class="bare">https://stackoverflow.com/questions/332030/when-should-static-cast-dynamic-cast-const-cast-and-reinterpret-cast-be-used/60414256#60414256</a></p>
+</div>
+</div>
 </div>
 <div class="sect2">
 <h3 id="posix"><a class="anchor" href="#posix"></a><a class="link" href="#posix">21.3. POSIX</a></h3>
@@ -23632,6 +24196,9 @@ There are no non-locking atomic types or atomic primitives in POSIX: <a href="ht
 <div class="paragraph">
 <p>Leads to the dreadful "Stack smashing detected" message. Which is infinitely better than a silent break in any case.</p>
 </div>
+<div class="paragraph">
+<p>We had also seen this error in our repository at: <a href="#stack-smashing-detected-when-using-glibc">stack smashing detected when using glibc</a>.</p>
+</div>
 </div>
 <div class="sect3">
 <h4 id="memory-leaks"><a class="anchor" href="#memory-leaks"></a><a class="link" href="#memory-leaks">21.5.2. Memory leaks</a></h4>
@@ -23642,6 +24209,25 @@ There are no non-locking atomic types or atomic primitives in POSIX: <a href="ht
 <p>Example: <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/c/memory_leak.c">userland/c/memory_leak.c</a></p>
 </div>
 </div>
+<div class="sect3">
+<h4 id="profiling-userland-programs"><a class="anchor" href="#profiling-userland-programs"></a><a class="link" href="#profiling-userland-programs">21.5.3. Profiling userland programs</a></h4>
+<div class="paragraph">
+<p><a href="https://stackoverflow.com/questions/375913/how-can-i-profile-c-code-running-on-linux/60265409#60265409" class="bare">https://stackoverflow.com/questions/375913/how-can-i-profile-c-code-running-on-linux/60265409#60265409</a></p>
+</div>
+<div class="paragraph">
+<p>OK, we have to learn this stuff.</p>
+</div>
+<div class="paragraph">
+<p>Examples:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/profile.c">userland/gcc/profile.c</a>: simple profiling example, where certain calls of a certain function can dominate the runtime</p>
+</li>
+</ul>
+</div>
+</div>
 </div>
 <div class="sect2">
 <h3 id="interpreted-languages"><a class="anchor" href="#interpreted-languages"></a><a class="link" href="#interpreted-languages">21.6. Interpreted languages</a></h3>
@@ -23651,7 +24237,29 @@ There are no non-locking atomic types or atomic primitives in POSIX: <a href="ht
 <div class="sect3">
 <h4 id="python"><a class="anchor" href="#python"></a><a class="link" href="#python">21.6.1. Python</a></h4>
 <div class="paragraph">
-<p>Build and install the interpreter on the target:</p>
+<p>Examples:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/rootfs_overlay/lkmc/python/hello.py">rootfs_overlay/lkmc/python/hello.py</a>: hello world</p>
+</li>
+<li>
+<p><code>time</code></p>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/rootfs_overlay/lkmc/python/count.py">rootfs_overlay/lkmc/python/count.py</a>: count once every second</p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+<div class="sect4">
+<h5 id="build-and-install-the-interpreter"><a class="anchor" href="#build-and-install-the-interpreter"></a><a class="link" href="#build-and-install-the-interpreter">21.6.1.1. Build and install the interpreter</a></h5>
+<div class="paragraph">
+<p>Buildroot has a Python package that can be added to the guest image:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -23706,8 +24314,11 @@ There are no non-locking atomic types or atomic primitives in POSIX: <a href="ht
 <pre>./run --userland "$(./getvar buildroot_target_dir)/usr/bin/python3" --userland-args rootfs_overlay/lkmc/python/hello.py</pre>
 </div>
 </div>
+</div>
+<div class="sect4">
+<h5 id="python-gem5-user-mode-simulation"><a class="anchor" href="#python-gem5-user-mode-simulation"></a><a class="link" href="#python-gem5-user-mode-simulation">21.6.1.2. Python gem5 user mode simulation</a></h5>
 <div class="paragraph">
-<p>LKMC 50ac89b779363774325c81157ec8b9a6bdb50a2f gem5 390a74f59934b85d91489f8a563450d8321b602da:</p>
+<p>At LKMC 50ac89b779363774325c81157ec8b9a6bdb50a2f gem5 390a74f59934b85d91489f8a563450d8321b602da:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -23753,16 +24364,64 @@ There are no non-locking atomic types or atomic primitives in POSIX: <a href="ht
 <div class="paragraph">
 <p>which corresponds to the glorious <code>getrandom</code> syscall: <a href="https://github.com/torvalds/linux/blob/v4.17/include/uapi/asm-generic/unistd.h#L707" class="bare">https://github.com/torvalds/linux/blob/v4.17/include/uapi/asm-generic/unistd.h#L707</a></p>
 </div>
+</div>
+<div class="sect4">
+<h5 id="embedding-python-in-another-application"><a class="anchor" href="#embedding-python-in-another-application"></a><a class="link" href="#embedding-python-in-another-application">21.6.1.3. Embedding Python in another application</a></h5>
 <div class="paragraph">
-<p>Examples:</p>
+<p>Here we will add some better examples and explanations for: <a href="https://docs.python.org/3/extending/embedding.html#very-high-level-embedding" class="bare">https://docs.python.org/3/extending/embedding.html#very-high-level-embedding</a></p>
+</div>
+<div class="paragraph">
+<p>"Embedding Python" basically means calling the Python interpreter from C, and possibly passing values between the two.</p>
+</div>
+<div class="paragraph">
+<p>These examples show to to embed the Python interpreter into a C/C++ application to interface between them</p>
 </div>
 <div class="ulist">
 <ul>
 <li>
-<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/rootfs_overlay/lkmc/python/hello.py">rootfs_overlay/lkmc/python/hello.py</a>: hello world</p>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/libs/python_embed/eval.c">userland/libs/python_embed/eval.c</a>: this example simply does <code>eval</code> a Python string in C, and don&#8217;t communicate any values between the two.</p>
+<div class="paragraph">
+<p>It could be used to call external commands that have external side effects, but it is not very exciting.</p>
+</div>
+</li>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/libs/python_embed/pure.c">userland/libs/python_embed/pure.c</a>: this example actually defines some Python classes and functions from C, implementing those entirely in C.</p>
+<div class="paragraph">
+<p>The C program that defines those classes then instantiates the interpreter calls some regular Python code from it: <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/libs/python_embed/pure.py">userland/libs/python_embed/pure.py</a></p>
+</div>
+<div class="paragraph">
+<p>The regular Python code can then use the native C classes as if they were defined in Python.</p>
+</div>
+<div class="paragraph">
+<p>Finally, the Python returns values back to the C code that called the interpreter.</p>
+</div>
+</li>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/libs/python_embed/pure_cpp.cpp">userland/libs/python_embed/pure_cpp.cpp</a>: C version of the above, the main goal of this example is to show how to interface with C classes.</p>
 </li>
 </ul>
 </div>
+<div class="paragraph">
+<p>One notable user of Python embedding is the <a href="#gem5">gem5</a> simulator, see also: <a href="#gem5-vs-qemu">gem5 vs QEMU</a>. gem5 embeds the Python interpreter in order to interpret scripts as seen from the CLI:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>build/ARM/gem5.opt configs/example/fs.py</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>gem5 then runs that Python script, which instantiates C classes defined from Python, and then finally hands back control to the C runtime to run the actual simulation faster.</p>
+</div>
+</div>
+<div class="sect4">
+<h5 id="pybind11"><a class="anchor" href="#pybind11"></a><a class="link" href="#pybind11">21.6.1.4. pybind11</a></h5>
+<div class="paragraph">
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/libs/pybind11">userland/libs/pybind11</a></p>
+</div>
+<div class="paragraph">
+<p><a href="https://stackoverflow.com/questions/145270/calling-c-c-from-python/60374990#60374990" class="bare">https://stackoverflow.com/questions/145270/calling-c-c-from-python/60374990#60374990</a></p>
+</div>
+</div>
 </div>
 <div class="sect3">
 <h4 id="node-js"><a class="anchor" href="#node-js"></a><a class="link" href="#node-js">21.6.2. Node.js</a></h4>
@@ -24715,10 +25374,51 @@ git clean -xdf .</pre>
 <div class="paragraph">
 <p>See for example <a href="#blas">BLAS</a>.</p>
 </div>
+<div class="sect4">
+<h5 id="hdf5"><a class="anchor" href="#hdf5"></a><a class="link" href="#hdf5">21.8.5.1. HDF5</a></h5>
+<div class="paragraph">
+<p><a href="https://en.wikipedia.org/wiki/Hierarchical_Data_Format" class="bare">https://en.wikipedia.org/wiki/Hierarchical_Data_Format</a></p>
+</div>
+<div class="paragraph">
+<p>Binary format to store data. TODO vs databases, notably SQLite: <a href="https://datascience.stackexchange.com/questions/262/hierarchical-data-format-what-are-the-advantages-compared-to-alternative-format" class="bare">https://datascience.stackexchange.com/questions/262/hierarchical-data-format-what-are-the-advantages-compared-to-alternative-format</a></p>
+</div>
+<div class="paragraph">
+<p>Examples: <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/libs/hdf5">userland/libs/hdf5</a></p>
+</div>
+</div>
 </div>
 </div>
 <div class="sect2">
-<h3 id="userland-content-bibliography"><a class="anchor" href="#userland-content-bibliography"></a><a class="link" href="#userland-content-bibliography">21.9. Userland content bibliography</a></h3>
+<h3 id="userland-content-filename-conventions"><a class="anchor" href="#userland-content-filename-conventions"></a><a class="link" href="#userland-content-filename-conventions">21.9. Userland content filename conventions</a></h3>
+<div class="paragraph">
+<p>The following basenames should always refer to programs that do the same thing, but in different languages:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><code>count</code>: count to infinity, sleep one second between each number</p>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/rootfs_overlay/lkmc/count.sh">rootfs_overlay/lkmc/count.sh</a></p>
+</li>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/rootfs_overlay/lkmc/python/count.py">rootfs_overlay/lkmc/python/count.py</a></p>
+</li>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/count.cpp">userland/cpp/count.cpp</a></p>
+</li>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/posix/count.c">userland/posix/count.c</a></p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</div>
+<div class="sect2">
+<h3 id="userland-content-bibliography"><a class="anchor" href="#userland-content-bibliography"></a><a class="link" href="#userland-content-bibliography">21.10. Userland content bibliography</a></h3>
 <div class="ulist">
 <ul>
 <li>
@@ -32668,16 +33368,18 @@ instructions 124346081</pre>
 <table id="table-busy-loop-dmips" class="tableblock frame-all grid-all stretch">
 <caption class="title">Table 7. Busy loop MIPS for different simulator setups</caption>
 <colgroup>
-<col style="width: 14.2857%;">
-<col style="width: 14.2857%;">
-<col style="width: 14.2857%;">
-<col style="width: 14.2857%;">
-<col style="width: 14.2857%;">
-<col style="width: 14.2857%;">
-<col style="width: 14.2858%;">
+<col style="width: 12.5%;">
+<col style="width: 12.5%;">
+<col style="width: 12.5%;">
+<col style="width: 12.5%;">
+<col style="width: 12.5%;">
+<col style="width: 12.5%;">
+<col style="width: 12.5%;">
+<col style="width: 12.5%;">
 </colgroup>
 <thead>
 <tr>
+<th class="tableblock halign-left valign-top">Comment</th>
 <th class="tableblock halign-left valign-top">LKMC</th>
 <th class="tableblock halign-left valign-top">Benchmark build</th>
 <th class="tableblock halign-left valign-top">Emulator command</th>
@@ -32689,60 +33391,87 @@ instructions 124346081</pre>
 </thead>
 <tbody>
 <tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">QEMU busy loop</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">a18f28e263c91362519ef550150b5c9d75fa3679 + 1</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/busy_loop.c">userland/gcc/busy_loop.c</a> <code>-O0</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><code>qemu --arch aarch64</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">`./run --arch aarch64 --userland userland/gcc/busy_loop.c `</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">10^10</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">68</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1.1 * 10^11 (approx)</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">2000</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">gem5 busy loop</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">a18f28e263c91362519ef550150b5c9d75fa3679 + 1</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/busy_loop.c">userland/gcc/busy_loop.c</a> <code>-O0</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">10^7</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">100</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">1.10018162 * 10^8</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --static --userland userland/gcc/busy_loop.c --userland-args 1000000</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">10^6</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">18</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">2.4005699 * 10^7</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">1.3</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">gem5 busy loop for a debug build</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">a18f28e263c91362519ef550150b5c9d75fa3679 + 1</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/busy_loop.c">userland/gcc/busy_loop.c</a> <code>-O0</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64 --gem5-build-id debug</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --gem5-build-type debug --static --userland userland/gcc/busy_loop.c --userland-args 100000</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">10^5</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">32</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">2.528728 * 10^6</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">0.08</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">33</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">2.405682 * 10^6</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0.07</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">gem5 busy loop for a fast build</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0d5a41a3f88fcd7ed40fc19474fe5aed0463663f + 1</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/busy_loop.c">userland/gcc/busy_loop.c</a> <code>-O0 -static</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --gem5-build-type fast --static --userland userland/gcc/busy_loop.c --userland-args 1000000</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">10^6</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">15</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">2.4005699 * 10^7</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">1.6</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">gem5 busy loop for a <a href="#gem5-cpu-types">TimingSimpleCPU</a></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">a18f28e263c91362519ef550150b5c9d75fa3679 + 1</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/busy_loop.c">userland/gcc/busy_loop.c</a> <code>-O0</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64 -- --cpu-type MinorCPU --caches</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --arch aarch64 --static --userland userland/gcc/busy_loop.c --userland-args 1000000 -- --cpu-type TimingSimpleCPU --caches</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">10^6</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">26</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">2.4005699 * 10^7</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0.9</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">gem5 busy loop for a <a href="#gem5-cpu-types">MinorCPU</a></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">a18f28e263c91362519ef550150b5c9d75fa3679 + 1</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/busy_loop.c">userland/gcc/busy_loop.c</a> <code>-O0</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --arch aarch64 --userland userland/gcc/busy_loop.c --userland-args 1000000 -- --cpu-type MinorCPU --caches</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">10^6</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">31</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1.1018152 * 10^7</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">0.4</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">gem5 busy loop for a <a href="#gem5-cpu-types">DerivO3CPU</a></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">a18f28e263c91362519ef550150b5c9d75fa3679 + 1</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/busy_loop.c">userland/gcc/busy_loop.c</a> <code>-O0</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64 -- --cpu-type DerivO3CPU --caches</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --userland args 1000000 -- --cpu-type DerivO3CPU --caches</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">10^6</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">52</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1.1018128 * 10^7</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">0.2</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">a18f28e263c91362519ef550150b5c9d75fa3679 + 1</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/busy_loop.c">userland/gcc/busy_loop.c</a> <code>-O0</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64 --gem5-build-id MOESI_CMP_directory -- --cpu-type DerivO3CPU --caches --ruby</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --gem5-build-id MOESI_CMP_directory -- --cpu-type DerivO3CPU --caches --ruby</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1 * 1000000 = 10^6</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">63</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1.1005150 * 10^7</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">0.2</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">605448f07e6380634b1aa7e9732d111759f69fd</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><a href="#dhrystone">Dhrystone</a> <code>-O3</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64</code></p></td>
@@ -32752,17 +33481,19 @@ instructions 124346081</pre>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1.6</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">5d233f2664a78789f9907d27e2a40e86cefad595</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><a href="#stream-benchmark">STREAM benchmark</a> <code>-O3</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64 --userland-args 300000 2</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --userland-args 1000000 --trace ExecAll</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">3 * 10^5 * 2</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">64</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">9.9674773 * 10^7</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1.6</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">glibc C pre-main effects</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">ab6f7331406b22f8ab6e2df5f8b8e464fb35b611</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">glibc C pre-main <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/c/m5ops.c">userland/c/m5ops.c</a> <code>-O0</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/c/m5ops.c">userland/c/m5ops.c</a> <code>-O0</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64 --userland-args e</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">2</p></td>
@@ -32770,6 +33501,7 @@ instructions 124346081</pre>
 <td class="tableblock halign-left valign-top"><p class="tableblock">0.05</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">ab6f7331406b22f8ab6e2df5f8b8e464fb35b611</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">glibc C pre-main <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/c/m5ops.c">userland/c/m5ops.c</a> <code>-O0</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64 --userland-args e --gem5-build-type debug</code></p></td>
@@ -32779,6 +33511,7 @@ instructions 124346081</pre>
 <td class="tableblock halign-left valign-top"><p class="tableblock">0.05</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">ab6f7331406b22f8ab6e2df5f8b8e464fb35b611</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">glibc C++ pre-main <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/m5ops.cpp">userland/cpp/m5ops.cpp</a> <code>-O0</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64 --userland-args e</code></p></td>
@@ -32788,6 +33521,7 @@ instructions 124346081</pre>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">ab6f7331406b22f8ab6e2df5f8b8e464fb35b611</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">glibc C++ pre-main <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/m5ops.cpp">userland/cpp/m5ops.cpp</a> <code>-O0</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64 --userland-args e --gem5-build-type debug</code></p></td>
@@ -32797,22 +33531,44 @@ instructions 124346081</pre>
 <td class="tableblock halign-left valign-top"><p class="tableblock">0.1</p></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">gem5 optimized build immediate exit on first instruction to benchmark the simulator startup time</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">ab6f7331406b22f8ab6e2df5f8b8e464fb35b611</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">immediate exit <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/arch/aarch64/freestanding/linux/gem5_exit.S">userland/arch/aarch64/freestanding/linux/gem5_exit.S</a> <code>-O0</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">immediate exit <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/freestanding/gem5_exit.S">userland/freestanding/gem5_exit.S</a> <code>-O0</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
-<td class="tableblock halign-left valign-top"></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
 <td class="tableblock halign-left valign-top"></td>
 </tr>
 <tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">same as above but debug build</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">ab6f7331406b22f8ab6e2df5f8b8e464fb35b611</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">immediate exit <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/arch/aarch64/freestanding/linux/gem5_exit.S">userland/arch/aarch64/freestanding/linux/gem5_exit.S</a> <code>-O0</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/freestanding/gem5_exit.S">userland/freestanding/gem5_exit.S</a> <code>-O0</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64 --gem5-build-type debug</code></p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
 <td class="tableblock halign-left valign-top"></td>
-<td class="tableblock halign-left valign-top"></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Check the effect of an ExecAll log (log every instruction) on execution time, compare to analogous run without it. <code>trace.txt</code> size: 3.5GB. 5x slowdown observed with output to a hard disk.</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">d29a07ddad499f273cc90dd66e40f8474b5dfc40</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/busy_loop.c">userland/gcc/busy_loop.c</a> <code>-O0</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --userland-args 1000000 --gem5-worktree master --trace ExecAll</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">10^6</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">2.4106774 * 10^7</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">136</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0.2</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Same as above but with run command manually hacked to output to a ramfs. Slightly faster, but the bulk was still just in log format operations!</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">d29a07ddad499f273cc90dd66e40f8474b5dfc40</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/gcc/busy_loop.c">userland/gcc/busy_loop.c</a> <code>-O0</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --userland-args 1000000 --gem5-worktree master --trace ExecAll</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">10^6</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">2.4106774 * 10^7</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">107</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0.2</p></td>
 </tr>
 </tbody>
 </table>
@@ -33130,6 +33886,9 @@ xdg-open graph-size.pdf</pre>
 <p>We will update this whenever the gem5 submodule is updated.</p>
 </div>
 <div class="paragraph">
+<p>All benchmarks done on <a href="#p51">P51</a>.</p>
+</div>
+<div class="paragraph">
 <p>Sample results at gem5 2a9573f5942b5416fb0570cf5cb6cdecba733392: 10 to 12 minutes.</p>
 </div>
 <div class="paragraph">
@@ -33141,6 +33900,9 @@ xdg-open graph-size.pdf</pre>
 tail -n+1 ../linux-kernel-module-cheat-regression/*/gem5-bench-build-*.txt</pre>
 </div>
 </div>
+<div class="paragraph">
+<p>Ubuntu 19.10, GCC 9.2.1, LKMC 7c6bb29bc89ec3f1056c0680c3f08bd64018a7bc, gem5 d7d9bc240615625141cd6feddbadd392457e49eb (18-02-2020), <code>./build --arch aarch64 --gem5-worktree master --no-cache</code>: 19:33 TODO must investigate why it got so much worse.</p>
+</div>
 <div class="sect5">
 <h6 id="benchmark-gem5-single-file-change-rebuild-time"><a class="anchor" href="#benchmark-gem5-single-file-change-rebuild-time"></a><a class="link" href="#benchmark-gem5-single-file-change-rebuild-time">29.2.3.3.1. Benchmark gem5 single file change rebuild time</a></h6>
 <div class="paragraph">
@@ -33223,29 +33985,10 @@ tail -n+1 ../linux-kernel-module-cheat-regression/*/gem5-bench-build-*.txt</pre>
 <div class="sect3">
 <h4 id="p51"><a class="anchor" href="#p51"></a><a class="link" href="#p51">29.3.1. P51</a></h4>
 <div class="paragraph">
-<p>Lenovo ThinkPad <a href="https://www3.lenovo.com/gb/en/laptops/thinkpad/p-series/P51/p/22TP2WPWP51">P51 laptop</a>:</p>
+<p>Lenovo ThinkPad <a href="https://www3.lenovo.com/gb/en/laptops/thinkpad/p-series/P51/p/22TP2WPWP51">P51 laptop</a> with the Latest stable Ubuntu.</p>
 </div>
-<div class="ulist">
-<ul>
-<li>
-<p>2500 USD in 2018 (high end)</p>
-</li>
-<li>
-<p>Intel Core i7-7820HQ Processor (8MB Cache, up to 3.90GHz) (4 cores 8 threads)</p>
-</li>
-<li>
-<p>32GB(16+16) DDR4 2400MHz SODIMM</p>
-</li>
-<li>
-<p>512GB SSD PCIe TLC OPAL2</p>
-</li>
-<li>
-<p>NVIDIA Quadro M1200 Mobile, latest Ubuntu supported proprietary driver</p>
-</li>
-<li>
-<p>Latest Ubuntu</p>
-</li>
-</ul>
+<div class="paragraph">
+<p>Full specs and benchmark scores will be maintained at the latest version of: <a href="https://github.com/cirosantilli/notes/blob/0c038b0e430d0017f12d028c6a0e7c0b99ec957f/my-hardware.adoc#thinkpad-p51" class="bare">https://github.com/cirosantilli/notes/blob/0c038b0e430d0017f12d028c6a0e7c0b99ec957f/my-hardware.adoc#thinkpad-p51</a></p>
 </div>
 </div>
 </div>
@@ -34190,6 +34933,17 @@ export CCACHE_MAXSIZE="20G"</pre>
 </li>
 </ul>
 </div>
+<div class="paragraph">
+<p>ccache can be disabled with the <code>--no-ccache</code> option as in:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./build-gem5 --no-ccache</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>This can be useful to <a href="#benchmark-builds">benchmark builds</a>.</p>
+</div>
 </div>
 <div class="sect2">
 <h3 id="getvar"><a class="anchor" href="#getvar"></a><a class="link" href="#getvar">33.10. getvar</a></h3>

Comment	LKMC	Benchmark build	Emulator command
QEMU busy loop	a18f28e263c91362519ef550150b5c9d75fa3679 + 1	userland/gcc/busy_loop.c `-O0`	`qemu --arch aarch64`	`./run --arch aarch64 --userland userland/gcc/busy_loop.c `	10^10	68	1.1 * 10^11 (approx)	2000
gem5 busy loop	a18f28e263c91362519ef550150b5c9d75fa3679 + 1	userland/gcc/busy_loop.c `-O0`	`gem5 --arch aarch64`	10^7	100	1.10018162 * 10^8	1	`./run --arch aarch64 --emulator gem5 --static --userland userland/gcc/busy_loop.c --userland-args 1000000`	10^6	18	2.4005699 * 10^7	1.3
gem5 busy loop for a debug build	a18f28e263c91362519ef550150b5c9d75fa3679 + 1	userland/gcc/busy_loop.c `-O0`	`gem5 --arch aarch64 --gem5-build-id debug`	`./run --arch aarch64 --emulator gem5 --gem5-build-type debug --static --userland userland/gcc/busy_loop.c --userland-args 100000`	10^5	32	2.528728 * 10^6	0.08	33	2.405682 * 10^6	0.07
gem5 busy loop for a fast build	0d5a41a3f88fcd7ed40fc19474fe5aed0463663f + 1	userland/gcc/busy_loop.c `-O0 -static`	`./run --arch aarch64 --emulator gem5 --gem5-build-type fast --static --userland userland/gcc/busy_loop.c --userland-args 1000000`	10^6	15	2.4005699 * 10^7	1.6
gem5 busy loop for a TimingSimpleCPU	a18f28e263c91362519ef550150b5c9d75fa3679 + 1	userland/gcc/busy_loop.c `-O0`	`gem5 --arch aarch64 -- --cpu-type MinorCPU --caches`	`./run --arch aarch64 --emulator gem5 --arch aarch64 --static --userland userland/gcc/busy_loop.c --userland-args 1000000 -- --cpu-type TimingSimpleCPU --caches`	10^6	26	2.4005699 * 10^7	0.9
gem5 busy loop for a MinorCPU	a18f28e263c91362519ef550150b5c9d75fa3679 + 1	userland/gcc/busy_loop.c `-O0`	`./run --arch aarch64 --emulator gem5 --arch aarch64 --userland userland/gcc/busy_loop.c --userland-args 1000000 -- --cpu-type MinorCPU --caches`	10^6	31	1.1018152 * 10^7	0.4
gem5 busy loop for a DerivO3CPU	a18f28e263c91362519ef550150b5c9d75fa3679 + 1	userland/gcc/busy_loop.c `-O0`	`gem5 --arch aarch64 -- --cpu-type DerivO3CPU --caches`	`./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --userland args 1000000 -- --cpu-type DerivO3CPU --caches`	10^6	52	1.1018128 * 10^7	0.2
	a18f28e263c91362519ef550150b5c9d75fa3679 + 1	userland/gcc/busy_loop.c `-O0`	`gem5 --arch aarch64 --gem5-build-id MOESI_CMP_directory -- --cpu-type DerivO3CPU --caches --ruby`	`./run --arch aarch64 --emulator gem5 --gem5-build-id MOESI_CMP_directory -- --cpu-type DerivO3CPU --caches --ruby`	1 * 1000000 = 10^6	63	1.1005150 * 10^7	0.2
	605448f07e6380634b1aa7e9732d111759f69fd	Dhrystone `-O3`	`gem5 --arch aarch64`	1.6
	5d233f2664a78789f9907d27e2a40e86cefad595	STREAM benchmark `-O3`	`gem5 --arch aarch64 --userland-args 300000 2`	`./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --userland-args 1000000 --trace ExecAll`	3 * 10^5 * 2	64	9.9674773 * 10^7	1.6
glibc C pre-main effects	ab6f7331406b22f8ab6e2df5f8b8e464fb35b611	glibc C pre-main userland/c/m5ops.c `-O0`	userland/c/m5ops.c `-O0`	`gem5 --arch aarch64 --userland-args e`	1	2	0.05
	ab6f7331406b22f8ab6e2df5f8b8e464fb35b611	glibc C pre-main userland/c/m5ops.c `-O0`	`gem5 --arch aarch64 --userland-args e --gem5-build-type debug`	0.05
	ab6f7331406b22f8ab6e2df5f8b8e464fb35b611	glibc C++ pre-main userland/cpp/m5ops.cpp `-O0`	`gem5 --arch aarch64 --userland-args e`	1
	ab6f7331406b22f8ab6e2df5f8b8e464fb35b611	glibc C++ pre-main userland/cpp/m5ops.cpp `-O0`	`gem5 --arch aarch64 --userland-args e --gem5-build-type debug`	0.1
gem5 optimized build immediate exit on first instruction to benchmark the simulator startup time	ab6f7331406b22f8ab6e2df5f8b8e464fb35b611	immediate exit userland/arch/aarch64/freestanding/linux/gem5_exit.S `-O0`	immediate exit userland/freestanding/gem5_exit.S `-O0`	`gem5 --arch aarch64`	1	1		1
same as above but debug build	ab6f7331406b22f8ab6e2df5f8b8e464fb35b611	immediate exit userland/arch/aarch64/freestanding/linux/gem5_exit.S `-O0`	userland/freestanding/gem5_exit.S `-O0`	`gem5 --arch aarch64 --gem5-build-type debug`	1	1	1
Check the effect of an ExecAll log (log every instruction) on execution time, compare to analogous run without it. `trace.txt` size: 3.5GB. 5x slowdown observed with output to a hard disk.	d29a07ddad499f273cc90dd66e40f8474b5dfc40	userland/gcc/busy_loop.c `-O0`	`./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --userland-args 1000000 --gem5-worktree master --trace ExecAll`	10^6	2.4106774 * 10^7	136	0.2
Same as above but with run command manually hacked to output to a ramfs. Slightly faster, but the bulk was still just in log format operations!	d29a07ddad499f273cc90dd66e40f8474b5dfc40	userland/gcc/busy_loop.c `-O0`	`./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --userland-args 1000000 --gem5-worktree master --trace ExecAll`	10^6	2.4106774 * 10^7	107	0.2