From 37627d90aaf9068798d1e67212df0f6c12c8a3b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?=
 =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= <ciro.santilli@gmail.com>
Date: Thu, 29 Oct 2020 00:00:00 +0000
Subject: [PATCH] 1cf5222769851522454e241f3270d34c5ee55951

---
 index.html | 1419 ++++++++++++----------------------------------------
 1 file changed, 320 insertions(+), 1099 deletions(-)
diff --git a/index.html b/index.html
index 64d5a65..8e158c1 100644
--- a/index.html
+++ b/index.html
@@ -477,10 +477,9 @@ pre{ white-space:pre }
 <ul class="sectlevel4">
 <li><a href="#your-first-linux-kernel-hack">1.2.2.1. Your first Linux kernel hack</a></li>
 <li><a href="#your-first-kernel-module-hack">1.2.2.2. Your first kernel module hack</a></li>
-<li><a href="#your-first-qemu-hack">1.2.2.3. Your first QEMU hack</a></li>
-<li><a href="#your-first-glibc-hack">1.2.2.4. Your first glibc hack</a></li>
-<li><a href="#your-first-binutils-hack">1.2.2.5. Your first Binutils hack</a></li>
-<li><a href="#your-first-gcc-hack">1.2.2.6. Your first GCC hack</a></li>
+<li><a href="#your-first-glibc-hack">1.2.2.3. Your first glibc hack</a></li>
+<li><a href="#your-first-binutils-hack">1.2.2.4. Your first Binutils hack</a></li>
+<li><a href="#your-first-gcc-hack">1.2.2.5. Your first GCC hack</a></li>
 </ul>
 </li>
 <li><a href="#about-the-qemu-buildroot-setup">1.2.3. About the QEMU Buildroot setup</a></li>
@@ -1069,19 +1068,17 @@ pre{ white-space:pre }
 <ul class="sectlevel3">
 <li><a href="#pci">22.6.1. PCI</a>
 <ul class="sectlevel4">
-<li><a href="#pci-min">22.6.1.1. pci_min</a></li>
-<li><a href="#qemu-edu">22.6.1.2. QEMU edu PCI device</a></li>
-<li><a href="#manipulate-pci-registers-directly">22.6.1.3. Manipulate PCI registers directly</a></li>
-<li><a href="#pciutils">22.6.1.4. pciutils</a></li>
-<li><a href="#introduction-to-pci">22.6.1.5. Introduction to PCI</a></li>
-<li><a href="#pci-bfd">22.6.1.6. PCI BFD</a></li>
-<li><a href="#pci-bar">22.6.1.7. PCI BAR</a></li>
+<li><a href="#qemu-edu">22.6.1.1. QEMU edu PCI device</a></li>
+<li><a href="#manipulate-pci-registers-directly">22.6.1.2. Manipulate PCI registers directly</a></li>
+<li><a href="#pciutils">22.6.1.3. pciutils</a></li>
+<li><a href="#introduction-to-pci">22.6.1.4. Introduction to PCI</a></li>
+<li><a href="#pci-bfd">22.6.1.5. PCI BFD</a></li>
+<li><a href="#pci-bar">22.6.1.6. PCI BAR</a></li>
 </ul>
 </li>
 <li><a href="#gpio">22.6.2. GPIO</a></li>
 <li><a href="#leds">22.6.3. LEDs</a></li>
-<li><a href="#platform-device">22.6.4. platform_device</a></li>
-<li><a href="#gem5-educational-hardware-models">22.6.5. gem5 educational hardware models</a></li>
+<li><a href="#gem5-educational-hardware-models">22.6.4. gem5 educational hardware models</a></li>
 </ul>
 </li>
 <li><a href="#qemu-monitor">22.7. QEMU monitor</a>
@@ -1188,8 +1185,9 @@ pre{ white-space:pre }
 </li>
 <li><a href="#m5ops-instructions">23.8.2. m5ops instructions</a>
 <ul class="sectlevel4">
-<li><a href="#m5ops-instructions-interface">23.8.2.1. m5ops instructions interface</a></li>
-<li><a href="#m5op-annotations">23.8.2.2. m5op annotations</a></li>
+<li><a href="#m5ops-magic-addresses">23.8.2.1. m5ops magic addresses</a></li>
+<li><a href="#m5ops-instructions-interface">23.8.2.2. m5ops instructions interface</a></li>
+<li><a href="#m5op-annotations">23.8.2.3. m5op annotations</a></li>
 </ul>
 </li>
 </ul>
@@ -1492,11 +1490,7 @@ pre{ white-space:pre }
 <li><a href="#gcc-c-extensions">26.2.3. GCC C extensions</a>
 <ul class="sectlevel4">
 <li><a href="#c-empty-struct">26.2.3.1. C empty struct</a></li>
-<li><a href="#openmp">26.2.3.2. OpenMP</a>
-<ul class="sectlevel5">
-<li><a href="#openmp-validation">26.2.3.2.1. OpenMP validation</a></li>
-</ul>
-</li>
+<li><a href="#openmp">26.2.3.2. OpenMP</a></li>
 </ul>
 </li>
 </ul>
@@ -1520,7 +1514,17 @@ pre{ white-space:pre }
 <li><a href="#cpp17">26.3.3.1. C++17 N4659 standards draft</a></li>
 </ul>
 </li>
-<li><a href="#cpp-type-casting">26.3.4. C++ type casting</a></li>
+<li><a href="#cpp-templates">26.3.4. C++ templates</a>
+<ul class="sectlevel4">
+<li><a href="#sfinae">26.3.4.1. SFINAE</a></li>
+</ul>
+</li>
+<li><a href="#cpp-type-casting">26.3.5. C++ type casting</a></li>
+<li><a href="#cpp-compile-time-magic">26.3.6. C++ compile time magic</a>
+<ul class="sectlevel4">
+<li><a href="#cpp-decltype">26.3.6.1. C++ <code>decltype</code></a></li>
+</ul>
+</li>
 </ul>
 </li>
 <li><a href="#posix">26.4. POSIX</a>
@@ -1561,10 +1565,16 @@ pre{ white-space:pre }
 <ul class="sectlevel3">
 <li><a href="#python">26.7.1. Python</a>
 <ul class="sectlevel4">
-<li><a href="#build-and-install-the-interpreter">26.7.1.1. Build and install the interpreter</a></li>
-<li><a href="#python-gem5-user-mode-simulation">26.7.1.2. Python gem5 user mode simulation</a></li>
-<li><a href="#embedding-python-in-another-application">26.7.1.3. Embedding Python in another application</a></li>
-<li><a href="#pybind11">26.7.1.4. pybind11</a></li>
+<li><a href="#python-standard-library">26.7.1.1. Python standard library</a>
+<ul class="sectlevel5">
+<li><a href="#python-unittest">26.7.1.1.1. Python unittest</a></li>
+<li><a href="#python-relative-imports">26.7.1.1.2. Python relative imports</a></li>
+</ul>
+</li>
+<li><a href="#build-and-install-the-interpreter">26.7.1.2. Build and install the interpreter</a></li>
+<li><a href="#python-gem5-user-mode-simulation">26.7.1.3. Python gem5 user mode simulation</a></li>
+<li><a href="#embedding-python-in-another-application">26.7.1.4. Embedding Python in another application</a></li>
+<li><a href="#pybind11">26.7.1.5. pybind11</a></li>
 </ul>
 </li>
 <li><a href="#node-js">26.7.2. Node.js</a>
@@ -1595,16 +1605,9 @@ pre{ white-space:pre }
 <li><a href="#parsec-benchmark-with-parsecmgmt">26.9.1.3. PARSEC benchmark with parsecmgmt</a></li>
 <li><a href="#parsec-uninstall">26.9.1.4. PARSEC uninstall</a></li>
 <li><a href="#parsec-benchmark-hacking">26.9.1.5. PARSEC benchmark hacking</a></li>
-<li><a href="#coremark">26.9.1.6. Coremark</a></li>
-</ul>
-</li>
-<li><a href="#microbenchmarks">26.9.2. Microbenchmarks</a>
-<ul class="sectlevel4">
-<li><a href="#dhrystone">26.9.2.1. Dhrystone</a></li>
-<li><a href="#lmbench">26.9.2.2. LMbench</a></li>
-<li><a href="#stream-benchmark">26.9.2.3. STREAM benchmark</a></li>
 </ul>
 </li>
+<li><a href="#microbenchmarks">26.9.2. Microbenchmarks</a></li>
 </ul>
 </li>
 <li><a href="#userland-libs-directory">26.10. userland/libs directory</a>
@@ -2105,11 +2108,7 @@ pre{ white-space:pre }
 <ul class="sectlevel3">
 <li><a href="#p51">34.3.1. 2017 Lenovo ThinkPad P51</a>
 <ul class="sectlevel4">
-<li><a href="#p51-benchmarks">34.3.1.1. P51 benchmarks</a>
-<ul class="sectlevel5">
-<li><a href="#p51-coremark-pro">34.3.1.1.1. P51 CoreMark-Pro</a></li>
-</ul>
-</li>
+<li><a href="#p51-benchmarks">34.3.1.1. P51 benchmarks</a></li>
 <li><a href="#p51-maintenance-history">34.3.1.2. P51 maintenance history</a></li>
 <li><a href="#intel-core-i7-7820hq-cpu">34.3.1.3. Intel Core i7-7820HQ CPU</a></li>
 <li><a href="#samsung-m471a2k43bb1-crc-16gb-dram">34.3.1.4. Samsung M471A2K43BB1-CRC 16GB DRAM</a></li>
@@ -2767,80 +2766,7 @@ hello /root/.profile
 </div>
 </div>
 <div class="sect4">
-<h5 id="your-first-qemu-hack"><a class="anchor" href="#your-first-qemu-hack"></a><a class="link" href="#your-first-qemu-hack">1.2.2.3. Your first QEMU hack</a></h5>
-<div class="paragraph">
-<p>Not satisfied with mere software? OK then, let&#8217;s hack up the QEMU x86 CPU identification:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>vim submodules/qemu/target/i386/cpu.c</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>and modify:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>.model_id = "QEMU Virtual CPU version " QEMU_HW_VERSION,</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>to contain:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>.model_id = "QEMU Virtual CPU version HACKED " QEMU_HW_VERSION,</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>then as usual rebuild and re-run:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./build-qemu
-./run --eval-after 'grep "model name" /proc/cpuinfo'</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>and once again, there is your message: QEMU communicated it to the Linux kernel, which printed it out.</p>
-</div>
-<div class="paragraph">
-<p>You have now gone from newb to hardware hacker in a mere 15 minutes, your rate of progress is truly astounding!!!</p>
-</div>
-<div class="paragraph">
-<p>Seriously though, if you want to be a real hardware hacker, it just can&#8217;t be done with open source tools as of 2018. The root obstacle is that:</p>
-</div>
-<div class="ulist">
-<ul>
-<li>
-<p><a href="https://en.wikipedia.org/wiki/Semiconductor_fabrication_plant">Silicon fabs</a> don&#8217;t publish reveal their <a href="https://en.wikipedia.org/wiki/Design_rule_checking">design rules</a></p>
-</li>
-<li>
-<p>which implies that there are no decent <a href="https://en.wikipedia.org/wiki/Standard_cell">standard cell libraries</a>. See also: <a href="https://www.quora.com/Are-there-good-open-source-standard-cell-libraries-to-learn-IC-synthesis-with-EDA-tools/answer/Ciro-Santilli" class="bare">https://www.quora.com/Are-there-good-open-source-standard-cell-libraries-to-learn-IC-synthesis-with-EDA-tools/answer/Ciro-Santilli</a></p>
-</li>
-<li>
-<p>which implies that people can&#8217;t develop open source <a href="https://en.wikipedia.org/wiki/Electronic_design_automation">EDA tools</a></p>
-</li>
-<li>
-<p>which implies that you can&#8217;t get decent <a href="https://community.cadence.com/cadence_blogs_8/b/di/posts/hls-ppa-is-it-all-you-need-to-know">power, performance and area</a> estimates</p>
-</li>
-</ul>
-</div>
-<div class="paragraph">
-<p>The only thing you can do with open source is purely functional designs with <a href="https://en.wikipedia.org/wiki/Verilator">Verilator</a>, but you will never know if it can be actually produced and how efficient it can be.</p>
-</div>
-<div class="paragraph">
-<p>If you really want to develop semiconductors, your only choice is to join an university or a semiconductor company that has the EDA licenses.</p>
-</div>
-<div class="paragraph">
-<p>See also: <a href="#should-you-waste-your-life-with-systems-programming">Section 1.1, &#8220;Should you waste your life with systems programming?&#8221;</a>.</p>
-</div>
-<div class="paragraph">
-<p>While hacking QEMU, you will likely want to GDB step its source. That is trivial since QEMU is just another userland program like any other, but our setup has a shortcut to make it even more convenient, see: <a href="#debug-the-emulator">Section 22.8, &#8220;Debug the emulator&#8221;</a>.</p>
-</div>
-</div>
-<div class="sect4">
-<h5 id="your-first-glibc-hack"><a class="anchor" href="#your-first-glibc-hack"></a><a class="link" href="#your-first-glibc-hack">1.2.2.4. Your first glibc hack</a></h5>
+<h5 id="your-first-glibc-hack"><a class="anchor" href="#your-first-glibc-hack"></a><a class="link" href="#your-first-glibc-hack">1.2.2.3. Your first glibc hack</a></h5>
 <div class="paragraph">
 <p>We use <a href="#libc-choice">glibc as our default libc now</a>, and it is tracked as an unmodified submodule at <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/submodules/glibc">submodules/glibc</a>, at the exact same version that Buildroot has it, which can be found at: <a href="https://github.com/buildroot/buildroot/blob/2018.05/package/glibc/glibc.mk#L13">package/glibc/glibc.mk</a>. Buildroot 2018.05 applies no patches.</p>
 </div>
@@ -2928,7 +2854,7 @@ index 706b20b492..23185948f3 100644
 </div>
 </div>
 <div class="sect4">
-<h5 id="your-first-binutils-hack"><a class="anchor" href="#your-first-binutils-hack"></a><a class="link" href="#your-first-binutils-hack">1.2.2.5. Your first Binutils hack</a></h5>
+<h5 id="your-first-binutils-hack"><a class="anchor" href="#your-first-binutils-hack"></a><a class="link" href="#your-first-binutils-hack">1.2.2.4. Your first Binutils hack</a></h5>
 <div class="paragraph">
 <p>Have you ever felt that a single <code>inc</code> instruction was not enough? Really? Me too!</p>
 </div>
@@ -3014,7 +2940,7 @@ index af583ce578..3cc341f303 100644
 </div>
 </div>
 <div class="sect4">
-<h5 id="your-first-gcc-hack"><a class="anchor" href="#your-first-gcc-hack"></a><a class="link" href="#your-first-gcc-hack">1.2.2.6. Your first GCC hack</a></h5>
+<h5 id="your-first-gcc-hack"><a class="anchor" href="#your-first-gcc-hack"></a><a class="link" href="#your-first-gcc-hack">1.2.2.5. Your first GCC hack</a></h5>
 <div class="paragraph">
 <p>OK, now time to hack GCC.</p>
 </div>
@@ -3623,9 +3549,6 @@ unzip lkmc-*.zip
 </div>
 </div>
 <div class="paragraph">
-<p>This also allows you to <a href="#your-first-qemu-hack">modify QEMU</a> if you&#8217;re into that sort of thing.</p>
-</div>
-<div class="paragraph">
 <p>To build the kernel modules as in <a href="#your-first-kernel-module-hack">Your first kernel module hack</a> do:</p>
 </div>
 <div class="literalblock">
@@ -7690,9 +7613,6 @@ cat f
 <p>The device tree is a Linux kernel defined data structure that serves to inform the kernel how the hardware is setup.</p>
 </div>
 <div class="paragraph">
-<p><a href="#platform-device">platform_device</a> contains a minimal runnable example of device tree manipulation.</p>
-</div>
-<div class="paragraph">
 <p>Device trees serve to reduce the need for hardware vendors to patch the kernel: they just provide a device tree file instead, which is much simpler.</p>
 </div>
 <div class="paragraph">
@@ -8418,7 +8338,17 @@ qemu: uncaught target signal 6 (Aborted) - core dumped</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>Running dynamically linked executables in QEMU requires pointing it to the root filesystem with the <code>-L</code> option so that it can find the dynamic linker and shared libraries.</p>
+<p>Running dynamically linked executables in QEMU requires pointing it to the root filesystem with the <code>-L</code> option so that it can find the dynamic linker and shared libraries, see also:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://stackoverflow.com/questions/54802670/using-dynamic-linker-with-qemu-arm/64551293#64551293" class="bare">https://stackoverflow.com/questions/54802670/using-dynamic-linker-with-qemu-arm/64551293#64551293</a></p>
+</li>
+<li>
+<p><a href="https://stackoverflow.com/questions/khow-to-gdb-step-debug-a-dynamically-linked-executable-in-qemu-user-mode" class="bare">https://stackoverflow.com/questions/khow-to-gdb-step-debug-a-dynamically-linked-executable-in-qemu-user-mode</a></p>
+</li>
+</ul>
 </div>
 <div class="paragraph">
 <p>We pass <code>-L</code> by default, so everything just works.</p>
@@ -8662,7 +8592,17 @@ qemu: uncaught target signal 6 (Aborted) - core dumped</pre>
 <div class="sect3">
 <h4 id="gem5-dynamic-linked-executables-in-syscall-emulation"><a class="anchor" href="#gem5-dynamic-linked-executables-in-syscall-emulation"></a><a class="link" href="#gem5-dynamic-linked-executables-in-syscall-emulation">10.7.1. gem5 dynamic linked executables in syscall emulation</a></h4>
 <div class="paragraph">
-<p>Support for dynamic linking was added in November 2019: <a href="https://stackoverflow.com/questions/50542222/how-to-run-a-dynamically-linked-executable-syscall-emulation-mode-se-py-in-gem5/50696098#50696098" class="bare">https://stackoverflow.com/questions/50542222/how-to-run-a-dynamically-linked-executable-syscall-emulation-mode-se-py-in-gem5/50696098#50696098</a></p>
+<p>Support for dynamic linking was added in November 2019:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://stackoverflow.com/questions/50542222/how-to-run-a-dynamically-linked-executable-syscall-emulation-mode-se-py-in-gem5/50696098#50696098" class="bare">https://stackoverflow.com/questions/50542222/how-to-run-a-dynamically-linked-executable-syscall-emulation-mode-se-py-in-gem5/50696098#50696098</a></p>
+</li>
+<li>
+<p><a href="https://stackoverflow.com/questions/64547306/cannot-open-lib-ld-linux-aarch64-so-1-in-qemu-or-gem5/64551313#64551313" class="bare">https://stackoverflow.com/questions/64547306/cannot-open-lib-ld-linux-aarch64-so-1-in-qemu-or-gem5/64551313#64551313</a></p>
+</li>
+</ul>
 </div>
 <div class="paragraph">
 <p>Note that as shown at <a href="#benchmark-emulators-on-userland-executables">Section 34.2.2, &#8220;Benchmark emulators on userland executables&#8221;</a>, the dynamic version runs 200x more instructions, which might have an impact on smaller simulations in detailed CPUs.</p>
@@ -12945,7 +12885,7 @@ CONFIG_VIRTIO_PCI=y</pre>
 <p><a href="https://unix.stackexchange.com/questions/414655/not-syncing-vfs-unable-to-mount-root-fs-on-unknown-block0-0/603197#603197" class="bare">https://unix.stackexchange.com/questions/414655/not-syncing-vfs-unable-to-mount-root-fs-on-unknown-block0-0/603197#603197</a></p>
 </li>
 <li>
-<p><a href="https://stackoverflow.com/questions/63277677/i-meet-a-problem-when-i-encountered-in-the-fs-mode-of-running-gem5/63278487#63278487" class="bare">https://stackoverflow.com/questions/63277677/i-meet-a-problem-when-i-encountered-in-the-fs-mode-of-running-gem5/63278487#63278487</a></p>
+<p><a href="https://stackoverflow.com/questions/63277677/i-meet-a-problem-when-i-encountered-in-the-fs-mode-of-running-gem5/63278487#63278487" class="bare">https://stackoverflow.com/questions/63277677/i-meet-a-problem-when-i-encountered-in-the-fs-mode-of-running-gem5/63278487#63278487</a> summary only</p>
 </li>
 </ul>
 </div>
@@ -14384,9 +14324,6 @@ request_irq irq = 1 ret = 0</pre>
 <div class="paragraph">
 <p>and furthermore interrupt <code>1</code> and <code>12</code> happen immediately TODO why, were they somehow pending?</p>
 </div>
-<div class="paragraph">
-<p>So so see something interesting, you need to monitor an interrupt that is more rare than the keyboard, e.g. <a href="#platform-device">platform_device</a>.</p>
-</div>
 </div>
 <div class="sect3">
 <h4 id="procinterrupts"><a class="anchor" href="#procinterrupts"></a><a class="link" href="#procinterrupts">16.11.3. /proc/interrupts</a></h4>
@@ -17407,19 +17344,6 @@ Format specific information:
 </ul>
 </div>
 </li>
-<li>
-<p>added in <a href="https://github.com/cirosantilli/qemu">our fork of QEMU</a>:</p>
-<div class="ulist">
-<ul>
-<li>
-<p><a href="#pci-min">pci_min</a></p>
-</li>
-<li>
-<p><a href="#platform-device">platform_device</a></p>
-</li>
-</ul>
-</div>
-</li>
 </ul>
 </div>
 <div class="sect3">
@@ -17428,77 +17352,7 @@ Format specific information:
 <p>Only tested in x86.</p>
 </div>
 <div class="sect4">
-<h5 id="pci-min"><a class="anchor" href="#pci-min"></a><a class="link" href="#pci-min">22.6.1.1. pci_min</a></h5>
-<div class="paragraph">
-<p>PCI driver for our minimal <code>pci_min.c</code> QEMU fork device:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./run -- -device lkmc_pci_min</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>then:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>insmod pci_min.ko</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Sources:</p>
-</div>
-<div class="ulist">
-<ul>
-<li>
-<p>Kernel module: <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/kernel_modules/pci_min.c">kernel_modules/pci_min.c</a>.</p>
-</li>
-<li>
-<p>QEMU device: <a href="https://github.com/cirosantilli/qemu/blob/lkmc/hw/misc/lkmc_pci_min.c" class="bare">https://github.com/cirosantilli/qemu/blob/lkmc/hw/misc/lkmc_pci_min.c</a></p>
-</li>
-</ul>
-</div>
-<div class="paragraph">
-<p>Outcome:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>&lt;4&gt;[   10.608241] pci_min: loading out-of-tree module taints kernel.
-&lt;6&gt;[   10.609935] probe
-&lt;6&gt;[   10.651881] dev-&gt;irq = 11
-lkmc_pci_min mmio_write addr = 0 val = 12345678 size = 4
-&lt;6&gt;[   10.668515] irq_handler irq = 11 dev = 251
-lkmc_pci_min mmio_write addr = 4 val = 0 size = 4</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>What happened:</p>
-</div>
-<div class="ulist">
-<ul>
-<li>
-<p>right at probe time, we write to a register</p>
-</li>
-<li>
-<p>our hardware model is coded such that it generates an interrupt when written to</p>
-</li>
-<li>
-<p>the Linux kernel interrupt handler write to another register, which tells the hardware to stop sending interrupts</p>
-</li>
-</ul>
-</div>
-<div class="paragraph">
-<p>Kernel messages and printks from inside QEMU are shown all together, to see that more clearly, run in <a href="#qemu-graphic-mode">QEMU graphic mode</a> instead.</p>
-</div>
-<div class="paragraph">
-<p>We don&#8217;t enable the device by default because it does not work for vanilla QEMU, which we often want to test with this repository.</p>
-</div>
-<div class="paragraph">
-<p>Probe already does a MMIO write, which generates an IRQ and tests everything.</p>
-</div>
-</div>
-<div class="sect4">
-<h5 id="qemu-edu"><a class="anchor" href="#qemu-edu"></a><a class="link" href="#qemu-edu">22.6.1.2. QEMU edu PCI device</a></h5>
+<h5 id="qemu-edu"><a class="anchor" href="#qemu-edu"></a><a class="link" href="#qemu-edu">22.6.1.1. QEMU edu PCI device</a></h5>
 <div class="paragraph">
 <p>Small upstream educational PCI device:</p>
 </div>
@@ -17564,11 +17418,14 @@ lkmc_pci_min mmio_write addr = 4 val = 0 size = 4</pre>
 <li>
 <p><a href="https://stackoverflow.com/questions/62831327/add-memory-device-to-qemu" class="bare">https://stackoverflow.com/questions/62831327/add-memory-device-to-qemu</a></p>
 </li>
+<li>
+<p><a href="https://stackoverflow.com/questions/64539528/qemu-pci-dma-read-and-pci-dma-write-does-not-work" class="bare">https://stackoverflow.com/questions/64539528/qemu-pci-dma-read-and-pci-dma-write-does-not-work</a></p>
+</li>
 </ul>
 </div>
 </div>
 <div class="sect4">
-<h5 id="manipulate-pci-registers-directly"><a class="anchor" href="#manipulate-pci-registers-directly"></a><a class="link" href="#manipulate-pci-registers-directly">22.6.1.3. Manipulate PCI registers directly</a></h5>
+<h5 id="manipulate-pci-registers-directly"><a class="anchor" href="#manipulate-pci-registers-directly"></a><a class="link" href="#manipulate-pci-registers-directly">22.6.1.2. Manipulate PCI registers directly</a></h5>
 <div class="paragraph">
 <p>In this section we will try to interact with PCI devices directly from userland without kernel modules.</p>
 </div>
@@ -17590,7 +17447,7 @@ lkmc_pci_min mmio_write addr = 4 val = 0 size = 4</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>which we identify as being <code>edu</code> and <code>pci_min</code> respectively by the magic numbers: <code>1234:11e?</code></p>
+<p>which we identify as being <a href="#qemu-edu">QEMU edu PCI device</a> by the magic number: <code>1234:11e8</code>.</p>
 </div>
 <div class="paragraph">
 <p>Alternatively, we can also do use the QEMU monitor:</p>
@@ -17605,17 +17462,7 @@ lkmc_pci_min mmio_write addr = 4 val = 0 size = 4</pre>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>      dev: lkmc_pci_min, id ""
-        addr = 07.0
-        romfile = ""
-        rombar = 1 (0x1)
-        multifunction = false
-        command_serr_enable = true
-        x-pcie-lnksta-dllla = true
-        x-pcie-extcap-init = true
-        class Class 00ff, addr 00:07.0, pci id 1234:11e9 (sub 1af4:1100)
-        bar 0: mem at 0xfeb54000 [0xfeb54007]
-      dev: edu, id ""
+<pre>      dev: edu, id ""
         addr = 06.0
         romfile = ""
         rombar = 1 (0x1)
@@ -17652,7 +17499,7 @@ lkmc_pci_min mmio_write addr = 4 val = 0 size = 4</pre>
 <div class="literalblock">
 <div class="content">
 <pre>setpci -s 0000:00:06.0 BASE_ADDRESS_0
-setpci -d 1234:11e9 BASE_ADDRESS_0</pre>
+setpci -d 1234:11e8 BASE_ADDRESS_0</pre>
 </div>
 </div>
 <div class="paragraph">
@@ -17680,16 +17527,14 @@ setpci -d 1234:11e9 BASE_ADDRESS_0</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>which writes to the first register of our <a href="#pci-min">pci_min</a> device.</p>
+<p>which writes to the first register of the edu device.</p>
 </div>
 <div class="paragraph">
-<p>The device then fires an interrupt at irq 11, which is unhandled, which leads the kernel to say you are a bad boy:</p>
+<p>The device then fires an interrupt at irq 11, which is unhandled, which leads the kernel to say you are a bad person:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>lkmc_pci_min mmio_write addr = 0 val = 12345678 size = 4
-&lt;5&gt;[ 1064.042435] random: crng init done
-&lt;3&gt;[ 1065.567742] irq 11: nobody cared (try booting with the "irqpoll" option)</pre>
+<pre>&lt;3&gt;[ 1065.567742] irq 11: nobody cared (try booting with the "irqpoll" option)</pre>
 </div>
 </div>
 <div class="paragraph">
@@ -17705,7 +17550,7 @@ devmem 0xfeb54000 w 0x12345678</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>Our kernel module handles the interrupt, but does not acknowledge it like our proper <a href="#pci-min">pci_min</a> kernel module, and so it keeps firing, which leads to infinitely many messages being printed:</p>
+<p>Our kernel module handles the interrupt, but does not acknowledge it like our proper edu kernel module, and so it keeps firing, which leads to infinitely many messages being printed:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -17714,7 +17559,7 @@ devmem 0xfeb54000 w 0x12345678</pre>
 </div>
 </div>
 <div class="sect4">
-<h5 id="pciutils"><a class="anchor" href="#pciutils"></a><a class="link" href="#pciutils">22.6.1.4. pciutils</a></h5>
+<h5 id="pciutils"><a class="anchor" href="#pciutils"></a><a class="link" href="#pciutils">22.6.1.3. pciutils</a></h5>
 <div class="paragraph">
 <p>There are two versions of <code>setpci</code> and <code>lspci</code>:</p>
 </div>
@@ -17730,7 +17575,7 @@ devmem 0xfeb54000 w 0x12345678</pre>
 </div>
 </div>
 <div class="sect4">
-<h5 id="introduction-to-pci"><a class="anchor" href="#introduction-to-pci"></a><a class="link" href="#introduction-to-pci">22.6.1.5. Introduction to PCI</a></h5>
+<h5 id="introduction-to-pci"><a class="anchor" href="#introduction-to-pci"></a><a class="link" href="#introduction-to-pci">22.6.1.4. Introduction to PCI</a></h5>
 <div class="paragraph">
 <p>The PCI standard is non-free, obviously like everything in low level: <a href="https://pcisig.com/specifications" class="bare">https://pcisig.com/specifications</a> but Google gives several illegal PDF hits :-)</p>
 </div>
@@ -17790,7 +17635,7 @@ devmem 0xfeb54000 w 0x12345678</pre>
 </div>
 </div>
 <div class="sect4">
-<h5 id="pci-bfd"><a class="anchor" href="#pci-bfd"></a><a class="link" href="#pci-bfd">22.6.1.6. PCI BFD</a></h5>
+<h5 id="pci-bfd"><a class="anchor" href="#pci-bfd"></a><a class="link" href="#pci-bfd">22.6.1.5. PCI BFD</a></h5>
 <div class="paragraph">
 <p><code>lspci -k</code> shows something like:</p>
 </div>
@@ -17844,7 +17689,7 @@ devmem 0xfeb54000 w 0x12345678</pre>
 </div>
 </div>
 <div class="sect4">
-<h5 id="pci-bar"><a class="anchor" href="#pci-bar"></a><a class="link" href="#pci-bar">22.6.1.7. PCI BAR</a></h5>
+<h5 id="pci-bar"><a class="anchor" href="#pci-bar"></a><a class="link" href="#pci-bar">22.6.1.6. PCI BAR</a></h5>
 <div class="paragraph">
 <p><a href="https://stackoverflow.com/questions/30190050/what-is-base-address-register-bar-in-pcie/44716618#44716618" class="bare">https://stackoverflow.com/questions/30190050/what-is-base-address-register-bar-in-pcie/44716618#44716618</a></p>
 </div>
@@ -18001,85 +17846,7 @@ echo 255 &gt;brightness</pre>
 </div>
 </div>
 <div class="sect3">
-<h4 id="platform-device"><a class="anchor" href="#platform-device"></a><a class="link" href="#platform-device">22.6.4. platform_device</a></h4>
-<div class="paragraph">
-<p>Minimal platform device example coded into the <code>-M versatilepb</code> SoC of our QEMU fork.</p>
-</div>
-<div class="paragraph">
-<p>Using this device now requires checking out to the branch:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>git checkout platform-device
-git submodule sync</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>before building, it does not work on master.</p>
-</div>
-<div class="paragraph">
-<p>Rationale: we found out that the kernels that build for <code>qemu -M versatilepb</code> don&#8217;t work on gem5 because <code>versatilepb</code> is an old pre-v7 platform, and gem5 requires armv7. So we migrated over to <code>-M virt</code> to have a single kernel for both gem5 and QEMU, and broke this since the single kernel was more important. TODO port to <code>-M virt</code>.</p>
-</div>
-<div class="paragraph">
-<p>The module itself can be found at: <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/platform-device/kernel_modules/platform_device.c" class="bare">https://github.com/cirosantilli/linux-kernel-module-cheat/blob/platform-device/kernel_modules/platform_device.c</a></p>
-</div>
-<div class="paragraph">
-<p>Uses:</p>
-</div>
-<div class="ulist">
-<ul>
-<li>
-<p><code>hw/misc/lkmc_platform_device.c</code> minimal device added in our QEMU fork to <code>-M versatilepb</code></p>
-</li>
-<li>
-<p>the device tree entry we added to our Linux kernel fork: <a href="https://github.com/cirosantilli/linux/blob/361bb623671a52a36a077a6dd45843389a687a33/arch/arm/boot/dts/versatile-pb.dts#L42" class="bare">https://github.com/cirosantilli/linux/blob/361bb623671a52a36a077a6dd45843389a687a33/arch/arm/boot/dts/versatile-pb.dts#L42</a></p>
-</li>
-</ul>
-</div>
-<div class="paragraph">
-<p>Expected outcome after insmod:</p>
-</div>
-<div class="ulist">
-<ul>
-<li>
-<p>QEMU reports MMIO with printfs</p>
-</li>
-<li>
-<p>IRQs are generated and handled by this module, which logs to dmesg</p>
-</li>
-</ul>
-</div>
-<div class="paragraph">
-<p>Without insmoding this module, try writing to the register with <a href="#dev-mem">/dev/mem</a>:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>devmem 0x101e9000 w 0x12345678</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>We can also observe the interrupt with <a href="#dummy-irq">dummy-irq</a>:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>modprobe dummy-irq irq=34
-insmod platform_device.ko</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>The IRQ number <code>34</code> was found by on the dmesg after:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>insmod platform_device.ko</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Bibliography: <a href="https://stackoverflow.com/questions/28315265/how-to-add-a-new-device-in-qemu-source-code/44612957#44612957" class="bare">https://stackoverflow.com/questions/28315265/how-to-add-a-new-device-in-qemu-source-code/44612957#44612957</a></p>
-</div>
-</div>
-<div class="sect3">
-<h4 id="gem5-educational-hardware-models"><a class="anchor" href="#gem5-educational-hardware-models"></a><a class="link" href="#gem5-educational-hardware-models">22.6.5. gem5 educational hardware models</a></h4>
+<h4 id="gem5-educational-hardware-models"><a class="anchor" href="#gem5-educational-hardware-models"></a><a class="link" href="#gem5-educational-hardware-models">22.6.4. gem5 educational hardware models</a></h4>
 <div class="paragraph">
 <p>TODO get some working!</p>
 </div>
@@ -19434,7 +19201,7 @@ root</pre>
 <p>OK, this is why we used gem5 in the first place, performance measurements!</p>
 </div>
 <div class="paragraph">
-<p>Let&#8217;s see how many cycles <a href="#dhrystone">Dhrystone</a>, which Buildroot provides, takes for a few different input parameters.</p>
+<p>Let&#8217;s see how many cycles dhrystone, which Buildroot provides, takes for a few different input parameters.</p>
 </div>
 <div class="paragraph">
 <p>We will do that for various input parameters on full system by taking a checkpoint after the boot finishes a fast atomic CPU boot, and then we will restore in a more detailed mode and run the benchmark:</p>
@@ -21309,10 +21076,33 @@ m5 execfile</pre>
 <div class="sect3">
 <h4 id="m5ops-instructions"><a class="anchor" href="#m5ops-instructions"></a><a class="link" href="#m5ops-instructions">23.8.2. m5ops instructions</a></h4>
 <div class="paragraph">
-<p>gem5 allocates some magic instructions on unused instruction encodings for convenient guest instrumentation.</p>
+<p>There are few different possible instructions that can be used to implement identical m5ops:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>magic instructions reserved in the encoding space</p>
+</li>
+<li>
+<p>magic addresses: <a href="#m5ops-magic-addresses">m5ops magic addresses</a></p>
+</li>
+<li>
+<p>unused <a href="#semihosting">Semihosting</a> addresses space on ARM platforms</p>
+</li>
+</ul>
 </div>
 <div class="paragraph">
-<p>Those instructions are exposed through the <a href="#gem5-m5-executable">gem5 m5 executable</a> in tree executable.</p>
+<p>All of those those methods are exposed through the <a href="#gem5-m5-executable">gem5 m5 executable</a> in-tree executable. You can select which method to use when calling the executable, e.g.:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>m5 exit
+# Same as the above.
+m5 --inst exit
+# The address is mandatory if not configured at build time.
+m5 --addr 0x10010000 exit
+m5 --semi exit</pre>
+</div>
 </div>
 <div class="paragraph">
 <p>To make things simpler to understand, you can play around with our own minimized educational <code>m5</code> subset:</p>
@@ -21400,7 +21190,45 @@ m5 execfile</pre>
 </ul>
 </div>
 <div class="sect4">
-<h5 id="m5ops-instructions-interface"><a class="anchor" href="#m5ops-instructions-interface"></a><a class="link" href="#m5ops-instructions-interface">23.8.2.1. m5ops instructions interface</a></h5>
+<h5 id="m5ops-magic-addresses"><a class="anchor" href="#m5ops-magic-addresses"></a><a class="link" href="#m5ops-magic-addresses">23.8.2.1. m5ops magic addresses</a></h5>
+<div class="paragraph">
+<p>These are magic addresses that when accessed lead to an <a href="#m5ops">m5op</a>.</p>
+</div>
+<div class="paragraph">
+<p>The base address is given by <code>system.m5ops_base</code>, and then each m5op happens at a different address offset form that base.</p>
+</div>
+<div class="paragraph">
+<p>If <code>system.m5ops_base</code> is 0, then the memory m5ops are disabled.</p>
+</div>
+<div class="paragraph">
+<p>Note that the address is physical, and therefore when running in full system on top of the Linux kernel, you must first map a virtual to physical address with <code>/dev/mem</code> as mentioned at: <a href="#userland-physical-address-experiments">Userland physical address experiments</a>.</p>
+</div>
+<div class="paragraph">
+<p>One advantage of this method is that it can work with <a href="#gem5-kvm">gem5 KVM</a>, whereas the magic instructions don&#8217;t, since the host cannot handle them and it is hard to hook into that.</p>
+</div>
+<div class="paragraph">
+<p>A <a href="#baremetal">Baremetal</a> example of that can be found at: <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/baremetal/arch/aarch64/no_bootloader/m5_exit_addr.S">baremetal/arch/aarch64/no_bootloader/m5_exit_addr.S</a>.</p>
+</div>
+<div class="paragraph">
+<p>As of gem5 0d5a80cb469f515b95e03f23ddaf70c9fd2ecbf2, <code>fs.py --baremetal</code> disables the memory m5ops however for some reason, therefore you should run that program as:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./run --arch aarch64 --baremetal baremetal/arch/aarch64/no_bootloader/m5_exit_addr.S --emulator gem5 --trace-insts-stdout -- --param 'system.m5ops_base=0x10010000'</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>TODO failing with:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>info: Entering event queue @ 0.  Starting simulation...
+fatal: Unable to find destination for [0x10012100:0x10012108] on system.iobus</pre>
+</div>
+</div>
+</div>
+<div class="sect4">
+<h5 id="m5ops-instructions-interface"><a class="anchor" href="#m5ops-instructions-interface"></a><a class="link" href="#m5ops-instructions-interface">23.8.2.2. m5ops instructions interface</a></h5>
 <div class="paragraph">
 <p>Let&#8217;s study how the <a href="#gem5-m5-executable">gem5 m5 executable</a> uses them:</p>
 </div>
@@ -21426,14 +21254,11 @@ m5 execfile</pre>
 <p>magic instructions, which don&#8217;t exist in the corresponding arch</p>
 </li>
 <li>
-<p>magic memory addresses on a given page</p>
+<p>magic memory addresses on a given page: <a href="#m5ops-magic-addresses">m5ops magic addresses</a></p>
 </li>
 </ul>
 </div>
 <div class="paragraph">
-<p>TODO: what is the advantage of magic memory addresses? Because you have to do more setup work by telling the kernel never to touch the magic page. For the magic instructions, the only thing that could go wrong is if you run some crazy kind of fuzzing workload that generates random instructions.</p>
-</div>
-<div class="paragraph">
 <p>Then, in aarch64 magic instructions for example, the lines:</p>
 </div>
 <div class="literalblock">
@@ -21514,7 +21339,7 @@ m5_fail(ints[1], ints[0]);</pre>
 </div>
 </div>
 <div class="sect4">
-<h5 id="m5op-annotations"><a class="anchor" href="#m5op-annotations"></a><a class="link" href="#m5op-annotations">23.8.2.2. m5op annotations</a></h5>
+<h5 id="m5op-annotations"><a class="anchor" href="#m5op-annotations"></a><a class="link" href="#m5op-annotations">23.8.2.3. m5op annotations</a></h5>
 <div class="paragraph">
 <p><code>include/gem5/asm/generic/m5ops.h</code> also describes some annotation instructions.</p>
 </div>
@@ -24208,6 +24033,9 @@ type=SimpleMemory</pre>
 <div class="paragraph">
 <p>and configure it into Eclipse as usual.</p>
 </div>
+<div class="paragraph">
+<p>One downside of this setup is that if you want to nuke your build directory to get a clean build, then the Eclipse configuration files present in it might get deleted. Maybe it is possible to store configuration files outside of the directory, but we are now mitigating that by making a backup copy of those configuration files before removing the directory, and restoring it when you do <code>./build-gem --clean</code>.</p>
+</div>
 </div>
 <div class="sect3">
 <h4 id="gem5-python-c-interaction"><a class="anchor" href="#gem5-python-c-interaction"></a><a class="link" href="#gem5-python-c-interaction">23.22.2. gem5 Python C++ interaction</a></h4>
@@ -28879,6 +28707,9 @@ class O3ThreadContext : public ThreadContext</pre>
 </div>
 </div>
 <div class="paragraph">
+<p>see also: <a href="https://stackoverflow.com/questions/64420547/in-gem5-how-do-i-know-the-specific-location-of-the-class/64423633#64423633" class="bare">https://stackoverflow.com/questions/64420547/in-gem5-how-do-i-know-the-specific-location-of-the-class/64423633#64423633</a></p>
+</div>
+<div class="paragraph">
 <p>Unlike in <code>SimpleThread</code> however, <code>O3ThreadContext</code> does not contain the register data itself, e.g. <code>O3ThreadContext::readIntRegFlat</code> instead forwards to <code>cpu</code>:</p>
 </div>
 <div class="literalblock">
@@ -30043,9 +29874,9 @@ build/ARM/config/the_isa.hh
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>git submodule update --init submodules/gensim-simulator
+<pre>git submodule update --init submodules/gensim
 sudo apt install libantlr3c-dev
-cd submodule/gensim-simulator
+cd submodule/gensim
 make</pre>
 </div>
 </div>
@@ -30091,12 +29922,12 @@ Aborted (core dumped)</pre>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>cd /home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim-simulator/models/armv8 &amp;&amp; \
-  /home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim-simulator/build/dist/bin/gensim \
-  -a /home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim-simulator/models/armv8/aarch64.ac \
+<pre>cd /home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim/models/armv8 &amp;&amp; \
+  /home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim/build/dist/bin/gensim \
+  -a /home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim/models/armv8/aarch64.ac \
   -s module,arch,decode,disasm,ee_interp,ee_blockjit,jumpinfo,function,makefile \
-  -o decode.GenerateDotGraph=1,makefile.libtrace_path=/home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim-simulator/support/libtrace/inc,makefile.archsim_path=/home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim-simulator/archsim/inc,makefile.llvm_path=,makefile.Optimise=2,makefile.Debug=1 \
-  -t /home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim-simulator/build/models/armv8/output-aarch64/</pre>
+  -o decode.GenerateDotGraph=1,makefile.libtrace_path=/home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim/support/libtrace/inc,makefile.archsim_path=/home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim/archsim/inc,makefile.llvm_path=,makefile.Optimise=2,makefile.Debug=1 \
+  -t /home/ciro/bak/git/linux-kernel-module-cheat/submodules/gensim/build/models/armv8/output-aarch64/</pre>
 </div>
 </div>
 <div class="paragraph">
@@ -31500,101 +31331,6 @@ echo 1 &gt; /proc/sys/vm/overcommit_memory
 <div class="paragraph">
 <p><code>strace</code> shows that OpenMP makes <code>clone()</code> syscalls in Linux. TODO: does it actually call <code>pthread_</code> functions, or does it make syscalls directly? Or in other words, can it work on <a href="#freestanding-programs">Freestanding programs</a>? A quick grep shows many references to pthreads.</p>
 </div>
-<div class="sect5">
-<h6 id="openmp-validation"><a class="anchor" href="#openmp-validation"></a><a class="link" href="#openmp-validation">26.2.3.2.1. OpenMP validation</a></h6>
-<div class="paragraph">
-<p><a href="https://github.com/uhhpctools/omp-validation" class="bare">https://github.com/uhhpctools/omp-validation</a></p>
-</div>
-<div class="paragraph">
-<p>Host build on Ubuntu 20.04:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>git submodule update --init submodules/omp-validation
-cd submodules/omp-validation
-PERL5LIB="${PERL5LIB}:." make -j `nproc` ctest</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>This both builds and runs, took about 5 minutes on <a href="#p51">2017 Lenovo ThinkPad P51</a>, but had build failues for some reason:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>Summary:
-S Number of tested Open MP constructs: 62
-S Number of used tests:                123
-S Number of failed tests:              4
-S Number of successful tests:          119
-S + from this were verified:           115
-
-Normal tests:
-N Number of failed tests:              2
-N + from this fail compilation:        0
-N + from this timed out                0
-N Number of successful tests:          60
-N + from this were verified:           58
-
-Orphaned tests:
-O Number of failed tests:              2
-O + from this fail compilation:        0
-O + from this timed out                0
-O Number of successful tests:          59
-O + from this were verified:           57</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>The tests and run results placed under <code>bin/c/</code>, e.g.:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>test_omp_threadprivate
-test_omp_threadprivate.c
-test_omp_threadprivate.log
-test_omp_threadprivate.out
-test_omp_threadprivate_compile.log</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>C files are also present as some kind of code generaion is used.</p>
-</div>
-<div class="paragraph">
-<p>Build only and run one of them manually:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>make -j`nproc` omp_my_sleep omp_testsuite
-PERL5LIB="${PERL5LIB}:." ./runtest.pl --lang=c --norun testlist-c.txt
-./bin/c/test_omp_barrier</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>The <code>bin/c</code> directory is hardcoded in the executable, so to run it you must ensure that it exists relative to CWD, e.g.:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>cd bin/c
-mkdir -p bin/c
-./test_omp_barrier</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Manually cross compile all tests and optionally add some extra options, e.g. <code>-static</code> to <a href="#gem5-dynamic-linked-executables-in-syscall-emulation">more conveniently run in gem5</a>:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>PERL5LIB="${PERL5LIB}:." ./runtest.pl --lang=c --makeopts 'CC=aarch64-linux-gnu-gcc CFLAGS_EXTRA=-static' --norun testlist-c.txt
-./../../run --arch aarch64 --emulator gem5 --userland submodules/omp-validation/bin/c/test_omp_parallel_reduction --cpus 8 --memory 8G</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Build a single test:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>make bin/c/test_omp_sections_reduction</pre>
-</div>
-</div>
-</div>
 </div>
 </div>
 </div>
@@ -31652,22 +31388,6 @@ mkdir -p bin/c
 </div>
 </li>
 <li>
-<p>templates</p>
-<div class="ulist">
-<ul>
-<li>
-<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/template.cpp">userland/cpp/template.cpp</a>: basic example</p>
-</li>
-<li>
-<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/template_class_with_static_member.cpp">userland/cpp/template_class_with_static_member.cpp</a>: <a href="https://stackoverflow.com/questions/3229883/static-member-initialization-in-a-class-template" class="bare">https://stackoverflow.com/questions/3229883/static-member-initialization-in-a-class-template</a></p>
-</li>
-<li>
-<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/if_constexpr.cpp">userland/cpp/if_constexpr.cpp</a>: C++17 <code>if constexpr</code>: <a href="https://stackoverflow.com/questions/12160765/if-else-at-compile-time-in-c/54647315#54647315" class="bare">https://stackoverflow.com/questions/12160765/if-else-at-compile-time-in-c/54647315#54647315</a></p>
-</li>
-</ul>
-</div>
-</li>
-<li>
 <p>iostream</p>
 <div class="ulist">
 <ul>
@@ -32153,7 +31873,29 @@ non-atomic 19</pre>
 </div>
 </div>
 <div class="sect3">
-<h4 id="cpp-type-casting"><a class="anchor" href="#cpp-type-casting"></a><a class="link" href="#cpp-type-casting">26.3.4. C++ type casting</a></h4>
+<h4 id="cpp-templates"><a class="anchor" href="#cpp-templates"></a><a class="link" href="#cpp-templates">26.3.4. C++ templates</a></h4>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/template.cpp">userland/cpp/template.cpp</a>: basic example</p>
+</li>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/template_class_with_static_member.cpp">userland/cpp/template_class_with_static_member.cpp</a>: <a href="https://stackoverflow.com/questions/3229883/static-member-initialization-in-a-class-template" class="bare">https://stackoverflow.com/questions/3229883/static-member-initialization-in-a-class-template</a></p>
+</li>
+</ul>
+</div>
+<div class="sect4">
+<h5 id="sfinae"><a class="anchor" href="#sfinae"></a><a class="link" href="#sfinae">26.3.4.1. SFINAE</a></h5>
+<div class="paragraph">
+<p><a href="https://en.cppreference.com/w/cpp/language/sfinae" class="bare">https://en.cppreference.com/w/cpp/language/sfinae</a></p>
+</div>
+<div class="paragraph">
+<p>Not possible to do the typecheck automatically without explicitly giving type constraints: <a href="https://stackoverflow.com/questions/53441832/sfinae-automatically-check-that-function-body-compiles-without-explicit-constrai" class="bare">https://stackoverflow.com/questions/53441832/sfinae-automatically-check-that-function-body-compiles-without-explicit-constrai</a></p>
+</div>
+</div>
+</div>
+<div class="sect3">
+<h4 id="cpp-type-casting"><a class="anchor" href="#cpp-type-casting"></a><a class="link" href="#cpp-type-casting">26.3.5. C++ type casting</a></h4>
 <div class="paragraph">
 <p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/static_dynamic_reinterpret_cast.cpp">userland/cpp/static_dynamic_reinterpret_cast.cpp</a></p>
 </div>
@@ -32161,6 +31903,31 @@ non-atomic 19</pre>
 <p><a href="https://stackoverflow.com/questions/332030/when-should-static-cast-dynamic-cast-const-cast-and-reinterpret-cast-be-used/60414256#60414256" class="bare">https://stackoverflow.com/questions/332030/when-should-static-cast-dynamic-cast-const-cast-and-reinterpret-cast-be-used/60414256#60414256</a></p>
 </div>
 </div>
+<div class="sect3">
+<h4 id="cpp-compile-time-magic"><a class="anchor" href="#cpp-compile-time-magic"></a><a class="link" href="#cpp-compile-time-magic">26.3.6. C++ compile time magic</a></h4>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/if_constexpr.cpp">userland/cpp/if_constexpr.cpp</a>: C++17 <code>if constexpr</code>: <a href="https://stackoverflow.com/questions/12160765/if-else-at-compile-time-in-c/54647315#54647315" class="bare">https://stackoverflow.com/questions/12160765/if-else-at-compile-time-in-c/54647315#54647315</a></p>
+</li>
+</ul>
+</div>
+<div class="sect4">
+<h5 id="cpp-decltype"><a class="anchor" href="#cpp-decltype"></a><a class="link" href="#cpp-decltype">26.3.6.1. C++ <code>decltype</code></a></h5>
+<div class="paragraph">
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/decltype.cpp">userland/cpp/decltype.cpp</a></p>
+</div>
+<div class="paragraph">
+<p>C++11 keyword.</p>
+</div>
+<div class="paragraph">
+<p>Replaces decltype with type of an expression at compile time.</p>
+</div>
+<div class="paragraph">
+<p>More powerful than <code>auto</code> as you can use it in more places.</p>
+</div>
+</div>
+</div>
 </div>
 <div class="sect2">
 <h3 id="posix"><a class="anchor" href="#posix"></a><a class="link" href="#posix">26.4. POSIX</a></h3>
@@ -32596,7 +32363,54 @@ There are no non-locking atomic types or atomic primitives in POSIX: <a href="ht
 </ul>
 </div>
 <div class="sect4">
-<h5 id="build-and-install-the-interpreter"><a class="anchor" href="#build-and-install-the-interpreter"></a><a class="link" href="#build-and-install-the-interpreter">26.7.1.1. Build and install the interpreter</a></h5>
+<h5 id="python-standard-library"><a class="anchor" href="#python-standard-library"></a><a class="link" href="#python-standard-library">26.7.1.1. Python standard library</a></h5>
+<div class="sect5">
+<h6 id="python-unittest"><a class="anchor" href="#python-unittest"></a><a class="link" href="#python-unittest">26.7.1.1.1. Python unittest</a></h6>
+<div class="paragraph">
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/rootfs_overlay/lkmc/python/unittest_find/">rootfs_overlay/lkmc/python/unittest_find/</a> contains examples to test how tests are found by <code>unittest</code> within directories.  Related questions:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://stackoverflow.com/questions/1732438/how-do-i-run-all-python-unit-tests-in-a-directory" class="bare">https://stackoverflow.com/questions/1732438/how-do-i-run-all-python-unit-tests-in-a-directory</a></p>
+</li>
+<li>
+<p><a href="https://stackoverflow.com/questions/46976256/recursive-unittest-discovery-with-python3-and-without-init-py-files" class="bare">https://stackoverflow.com/questions/46976256/recursive-unittest-discovery-with-python3-and-without-init-py-files</a></p>
+</li>
+</ul>
+</div>
+</div>
+<div class="sect5">
+<h6 id="python-relative-imports"><a class="anchor" href="#python-relative-imports"></a><a class="link" href="#python-relative-imports">26.7.1.1.2. Python relative imports</a></h6>
+<div class="paragraph">
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/rootfs_overlay/lkmc/python/relative_import/">rootfs_overlay/lkmc/python/relative_import/</a> contains examples to test how how to do relative imports in Python.</p>
+</div>
+<div class="paragraph">
+<p>This subject is impossible to understand.</p>
+</div>
+<div class="paragraph">
+<p>Related questions:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://stackoverflow.com/questions/16981921/relative-imports-in-python-3" class="bare">https://stackoverflow.com/questions/16981921/relative-imports-in-python-3</a></p>
+</li>
+<li>
+<p><a href="https://stackoverflow.com/questions/14132789/relative-imports-for-the-billionth-time" class="bare">https://stackoverflow.com/questions/14132789/relative-imports-for-the-billionth-time</a></p>
+</li>
+<li>
+<p><a href="https://stackoverflow.com/questions/21490860/relative-imports-with-unittest-in-python" class="bare">https://stackoverflow.com/questions/21490860/relative-imports-with-unittest-in-python</a></p>
+</li>
+<li>
+<p><a href="https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder" class="bare">https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder</a></p>
+</li>
+</ul>
+</div>
+</div>
+</div>
+<div class="sect4">
+<h5 id="build-and-install-the-interpreter"><a class="anchor" href="#build-and-install-the-interpreter"></a><a class="link" href="#build-and-install-the-interpreter">26.7.1.2. Build and install the interpreter</a></h5>
 <div class="paragraph">
 <p>Buildroot has a Python package that can be added to the guest image:</p>
 </div>
@@ -32655,7 +32469,7 @@ There are no non-locking atomic types or atomic primitives in POSIX: <a href="ht
 </div>
 </div>
 <div class="sect4">
-<h5 id="python-gem5-user-mode-simulation"><a class="anchor" href="#python-gem5-user-mode-simulation"></a><a class="link" href="#python-gem5-user-mode-simulation">26.7.1.2. Python gem5 user mode simulation</a></h5>
+<h5 id="python-gem5-user-mode-simulation"><a class="anchor" href="#python-gem5-user-mode-simulation"></a><a class="link" href="#python-gem5-user-mode-simulation">26.7.1.3. Python gem5 user mode simulation</a></h5>
 <div class="paragraph">
 <p>At LKMC 50ac89b779363774325c81157ec8b9a6bdb50a2f gem5 390a74f59934b85d91489f8a563450d8321b602da:</p>
 </div>
@@ -32715,7 +32529,7 @@ There are no non-locking atomic types or atomic primitives in POSIX: <a href="ht
 </div>
 </div>
 <div class="sect4">
-<h5 id="embedding-python-in-another-application"><a class="anchor" href="#embedding-python-in-another-application"></a><a class="link" href="#embedding-python-in-another-application">26.7.1.3. Embedding Python in another application</a></h5>
+<h5 id="embedding-python-in-another-application"><a class="anchor" href="#embedding-python-in-another-application"></a><a class="link" href="#embedding-python-in-another-application">26.7.1.4. Embedding Python in another application</a></h5>
 <div class="paragraph">
 <p>Here we will add some better examples and explanations for: <a href="https://docs.python.org/3/extending/embedding.html#very-high-level-embedding" class="bare">https://docs.python.org/3/extending/embedding.html#very-high-level-embedding</a></p>
 </div>
@@ -32766,7 +32580,7 @@ There are no non-locking atomic types or atomic primitives in POSIX: <a href="ht
 </div>
 </div>
 <div class="sect4">
-<h5 id="pybind11"><a class="anchor" href="#pybind11"></a><a class="link" href="#pybind11">26.7.1.4. pybind11</a></h5>
+<h5 id="pybind11"><a class="anchor" href="#pybind11"></a><a class="link" href="#pybind11">26.7.1.5. pybind11</a></h5>
 <div class="paragraph">
 <p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/libs/pybind11">userland/libs/pybind11</a></p>
 </div>
@@ -32859,26 +32673,9 @@ There are no non-locking atomic types or atomic primitives in POSIX: <a href="ht
 <ul>
 <li>
 <p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/rootfs_overlay/lkmc/nodejs/object_to_string.js">rootfs_overlay/lkmc/nodejs/object_to_string.js</a>: <code>util.inspect.custom</code> and <code>toString</code> override experiment: <a href="https://stackoverflow.com/questions/24902061/is-there-an-repr-equivalent-for-javascript/26698403#26698403" class="bare">https://stackoverflow.com/questions/24902061/is-there-an-repr-equivalent-for-javascript/26698403#26698403</a></p>
-<div class="paragraph">
-<p>Output:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>util.inspect
-my type is MyClassUtilInspectCustom and a is 1 and b is 2
-console.log
-my type is MyClassUtilInspectCustom and a is 1 and b is 2
-toString
-[object Object]
-
-util.inspect
-MyClassToString { a: 1, b: 2 }
-console.log
-MyClassToString { a: 1, b: 2 }
-toString
-my type is MyClassToString and a is 1 and b is 2</pre>
-</div>
-</div>
+</li>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/rootfs_overlay/lkmc/nodejs/object_to_json.js">rootfs_overlay/lkmc/nodejs/object_to_json.js</a>: <code>toJSON</code> examples</p>
 </li>
 </ul>
 </div>
@@ -33570,218 +33367,6 @@ git clean -xdf .</pre>
 </ul>
 </div>
 </div>
-<div class="sect4">
-<h5 id="coremark"><a class="anchor" href="#coremark"></a><a class="link" href="#coremark">26.9.1.6. Coremark</a></h5>
-<div class="paragraph">
-<p><a href="https://en.wikipedia.org/wiki/Coremark" class="bare">https://en.wikipedia.org/wiki/Coremark</a></p>
-</div>
-<div class="paragraph">
-<p>Part of <a href="https://en.wikipedia.org/wiki/EEMBC">EEMBC</a>.</p>
-</div>
-<div class="paragraph">
-<p>They have two versions:</p>
-</div>
-<div class="ulist">
-<ul>
-<li>
-<p>2009: <a href="https://github.com/eembc/coremark" class="bare">https://github.com/eembc/coremark</a></p>
-</li>
-<li>
-<p>2015: <a href="https://github.com/eembc/coremark-pro" class="bare">https://github.com/eembc/coremark-pro</a></p>
-<div class="paragraph">
-<p>Describes very clearly on the README what tests it does. Most of them are understandable high level operations.</p>
-</div>
-<div class="paragraph">
-<p>In particular, it contains "a greatly improved version of the <a href="https://en.wikipedia.org/wiki/Livermore_loops">Livermore loops</a>"</p>
-</div>
-</li>
-</ul>
-</div>
-<div class="paragraph">
-<p>Both have a custom license, so yeah, no patience to read this stuff.</p>
-</div>
-<div class="paragraph">
-<p>Coremark-pro build and run on Ubuntu 20.04:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>git submodule update --init submodules coremark-pro
-cd submodules/coremark-pro
-make TARGET=linux64 build
-make TARGET=linux64 XCMD='-c4' certify-all</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>This uses <code>4</code> contexts. TODO what are contexts? Is the same as threads? You likely want to use <code>-c$(nproc)</code> in practice instead?</p>
-</div>
-<div class="paragraph">
-<p>Finishes in a few seconds, <a href="#p51">2017 Lenovo ThinkPad P51</a> results:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>Workload Name                                     (iter/s)   (iter/s)    Scaling
------------------------------------------------ ---------- ---------- ----------
-cjpeg-rose7-preset                                  526.32     178.57       2.95
-core                                                  7.39       2.16       3.42
-linear_alg-mid-100x100-sp                           684.93     238.10       2.88
-loops-all-mid-10k-sp                                 27.65       7.80       3.54
-nnet_test                                            32.79      10.57       3.10
-parser-125k                                          71.43      25.00       2.86
-radix2-big-64k                                     2320.19     623.44       3.72
-sha-test                                            555.56     227.27       2.44
-zip-test                                            363.64     166.67       2.18
-
-MARK RESULTS TABLE
-
-Mark Name                                        MultiCore SingleCore    Scaling
------------------------------------------------ ---------- ---------- ----------
-CoreMark-PRO                                      18743.79    6306.76       2.97</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>More sample results: <a href="#p51-coremark-pro">P51 CoreMark-Pro</a>.</p>
-</div>
-<div class="paragraph">
-<p>And scaling appears to be the ration between multicore (4 due to <code>-c4</code> and single core performance), each benchmark gets run twice with multicore and single core.</p>
-</div>
-<div class="paragraph">
-<p>The tester script also outputs test commands, some of which are:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>builds/linux64/gcc64/bin/zip-test.exe -c1 -w1 -c4 -v1
-builds/linux64/gcc64/bin/zip-test.exe -c1 -w1 -c4 -v0
-builds/linux64/gcc64/bin/zip-test.exe -c4 -v1
-builds/linux64/gcc64/bin/zip-test.exe -c4 -v0</pre>
-</div>
-</div>
-<div class="paragraph">
-<p><code>-v1</code> appears to be a fast verification run, and both <code>-c1</code> vs <code>-c4</code> get run because for the single vs multicore preformance.</p>
-</div>
-<div class="paragraph">
-<p>Sample <code>-c4 -v0</code> output:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>-  Info: Starting Run...
--- Workload:zip-test=946108807
--- zip-test:time(ns)=11
--- zip-test:contexts=4
--- zip-test:iterations=4
--- zip-test:time(secs)=   0.011
--- zip-test:secs/workload= 0.00275
--- zip-test:workloads/sec= 363.636
--- Done:zip-test=946108807</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>and so we see the <code>zip-test:workloads/sec= 363.636</code> output is the key value, which is close to that of the <code>zip-test 363.64</code> in the earlier full summarized result.</p>
-</div>
-<div class="paragraph">
-<p>Cross compile statically for aarch64. From LKMC toplevel:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>make \
-  -C submodules/coremark-pro \
-  LINKER_FLAGS='-static' \
-  LINKER_LAST='-lm -lpthread -lrt' \
-  TARGET=gcc-cross-linux \
-  TOOLCHAIN=gcc-cross-linux \
-  TOOLS="$(./getvar --arch aarch64 buildroot_host_usr_dir)" \
-  TPREF="$(./getvar --arch aarch64 buildroot_toolchain_prefix)-" \
-  build \
-;</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Run a single executable on QEMU:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./run --arch aarch64 --userland submodules/coremark-pro/builds/gcc-cross-linux/bin/zip-test.exe --cli-args='-c4 -v0'</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Finishes in about 1 second, and gives <code>zip-test:workloads/sec= 74.0741</code> so we see that it ran about 5x slower than the native host.</p>
-</div>
-<div class="paragraph">
-<p>Run a single executable on gem5 in a verification run:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./run \
-  --arch aarch64 \
-  --cli-args='-c1 -v1' \
-  --emulator gem5 \
-  --userland submodules/coremark-pro/builds/gcc-cross-linux/bin/zip-test.exe \
-;</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>TODO: hangs for at least 15 minutes, there must be something wrong. Stuck on an evolving strlen loop:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>7837834500: system.cpu: A0 T0 : @__strlen_generic+112    : ldp
-7837834500: system.cpu: A0 T0 : @__strlen_generic+112. 0 :   addxi_uop   ureg0, x1, #16 : IntAlu :  D=0x0000003ffff07170  flags=(IsInteger|IsMicroop|IsDelayedCommit|IsFirstMicroop)
-7837835000: system.cpu: A0 T0 : @__strlen_generic+112. 1 :   ldp_uop   x2, x3, [ureg0] : MemRead :  D=0x20703c0a3e702f3c A=0x3ffff07170  flags=(IsInteger|IsMemRef|IsLoad|IsMicroop|IsLastMicroop)
-7837835500: system.cpu: A0 T0 : @__strlen_generic+116    :   sub   x4, x2, x8         : IntAlu :  D=0x3d607360632e3b34  flags=(IsInteger)
-7837836000: system.cpu: A0 T0 : @__strlen_generic+120    :   sub   x6, x3, x8         : IntAlu :  D=0x1f6f3b093d6f2e3b  flags=(IsInteger)
-7837836500: system.cpu: A0 T0 : @__strlen_generic+124    :   orr   x5, x4, x6         : IntAlu :  D=0x3f6f7b697f6f3f3f  flags=(IsInteger)
-7837837000: system.cpu: A0 T0 : @__strlen_generic+128    :   ands   x5, x8, LSL #7    : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
-7837837500: system.cpu: A0 T0 : @__strlen_generic+132    :   b.eq   &lt;__strlen_generic+88&gt; : IntAlu :   flags=(IsControl|IsDirectControl|IsCondControl)
-7837838000: system.cpu: A0 T0 : @__strlen_generic+88    : ldp
-7837838000: system.cpu: A0 T0 : @__strlen_generic+88. 0 :   addxi_uop   ureg0, x1, #32 : IntAlu :  D=0x0000003ffff07180  flags=(IsInteger|IsMicroop|IsDelayedCommit|IsFirstMicroop)
-7837838500: system.cpu: A0 T0 : @__strlen_generic+88. 1 :   ldp_uop   x2, x3, [ureg0] : MemRead :  D=0x6565686b636f4c27 A=0x3ffff07180  flags=(IsInteger|IsMemRef|IsLoad|IsMicroop|IsDelayedCommit)
-7837839000: system.cpu: A0 T0 : @__strlen_generic+88. 2 :   addxi_uop   x1, ureg0, #0 : IntAlu :  D=0x0000003ffff07180  flags=(IsInteger|IsMicroop|IsLastMicroop)
-7837839500: system.cpu: A0 T0 : @__strlen_generic+92    :   sub   x4, x2, x8         : IntAlu :  D=0x3c786d606f6c6e62  flags=(IsInteger)
-7837840000: system.cpu: A0 T0 : @__strlen_generic+96    :   sub   x6, x3, x8         : IntAlu :  D=0x6464676a626e4b26  flags=(IsInteger)
-7837840500: system.cpu: A0 T0 : @__strlen_generic+100    :   orr   x5, x4, x6         : IntAlu :  D=0x7c7c6f6a6f6e6f66  flags=(IsInteger)
-7837841000: system.cpu: A0 T0 : @__strlen_generic+104    :   ands   x5, x8, LSL #7    : IntAlu :  D=0x0000000000000000  flags=(IsInteger)</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Instructions before <code>__strlen_generic</code> starts:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>7831019000: system.cpu: A0 T0 : @define_params_zip+664    :   add   x1, sp, #168       : IntAlu :  D=0x0000007ffffef988  flags=(IsInteger)
-7831019500: system.cpu: A0 T0 : @define_params_zip+668    :   orr   x0, xzr, x24       : IntAlu :  D=0x0000003ffff00010  flags=(IsInteger)
-7831020000: system.cpu: A0 T0 : @define_params_zip+672    :   bl   &lt;th_strcat&gt;         : IntAlu :  D=0x000000000040a4c4  flags=(IsInteger|IsControl|IsDirectControl|IsUncondControl|IsCall)
-7831020500: system.cpu: A0 T0 : @th_strcat    :   b   &lt;strcat&gt;             : IntAlu :   flags=(IsControl|IsDirectControl|IsUncondControl)
-7831021000: system.cpu: A0 T0 : @strcat    : stp
-7831021000: system.cpu: A0 T0 : @strcat. 0 :   addxi_uop   ureg0, sp, #-48 : IntAlu :  D=0x0000007ffffef8b0  flags=(IsInteger|IsMicroop|IsDelayedCommit|IsFirstMicroop)
-7831021500: system.cpu: A0 T0 : @strcat. 1 :   strxi_uop   x29, [ureg0] : MemWrite :  D=0x0000007ffffef8e0 A=0x7ffffef8b0  flags=(IsInteger|IsMemRef|IsStore|IsMicroop|IsDelayedCommit)
-7831022000: system.cpu: A0 T0 : @strcat. 2 :   strxi_uop   x30, [ureg0, #8] : MemWrite :  D=0x000000000040a4c4 A=0x7ffffef8b8  flags=(IsInteger|IsMemRef|IsStore|IsMicroop|IsDelayedCommit)
-7831022500: system.cpu: A0 T0 : @strcat. 3 :   addxi_uop   sp, ureg0, #0 : IntAlu :  D=0x0000007ffffef8b0  flags=(IsInteger|IsMicroop|IsLastMicroop)
-7831023000: system.cpu: A0 T0 : @strcat+4    :   add   x29, sp, #0        : IntAlu :  D=0x0000007ffffef8b0  flags=(IsInteger)
-7831023500: system.cpu: A0 T0 : @strcat+8    :   str   x19, [sp, #16]     : MemWrite :  D=0x00000000004d6560 A=0x7ffffef8c0  flags=(IsInteger|IsMemRef|IsStore)
-7831024000: system.cpu: A0 T0 : @strcat+12    :   orr   x19, xzr, x0       : IntAlu :  D=0x0000003ffff00010  flags=(IsInteger)
-7831024500: system.cpu: A0 T0 : @strcat+16    :   str   x1, [sp, #40]      : MemWrite :  D=0x0000007ffffef988 A=0x7ffffef8d8  flags=(IsInteger|IsMemRef|IsStore)
-7831025000: system.cpu: A0 T0 : @strcat+20    :   bl   &lt;_init+120&gt;         : IntAlu :  D=0x00000000004464c8  flags=(IsInteger|IsControl|IsDirectControl|IsUncondControl|IsCall)
-7831025500: system.cpu: A0 T0 : @_init+120    :   adrp   x16, #835584      : IntAlu :  D=0x00000000004cc000  flags=(IsInteger)
-7831026000: system.cpu: A0 T0 : @_init+124    :   ldr   x17, [x16, #48]    : MemRead :  D=0x0000000000449680 A=0x4cc030  flags=(IsInteger|IsMemRef|IsLoad)
-7831026500: system.cpu: A0 T0 : @_init+128    :   add   x16, x16, #48      : IntAlu :  D=0x00000000004cc030  flags=(IsInteger)
-7831027000: system.cpu: A0 T0 : @_init+132    :   br   x17                 : IntAlu :   flags=(IsInteger|IsControl|IsIndirectControl|IsUncondControl)</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Their build/run system is nice, it even user mode simulators out-of-the-box! TODO give it a shot. See :</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>RUN =
-RUN_FLAGS =</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>under <code>util/make/linux64.mak</code>.</p>
-</div>
-<div class="paragraph">
-<p>Tested on a7ae8e6a8e29ef46d79eb9178d8599d1faeea0e5 + 1.</p>
-</div>
-</div>
 </div>
 <div class="sect3">
 <h4 id="microbenchmarks"><a class="anchor" href="#microbenchmarks"></a><a class="link" href="#microbenchmarks">26.9.2. Microbenchmarks</a></h4>
@@ -33820,365 +33405,6 @@ RUN_FLAGS =</pre>
 </li>
 </ul>
 </div>
-<div class="sect4">
-<h5 id="dhrystone"><a class="anchor" href="#dhrystone"></a><a class="link" href="#dhrystone">26.9.2.1. Dhrystone</a></h5>
-<div class="paragraph">
-<p><a href="https://en.wikipedia.org/wiki/Dhrystone" class="bare">https://en.wikipedia.org/wiki/Dhrystone</a></p>
-</div>
-<div class="paragraph">
-<p>Created in the 80&#8217;s, it is not a representative measure of performance in modern computers anymore. It has mostly been replaced by <a href="https://en.wikipedia.org/wiki/SPECint">SPEC</a>, which is&#8230;&#8203; closed source! Unbelievable.</p>
-</div>
-<div class="paragraph">
-<p>Dhrystone is very simple:</p>
-</div>
-<div class="ulist">
-<ul>
-<li>
-<p>there is one loop in the <code>dhry_1.c</code> main function that gets executed N times</p>
-</li>
-<li>
-<p>that loop calls 9 short functions called <code>Proc_0</code> to <code>Proc_9</code>, most of which are defined in <code>dhry_1.c</code>, and a few others in <code>dhry_2.c</code></p>
-</li>
-</ul>
-</div>
-<div class="paragraph">
-<p>The benchmark is single-threaded.</p>
-</div>
-<div class="paragraph">
-<p>After a quick look at it, Dhrystone in <code>-O3</code> is is very likely completely CPU bound, as there are no loops over variable sized arrays, except for some dummy ones that only run once. It just does a bunch of operations on local and global C variables, which are very likely to be inlined and treated fully in registers until the final write back, or to fit entirely in cache. TODO confirm with some kind of measurement. The benchmark also makes no syscalls except for measuring time and reporting results.</p>
-</div>
-<div class="paragraph">
-<p><a href="#buildroot">Buildroot</a> has a <code>dhrystone</code> package, but because it is so interesting to us, we decided to also build it ourselves, which allows things like static and baremetal compilation more easily.</p>
-</div>
-<div class="paragraph">
-<p>Build and run on QEMU <a href="#user-mode-simulation">User mode simulation</a>:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>git submodule update --init submodules/dhrystone
-./build-dhrystone --optimization-level 3
-./run --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>TODO automate run more nicely to dispense <code>getvar</code>.</p>
-</div>
-<div class="paragraph">
-<p>Increase the number of loops to try and reach more meaningful results:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./run --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone" --cli-args 100000000</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Build and run on gem5 user mode:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./build-dhrystone --optimization-level 3
-./run --emulator gem5 --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Run natively on the host:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./build-dhrystone --host
-"$(./getvar --host userland_build_dir)/submodules/dhrystone/dhrystone" 1000000000</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Sample output for <a href="#p51">2017 Lenovo ThinkPad P51</a> Ubuntu 20.04:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>Microseconds for one run through Dhrystone:    0.1
-Dhrystones per Second:                      16152479.0</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Build Dhrystone for <a href="#baremetal">Baremetal</a> and run it in on QEMU:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre># Build our Newlib stubs.
-./build-baremetal --arch aarch64
-./build-dhrystone --arch aarch64 --mode baremetal
-./run --arch aarch64 --baremetal "$(./getvar --arch aarch64 baremetal_build_dir)/submodules/dhrystone/dhrystone" --cli-args 10000</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>or with gem5:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre># Build our Newlib stubs.
-./build-baremetal --arch aarch64
-./build-dhrystone --arch aarch64 --emulator gem5 --mode baremetal
-./run --arch aarch64 --baremetal "$(./getvar --arch aarch64 --emulator gem5 baremetal_build_dir)/submodules/dhrystone/dhrystone" --cli-args 10000 --emulator gem5</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>If you really want the Buildroot package for some reason, build it with:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./build-buildroot --config 'BR2_PACKAGE_DHRYSTONE=y'</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>and run inside the guest from <code>PATH</code> with:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>dhrystone</pre>
-</div>
-</div>
-</div>
-<div class="sect4">
-<h5 id="lmbench"><a class="anchor" href="#lmbench"></a><a class="link" href="#lmbench">26.9.2.2. LMbench</a></h5>
-<div class="paragraph">
-<p><a href="http://www.bitmover.com/lmbench/" class="bare">http://www.bitmover.com/lmbench/</a></p>
-</div>
-<div class="paragraph">
-<p>Canonical source at <a href="https://sourceforge.net/projects/lmbench/" class="bare">https://sourceforge.net/projects/lmbench/</a> but Intel has a fork at: <a href="https://github.com/intel/lmbench" class="bare">https://github.com/intel/lmbench</a> which has more recent build updates, so I think that&#8217;s the one I&#8217;d put my money on as of 2020.</p>
-</div>
-<div class="paragraph">
-<p>Feels old, guessing not representative anymore like <a href="#dhrystone">Dhrystone</a>. But hey, history!</p>
-</div>
-<div class="paragraph">
-<p>Ubuntu 20.04 AMD64 native build and run:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>git submodule update --init submodules/lmbench
-cd submodules/lmbench
-cd src
-make results</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>TODO it hangs for a long time at:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>Hang on, we are calculating your cache line size.</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Bug report: <a href="https://github.com/intel/lmbench/issues/15" class="bare">https://github.com/intel/lmbench/issues/15</a></p>
-</div>
-<div class="paragraph">
-<p>the If I kill it, configuration process continues:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>Killed
-OK, it looks like your cache line is  bytes.</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>and continues with a few more interactive questions until finally:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>Confguration done, thanks.</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>where it again hangs for at least 2 hours, so I lost patience and killed it.</p>
-</div>
-<div class="paragraph">
-<p>TODO: how to do a non-interactive config? After the above procedure, <code>bin/x86_64-linux-gnu/CONFIG.ciro-p51</code> contains:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>DISKS=""
-DISK_DESC=""
-OUTPUT=/dev/null
-ENOUGH=50000
-FASTMEM="NO"
-FILE=/var/tmp/XXX
-FSDIR=/var/tmp
-INFO=INFO.ciro-p51
-LINE_SIZE=
-LOOP_O=0.00000000
-MAIL=no
-TOTAL_MEM=31903
-MB=22332
-MHZ="-1 System too busy"
-MOTHERBOARD=""
-NETWORKS=""
-OS="x86_64-linux-gnu"
-PROCESSORS="8"
-REMOTE=""
-SLOWFS="NO"
-SYNC_MAX="1"
-LMBENCH_SCHED="DEFAULT"
-TIMING_O=0
-RSH=rsh
-RCP=rcp
-VERSION=lmbench-3alpha4
-BENCHMARK_HARDWARE=YES
-BENCHMARK_OS=YES
-BENCHMARK_SYSCALL=
-BENCHMARK_SELECT=
-BENCHMARK_PROC=
-BENCHMARK_CTX=
-BENCHMARK_PAGEFAULT=
-BENCHMARK_FILE=
-BENCHMARK_MMAP=
-BENCHMARK_PIPE=
-BENCHMARK_UNIX=
-BENCHMARK_UDP=
-BENCHMARK_TCP=
-BENCHMARK_CONNECT=
-BENCHMARK_RPC=
-BENCHMARK_HTTP=
-BENCHMARK_BCOPY=
-BENCHMARK_MEM=
-BENCHMARK_OPS=</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Native build only without running tests:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>cd src
-make</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Interestingly, one of the creators of LMbench, Larry Mcvoy (<a href="https://www.linkedin.com/in/larrymcvoy/" class="bare">https://www.linkedin.com/in/larrymcvoy/</a>, <a href="https://en.wikipedia.org/wiki/Larry_McVoy" class="bare">https://en.wikipedia.org/wiki/Larry_McVoy</a>), is also a co-founder of <a href="https://en.wikipedia.org/wiki/BitKeeper">BitKeeper</a>. Their SMC must be blazingly fast!!! Also his LinkedIn says Intel uses it. But they will forever be remembered as "the closed source Git precursor that died N years ago", RIP.</p>
-</div>
-</div>
-<div class="sect4">
-<h5 id="stream-benchmark"><a class="anchor" href="#stream-benchmark"></a><a class="link" href="#stream-benchmark">26.9.2.3. STREAM benchmark</a></h5>
-<div class="paragraph">
-<p><a href="http://www.cs.virginia.edu/stream/ref.html" class="bare">http://www.cs.virginia.edu/stream/ref.html</a></p>
-</div>
-<div class="paragraph">
-<p>Very simple memory width benchmark with one C and one Fortran version, originally published in 1991, and the latest version at the time of writing is from 2013.</p>
-</div>
-<div class="paragraph">
-<p>Its operation is very simple: fork one thread for each CPU in the system (using OpenMP) and do the following four array operations (4 separate loops of individual operations):</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>/* Copy. */
-times[0 * ntimes + k] = mysecond();
-#pragma omp parallel for
-for (j=0; j&lt;stream_array_size; j++)
-    c[j] = a[j];
-times[0 * ntimes + k] = mysecond() - times[0 * ntimes + k];
-
-/* Scale. */
-times[1 * ntimes + k] = mysecond();
-#pragma omp parallel for
-for (j=0; j&lt;stream_array_size; j++)
-    b[j] = scalar*c[j];
-times[1 * ntimes + k] = mysecond() - times[1 * ntimes + k];
-
-/* Add. */
-times[2 * ntimes + k] = mysecond();
-#pragma omp parallel for
-for (j=0; j&lt;stream_array_size; j++)
-    c[j] = a[j]+b[j];
-times[2 * ntimes + k] = mysecond() - times[2 * ntimes + k];
-
-/* Triad. */
-times[3 * ntimes + k] = mysecond();
-#pragma omp parallel for
-for (j=0; j&lt;stream_array_size; j++)
-    a[j] = b[j]+scalar*c[j];
-times[3 * ntimes + k] = mysecond() - times[3 * ntimes + k];
-}</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>See also: <a href="https://stackoverflow.com/questions/56086993/what-does-stream-memory-bandwidth-benchmark-really-measure" class="bare">https://stackoverflow.com/questions/56086993/what-does-stream-memory-bandwidth-benchmark-really-measure</a></p>
-</div>
-<div class="paragraph">
-<p>Ubuntu 20.04 native build and run:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>git submodule update --init submodules/stream-benchmark
-cd submodules/stream-benchmark
-make
-./stream_c.exe</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Sample output:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>-------------------------------------------------------------
-STREAM version $Revision: 5.10 $
--------------------------------------------------------------
-This system uses 8 bytes per array element.
--------------------------------------------------------------
-Array size = 10000000 (elements), Offset = 0 (elements)
-Memory per array = 76.3 MiB (= 0.1 GiB).
-Total memory required = 228.9 MiB (= 0.2 GiB).
-Each kernel will be executed 10 times.
- The *best* time for each kernel (excluding the first iteration)
- will be used to compute the reported bandwidth.
--------------------------------------------------------------
-Number of Threads requested = 8
-Number of Threads counted = 8
--------------------------------------------------------------
-Your clock granularity/precision appears to be 1 microseconds.
-Each test below will take on the order of 7027 microseconds.
-   (= 7027 clock ticks)
-Increase the size of the arrays if this shows that
-you are not getting at least 20 clock ticks per test.
--------------------------------------------------------------
-WARNING -- The above is only a rough guideline.
-For best results, please be sure you know the
-precision of your system timer.
--------------------------------------------------------------
-Function    Best Rate MB/s  Avg time     Min time     Max time
-Copy:           20123.2     0.008055     0.007951     0.008267
-Scale:          20130.4     0.008032     0.007948     0.008177
-Add:            22528.8     0.010728     0.010653     0.010867
-Triad:          22448.4     0.010826     0.010691     0.011352
--------------------------------------------------------------
-Solution Validates: avg error less than 1.000000e-13 on all three arrays
--------------------------------------------------------------</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>The LKMC usage of STREAM is analogous to that of <a href="#dhrystone">Dhrystone</a>. Build and run on QEMU <a href="#user-mode-simulation">User mode simulation</a>:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./build-stream --optimization-level 3
-./run --userland "$(./getvar userland_build_dir)/submodules/stream-benchmark/stream_c.exe"</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Decrease the benchmark size and the retry count to finish simulation faster, but possibly have a less representative result:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./run --userland "$(./getvar userland_build_dir)/submodules/stream-benchmark/stream_c.exe" --cli-args '100 2'</pre>
-</div>
-</div>
-<div class="paragraph">
-<p>Build and run on gem5 user mode:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./build-stream --optimization-level 3
-./run --emulator gem5 --userland "$(./getvar userland_build_dir)/submodules/stream-benchmark/stream_c.exe" --cli-args '1000 2'</pre>
-</div>
-</div>
-</div>
 </div>
 </div>
 <div class="sect2">
@@ -34938,6 +34164,31 @@ When instructions do not interpret this operand encoding as the zero register, u
 <div class="paragraph">
 <p>This is analogous to <a href="#baremetal-gdb-step-debug">step debugging baremetal examples</a>.</p>
 </div>
+<div class="paragraph">
+<p>Related:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://stackoverflow.com/questions/4783404/is-main-really-start-of-a-c-program/64116561#64116561" class="bare">https://stackoverflow.com/questions/4783404/is-main-really-start-of-a-c-program/64116561#64116561</a> "Is main() really start of a C++ program?"</p>
+</li>
+<li>
+<p><a href="https://electronics.stackexchange.com/questions/258896/what-happens-before-main/404298#404298" class="bare">https://electronics.stackexchange.com/questions/258896/what-happens-before-main/404298#404298</a></p>
+</li>
+<li>
+<p><a href="https://electronics.stackexchange.com/questions/55767/who-receives-the-value-returned-by-main" class="bare">https://electronics.stackexchange.com/questions/55767/who-receives-the-value-returned-by-main</a>, more microcontroller focused, should entitled "how to quit a program in microcontroller"</p>
+</li>
+<li>
+<p><a href="https://stackoverflow.com/questions/53570678/what-happens-before-main-in-c" class="bare">https://stackoverflow.com/questions/53570678/what-happens-before-main-in-c</a> "What happens before main in C++?"</p>
+</li>
+<li>
+<p><a href="https://www.quora.com/What-is-happening-before-the-main-function-is-called-in-C++-programming" class="bare">https://www.quora.com/What-is-happening-before-the-main-function-is-called-in-C++-programming</a></p>
+</li>
+<li>
+<p><a href="https://stackoverflow.com/questions/2053029/how-exactly-does-attribute-constructor-work" class="bare">https://stackoverflow.com/questions/2053029/how-exactly-does-attribute-constructor-work</a></p>
+</li>
+</ul>
+</div>
 <div class="sect4">
 <h5 id="nostartfiles-programs"><a class="anchor" href="#nostartfiles-programs"></a><a class="link" href="#nostartfiles-programs">27.5.1.1. nostartfiles programs</a></h5>
 <div class="paragraph">
@@ -42735,30 +41986,6 @@ instructions 124346081</pre>
 <td class="tableblock halign-left valign-top"></td>
 </tr>
 <tr>
-<td class="tableblock halign-left valign-top"></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">605448f07e6380634b1aa7e9732d111759f69fd</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><a href="#dhrystone">Dhrystone</a> <code>-O3</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><code>gem5 --arch aarch64</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">4 * 10^5</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">68</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">9.2034139 * 10^7</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">1.6</p></td>
-<td class="tableblock halign-left valign-top"></td>
-<td class="tableblock halign-left valign-top"></td>
-</tr>
-<tr>
-<td class="tableblock halign-left valign-top"></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">5d233f2664a78789f9907d27e2a40e86cefad595</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><a href="#stream-benchmark">STREAM benchmark</a> <code>-O3</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock"><code>./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --cli-args 1000000 --trace ExecAll</code></p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">3 * 10^5 * 2</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">64</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">9.9674773 * 10^7</p></td>
-<td class="tableblock halign-left valign-top"><p class="tableblock">1.6</p></td>
-<td class="tableblock halign-left valign-top"></td>
-<td class="tableblock halign-left valign-top"></td>
-</tr>
-<tr>
 <td class="tableblock halign-left valign-top"><p class="tableblock">glibc C pre-main effects</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock">ab6f7331406b22f8ab6e2df5f8b8e464fb35b611</p></td>
 <td class="tableblock halign-left valign-top"><p class="tableblock"><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/c/m5ops.c">userland/c/m5ops.c</a> <code>-O0</code></p></td>
@@ -42887,10 +42114,7 @@ instructions 124346081</pre>
 <p>Let&#8217;s see if user mode runs considerably faster than full system or not, ignoring the kernel boot.</p>
 </div>
 <div class="paragraph">
-<p>First we build <a href="#dhrystone">Dhrystone</a> manually statically since dynamic linking is broken in gem5 as explained at: <a href="#gem5-syscall-emulation-mode">Section 10.7, &#8220;gem5 syscall emulation mode&#8221;</a>.</p>
-</div>
-<div class="paragraph">
-<p>TODO: move this section to our new custom dhrystone setup: <a href="#dhrystone">Section 26.9.2.1, &#8220;Dhrystone&#8221;</a>.</p>
+<p>First we build dhrystonee manually statically since dynamic linking is broken in gem5 as explained at: <a href="#gem5-syscall-emulation-mode">Section 10.7, &#8220;gem5 syscall emulation mode&#8221;</a>.</p>
 </div>
 <div class="paragraph">
 <p>gem5 user mode:</p>
@@ -43148,7 +42372,7 @@ tail -n+1 ../linux-kernel-module-cheat-regression/*/gem5-bench-build-*.txt</pre>
 <p>and then copy the link command to a separate Bash file. Then you can time and modify it easily.</p>
 </div>
 <div class="paragraph">
-<p>Some approximate reference values on <a href="#p51">2017 Lenovo ThinkPad P51</a>:</p>
+<p>Some approximate reference values on <a href="#p51">2017 Lenovo ThinkPad P51</a> LKMC d4b3e064adeeace3c3e7d106801f95c14637c12f + 1 (doing multiple runs to warm up disk caches):</p>
 </div>
 <div class="ulist">
 <ul>
@@ -43223,7 +42447,37 @@ tail -n+1 ../linux-kernel-module-cheat-regression/*/gem5-bench-build-*.txt</pre>
 </ul>
 </div>
 <div class="paragraph">
-<p>Tested at: d4b3e064adeeace3c3e7d106801f95c14637c12f + 1.</p>
+<p>On LKMC 220c3a434499e4713664d4a47c246cb81ee0a06a gem5 63e96992568d8a8a0dccac477b8b7f1370ac7e98 (Sep 2020):</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><code>opt</code></p>
+<div class="ulist">
+<ul>
+<li>
+<p>default link: <code>18.32user 3.99system 0:22.33elapsed 99%CPU (0avgtext+0avgdata 4622908maxresident)k</code></p>
+</li>
+<li>
+<p><code>LDFLAGS_EXTRA=-fuse-ld=lld</code> (after a build with default linker): <code>6.74user 1.81system 0:03.85elapsed 222%CPU (0avgtext+0avgdata 7025292maxresident)k</code></p>
+</li>
+<li>
+<p><code>LDFLAGS_EXTRA=-fuse-ld=gold</code>: <code>7.70user 1.36system 0:09.44elapsed 95%CPU (0avgtext+0avgdata 5959152maxresident)k</code></p>
+<div class="ulist">
+<ul>
+<li>
+<p><code>LDFLAGS_EXTRA=-fuse-ld=gold -Wl,--threads -Wl,--thread-count=8</code>: <code>9.66user 1.86system 0:04.62elapsed 249%CPU (0avgtext+0avgdata 5989916maxresident)k</code></p>
+<div class="paragraph">
+<p>Arghhh, it does not use multile threads by default&#8230;&#8203; <a href="https://stackoverflow.com/questions/5142753/can-gcc-use-multiple-cores-when-linking/42302047#42302047" class="bare">https://stackoverflow.com/questions/5142753/can-gcc-use-multiple-cores-when-linking/42302047#42302047</a></p>
+</div>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+</ul>
 </div>
 </div>
 </div>
@@ -43339,40 +42593,7 @@ tail -n+1 ../linux-kernel-module-cheat-regression/*/gem5-bench-build-*.txt</pre>
 </div>
 <div class="sect4">
 <h5 id="p51-benchmarks"><a class="anchor" href="#p51-benchmarks"></a><a class="link" href="#p51-benchmarks">34.3.1.1. P51 benchmarks</a></h5>
-<div class="paragraph">
-<p><a href="#dhrystone">Dhrystone</a> on Ubuntu 20.04 results at <a href="#dhrystone">Dhrystone</a>.</p>
-</div>
-<div class="paragraph">
-<p><a href="#stream-benchmark">STREAM benchmark</a> on Ubuntu 20.04 results at <a href="#stream-benchmark">STREAM benchmark</a>.</p>
-</div>
-<div class="sect5">
-<h6 id="p51-coremark-pro"><a class="anchor" href="#p51-coremark-pro"></a><a class="link" href="#p51-coremark-pro">34.3.1.1.1. P51 CoreMark-Pro</a></h6>
-<div class="paragraph">
-<p><a href="#coremark">CoreMark-Pro</a> d5b4f2ba7ba31e37a5aa93423831e7d5eb933868 on Ubuntu 20.04 with <code>XCMD="-c$(nproc)"</code>:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>                                                 MultiCore SingleCore
-Workload Name                                     (iter/s)   (iter/s)    Scaling
------------------------------------------------ ---------- ---------- ----------
-cjpeg-rose7-preset                                  769.23     175.44       4.38
-core                                                  7.98       2.11       3.78
-linear_alg-mid-100x100-sp                           892.86     233.64       3.82
-loops-all-mid-10k-sp                                 35.84       7.58       4.73
-nnet_test                                            35.09      10.05       3.49
-parser-125k                                         125.00      20.41       6.12
-radix2-big-64k                                     3278.69     630.91       5.20
-sha-test                                            625.00     227.27       2.75
-zip-test                                            615.38     166.67       3.69
 
-MARK RESULTS TABLE
-
-Mark Name                                        MultiCore SingleCore    Scaling
------------------------------------------------ ---------- ---------- ----------
-CoreMark-PRO                                      25016.00    6079.70       4.11</pre>
-</div>
-</div>
-</div>
 </div>
 <div class="sect4">
 <h5 id="p51-maintenance-history"><a class="anchor" href="#p51-maintenance-history"></a><a class="link" href="#p51-maintenance-history">34.3.1.2. P51 maintenance history</a></h5>