diff --git a/index.html b/index.html
index 2f75bd3..1b5bc67 100644
--- a/index.html
+++ b/index.html
@@ -1185,7 +1185,11 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 <li><a href="#gem5-prof-and-perf-builds">19.15.3. gem5 prof and perf builds</a></li>
 <li><a href="#gem5-clang-build">19.15.4. gem5 clang build</a></li>
 <li><a href="#gem5-sanitation-build">19.15.5. gem5 sanitation build</a></li>
-<li><a href="#gem5-ruby-build">19.15.6. gem5 Ruby build</a></li>
+<li><a href="#gem5-ruby-build">19.15.6. gem5 Ruby build</a>
+<ul class="sectlevel4">
+<li><a href="#gem5-crossbar-interconnect">19.15.6.1. gem5 crossbar interconnect</a></li>
+</ul>
+</li>
 <li><a href="#gem5-python-3-build">19.15.7. gem5 Python 3 build</a></li>
 </ul>
 </li>
@@ -1210,10 +1214,52 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 <li><a href="#gem5-entry-point">19.19.3. gem5 entry point</a></li>
 <li><a href="#gem5-event-queue">19.19.4. gem5 event queue</a>
 <ul class="sectlevel4">
-<li><a href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">19.19.4.1. gem5 event queue AtomicSimpleCPU syscall emulation freestanding example analysis</a></li>
-<li><a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis">19.19.4.2. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis</a></li>
+<li><a href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">19.19.4.1. gem5 event queue AtomicSimpleCPU syscall emulation freestanding example analysis</a>
+<ul class="sectlevel5">
+<li><a href="#atomicsimplecpu-initial-events">19.19.4.1.1. AtomicSimpleCPU initial events</a></li>
+<li><a href="#atomicsimplecpu-tick-reschedule-timing">19.19.4.1.2. AtomicSimpleCPU tick reschedule timing</a></li>
+<li><a href="#atomicsimplecpu-memory-access">19.19.4.1.3. AtomicSimpleCPU memory access</a></li>
+<li><a href="#gem5-se-py-page-translation">19.19.4.1.4. gem5 se.py page translation</a></li>
+</ul>
+</li>
+<li><a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis">19.19.4.2. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis</a>
+<ul class="sectlevel5">
+<li><a href="#timingsimplecpu-analysis-0">19.19.4.2.1. TimingSimpleCPU analysis #0</a></li>
+<li><a href="#timingsimplecpu-analysis-1">19.19.4.2.2. TimingSimpleCPU analysis #1</a></li>
+<li><a href="#timingsimplecpu-analysis-2">19.19.4.2.3. TimingSimpleCPU analysis #2</a></li>
+<li><a href="#timingsimplecpu-analysis-3-and-4">19.19.4.2.4. TimingSimpleCPU analysis #3 and #4</a></li>
+<li><a href="#timingsimplecpu-analysis-5">19.19.4.2.5. TimingSimpleCPU analysis #5</a></li>
+<li><a href="#timingsimplecpu-analysis-6">19.19.4.2.6. TimingSimpleCPU analysis #6</a></li>
+<li><a href="#timingsimplecpu-analysis-7">19.19.4.2.7. TimingSimpleCPU analysis #7</a></li>
+<li><a href="#timingsimplecpu-analysis-8">19.19.4.2.8. TimingSimpleCPU analysis #8</a></li>
+<li><a href="#timingsimplecpu-analysis-9">19.19.4.2.9. TimingSimpleCPU analysis #9</a></li>
+<li><a href="#timingsimplecpu-analysis-10">19.19.4.2.10. TimingSimpleCPU analysis #10</a></li>
+<li><a href="#timingsimplecpu-analysis-11">19.19.4.2.11. TimingSimpleCPU analysis #11</a></li>
+<li><a href="#timingsimplecpu-analysis-12">19.19.4.2.12. TimingSimpleCPU analysis #12</a></li>
+<li><a href="#timingsimplecpu-analysis-13">19.19.4.2.13. TimingSimpleCPU analysis #13</a></li>
+<li><a href="#timingsimplecpu-analysis-14">19.19.4.2.14. TimingSimpleCPU analysis #14</a></li>
+<li><a href="#timingsimplecpu-analysis-15">19.19.4.2.15. TimingSimpleCPU analysis #15</a></li>
+<li><a href="#timingsimplecpu-analysis-16">19.19.4.2.16. TimingSimpleCPU analysis #16</a></li>
+<li><a href="#timingsimplecpu-analysis-17">19.19.4.2.17. TimingSimpleCPU analysis #17</a></li>
+<li><a href="#timingsimplecpu-analysis-18">19.19.4.2.18. TimingSimpleCPU analysis #18</a></li>
+<li><a href="#timingsimplecpu-analysis-19">19.19.4.2.19. TimingSimpleCPU analysis #19</a></li>
+<li><a href="#timingsimplecpu-analysis-20">19.19.4.2.20. TimingSimpleCPU analysis #20</a></li>
+<li><a href="#timingsimplecpu-analysis-21">19.19.4.2.21. TimingSimpleCPU analysis #21</a></li>
+<li><a href="#timingsimplecpu-analysis-22">19.19.4.2.22. TimingSimpleCPU analysis #22</a></li>
+<li><a href="#timingsimplecpu-analysis-23">19.19.4.2.23. TimingSimpleCPU analysis #23</a></li>
+<li><a href="#timingsimplecpu-analysis-24">19.19.4.2.24. TimingSimpleCPU analysis #24</a></li>
+<li><a href="#timingsimplecpu-analysis-25">19.19.4.2.25. TimingSimpleCPU analysis #25</a></li>
+<li><a href="#timingsimplecpu-analysis-26">19.19.4.2.26. TimingSimpleCPU analysis #26</a></li>
+<li><a href="#timingsimplecpu-analysis-27">19.19.4.2.27. TimingSimpleCPU analysis #27</a></li>
+<li><a href="#timingsimplecpu-analysis-28">19.19.4.2.28. TimingSimpleCPU analysis #28</a></li>
+<li><a href="#timingsimplecpu-analysis-29">19.19.4.2.29. TimingSimpleCPU analysis #29</a></li>
+<li><a href="#timingsimplecpu-analysis-ldr-stall">19.19.4.2.30. TimingSimpleCPU analysis: LDR stall</a></li>
+</ul>
+</li>
 <li><a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches">19.19.4.3. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis with caches</a></li>
-<li><a href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis">19.19.4.4. gem5 event queue MinorCPU syscall emulation freestanding example analysis</a></li>
+<li><a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches-and-multiple-cpus">19.19.4.4. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis with caches and multiple CPUs</a></li>
+<li><a href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis">19.19.4.5. gem5 event queue MinorCPU syscall emulation freestanding example analysis</a></li>
+<li><a href="#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis">19.19.4.6. gem5 event queue DerivO3CPU syscall emulation freestanding example analysis</a></li>
 </ul>
 </li>
 <li><a href="#gem5-stats-internals">19.19.5. gem5 stats internals</a></li>
@@ -1224,8 +1270,9 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 </li>
 <li><a href="#gem5-build-system">19.19.7. gem5 build system</a>
 <ul class="sectlevel4">
-<li><a href="#gem5-polymorphic-isa-includes">19.19.7.1. gem5 polymorphic ISA includes</a></li>
-<li><a href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.19.7.2. Why are all C++ symlinked into the gem5 build dir?</a></li>
+<li><a href="#gem5-build-broken-on-recent-compiler-version">19.19.7.1. gem5 build broken on recent compiler version</a></li>
+<li><a href="#gem5-polymorphic-isa-includes">19.19.7.2. gem5 polymorphic ISA includes</a></li>
+<li><a href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.19.7.3. Why are all C++ symlinked into the gem5 build dir?</a></li>
 </ul>
 </li>
 </ul>
@@ -1889,10 +1936,11 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 <ul class="sectlevel2">
 <li><a href="#cache-coherence">32.1. Cache coherence</a>
 <ul class="sectlevel3">
-<li><a href="#msi-protocol">32.1.1. MSI protocol</a></li>
-<li><a href="#mesi-protocol">32.1.2. MESI protocol</a></li>
-<li><a href="#mosi-protocol">32.1.3. MOSI protocol</a></li>
-<li><a href="#moesi-protocol">32.1.4. MOESI protocol</a></li>
+<li><a href="#vi-protocol">32.1.1. VI protocol</a></li>
+<li><a href="#msi-protocol">32.1.2. MSI protocol</a></li>
+<li><a href="#mesi-protocol">32.1.3. MESI protocol</a></li>
+<li><a href="#mosi-protocol">32.1.4. MOSI protocol</a></li>
+<li><a href="#moesi-protocol">32.1.5. MOESI protocol</a></li>
 </ul>
 </li>
 </ul>
@@ -2884,12 +2932,7 @@ j = 0</pre>
 <p>Remember that the gem5 boot is <a href="#benchmark-linux-kernel-boot">considerably slower</a> than QEMU since the simulation is more detailed.</p>
 </div>
 <div class="paragraph">
-<p>gem5 moves a bit slowly, and if your host compiler is very new, the build might be broken for it, e.g. this was the case for Ubuntu 19.10 with GCC 9 and gem5 62d75e7105fe172eb906d4f80f360ff8591d4178 from Dec 2019. You can work around that by <a href="https://askubuntu.com/questions/466651/how-do-i-use-the-latest-gcc-on-ubuntu/1163021#1163021">installing an older compiler</a> and using it with something like:</p>
-</div>
-<div class="literalblock">
-<div class="content">
-<pre>./build-gem5 -- CC=gcc-8 CXX=g++-8</pre>
-</div>
+<p>If you have a relatively new GCC version and the gem5 build fails on your machine, see: <a href="#gem5-build-broken-on-recent-compiler-version">gem5 build broken on recent compiler version</a>.</p>
 </div>
 <div class="paragraph">
 <p>To get a terminal, either open a new shell and run:</p>
@@ -20526,13 +20569,13 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 </ul>
 </div>
 <div class="paragraph">
-<p>The Ruby memory system includes the SLICC domain specific language to describe memory systems: <a href="http://gem5.org/Ruby" class="bare">http://gem5.org/Ruby</a></p>
+<p>The Ruby memory system includes the SLICC domain specific language to describe memory systems: <a href="http://gem5.org/Ruby" class="bare">http://gem5.org/Ruby</a> SLICC transpiles to C++ auto-generated files under <code>build/&lt;isa&gt;/mem/ruby/protocol/</code>.</p>
 </div>
 <div class="paragraph">
-<p>It seems to have usage outside of gem5, but the naming overload with the <a href="https://en.wikipedia.org/wiki/Ruby_(programming_language)">Ruby programming language</a>, which also has <a href="https://thoughtbot.com/blog/writing-a-domain-specific-language-in-ruby">domain specific languages</a> as a concept, makes it impossible to google anything about it!</p>
+<p>Ruby seems to have usage outside of gem5, but the naming overload with the <a href="https://en.wikipedia.org/wiki/Ruby_(programming_language)">Ruby programming language</a>, which also has <a href="https://thoughtbot.com/blog/writing-a-domain-specific-language-in-ruby">domain specific languages</a> as a concept, makes it impossible to google anything about it!</p>
 </div>
 <div class="paragraph">
-<p>Since it is not the default, Ruby is generally less stable that the classic memory model. However, because it allows describing a wide variety of important coherency protocols, while the classic system only describes a single protocol, Ruby is very importanonly describes a single protocol, Ruby is a very important feature of gem5.</p>
+<p>Since it is not the default, Ruby is generally less stable that the classic memory model. However, because it allows describing a wide variety of important <a href="#cache-coherence">cache coherency protocols</a>, while the classic system only describes a single protocol, Ruby is very importanonly describes a single protocol, Ruby is a very important feature of gem5.</p>
 </div>
 <div class="paragraph">
 <p>Ruby support must be enabled at compile time with the <code>scons PROTOCOL=</code> flag, which compiles support for the desired memory system type.</p>
@@ -20554,15 +20597,15 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 <div class="ulist">
 <ul>
 <li>
-<p>if <code>--ruby</code> is given, use the ruby memory system</p>
+<p>if <code>--ruby</code> is given, use the ruby memory system that was compiled into gem5. Caches are always present when Ruby is used, since the main goal of Ruby is to specify the cache coherence protocol, and it therefore hardcodes cache hierarchies.</p>
 </li>
 <li>
-<p>otherwise, use the classic memory system</p>
+<p>otherwise, use the classic memory system. Caches may be optional for certain CPU types and are enabled with <code>--caches</code>.</p>
 </li>
 </ul>
 </div>
 <div class="paragraph">
-<p>For example, to use a two level <a href="https://en.wikipedia.org/wiki/MESI_protocol">MESI</a> <a href="https://en.wikipedia.org/wiki/Cache_coherence">cache coherence protocol</a>, we can do:</p>
+<p>For example, to use a two level <a href="#mesi-protocol">MESI protocol</a> we can do:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -20630,7 +20673,7 @@ cat "$(./getvar --arch aarch64 --emulator gem5 trace_txt_file)"</pre>
 </div>
 </li>
 <li>
-<p>without <code>--ruby</code>, we instead see <code>XBar</code> (Coherent Crossbar) related messages such as <code>CoherentXBar:</code>, which I believe is the more precise name for the memory model that the classic memory system uses</p>
+<p>without <code>--ruby</code>, we instead see <code>XBar</code> (Coherent Crossbar) related messages such as <code>CoherentXBar:</code>, which I believe is the more precise name for the memory model that the classic memory system uses: <a href="#gem5-crossbar-interconnect">gem5 crossbar interconnect</a>.</p>
 </li>
 </ul>
 </div>
@@ -20640,6 +20683,50 @@ cat "$(./getvar --arch aarch64 --emulator gem5 trace_txt_file)"</pre>
 <div class="paragraph">
 <p>Tested in gem5 d7d9bc240615625141cd6feddbadd392457e49eb.</p>
 </div>
+<div class="sect4">
+<h5 id="gem5-crossbar-interconnect"><a class="anchor" href="#gem5-crossbar-interconnect"></a><a class="link" href="#gem5-crossbar-interconnect">19.15.6.1. gem5 crossbar interconnect</a></h5>
+<div class="paragraph">
+<p>Crossbar or <code>XBar</code> in the code, is the default <a href="#cache-coherence">CPU interconnect</a> that gets used by <code>fs.py</code> if <a href="#gem5-ruby-build"><code>--ruby</code></a> is not given.</p>
+</div>
+<div class="paragraph">
+<p>One simple example of its operation can be seen at: <a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis">Section 19.19.4.2, &#8220;gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis&#8221;</a>.</p>
+</div>
+<div class="paragraph">
+<p>See also: <a href="https://en.wikipedia.org/wiki/Crossbar_switch" class="bare">https://en.wikipedia.org/wiki/Crossbar_switch</a></p>
+</div>
+<div class="paragraph">
+<p>TODO: describe it in more detail. It appears to be a very simple mechanism.</p>
+</div>
+<div class="paragraph">
+<p>Under <code>src/mem/</code> we see that there is both a coherent and a non-coherent XBar.</p>
+</div>
+<div class="paragraph">
+<p>In <code>se.py</code> it is set at:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>if options.ruby:
+    ...
+else:
+    MemClass = Simulation.setMemClass(options)
+    system.membus = SystemXBar()</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and <code>SystemXBar</code> is defined at <code>src/mem/XBar.py</code> with a nice comment:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre># One of the key coherent crossbar instances is the system
+# interconnect, tying together the CPU clusters, GPUs, and any I/O
+# coherent masters, and DRAM controllers.
+class SystemXBar(CoherentXBar):</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Tested in gem5 12c917de54145d2d50260035ba7fa614e25317a3.</p>
+</div>
+</div>
 </div>
 <div class="sect3">
 <h4 id="gem5-python-3-build"><a class="anchor" href="#gem5-python-3-build"></a><a class="link" href="#gem5-python-3-build">19.15.7. gem5 Python 3 build</a></h4>
@@ -20749,6 +20836,9 @@ cat "$(./getvar --arch aarch64 --emulator gem5 trace_txt_file)"</pre>
 <p>Its 4 stage pipeline is described at the "MinorCPU" section of <a href="#gem5-arm-rsk">gem5 ARM RSK</a>.</p>
 </div>
 <div class="paragraph">
+<p>There is also an in-tree doxygen at: <a href="https://github.com/gem5/gem5/blob/9fc9c67b4242c03f165951775be5cd0812f2a705/src/doc/inside-minor.doxygen"><code>src/doc/inside-minor.doxygen</code></a> and rendered at: <a href="http://pages.cs.wisc.edu/~swilson/gem5-docs/minor.html" class="bare">http://pages.cs.wisc.edu/~swilson/gem5-docs/minor.html</a></p>
+</div>
+<div class="paragraph">
 <p>As of 2019, in-order cores are mostly present in low power / cost contexts, for example little cores of <a href="https://en.wikipedia.org/wiki/ARM_big.LITTLE">ARM bigLITTLE</a>.</p>
 </div>
 <div class="paragraph">
@@ -21256,12 +21346,12 @@ exec filecode in scope</pre>
 <p>CPU ticks</p>
 </li>
 <li>
-<p>TODO peripherals and memory</p>
+<p>peripherals and memory</p>
 </li>
 </ul>
 </div>
 <div class="paragraph">
-<p>At the beginning of simulation, gem5 sets up exactly two events:</p>
+<p>At <a href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">gem5 event queue AtomicSimpleCPU syscall emulation freestanding example analysis</a> we see for example that at the beginning of an <a href="#gem5-basesimplecpu">AtomicCPU</a> simulation, gem5 sets up exactly two events:</p>
 </div>
 <div class="ulist">
 <ul>
@@ -21274,7 +21364,10 @@ exec filecode in scope</pre>
 </ul>
 </div>
 <div class="paragraph">
-<p>Tick events then get triggered one by one as simulation progresses, in addition to any other system events.</p>
+<p>Then, at the end of the callback of one tick event, another tick is scheduled.</p>
+</div>
+<div class="paragraph">
+<p>And so the simulation progresses tick by tick, until an exit event happens.</p>
 </div>
 <div class="paragraph">
 <p>The <code>EventQueue</code> class has one awesome <code>dump()</code> function that prints a human friendly representation of the queue, and can be easily called from GDB. TODO example.</p>
@@ -21293,6 +21386,46 @@ exec filecode in scope</pre>
 <div class="paragraph">
 <p>This calls the <code>Event::process</code> method of the event.</p>
 </div>
+<div class="paragraph">
+<p>Another important technique is to use <a href="#debug-the-emulator">GDB</a> and break at interesting points such as:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>b Trace::OstreamLogger::logMessage()
+b EventManager::schedule
+b EventFunctionWrapper::process</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>although stepping into <code>EventFunctionWrapper::process</code> which does <code>std::function</code> is a bit of a pain: <a href="https://stackoverflow.com/questions/59429401/how-to-step-into-stdfunction-user-code-from-c-functional-with-gdb" class="bare">https://stackoverflow.com/questions/59429401/how-to-step-into-stdfunction-user-code-from-c-functional-with-gdb</a></p>
+</div>
+<div class="paragraph">
+<p>Another potentially useful technique is to use:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>--trace Event,ExecAll,FmtFlag,FmtStackTrace --trace-stdout</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which automates the logging of <code>Trace::OstreamLogger::logMessage()</code> backtraces.</p>
+</div>
+<div class="paragraph">
+<p>But alas, it misses which function callback is being scheduled, which is the awesome thing we actually want:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://stackoverflow.com/questions/37545327/get-the-name-of-a-stdfunction" class="bare">https://stackoverflow.com/questions/37545327/get-the-name-of-a-stdfunction</a></p>
+</li>
+<li>
+<p><a href="https://stackoverflow.com/questions/40706805/how-to-convert-a-function-pointer-to-function-name/40706869" class="bare">https://stackoverflow.com/questions/40706805/how-to-convert-a-function-pointer-to-function-name/40706869</a></p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>Then, once we had that, the most perfect thing ever would be to make the full event graph containing which events schedule which events!</p>
+</div>
 <div class="sect4">
 <h5 id="gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">19.19.4.1. gem5 event queue AtomicSimpleCPU syscall emulation freestanding example analysis</a></h5>
 <div class="paragraph">
@@ -21304,79 +21437,149 @@ exec filecode in scope</pre>
   --arch aarch64 \
   --emulator gem5 \
   --userland userland/arch/aarch64/freestanding/linux/hello.S \
-  --trace Event \
+  --trace Event,ExecAll,FmtFlag \
   --trace-stdout \
 ;</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>At LKMC a0ea29835b9bacc6aa1cceb24c79d895315991d4 + 1 this outputs:</p>
+<p>which gives:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>      0: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped event scheduled @ 0
+<pre>      0: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 scheduled @ 0
 **** REAL SIMULATION ****
-      0: Event_70: generic event scheduled @ 0
+      0: Event: Event_70: generic 70 scheduled @ 0
 info: Entering event queue @ 0.  Starting simulation...
-      0: Event_70: generic event rescheduled @ 18446744073709551615
-      0: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped event rescheduled @ 500
-    500: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped event rescheduled @ 1000
-   1000: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped event rescheduled @ 1500
-   1500: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped event rescheduled @ 2000
+      0: Event: Event_70: generic 70 rescheduled @ 18446744073709551615
+      0: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 executed @ 0
+      0: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue    :   movz   x0, #1, #0        : IntAlu :  D=0x0000000000000001  flags=(IsInteger)
+      0: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 rescheduled @ 500
+    500: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 executed @ 500
+    500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+4    :   adr   x1, #28            : IntAlu :  D=0x0000000000400098  flags=(IsInteger)
+    500: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 rescheduled @ 1000
+   1000: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 executed @ 1000
+   1000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+8    :   ldr   w2, #4194464       : MemRead :  D=0x0000000000000006 A=0x4000a0  flags=(IsInteger|IsMemRef|IsLoad)
+   1000: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 rescheduled @ 1500
+   1500: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 executed @ 1500
+   1500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+12    :   movz   x8, #64, #0       : IntAlu :  D=0x0000000000000040  flags=(IsInteger)
+   1500: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 rescheduled @ 2000
+   2000: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 executed @ 2000
+   2000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+16    :   svc   #0x0               : IntAlu :   flags=(IsSerializeAfter|IsNonSpeculative|IsSyscall)
 hello
-   2000: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped event rescheduled @ 2500
-   2500: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped event rescheduled @ 3000
-   3000: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped event rescheduled @ 3500
-   3500: Event_71: generic event scheduled @ 3500</pre>
+   2000: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 rescheduled @ 2500
+   2500: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 executed @ 2500
+   2500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+20    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+   2500: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 rescheduled @ 3000
+   3000: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 executed @ 3000
+   3000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+24    :   movz   x8, #93, #0       : IntAlu :  D=0x000000000000005d  flags=(IsInteger)
+   3000: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 rescheduled @ 3500
+   3500: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 executed @ 3500
+   3500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+28    :   svc   #0x0               : IntAlu :   flags=(IsSerializeAfter|IsNonSpeculative|IsSyscall)
+   3500: Event: Event_71: generic 71 scheduled @ 3500
+   3500: Event: Event_71: generic 71 executed @ 3500</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>which we immediately guess means that there is one event per tick when comparing to the <code>ExecAll</code> trace:</p>
+<p>On the event trace, we can first see:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>      0: system.cpu A0 T0 : @asm_main_after_prologue    :   movz   x0, #1, #0        : IntAlu :  D=0x0000000000000001  flags=(IsInteger)
-    500: system.cpu A0 T0 : @asm_main_after_prologue+4    :   adr   x1, #28            : IntAlu :  D=0x0000000000400098  flags=(IsInteger)
-   1000: system.cpu A0 T0 : @asm_main_after_prologue+8    :   ldr   w2, #4194464       : MemRead :  D=0x0000000000000006 A=0x4000a0  flags=(IsInteger|IsMemRef|IsLoad)
-   1500: system.cpu A0 T0 : @asm_main_after_prologue+12    :   movz   x8, #64, #0       : IntAlu :  D=0x0000000000000040  flags=(IsInteger)
-   2000: system.cpu A0 T0 : @asm_main_after_prologue+16    :   svc   #0x0               : IntAlu :   flags=(IsSerializeAfter|IsNonSpeculative|IsSyscall)
-hello
-   2500: system.cpu A0 T0 : @asm_main_after_prologue+20    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
-   3000: system.cpu A0 T0 : @asm_main_after_prologue+24    :   movz   x8, #93, #0       : IntAlu :  D=0x000000000000005d  flags=(IsInteger)
-   3500: system.cpu A0 T0 : @asm_main_after_prologue+28    :   svc   #0x0               : IntAlu :   flags=(IsSerializeAfter|IsNonSpeculative|IsSyscall)
-Exiting @ tick 3500 because exiting with last active thread context</pre>
+<pre>0: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 scheduled @ 0</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>On the event trace, we can see:</p>
+<p>This schedules a tick event for time <code>0</code>, and leads to the first clock tick.</p>
+</div>
+<div class="paragraph">
+<p>Then:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>0: Event: Event_70: generic 70 scheduled @ 0
+0: Event: Event_70: generic 70 rescheduled @ 18446744073709551615</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>schedules the end of time event for time <code>0</code>, which is later rescheduled to the actual end of time.</p>
+</div>
+<div class="paragraph">
+<p>At:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>0: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 executed @ 0
+0: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue    :   movz   x0, #1, #0        : IntAlu :  D=0x0000000000000001  flags=(IsInteger)
+0: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 rescheduled @ 500</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>the tick event happens, the instruction runs, and then the instruction is rescheduled in <code>500</code> time units. This is done at the end of <code>AtomicSimpleCPU::tick()</code>:</p>
 </div>
-<div class="ulist">
-<ul>
-<li>
-<p><code>AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped event scheduled @ 0</code> schedules a tick event for time <code>0</code>, and this leads to the first clock tick</p>
-</li>
-<li>
-<p><code>0: Event_70: generic event scheduled @ 0</code>: schedules the end of time event for time <code>0</code>, which is later rescheduled to the actual end of time: <code>0: Event_70: generic event rescheduled @ 18446744073709551615</code></p>
-</li>
-<li>
-<p>at <code>0: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped event rescheduled @ 500</code> the first clock tick must have finished running, and so to represent the next one, it was simply rescheduled for the next time <code>500</code>! This is done at the end of <code>AtomicSimpleCPU::tick()</code>:</p>
 <div class="literalblock">
 <div class="content">
 <pre>if (_status != Idle)
     reschedule(tickEvent, curTick() + latency, true);</pre>
 </div>
 </div>
-</li>
-<li>
-<p>at <code>3500: Event_71: generic event scheduled @ 3500</code> the exit system call is called and the simulation ends.</p>
 <div class="paragraph">
-<p>A new event is scheduled for the current time itself. TODO what is this event?</p>
+<p>At:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>3500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+28    :   svc   #0x0               : IntAlu :   flags=(IsSerializeAfter|IsNonSpeculative|IsSyscall)
+3500: Event: Event_71: generic 71 scheduled @ 3500
+3500: Event: Event_71: generic 71 executed @ 3500</pre>
 </div>
-</li>
-</ul>
 </div>
 <div class="paragraph">
-<p>Let&#8217;s study the first event. From <a href="#debug-the-emulator">GDB</a>, let&#8217;s break at the point that prints messages: <code>Trace::OstreamLogger::logMessage()</code> to see where events are being scheduled from:</p>
+<p>the exit system call is called, and then it schedules an exit evit, which gets executed and the simulation ends.</p>
+</div>
+<div class="paragraph">
+<p>We guess then that <code>Event_71</code> comes from the SE implementation of the exit syscall, so let&#8217;s just confirm, the trace contains:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>exitSimLoop() at sim_events.cc:97 0x5555594746e0
+exitImpl() at syscall_emul.cc:215 0x55555948c046
+exitFunc() at syscall_emul.cc:225 0x55555948c147
+SyscallDesc::doSyscall() at syscall_desc.cc:72 0x5555594949b6
+Process::syscall() at process.cc:401 0x555559484717
+SimpleThread::syscall() at 0x555559558059
+ArmISA::SupervisorCall::invoke() at faults.cc:856 0x5555572950d7
+BaseSimpleCPU::advancePC() at base.cc:681 0x555559083133
+AtomicSimpleCPU::tick() at atomic.cc:757 0x55555907834c</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and <code>exitSimLoop()</code> does:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>new GlobalSimLoopExitEvent(when + simQuantum, message, exit_code, repeat);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Tested in gem5 12c917de54145d2d50260035ba7fa614e25317a3.</p>
+</div>
+<div class="sect5">
+<h6 id="atomicsimplecpu-initial-events"><a class="anchor" href="#atomicsimplecpu-initial-events"></a><a class="link" href="#atomicsimplecpu-initial-events">19.19.4.1.1. AtomicSimpleCPU initial events</a></h6>
+<div class="paragraph">
+<p>Let&#8217;s have a closer look at the initial magically scheduled events of the simulation.</p>
+</div>
+<div class="paragraph">
+<p>Most events come from other events, but at least one initial event must be scheduled somehow from elsewhere to kick things off.</p>
+</div>
+<div class="paragraph">
+<p>The initial tick event:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>0: Event: AtomicSimpleCPU tick.wrapped_function_event: EventFunctionWrapped 39 scheduled @ 0</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>we&#8217;ll study by breaking at at the point that prints messages: <code>b Trace::OstreamLogger::logMessage()</code> to see where events are being scheduled from:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -21445,6 +21648,14 @@ ArmLinuxProcess64::initState() at process.cc:1,777 0x5555572d5e5e</pre>
 </div>
 </div>
 <div class="paragraph">
+<p>and this gets called from the toplevel Python scripts e.g. se.py <code>configs/common/Simulation.py</code> does:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>m5.instantiate(checkpoint_dir)</pre>
+</div>
+</div>
+<div class="paragraph">
 <p>As we can see, <code>initState</code> is just one stage of generic <code>SimObject</code> initialization. <code>root.descendants()</code> goes over the entire <code>SimObject</code> tree calling <code>initState()</code>.</p>
 </div>
 <div class="paragraph">
@@ -21521,10 +21732,16 @@ ArmLinuxProcess64::initState() at process.cc:1,777 0x5555572d5e5e</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>So that&#8217;s how the main atomic tick loop works, fully understood!</p>
+<p>And the second magically scheduled event is the exit event:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>0: Event: Event_70: generic 70 scheduled @ 0
+0: Event: Event_70: generic 70 rescheduled @ 18446744073709551615</pre>
+</div>
 </div>
 <div class="paragraph">
-<p>The second event has backtrace:</p>
+<p>which is scheduled with backtrace:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -21540,7 +21757,7 @@ simulate() at simulate.cc:104 0x555559476d6f</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>so gets scheduled automatically at object creation <code>simulate()</code> through the <code>GlobalEvent()</code> constructor:</p>
+<p>which comes at object creation inside <code>simulate()</code> through the <code>GlobalEvent()</code> constructor:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -21561,36 +21778,165 @@ simulate() at simulate.cc:104 0x555559476d6f</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>And at last, we can guess without reading the code that <code>Event_71</code> is comes from the SE implementation of the exit syscall, so let&#8217;s just confirm, the trace contains:</p>
+<p>Tested in gem5 12c917de54145d2d50260035ba7fa614e25317a3.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="atomicsimplecpu-tick-reschedule-timing"><a class="anchor" href="#atomicsimplecpu-tick-reschedule-timing"></a><a class="link" href="#atomicsimplecpu-tick-reschedule-timing">19.19.4.1.2. AtomicSimpleCPU tick reschedule timing</a></h6>
+<div class="paragraph">
+<p>Inside <code>AtomicSimpleCPU::tick()</code> we saw previously that the reschedule happens at:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>exitSimLoop() at sim_events.cc:97 0x5555594746e0
-exitImpl() at syscall_emul.cc:215 0x55555948c046
-exitFunc() at syscall_emul.cc:225 0x55555948c147
-SyscallDesc::doSyscall() at syscall_desc.cc:72 0x5555594949b6
-Process::syscall() at process.cc:401 0x555559484717
-SimpleThread::syscall() at 0x555559558059
-ArmISA::SupervisorCall::invoke() at faults.cc:856 0x5555572950d7
-BaseSimpleCPU::advancePC() at base.cc:681 0x555559083133
-AtomicSimpleCPU::tick() at atomic.cc:757 0x55555907834c</pre>
+<pre>    if (latency &lt; clockPeriod())
+        latency = clockPeriod();
+
+    if (_status != Idle)
+        reschedule(tickEvent, curTick() + latency, true);</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>and <code>exitSimLoop()</code> does:</p>
+<p>so it is interesting to learn where that <code>latency</code> comes from.</p>
+</div>
+<div class="paragraph">
+<p>From our logs, we see that all events happened with a <code>500</code> time unit interval between them, so that must be the value for all instructions of our simple example.</p>
+</div>
+<div class="paragraph">
+<p>By GDBing it a bit, we see that none of our instructions incremented <code>latency</code>, and so it got set to <code>clockPeriod()</code>, which comes from <code>ClockDomain::clockPeriod()</code> which then likely comes from:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>new GlobalSimLoopExitEvent(when + simQuantum, message, exit_code, repeat);</pre>
+<pre>    parser.add_option("--cpu-clock", action="store", type="string",
+                      default='2GHz',</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>Tested at b4879ae5b0b6644e6836b0881e4da05c64a6550d.</p>
+<p>because the time unit is picoseconds. This then shows on the <a href="#gem5-config-ini"><code>config.ini</code></a> as:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>[system.cpu_clk_domain]
+type=SrcClockDomain
+clock=500</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="atomicsimplecpu-memory-access"><a class="anchor" href="#atomicsimplecpu-memory-access"></a><a class="link" href="#atomicsimplecpu-memory-access">19.19.4.1.3. AtomicSimpleCPU memory access</a></h6>
+<div class="paragraph">
+<p>It will be interesting to see how <code>AtomicSimpleCPU</code> makes memory access on GDB and to compare that with <a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis"><code>TimingSimpleCPU</code></a>.</p>
+</div>
+<div class="paragraph">
+<p>We assume that the memory access still goes through the <a href="#gem5-crossbar-interconnect"><code>CoherentXBar</code></a>, but instead of generating an event to model delayed response, it must be doing the access directly.</p>
+</div>
+<div class="paragraph">
+<p>Inside <code>AtomicSimpleCPU::tick</code>, we track <code>ifetch_req</code> and see:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>        fault = thread-&gt;itb-&gt;translateAtomic(ifetch_req, thread-&gt;getTC(),
+                                                BaseTLB::Execute);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>We can compare that with what happen sin <code>TimingSimpleCPU</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>        thread-&gt;itb-&gt;translateTiming(ifetch_req, thread-&gt;getTC(),
+                &amp;fetchTranslation, BaseTLB::Execute);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and so there it is: the <code>ITB</code> classes are the same, but there are a separate <code>Atomic</code> and <code>Timing</code> methods!</p>
+</div>
+<div class="paragraph">
+<p>The <code>Timing</code> one calls <code>ArmISA::TLB::translateComplete</code></p>
+</div>
+<div class="paragraph">
+<p>Tested in gem5 b4879ae5b0b6644e6836b0881e4da05c64a6550d.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="gem5-se-py-page-translation"><a class="anchor" href="#gem5-se-py-page-translation"></a><a class="link" href="#gem5-se-py-page-translation">19.19.4.1.4. gem5 se.py page translation</a></h6>
+<div class="paragraph">
+<p>Happens on <code>EmulationPageTable</code>, and seems to happen atomically without making any extra memory requests.</p>
+</div>
+<div class="paragraph">
+<p>TODO confirm from code, notably by seeing where the translation table is set.</p>
+</div>
+<div class="paragraph">
+<p>But we can confirm with logging with:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>--trace DRAM,ExecAll,FmtFlag</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which gives</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>      0: DRAM: system.mem_ctrls: recvAtomic: ReadReq 0x78
+      0: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue    :   movz   x0, #1, #0        : IntAlu :  D=0x0000000000000001  flags=(IsInteger)
+    500: DRAM: system.mem_ctrls: recvAtomic: ReadReq 0x7c
+    500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+4    :   adr   x1, #28            : IntAlu :  D=0x0000000000400098  flags=(IsInteger)
+   1000: DRAM: system.mem_ctrls: recvAtomic: ReadReq 0x80
+   1000: DRAM: system.mem_ctrls: recvAtomic: ReadReq 0xa0
+   1000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+8    :   ldr   w2, #4194464       : MemRead :  D=0x0000000000000006 A=0x4000a0  flags=(IsInteger|IsMemRef|IsLoad)
+   1500: DRAM: system.mem_ctrls: recvAtomic: ReadReq 0x84
+   1500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+12    :   movz   x8, #64, #0       : IntAlu :  D=0x0000000000000040  flags=(IsInteger)
+   2000: DRAM: system.mem_ctrls: recvAtomic: ReadReq 0x88
+   2000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+16    :   svc   #0x0               : IntAlu :   flags=(IsSerializeAfter|IsNonSpeculative|IsSyscall)
+hello
+   2500: DRAM: system.mem_ctrls: recvAtomic: ReadReq 0x8c
+   2500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+20    :   movz   x0, #0, #0        : IntAlu :  D=0x0000000000000000  flags=(IsInteger)
+   3000: DRAM: system.mem_ctrls: recvAtomic: ReadReq 0x90
+   3000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+24    :   movz   x8, #93, #0       : IntAlu :  D=0x000000000000005d  flags=(IsInteger)
+   3500: DRAM: system.mem_ctrls: recvAtomic: ReadReq 0x94
+   3500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+28    :   svc   #0x0               : IntAlu :   flags=(IsSerializeAfter|IsNonSpeculative|IsSyscall)
+Exiting @ tick 3500 because exiting with last active thread context
+   3500: DRAM: system.mem_ctrls_0: Computing stats due to a dump callback
+   3500: DRAM: system.mem_ctrls_1: Computing stats due to a dump callback</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>So we see that before every instruction execution there was a DRAM event! Also, each read happens 4 bytes after the previous one, which is consistent with instruction fetches.</p>
+</div>
+<div class="paragraph">
+<p>The DRAM addresses are very close to zero e.g. <code>0x78</code> for the first instruction, and therefore we guess that they are physical since the ELF entry point is much higher:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./run-toolchain --arch aarch64 readelf -- -h "$(./getvar --arch aarch64 userland_build_dir)/arch/aarch64/freestanding/linux/hello.out</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>at:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>  Entry point address:               0x400078</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>For LDR, we see that there was an extra DRAM read as well after the fetch read, as expected.</p>
+</div>
+<div class="paragraph">
+<p>Tested in gem5 b4879ae5b0b6644e6836b0881e4da05c64a6550d.</p>
+</div>
 </div>
 </div>
 <div class="sect4">
 <h5 id="gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis">19.19.4.2. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis</a></h5>
 <div class="paragraph">
+<p>Now, let&#8217;s move on to <code>TimingSimpleCPU</code>, which is just like <code>AtomicSimpleCPU</code> internally, but now the memory requests don&#8217;t actually finish immediately: <a href="#gem5-cpu-types">gem5 CPU types</a>!</p>
+</div>
+<div class="paragraph">
+<p>This means that simulation will be much more accurate, and the DRAM memory will be modelled.</p>
+</div>
+<div class="paragraph">
 <p>TODO: analyze better what each of the memory event mean. For now, we have just collected a bunch of data there, but needs interpreting. The CPU specifics in this section are already insightful however.</p>
 </div>
 <div class="paragraph">
@@ -21601,8 +21947,9 @@ AtomicSimpleCPU::tick() at atomic.cc:757 0x55555907834c</pre>
 <pre>./run \
   --arch aarch64 \
   --emulator gem5 \
+  --gem5-build-type gem5 \
   --userland userland/arch/aarch64/freestanding/linux/hello.S \
-  --trace Event,ExecAll \
+  --trace Event,ExecAll,FmtFlag \
   --trace-stdout \
   -- \
   --cpu-type TimingSimpleCPU \
@@ -21610,7 +21957,7 @@ AtomicSimpleCPU::tick() at atomic.cc:757 0x55555907834c</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>As of LKMC 9bfbff244d713de40e5686bd370eadb20cf78c7b + 1 the log is now much more complex.</p>
+<p>As of LKMC 78ce2dabe18ef1d87dc435e5bc9369ce82e8d6d2 gem5 12c917de54145d2d50260035ba7fa614e25317a3 the log is now much more complex.</p>
 </div>
 <div class="paragraph">
 <p>Here is an abridged version with:</p>
@@ -21626,43 +21973,224 @@ AtomicSimpleCPU::tick() at atomic.cc:757 0x55555907834c</pre>
 </ul>
 </div>
 <div class="paragraph">
-<p>because all that happens in between is exactly the same as the first two instructions and therefore boring:</p>
+<p>because all that happens in between is exactly the same as the first two instructions and therefore boring.</p>
+</div>
+<div class="paragraph">
+<p>We have also manually added:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>double newlines before each event execution</p>
+</li>
+<li>
+<p>line IDs to be able to refer to specific events more easily (<code>#0</code>, <code>#1</code>, etc.)</p>
+</li>
+</ul>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>      0: system.cpu.wrapped_function_event: EventFunctionWrapped event scheduled @ 0
+<pre>#0       0: Event: system.cpu.wrapped_function_event: EventFunctionWrapped 43 scheduled @ 0
 **** REAL SIMULATION ****
-      0: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped event scheduled @ 7786250
-      0: system.mem_ctrls_1.wrapped_function_event: EventFunctionWrapped event scheduled @ 7786250
-      0: Event_74: generic event scheduled @ 0
+#1       0: Event: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped 14 scheduled @ 7786250
+#2       0: Event: system.mem_ctrls_1.wrapped_function_event: EventFunctionWrapped 20 scheduled @ 7786250
+#3       0: Event: Event_74: generic 74 scheduled @ 0
 info: Entering event queue @ 0.  Starting simulation...
-      0: Event_74: generic event rescheduled @ 18446744073709551615
-      0: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 0
-      0: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped event scheduled @ 1000
-      0: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped event scheduled @ 0
-      0: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 46250
-      0: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 5000
-      0: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped event scheduled @ 0
-  46250: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped event scheduled @ 74250
-  74250: system.membus.slave[1]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped event scheduled @ 77000
-  74250: system.membus.respLayer1.wrapped_function_event: EventFunctionWrapped event scheduled @ 77000
-  77000: Event_40: Timing CPU icache tick event scheduled @ 77000
-  77000: system.cpu A0 T0 : @asm_main_after_prologue    :   movz   x0, #1, #0        : IntAlu :  D=0x0000000000000001  flags=(IsInteger)
-  77000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 77000
-  77000: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped event scheduled @ 78000
-  77000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 95750
-  77000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 77000
-  95750: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped event scheduled @ 123750
- 123750: system.membus.slave[1]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped event scheduled @ 126000
- 123750: system.membus.respLayer1.wrapped_function_event: EventFunctionWrapped event scheduled @ 126000
- 126000: Event_40: Timing CPU icache tick event scheduled @ 126000
-      [...]
- 469000: system.cpu A0 T0 : @asm_main_after_prologue+28    :   svc   #0x0               : IntAlu :   flags=(IsSerializeAfter|IsNonSpeculative|IsSyscall)
- 469000: Event_75: generic event scheduled @ 469000</pre>
+#4       0: Event: Event_74: generic 74 rescheduled @ 18446744073709551615
+
+#5       0: Event: system.cpu.wrapped_function_event: EventFunctionWrapped 43 executed @ 0
+#6       0: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 scheduled @ 0
+#7       0: Event: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped 60 scheduled @ 1000
+
+#8       0: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 executed @ 0
+#9       0: Event: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped 12 scheduled @ 0
+#10      0: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 10 scheduled @ 46250
+#11      0: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 scheduled @ 5000
+
+#12      0: Event: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped 12 executed @ 0
+#13      0: Event: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped 15 scheduled @ 0
+
+#14      0: Event: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped 15 executed @ 0
+
+#15   1000: Event: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped 60 executed @ 1000
+
+#16   5000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 executed @ 5000
+
+#17  46250: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 10 executed @ 46250
+#18  46250: Event: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 8 scheduled @ 74250
+
+#19  74250: Event: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 8 executed @ 74250
+#20  74250: Event: system.membus.slave[1]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 64 scheduled @ 77000
+#21  74250: Event: system.membus.respLayer1.wrapped_function_event: EventFunctionWrapped 65 scheduled @ 77000
+
+#22  77000: Event: system.membus.respLayer1.wrapped_function_event: EventFunctionWrapped 65 executed @ 77000
+
+#23  77000: Event: system.membus.slave[1]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 64 executed @ 77000
+#24  77000: Event: Event_40: Timing CPU icache tick 40 scheduled @ 77000
+
+#25  77000: Event: Event_40: Timing CPU icache tick 40 executed @ 77000
+     77000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue    :   movz   x0, #1, #0        : IntAlu :  D=0x0000000000000001  flags=(IsInteger)
+#26  77000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 scheduled @ 77000
+#27  77000: Event: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped 60 scheduled @ 78000
+
+#28  77000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 executed @ 77000
+#29  77000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 10 scheduled @ 95750
+#30  77000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 scheduled @ 77000
+
+#31  77000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 executed @ 77000
+
+#32  78000: Event: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped 60 executed @ 78000
+
+#33  95750: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 10 executed @ 95750
+#34  95750: Event: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 8 scheduled @ 123750
+
+#35 123750: Event: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 8 executed @ 123750
+#36 123750: Event: system.membus.slave[1]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 64 scheduled @ 126000
+#37 123750: Event: system.membus.respLayer1.wrapped_function_event: EventFunctionWrapped 65 scheduled @ 126000
+
+#38 126000: Event: system.membus.respLayer1.wrapped_function_event: EventFunctionWrapped 65 executed @ 126000
+
+#39 126000: Event: system.membus.slave[1]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 64 executed @ 126000
+#40 126000: Event: Event_40: Timing CPU icache tick 40 scheduled @ 126000
+
+#41 126000: Event: Event_40: Timing CPU icache tick 40 executed @ 126000
+    126000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+4    :   adr   x1, #28            : IntAlu :  D=0x0000000000400098  flags=(IsInteger)
+#42 126000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 scheduled @ 126000
+#43 126000: Event: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped 60 scheduled @ 127000
+
+    [...]
+
+    469000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+28    :   svc   #0x0               : IntAlu :   flags=(IsSerializeAfter|IsNonSpeculative|IsSyscall)
+    469000: Event: Event_75: generic 75 scheduled @ 469000
+    469000: Event: Event_75: generic 75 executed @ 469000</pre>
 </div>
 </div>
 <div class="paragraph">
-<p><code>0: system.cpu.wrapped_function_event</code> schedule the initial tick, much like for for <code>AtomicSimpleCPU</code>. This time however, it is not a tick, but rather a fetch event that gets scheduled:</p>
+<p>Looking into the generated <a href="#gem5-config-ini"><code>config.dot.svg</code></a> can give a better intuition on the shape of the memory system: <a href="#config-dot-svg-timingsimplecpu">Figure 2, &#8220;<code>config.dot.svg</code> for a TimingSimpleCPU without caches.&#8221;</a>, so it is good to keep that in mind.</p>
+</div>
+<div id="config-dot-svg-timingsimplecpu" class="imageblock">
+<div class="content">
+<img src="https://raw.githubusercontent.com/cirosantilli/media/master/gem5_config_TimingSimpleCPU_12c917de54145d2d50260035ba7fa614e25317a3.svg?sanitize=true" alt="gem5 config TimingSimpleCPU 12c917de54145d2d50260035ba7fa614e25317a3" height="600">
+</div>
+<div class="title">Figure 2. <code>config.dot.svg</code> for a TimingSimpleCPU without caches.</div>
+</div>
+<div class="paragraph">
+<p>It is also helpful to see this as a tree of events where one execute event schedules other events:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    | | | | |
+    0 1 2 3 4   0 TimingSimpleCPU::fetch
+    5
+    |
+    +---+
+    |   |
+    6   7       6 DRAMCtrl::processNextReqEvent
+    8   15      7 BaseXBar::Layer::releaseLayer
+    |
++---+---+
+|   |   |
+9   10  11      9 DRAMCtrl::Rank::processActivateEvent
+12  17  16     10 DRAMCtrl::processRespondEvent
+|   |          11 DRAMCtrl::processNextReqEvent
+|   |
+13  18         13 DRAMCtrl::Rank::processPowerEvent
+14  19         18 PacketQueue::processSendEvent
+    |
+    +---+
+    |   |
+    20  21     20 PacketQueue::processSendEvent
+    23  22     21 BaseXBar::Layer&lt;SrcType, DstType&gt;::releaseLayer
+    |
+    24         24 TimingSimpleCPU::IcachePort::ITickEvent::process
+    25
+    |
+    +---+
+    |   |
+    26  27     26 DRAMCtrl::processNextReqEvent
+    28  32     27 BaseXBar::Layer&lt;SrcType, DstType&gt;::releaseLayer
+    |
+    +---+
+    |   |
+    29  30     29 DRAMCtrl::processRespondEvent
+    33  31     30 DRAMCtrl::processNextReqEvent
+    |
+    34         34 PacketQueue::processSendEvent
+    35
+    |
+    +---+
+    |   |
+    36  37     36 PacketQueue::processSendEvent
+    39  38     37 BaseXBar::Layer&lt;SrcType, DstType&gt;::releaseLayer
+    |
+    40         40 TimingSimpleCPU::IcachePort::ITickEvent::process
+    41
+    |
+    +---+
+    |   |
+    42  43     42 DRAMCtrl::processNextReqEvent
+               43 BaseXBar::Layer&lt;SrcType, DstType&gt;::releaseLayer</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Note that every schedule is followed by an execution, so we put them together, for example:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    |   |
+    6   7    6 DRAMCtrl::processNextReqEvent
+    8   15   7 BaseXBar::Layer::releaseLayer
+    |</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>means:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><code>6</code>: schedule <code>DRAMCtrl::processNextReqEvent</code></p>
+</li>
+<li>
+<p><code>8</code>: execute <code>DRAMCtrl::processNextReqEvent</code></p>
+</li>
+<li>
+<p><code>7</code>: schedule <code>BaseXBar::Layer::releaseLayer</code></p>
+</li>
+<li>
+<p><code>15</code>: execute <code>BaseXBar::Layer::releaseLayer</code></p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>With this, we can focus on going up the event tree from an event of interest until we see what originally caused it!</p>
+</div>
+<div class="paragraph">
+<p>Notably, the above tree contains the execution of the first two instructions.</p>
+</div>
+<div class="paragraph">
+<p>Observe how the events leading up to the second instruction are basically a copy of those of the first one, this is the basic <code>TimingSimpleCPU</code> event loop in action.</p>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-0"><a class="anchor" href="#timingsimplecpu-analysis-0"></a><a class="link" href="#timingsimplecpu-analysis-0">19.19.4.2.1. TimingSimpleCPU analysis #0</a></h6>
+<div class="paragraph">
+<p>Schedules <code>TimingSimpleCPU::fetch</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+TimingSimpleCPU::activateContext
+SimpleThread::activate
+Process::initState
+ArmProcess64::initState
+ArmLinuxProcess64::initState</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>This schedules the initial tick, much like for for <code>AtomicSimpleCPU</code>.</p>
+</div>
+<div class="paragraph">
+<p>This time however, it is not a tick as in <code>AtomicSimpleCPU</code>, but rather a fetch event that gets scheduled for later on, since reading DRAM memory now takes time:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -21682,13 +22210,26 @@ info: Entering event queue @ 0.  Starting simulation...
 </div>
 </div>
 <div class="paragraph">
-<p>We have a fetch instead of a tick here compared to <code>AtomicSimpleCPU</code>, because in the timing CPU we must first get the instruction opcode from DRAM, which takes some cycles to return!</p>
+<p>By looking at the source, we see that <code>fetchEvent</code> runs <code>TimingSimpleCPU::fetch</code>.</p>
 </div>
 <div class="paragraph">
-<p>By looking at the source, we see that fetchEvent runs <code>TimingSimpleCPU::fetch</code>.</p>
+<p>Just like for <code>AtomicSimpleCPU</code>, this call comes from the <code>initState</code> call, which is exposed on <code>SimObject</code> and ultimately comes from Python.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-1"><a class="anchor" href="#timingsimplecpu-analysis-1"></a><a class="link" href="#timingsimplecpu-analysis-1">19.19.4.2.2. TimingSimpleCPU analysis #1</a></h6>
+<div class="paragraph">
+<p>Backtrace:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+DRAMCtrl::Rank::startup
+DRAMCtrl::startup</pre>
+</div>
 </div>
 <div class="paragraph">
-<p><code>0: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped event scheduled @ 7786250</code>: from GDB we see that it comes from <code>DRAMCtrl::startup</code> in <code>mem/dram_ctrl.cc</code> which contains:</p>
+<p>Snippets:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -21736,12 +22277,35 @@ DRAMCtrl::Rank::startup(Tick ref_tick)
 </div>
 </div>
 <div class="paragraph">
+<p><code>DRAMCtrl::startup</code> is itself a <code>SimObject</code> method exposed to Python and called from <code>simulate</code> in <code>src/python/m5/simulate.py</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>def simulate(*args, **kwargs):
+    global need_startup
+
+    if need_startup:
+        root = objects.Root.getInstance()
+        for obj in root.descendants(): obj.startup()</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>where <code>simulate</code> happens after <code>m5.instantiate</code>, and both are called directly from the toplevel scripts, e.g. for se.py in <code>configs/common/Simulation.py</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>def run(options, root, testsys, cpu_class):
+    ...
+            exit_event = m5.simulate()</pre>
+</div>
+</div>
+<div class="paragraph">
 <p>By looking up some variable definitions in the source, we now we see some memory parameters clearly:</p>
 </div>
 <div class="ulist">
 <ul>
 <li>
-<p>ranks: <code>std::vector&lt;DRAMCtrl::Rank*&gt;</code> with 2 elements. TODO why do we have 2? What does it represent? Likely linked to <a href="#gem5-config-ini"><code>config.ini</code></a> at <code>system.mem_ctrls.ranks_per_channel=2</code></p>
+<p>ranks: <code>std::vector&lt;DRAMCtrl::Rank*&gt;</code> with 2 elements. TODO why do we have 2? What does it represent? Likely linked to <a href="#gem5-config-ini"><code>config.ini</code></a> at <code>system.mem_ctrls.ranks_per_channel=2</code>: <a href="https://en.wikipedia.org/wiki/Memory_rank" class="bare">https://en.wikipedia.org/wiki/Memory_rank</a></p>
 </li>
 <li>
 <p><code>tCK=1250</code>, <code>tREFI=7800000</code>, <code>tRP=13750</code>, <code>tRCD=13750</code>: all defined in a single code location with a comment:</p>
@@ -21782,53 +22346,141 @@ DRAMCtrl::Rank::startup(Tick ref_tick)
 <div class="paragraph">
 <p><code>curTick() + tREFI - tRP = 0 + 7800000 - 13750 = 7786250</code> which is when that <code>refreshEvent</code> was scheduled. Our simulation ends way before that point however, so we will never know what it did thank God.</p>
 </div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-2"><a class="anchor" href="#timingsimplecpu-analysis-2"></a><a class="link" href="#timingsimplecpu-analysis-2">19.19.4.2.3. TimingSimpleCPU analysis #2</a></h6>
 <div class="paragraph">
-<p><code>0: Event_74: generic event scheduled @ 0</code> and <code>0: Event_74: generic event rescheduled @ 18446744073709551615</code>: schedule the final exit event, same as for <code>AtomicSimpleCPU</code></p>
+<p>This is just the startup of the second rank, see: <a href="#timingsimplecpu-analysis-1">TimingSimpleCPU analysis #1</a>.</p>
 </div>
 <div class="paragraph">
-<p>The next interesting event is:</p>
+<p><code>se.py</code> allocates the memory controller at <code>configs/common/MemConfig.py</code>:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 0</pre>
+<pre>def config_mem(options, system):
+
+    ...
+
+    opt_mem_channels = options.mem_channels
+
+    ...
+
+    nbr_mem_ctrls = opt_mem_channels
+
+    ...
+
+    for r in system.mem_ranges:
+        for i in range(nbr_mem_ctrls):
+            mem_ctrl = create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits,
+                                       intlv_size)
+
+            ...
+
+            mem_ctrls.append(mem_ctrl)</pre>
 </div>
 </div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-3-and-4"><a class="anchor" href="#timingsimplecpu-analysis-3-and-4"></a><a class="link" href="#timingsimplecpu-analysis-3-and-4">19.19.4.2.4. TimingSimpleCPU analysis #3 and #4</a></h6>
+<div class="paragraph">
+<p>From the timing we know what that one is: the end of time exit event, like for <code>AtomicSimpleCPU</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-5"><a class="anchor" href="#timingsimplecpu-analysis-5"></a><a class="link" href="#timingsimplecpu-analysis-5">19.19.4.2.5. TimingSimpleCPU analysis #5</a></h6>
+<div class="paragraph">
+<p>Executes <code>TimingSimpleCPU::fetch()</code>.</p>
+</div>
 <div class="paragraph">
-<p>which comes from:</p>
+<p>The log shows that event ID <code>43</code> is now executing: we had previously seen event <code>43</code> get scheduled and had analyzed it to be the initial fetch.</p>
+</div>
+<div class="paragraph">
+<p>We can step into <code>TimingSimpleCPU::fetch()</code> to confirm that the expected <a href="#elf">ELF</a> entry point is being fetched. We can inspect the ELF with:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>#0  Trace::OstreamLogger::logMessage
-#1  void Trace::Logger::dprintf
-#2  Event::trace
-#3  EventQueue::schedule
-#4  EventManager::schedule
-#5  DRAMCtrl::addToReadQueue
-#6  DRAMCtrl::recvTimingReq
-#7  DRAMCtrl::MemoryPort::recvTimingReq
-#8  TimingRequestProtocol::sendReq
-#9  MasterPort::sendTimingReq
-#10 CoherentXBar::recvTimingReq
-#11 CoherentXBar::CoherentXBarSlavePort::recvTimingReq(Packet*))
-#12 TimingRequestProtocol::sendReq
-#13 MasterPort::sendTimingReq
-#14 TimingSimpleCPU::sendFetch
-#15 TimingSimpleCPU::FetchTranslation::finish
-#16 ArmISA::TLB::translateComplete
-#17 ArmISA::TLB::translateTiming
-#18 ArmISA::TLB::translateTiming
-#19 TimingSimpleCPU::fetch
-#20 TimingSimpleCPU::&lt;lambda()&gt;::operator()(void)
-#21 std::_Function_handler&lt;void(), TimingSimpleCPU::TimingSimpleCPU(TimingSimpleCPUParams*)::&lt;lambda()&gt; &gt;
-#22 std::function&lt;void)&gt;::operator()() const
-#23 EventFunctionWrapper::process
-#24 EventQueue::serviceOne
-#25 doSimLoop
-#26 simulate</pre>
+<pre>./run-toolchain --arch aarch64 readelf -- -h "$(./getvar --arch aarch64 userland_build_dir)/arch/aarch64/freestanding/linux/hello.out</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>From the trace, we see that we are already running from the event queue. Therefore, we must have been running a previously scheduled event, and the previous event logs, the only such event is <code>0: system.cpu.wrapped_function_event: EventFunctionWrapped event scheduled @ 0</code> which scheduled a memory fetch!</p>
+<p>which contains:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>  Entry point address:               0x400078</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and by the time we go past:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>TimingSimpleCPU::fetch()
+{
+    ...
+    if (needToFetch) {
+        ...
+        setupFetchRequest(ifetch_req);
+        DPRINTF(SimpleCPU, "Translating address %#x\n", ifetch_req-&gt;getVaddr());
+        thread-&gt;itb-&gt;translateTiming(ifetch_req, thread-&gt;getTC(),
+                &amp;fetchTranslation, BaseTLB::Execute);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><code>BaseSimpleCPU::setupFetchRequest</code> sets up the fetch of the expected entry point by reading the PC:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>p/x ifetch_req-&gt;getVaddr()</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Still during the execution of the <code>fetch</code>, execution then moves into the address translation <code>ArmISA::TLB::translateTiming</code>, and after a call to:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>TLB::translateSe</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>the packet now contains the physical address:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_paddr = 0x78</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-6"><a class="anchor" href="#timingsimplecpu-analysis-6"></a><a class="link" href="#timingsimplecpu-analysis-6">19.19.4.2.6. TimingSimpleCPU analysis #6</a></h6>
+<div class="paragraph">
+<p>Schedules <code>DRAMCtrl::processNextReqEvent</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+DRAMCtrl::addToReadQueue
+DRAMCtrl::recvTimingReq
+DRAMCtrl::MemoryPort::recvTimingReq
+TimingRequestProtocol::sendReq
+MasterPort::sendTimingReq
+CoherentXBar::recvTimingReq
+CoherentXBar::CoherentXBarSlavePort::recvTimingReq
+TimingRequestProtocol::sendReq
+MasterPort::sendTimingReq
+TimingSimpleCPU::sendFetch
+TimingSimpleCPU::FetchTranslation::finish
+ArmISA::TLB::translateComplete
+ArmISA::TLB::translateTiming
+ArmISA::TLB::translateTiming
+TimingSimpleCPU::fetch</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>The event loop has started, and magic initialization schedulings are not happening anymore: now every event is being scheduled from another event:</p>
+</div>
+<div class="paragraph">
+<p>From the trace, we see that we are already running from the event queue under <code>TimingSimpleCPU::fetch</code> as expected.</p>
 </div>
 <div class="paragraph">
 <p>From the backtrace we see the tortuous path that the data request takes, going through:</p>
@@ -21847,7 +22499,10 @@ DRAMCtrl::Rank::startup(Tick ref_tick)
 </ul>
 </div>
 <div class="paragraph">
-<p>The scheduling happens at frame <code>#5</code>:</p>
+<p>This matches the <code>config.ini</code> system image, since we see that the request goes through the <code>CoherentXBar</code> before reaching memory, like all other CPU memory accesses, see also: <a href="#gem5-crossbar-interconnect">gem5 crossbar interconnect</a>.</p>
+</div>
+<div class="paragraph">
+<p>The scheduling happens at frame <code>DRAMCtrl::addToReadQueue</code>:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -21860,94 +22515,387 @@ DRAMCtrl::Rank::startup(Tick ref_tick)
 </div>
 </div>
 <div class="paragraph">
-<p>and from a quick source grep we see that <code>nextReqEvent</code> is a <code>DRAMCtrl::processNextReqEvent</code>.</p>
+<p>From this we deduce that the DRAM has a request queue of some sort, and that the fetch:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>has added a read request to that queue</p>
+</li>
+<li>
+<p>and has made a future request to read from the queue</p>
+</li>
+</ul>
 </div>
 <div class="paragraph">
-<p>The next schedule:</p>
+<p>The signature of the function is:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>0: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped event scheduled @ 1000</pre>
+<pre>DRAMCtrl::addToReadQueue(PacketPtr pkt, unsigned int pktCount)</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>and does a <code>BaseXBar::Layer::releaseLayer</code> event.</p>
+<p>where <code>PacketPtr</code> is of <code>class `Packet</code>, and so clearly the packet is coming from above.</p>
 </div>
 <div class="paragraph">
-<p>This one is also coming from the request queue at <code>TimingSimpleCPU::fetch</code>. We deduce therefore that the single previous fetch event scheduled not one, but two events!</p>
-</div>
-<div class="paragraph">
-<p>Now:</p>
+<p>From:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>      0: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped event scheduled @ 0</pre>
+<pre>p/x *pkt</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>comes from the previously scheduled <code>DRAMCtrl::processNextReqEvent</code> and schedules <code>DRAMCtrl::Rank::processPrechargeEvent</code>.</p>
-</div>
-<div class="paragraph">
-<p>Now:</p>
+<p>we see:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>      0: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 46250</pre>
+<pre>addr = 0x78</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>also runs from <code>DRAMCtrl::processNextReqEvent</code> and schedules a <code>DRAMCtrl::processRespondEvent</code>.</p>
+<p>which from <a href="#timingsimplecpu-analysis-5">TimingSimpleCPU analysis #5</a> we know is the physical address of the ELF entry point.</p>
 </div>
 <div class="paragraph">
-<p>I&#8217;m getting bored, let&#8217;s skip to the line that appears to matter for the first instruction:</p>
+<p>Communication goes through certain components via the <code>class Port</code> interface, e.g. at <code>TimingSimpleCPU::sendFetch</code> a call is made to send the packet forward:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>  46250: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped event scheduled @ 74250</pre>
+<pre>icachePort.sendTimingReq(ifetch_pkt)</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>But I got even more bored, and I will now skip to the first event before the instruction:</p>
+<p>which ends up calling:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>  77000: Event_40: Timing CPU icache tick event scheduled @ 77000
-  77000: system.cpu A0 T0 : @asm_main_after_prologue    :   movz   x0, #1, #0        : IntAlu :  D=0x0000000000000001  flags=(IsInteger)</pre>
+<pre>peer-&gt;recvTimingReq(pkt);</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>This event comes from <code>PacketQueue::processSendEvent</code> and schedules itself:</p>
+<p>to reach the receiving side:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>void
-TimingSimpleCPU::TimingCPUPort::TickEvent::schedule(PacketPtr _pkt, Tick t)
-{
-    pkt = _pkt;
-    cpu-&gt;schedule(this, t);
-}</pre>
+<pre>CoherentXBar::CoherentXBarSlavePort::recvTimingReq</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>which polymorphically resolves to:</p>
+<p>Ports are also used to connect the XBar and the DRAM.</p>
+</div>
+<div class="paragraph">
+<p>We will then see that at <a href="#timingsimplecpu-analysis-20">TimingSimpleCPU analysis #20</a> a reply packet will come back through the port interface down to the icache port, and only then does the decoding and execution happen.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-7"><a class="anchor" href="#timingsimplecpu-analysis-7"></a><a class="link" href="#timingsimplecpu-analysis-7">19.19.4.2.7. TimingSimpleCPU analysis #7</a></h6>
+<div class="paragraph">
+<p>Schedules <code>BaseXBar::Layer::releaseLayer</code> through:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>void
-TimingSimpleCPU::IcachePort::ITickEvent::process()
-{
-    cpu-&gt;completeIfetch(pkt);
-}</pre>
+<pre>EventManager::schedule
+BaseXBar::Layer&lt;SlavePort, MasterPort&gt;::occupyLayer
+BaseXBar::Layer&lt;SlavePort, MasterPort&gt;::succeededTiming
+CoherentXBar::recvTimingReq
+CoherentXBar::CoherentXBarSlavePort::recvTimingReq
+TimingRequestProtocol::sendReq
+MasterPort::sendTimingReq
+TimingSimpleCPU::sendFetch
+TimingSimpleCPU::FetchTranslation::finish
+ArmISA::TLB::translateComplete
+ArmISA::TLB::translateTiming
+ArmISA::TLB::translateTiming
+TimingSimpleCPU::fetch</pre>
 </div>
 </div>
 <div class="paragraph">
-<p>and so <code>TimingSimpleCPU::completeIfetch</code> is, at last, the interesting <code>TimingSimpleCPU</code> function!</p>
+<p>which schedules a <code>SimpleMemory::release</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-8"><a class="anchor" href="#timingsimplecpu-analysis-8"></a><a class="link" href="#timingsimplecpu-analysis-8">19.19.4.2.8. TimingSimpleCPU analysis #8</a></h6>
+<div class="paragraph">
+<p>Executes <code>DRAMCtrl::processNextReqEvent</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-9"><a class="anchor" href="#timingsimplecpu-analysis-9"></a><a class="link" href="#timingsimplecpu-analysis-9">19.19.4.2.9. TimingSimpleCPU analysis #9</a></h6>
+<div class="paragraph">
+<p>Schedules <code>DRAMCtrl::Rank::processActivateEvent</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+DRAMCtrl::activateBank
+DRAMCtrl::doDRAMAccess
+DRAMCtrl::processNextReqEvent</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-10"><a class="anchor" href="#timingsimplecpu-analysis-10"></a><a class="link" href="#timingsimplecpu-analysis-10">19.19.4.2.10. TimingSimpleCPU analysis #10</a></h6>
+<div class="paragraph">
+<p>Schedules <code>DRAMCtrl::processRespondEvent</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+DRAMCtrl::processNextReqEvent</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-11"><a class="anchor" href="#timingsimplecpu-analysis-11"></a><a class="link" href="#timingsimplecpu-analysis-11">19.19.4.2.11. TimingSimpleCPU analysis #11</a></h6>
+<div class="paragraph">
+<p>Schedules <code>DRAMCtrl::processNextReqEvent</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+DRAMCtrl::processNextReqEvent</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-12"><a class="anchor" href="#timingsimplecpu-analysis-12"></a><a class="link" href="#timingsimplecpu-analysis-12">19.19.4.2.12. TimingSimpleCPU analysis #12</a></h6>
+<div class="paragraph">
+<p>Executes <code>DRAMCtrl::Rank::processActivateEvent</code>.</p>
 </div>
 <div class="paragraph">
-<p>The end of this instruction must be setting things up in a way that can continue the PC walk loop, and by looking at the source and traces, it is clearly from: <code>TimingSimpleCPU::advanceInst</code> which calls <code>TimingSimpleCPU::fetch</code>, which is the very thing we did in this simulation!!! OMG, that&#8217;s the loop.</p>
+<p>which schedules:</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-13"><a class="anchor" href="#timingsimplecpu-analysis-13"></a><a class="link" href="#timingsimplecpu-analysis-13">19.19.4.2.13. TimingSimpleCPU analysis #13</a></h6>
+<div class="paragraph">
+<p>Schedules <code>DRAMCtrl::Rank::processPowerEvent</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+DRAMCtrl::Rank::schedulePowerEvent
+DRAMCtrl::Rank::processActivateEvent</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-14"><a class="anchor" href="#timingsimplecpu-analysis-14"></a><a class="link" href="#timingsimplecpu-analysis-14">19.19.4.2.14. TimingSimpleCPU analysis #14</a></h6>
+<div class="paragraph">
+<p>Executes <code>DRAMCtrl::Rank::processPowerEvent</code>.</p>
 </div>
 <div class="paragraph">
-<p>One final thing to check, is how the memory reads are going to make the processor stall in the middle of an instruction.</p>
+<p>This it must just be some power statistics stuff, as it does not schedule anything else.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-15"><a class="anchor" href="#timingsimplecpu-analysis-15"></a><a class="link" href="#timingsimplecpu-analysis-15">19.19.4.2.15. TimingSimpleCPU analysis #15</a></h6>
+<div class="paragraph">
+<p>Executes <code>BaseXBar::Layer&lt;SrcType, DstType&gt;::releaseLayer</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-16"><a class="anchor" href="#timingsimplecpu-analysis-16"></a><a class="link" href="#timingsimplecpu-analysis-16">19.19.4.2.16. TimingSimpleCPU analysis #16</a></h6>
+<div class="paragraph">
+<p>Executes <code>DRAMCtrl::processNextReqEvent()</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-17"><a class="anchor" href="#timingsimplecpu-analysis-17"></a><a class="link" href="#timingsimplecpu-analysis-17">19.19.4.2.17. TimingSimpleCPU analysis #17</a></h6>
+<div class="paragraph">
+<p>Executes <code>DRAMCtrl::processRespondEvent()</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-18"><a class="anchor" href="#timingsimplecpu-analysis-18"></a><a class="link" href="#timingsimplecpu-analysis-18">19.19.4.2.18. TimingSimpleCPU analysis #18</a></h6>
+<div class="paragraph">
+<p>Schedules <code>PacketQueue::processSendEvent()</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>PacketQueue::schedSendEvent
+PacketQueue::schedSendTiming
+QueuedSlavePort::schedTimingResp
+DRAMCtrl::accessAndRespond
+DRAMCtrl::processRespondEvent</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-19"><a class="anchor" href="#timingsimplecpu-analysis-19"></a><a class="link" href="#timingsimplecpu-analysis-19">19.19.4.2.19. TimingSimpleCPU analysis #19</a></h6>
+<div class="paragraph">
+<p>Executes <code>PacketQueue::processSendEvent()</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-20"><a class="anchor" href="#timingsimplecpu-analysis-20"></a><a class="link" href="#timingsimplecpu-analysis-20">19.19.4.2.20. TimingSimpleCPU analysis #20</a></h6>
+<div class="paragraph">
+<p>Schedules <code>PacketQueue::processSendEvent</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+PacketQueue::schedSendEvent
+PacketQueue::schedSendTiming
+QueuedSlavePort::schedTimingResp
+CoherentXBar::recvTimingResp
+CoherentXBar::CoherentXBarMasterPort::recvTimingResp
+TimingResponseProtocol::sendResp
+SlavePort::sendTimingResp
+RespPacketQueue::sendTiming
+PacketQueue::sendDeferredPacket
+PacketQueue::processSendEvent</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>From this backtrace, we see that this event is happening as the fetch reply packet finally comes back from DRAM.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-21"><a class="anchor" href="#timingsimplecpu-analysis-21"></a><a class="link" href="#timingsimplecpu-analysis-21">19.19.4.2.21. TimingSimpleCPU analysis #21</a></h6>
+<div class="paragraph">
+<p>Schedules <code>BaseXBar::Layer&lt;SrcType, DstType&gt;::releaseLayer</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+BaseXBar::Layer&lt;MasterPort, SlavePort&gt;::occupyLayer
+BaseXBar::Layer&lt;MasterPort, SlavePort&gt;::succeededTiming
+CoherentXBar::recvTimingResp
+CoherentXBar::CoherentXBarMasterPort::recvTimingResp
+TimingResponseProtocol::sendResp
+SlavePort::sendTimingResp
+RespPacketQueue::sendTiming
+PacketQueue::sendDeferredPacket
+PacketQueue::processSendEvent</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-22"><a class="anchor" href="#timingsimplecpu-analysis-22"></a><a class="link" href="#timingsimplecpu-analysis-22">19.19.4.2.22. TimingSimpleCPU analysis #22</a></h6>
+<div class="paragraph">
+<p>Executes <code>BaseXBar::Layer&lt;SrcType, DstType&gt;::releaseLayer</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-23"><a class="anchor" href="#timingsimplecpu-analysis-23"></a><a class="link" href="#timingsimplecpu-analysis-23">19.19.4.2.23. TimingSimpleCPU analysis #23</a></h6>
+<div class="paragraph">
+<p>Executes <code>PacketQueue::processSendEvent</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-24"><a class="anchor" href="#timingsimplecpu-analysis-24"></a><a class="link" href="#timingsimplecpu-analysis-24">19.19.4.2.24. TimingSimpleCPU analysis #24</a></h6>
+<div class="paragraph">
+<p>Schedules <code>TimingSimpleCPU::IcachePort::ITickEvent::process()</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+TimingSimpleCPU::TimingCPUPort::TickEvent::schedule
+TimingSimpleCPU::IcachePort::recvTimingResp
+TimingResponseProtocol::sendResp
+SlavePort::sendTimingResp
+RespPacketQueue::sendTiming
+PacketQueue::sendDeferredPacket
+PacketQueue::processSendEvent</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-25"><a class="anchor" href="#timingsimplecpu-analysis-25"></a><a class="link" href="#timingsimplecpu-analysis-25">19.19.4.2.25. TimingSimpleCPU analysis #25</a></h6>
+<div class="paragraph">
+<p>Executes <code>TimingSimpleCPU::IcachePort::ITickEvent::process()</code>.</p>
+</div>
+<div class="paragraph">
+<p>This custom <code>process</code> then calls <code>TimingSimpleCPU::completeIfetch(PacketPtr pkt)</code>, and that finally executes the very first instruction:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>77000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue    :   movz   x0, #1, #0        : IntAlu :  D=0x0000000000000001  flags=(IsInteger)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>The end of this instruction must be setting things up in a way that can continue the PC walk loop, and by looking at the source and traces, it is clearly from: <code>TimingSimpleCPU::advanceInst</code> which calls <code>TimingSimpleCPU::fetch</code>.</p>
+</div>
+<div class="paragraph">
+<p>And <code>TimingSimpleCPU::fetch</code> is the very thing we did in this simulation at <a href="#timingsimplecpu-analysis-0">TimingSimpleCPU analysis #0</a>!!! OMG, that&#8217;s the loop.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-26"><a class="anchor" href="#timingsimplecpu-analysis-26"></a><a class="link" href="#timingsimplecpu-analysis-26">19.19.4.2.26. TimingSimpleCPU analysis #26</a></h6>
+<div class="paragraph">
+<p>Schedules <code>DRAMCtrl::processNextReqEvent</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+DRAMCtrl::addToReadQueue
+DRAMCtrl::recvTimingReq
+DRAMCtrl::MemoryPort::recvTimingReq
+TimingRequestProtocol::sendReq
+MasterPort::sendTimingReq
+CoherentXBar::recvTimingReq
+CoherentXBar::CoherentXBarSlavePort::recvTimingReq
+TimingRequestProtocol::sendReq
+MasterPort::sendTimingReq
+TimingSimpleCPU::sendFetch
+TimingSimpleCPU::FetchTranslation::finish
+ArmISA::TLB::translateComplete
+ArmISA::TLB::translateTiming
+ArmISA::TLB::translateTiming
+TimingSimpleCPU::fetch
+TimingSimpleCPU::advanceInst
+TimingSimpleCPU::completeIfetch
+TimingSimpleCPU::IcachePort::ITickEvent::process</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-27"><a class="anchor" href="#timingsimplecpu-analysis-27"></a><a class="link" href="#timingsimplecpu-analysis-27">19.19.4.2.27. TimingSimpleCPU analysis #27</a></h6>
+<div class="paragraph">
+<p>Schedules <code>BaseXBar::Layer&lt;SrcType, DstType&gt;::releaseLayer</code> through:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+BaseXBar::Layer&lt;SlavePort, MasterPort&gt;::occupyLayer
+BaseXBar::Layer&lt;SlavePort, MasterPort&gt;::succeededTiming
+CoherentXBar::recvTimingReq
+CoherentXBar::CoherentXBarSlavePort::recvTimingReq
+TimingRequestProtocol::sendReq
+MasterPort::sendTimingReq
+TimingSimpleCPU::sendFetch
+TimingSimpleCPU::FetchTranslation::finish
+ArmISA::TLB::translateComplete
+ArmISA::TLB::translateTiming
+ArmISA::TLB::translateTiming
+TimingSimpleCPU::fetch
+TimingSimpleCPU::advanceInst
+TimingSimpleCPU::completeIfetch
+TimingSimpleCPU::IcachePort::ITickEvent::process</pre>
+</div>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-28"><a class="anchor" href="#timingsimplecpu-analysis-28"></a><a class="link" href="#timingsimplecpu-analysis-28">19.19.4.2.28. TimingSimpleCPU analysis #28</a></h6>
+<div class="paragraph">
+<p>Execute <code>DRAMCtrl::processNextReqEvent</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-29"><a class="anchor" href="#timingsimplecpu-analysis-29"></a><a class="link" href="#timingsimplecpu-analysis-29">19.19.4.2.29. TimingSimpleCPU analysis #29</a></h6>
+<div class="paragraph">
+<p>Schedule <code>DRAMCtrl::processRespondEvent()</code>.</p>
+</div>
+</div>
+<div class="sect5">
+<h6 id="timingsimplecpu-analysis-ldr-stall"><a class="anchor" href="#timingsimplecpu-analysis-ldr-stall"></a><a class="link" href="#timingsimplecpu-analysis-ldr-stall">19.19.4.2.30. TimingSimpleCPU analysis: LDR stall</a></h6>
+<div class="paragraph">
+<p>One important thing we want to check now, is how the memory reads are going to make the processor stall in the middle of an instruction.</p>
+</div>
+<div class="paragraph">
+<p>Since we were using a simple CPU without a pipeline, the data memory access stall everything: there is no further progress until memory comes back.</p>
 </div>
 <div class="paragraph">
 <p>For that, we can GDB to the <code>TimingSimpleCPU::completeIfetch</code> of the first LDR done in our test program.</p>
@@ -21994,48 +22942,168 @@ TimingSimpleCPU::IcachePort::ITickEvent::process()
 </li>
 </ul>
 </div>
+<div class="paragraph">
+<p>The following is the region of interest of the event log:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre> 175000: Event: Event_40: Timing CPU icache tick 40 executed @ 175000
+ 175000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 scheduled @ 175000
+ 175000: Event: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped 60 scheduled @ 176000
+
+ 175000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 executed @ 175000
+ 175000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 10 scheduled @ 193750
+ 175000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 scheduled @ 175000
+
+ 175000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 executed @ 175000
+
+ 176000: Event: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped 60 executed @ 176000
+
+ 193750: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 10 executed @ 193750
+ 193750: Event: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 8 scheduled @ 221750
+
+ 221750: Event: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 8 executed @ 221750
+ 221750: Event: system.membus.slave[2]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 66 scheduled @ 224000
+ 221750: Event: system.membus.respLayer2.wrapped_function_event: EventFunctionWrapped 67 scheduled @ 224000
+
+ 224000: Event: system.membus.respLayer2.wrapped_function_event: EventFunctionWrapped 67 executed @ 224000
+
+ 224000: Event: system.membus.slave[2]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 66 executed @ 224000
+ 224000: Event: Event_42: Timing CPU dcache tick 42 scheduled @ 224000
+
+ 224000: Event: Event_42: Timing CPU dcache tick 42 executed @ 224000
+ 175000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+8    :   ldr   w2, #4194464       : MemRead :  D=0x0000000000000006 A=0x4000a0  flags=(IsInteger|IsMemRef|IsLoad)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>We first find it by looking for the <code>ExecEnable</code> of LDR.</p>
+</div>
+<div class="paragraph">
+<p>Then, we go up to the previous <code>Timing CPU icache tick</code> event, which from the analysis of previous instruction instructions, we know is where the instruction execution starts, the LDR instruction fetch is done by then!</p>
+</div>
+<div class="paragraph">
+<p>Next, several events happen as the data request must be percolating through the memory system, it must be very similar to the instruction fetches. TODO analyze event function names.</p>
+</div>
+<div class="paragraph">
+<p>Finally, at last we reach</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre> 224000: Event: Event_42: Timing CPU dcache tick 42 executed @ 224000
+ 175000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+8    :   ldr   w2, #4194464       : MemRead :  D=0x0000000000000006 A=0x4000a0  flags=(IsInteger|IsMemRef|IsLoad)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>from which we guess:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><code>224000</code>: this is the time that the data request finally returned, and at which execute gets called</p>
+</li>
+<li>
+<p><code>175000</code>: the log finally prints at the end of execution, but it does not show the actual time that things finished, but rather the time that the ifetch finished, which happened in the past</p>
+</li>
+</ul>
+</div>
+</div>
 </div>
 <div class="sect4">
 <h5 id="gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches"><a class="anchor" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches"></a><a class="link" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches">19.19.4.3. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis with caches</a></h5>
 <div class="paragraph">
-<p>Let&#8217;s just add --caches to see if things go any faster:</p>
+<p>Let&#8217;s just add <code>--caches</code> to see if things go any faster:</p>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>      0: system.cpu.wrapped_function_event: EventFunctionWrapped event scheduled @ 0
+<pre>      0: Event: system.cpu.wrapped_function_event: EventFunctionWrapped 43 scheduled @ 0
 **** REAL SIMULATION ****
-      0: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped event scheduled @ 7786250
-      0: system.mem_ctrls_1.wrapped_function_event: EventFunctionWrapped event scheduled @ 7786250
-      0: Event_84: generic event scheduled @ 0
+      0: Event: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped 14 scheduled @ 7786250
+      0: Event: system.mem_ctrls_1.wrapped_function_event: EventFunctionWrapped 20 scheduled @ 7786250
+      0: Event: Event_84: generic 84 scheduled @ 0
 info: Entering event queue @ 0.  Starting simulation...
-      0: Event_84: generic event rescheduled @ 18446744073709551615
-      0: system.cpu.icache.mem_side-MemSidePort.wrapped_function_event: EventFunctionWrapped event scheduled @ 1000
-   1000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 1000
-   1000: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped event scheduled @ 2000
-   1000: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped event scheduled @ 1000
-   1000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 46250
-   1000: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped event scheduled @ 5000
-   1000: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped event scheduled @ 1000
-  46250: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped event scheduled @ 74250
-  74250: system.membus.slave[1]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped event scheduled @ 77000
-  74250: system.membus.respLayer1.wrapped_function_event: EventFunctionWrapped event scheduled @ 80000
-  77000: system.cpu.icache.cpu_side-CpuSidePort.wrapped_function_event: EventFunctionWrapped event scheduled @ 78000
-  78000: Event_40: Timing CPU icache tick event scheduled @ 78000
-  78000: system.cpu A0 T0 : @asm_main_after_prologue    :   movz   x0, #1, #0        : IntAlu :  D=0x0000000000000001  flags=(IsInteger)
-  78000: system.cpu.icache.cpu_side-CpuSidePort.wrapped_function_event: EventFunctionWrapped event scheduled @ 83000
-  83000: Event_40: Timing CPU icache tick event scheduled @ 83000
-  83000: system.cpu A0 T0 : @asm_main_after_prologue+4    :   adr   x1, #28            : IntAlu :  D=0x0000000000400098  flags=(IsInteger)
+      0: Event: Event_84: generic 84 rescheduled @ 18446744073709551615
+      0: Event: system.cpu.wrapped_function_event: EventFunctionWrapped 43 executed @ 0
+      0: Event: system.cpu.icache.mem_side-MemSidePort.wrapped_function_event: EventFunctionWrapped 59 scheduled @ 1000
+   1000: Event: system.cpu.icache.mem_side-MemSidePort.wrapped_function_event: EventFunctionWrapped 59 executed @ 1000
+   1000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 scheduled @ 1000
+   1000: Event: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped 70 scheduled @ 2000
+   1000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 executed @ 1000
+   1000: Event: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped 12 scheduled @ 1000
+   1000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 10 scheduled @ 46250
+   1000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 scheduled @ 5000
+   1000: Event: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped 12 executed @ 1000
+   1000: Event: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped 15 scheduled @ 1000
+   1000: Event: system.mem_ctrls_0.wrapped_function_event: EventFunctionWrapped 15 executed @ 1000
+   2000: Event: system.membus.reqLayer0.wrapped_function_event: EventFunctionWrapped 70 executed @ 2000
+   5000: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 9 executed @ 5000
+  46250: Event: system.mem_ctrls.wrapped_function_event: EventFunctionWrapped 10 executed @ 46250
+  46250: Event: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 8 scheduled @ 74250
+  74250: Event: system.mem_ctrls.port-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 8 executed @ 74250
+  74250: Event: system.membus.slave[1]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 74 scheduled @ 77000
+  74250: Event: system.membus.respLayer1.wrapped_function_event: EventFunctionWrapped 75 scheduled @ 80000
+  77000: Event: system.membus.slave[1]-RespPacketQueue.wrapped_function_event: EventFunctionWrapped 74 executed @ 77000
+  77000: Event: system.cpu.icache.cpu_side-CpuSidePort.wrapped_function_event: EventFunctionWrapped 57 scheduled @ 78000
+  78000: Event: system.cpu.icache.cpu_side-CpuSidePort.wrapped_function_event: EventFunctionWrapped 57 executed @ 78000
+  78000: Event: Event_40: Timing CPU icache tick 40 scheduled @ 78000
+  78000: Event: Event_40: Timing CPU icache tick 40 executed @ 78000
+  78000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue    :   movz   x0, #1, #0        : IntAlu :  D=0x0000000000000001  flags=(IsInteger)
+  78000: Event: system.cpu.icache.cpu_side-CpuSidePort.wrapped_function_event: EventFunctionWrapped 57 scheduled @ 83000
+  80000: Event: system.membus.respLayer1.wrapped_function_event: EventFunctionWrapped 75 executed @ 80000
+  83000: Event: system.cpu.icache.cpu_side-CpuSidePort.wrapped_function_event: EventFunctionWrapped 57 executed @ 83000
+  83000: Event: Event_40: Timing CPU icache tick 40 scheduled @ 83000
+  83000: Event: Event_40: Timing CPU icache tick 40 executed @ 83000
+  83000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+4    :   adr   x1, #28            : IntAlu :  D=0x0000000000400098  flags=(IsInteger)
+  83000: Event: system.cpu.icache.mem_side-MemSidePort.wrapped_function_event: EventFunctionWrapped 59 scheduled @ 84000
   [...]
- 191000: system.cpu A0 T0 : @asm_main_after_prologue+28    :   svc   #0x0               : IntAlu :   flags=(IsSerializeAfter|IsNonSpeculative|IsSyscall)
- 191000: Event_85: generic event scheduled @ 191000</pre>
+ 191000: Event: Event_85: generic 85 scheduled @ 191000
+ 191000: Event: Event_85: generic 85 executed @ 191000</pre>
 </div>
 </div>
 <div class="paragraph">
 <p>So yes, <code>--caches</code> does work here, leading to a runtime of 191000 rather than 469000 without caches!</p>
 </div>
+<div class="paragraph">
+<p>Notably, we now see that very little time passed between the first and second instructions, presumably because rather than going out all the way to the DRAM system, the event chain stops right at the <code>icache.cpu_side</code> when a hit happens, which must have been the case for the second instruction, which is just adjacent to the first one.</p>
+</div>
+<div class="paragraph">
+<p>It is also interested to look into the generated <a href="#gem5-config-ini"><code>config.dot.svg</code></a> to compare it to the one without caches: <a href="#config-dot-svg-timingsimplecpu">Figure 2, &#8220;<code>config.dot.svg</code> for a TimingSimpleCPU without caches.&#8221;</a>. With caches: <a href="#config-dot-svg-timingsimplecpu-caches">Figure 3, &#8220;<code>config.dot.svg</code> for a TimingSimpleCPU with caches.&#8221;</a>.</p>
+</div>
+<div class="paragraph">
+<p>We can see from there, that we now have <code>icache</code> and <code>dcache</code> elements inside the CPU block, and that the CPU <code>icache</code> and <code>dcache</code> ports go through the caches to the <code>SystemXBar</code> rather than being directly connected as before.</p>
+</div>
+<div class="paragraph">
+<p>It is worth noting that the caches do not affect the <code>ArmITB</code> and <code>ArmDTB</code> <a href="#arm-paging">TLBs</a>, since those are already caches themselves.</p>
+</div>
+<div id="config-dot-svg-timingsimplecpu-caches" class="imageblock">
+<div class="content">
+<img src="https://raw.githubusercontent.com/cirosantilli/media/master/gem5_config_TimingSimpleCPU_caches_12c917de54145d2d50260035ba7fa614e25317a3.svg?sanitize=true" alt="gem5 config TimingSimpleCPU caches 12c917de54145d2d50260035ba7fa614e25317a3" height="600">
+</div>
+<div class="title">Figure 3. <code>config.dot.svg</code> for a TimingSimpleCPU with caches.</div>
+</div>
 </div>
 <div class="sect4">
-<h5 id="gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis">19.19.4.4. gem5 event queue MinorCPU syscall emulation freestanding example analysis</a></h5>
+<h5 id="gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches-and-multiple-cpus"><a class="anchor" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches-and-multiple-cpus"></a><a class="link" href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches-and-multiple-cpus">19.19.4.4. gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis with caches and multiple CPUs</a></h5>
+<div class="paragraph">
+<p>TODO is this the minimal setup that allows us to see the <a href="#gem5-crossbar-interconnect">gem5 crossbar interconnect</a>? Can we see anything in <code>AtomicSimpleCPU</code>?</p>
+</div>
+<div class="paragraph">
+<p>It would be amazing to analyze a simple example with interconnect packets possibly invalidating caches of other CPUs.</p>
+</div>
+<div class="paragraph">
+<p>To observe it we could create one well controlled workload with instructions that flush memory, and run it on two CPUs.</p>
+</div>
+<div class="paragraph">
+<p>If we don&#8217;t use such instructions that flush memory, we would only see the interconnect at work when caches run out.</p>
+</div>
+<div id="config-dot-svg-timingsimplecpu-caches-2-cpus" class="imageblock">
+<div class="content">
+<img src="https://raw.githubusercontent.com/cirosantilli/media/master/gem5_config_TimingSimpleCPU_caches_2_CPUs_12c917de54145d2d50260035ba7fa614e25317a3.svg?sanitize=true" alt="gem5 config TimingSimpleCPU caches 2 CPUs 12c917de54145d2d50260035ba7fa614e25317a3" height="600">
+</div>
+<div class="title">Figure 4. <code>config.dot.svg</code> for a system with two TimingSimpleCPU with caches.</div>
+</div>
+</div>
+<div class="sect4">
+<h5 id="gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis">19.19.4.5. gem5 event queue MinorCPU syscall emulation freestanding example analysis</a></h5>
 <div class="paragraph">
 <p>The events <a href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">for the Atomic CPU</a> were pretty simple: basically just ticks.</p>
 </div>
@@ -22043,6 +23111,12 @@ info: Entering event queue @ 0.  Starting simulation...
 <p>But as we venture into <a href="#gem5-cpu-types">more complex CPU models</a> such as <code>MinorCPU</code>, the events get much more complex and interesting.</p>
 </div>
 <div class="paragraph">
+<p>The memory system system part must be similar to that of <code>TimingSimpleCPU</code> that we previously studied <a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis">gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis</a>: the main thing we want to see is how the CPU pipeline speeds up execution by preventing some memory stalls.</p>
+</div>
+<div class="paragraph">
+<p>The <a href="#gem5-config-ini"><code>config.dot.svg</code></a> also indicates that: everything is exactly as in <a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches">gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis with caches</a>, except that the CPU is a <code>MinorCPU</code> instead of <code>TimingSimpleCPU</code>, and the <code>--caches</code> are now mandatory.</p>
+</div>
+<div class="paragraph">
 <p>TODO: analyze the trace for:</p>
 </div>
 <div class="literalblock">
@@ -22060,6 +23134,12 @@ info: Entering event queue @ 0.  Starting simulation...
 </div>
 </div>
 </div>
+<div class="sect4">
+<h5 id="gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis"><a class="anchor" href="#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis"></a><a class="link" href="#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis">19.19.4.6. gem5 event queue DerivO3CPU syscall emulation freestanding example analysis</a></h5>
+<div class="paragraph">
+<p>TODO: like <a href="#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis">gem5 event queue MinorCPU syscall emulation freestanding example analysis</a> but even more complex!</p>
+</div>
+</div>
 </div>
 <div class="sect3">
 <h4 id="gem5-stats-internals"><a class="anchor" href="#gem5-stats-internals"></a><a class="link" href="#gem5-stats-internals">19.19.5. gem5 stats internals</a></h4>
@@ -22209,7 +23289,32 @@ enum class Arch {
 <div class="sect3">
 <h4 id="gem5-build-system"><a class="anchor" href="#gem5-build-system"></a><a class="link" href="#gem5-build-system">19.19.7. gem5 build system</a></h4>
 <div class="sect4">
-<h5 id="gem5-polymorphic-isa-includes"><a class="anchor" href="#gem5-polymorphic-isa-includes"></a><a class="link" href="#gem5-polymorphic-isa-includes">19.19.7.1. gem5 polymorphic ISA includes</a></h5>
+<h5 id="gem5-build-broken-on-recent-compiler-version"><a class="anchor" href="#gem5-build-broken-on-recent-compiler-version"></a><a class="link" href="#gem5-build-broken-on-recent-compiler-version">19.19.7.1. gem5 build broken on recent compiler version</a></h5>
+<div class="paragraph">
+<p>gem5 moves a bit slowly, and if your host compiler is very new, the gem5 build might be broken for it, e.g. this was the case for Ubuntu 19.10 with GCC 9 and gem5 62d75e7105fe172eb906d4f80f360ff8591d4178 from Dec 2019.</p>
+</div>
+<div class="paragraph">
+<p>This happens mostly because GCC keeps getting more strict with warnings and gem5 uses <code>-Werror</code>.</p>
+</div>
+<div class="paragraph">
+<p>The specific problem mentioned above was later fixed, but if it ever happens again, you can work around it by either by or by disabling <code>-Werror</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./build-gem5 -- CCFLAGS=-Wno-error</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>or by  <a href="https://askubuntu.com/questions/466651/how-do-i-use-the-latest-gcc-on-ubuntu/1163021#1163021">installing an older compiler</a> and using it with something like:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./build-gem5 -- CC=gcc-8 CXX=g++-8</pre>
+</div>
+</div>
+</div>
+<div class="sect4">
+<h5 id="gem5-polymorphic-isa-includes"><a class="anchor" href="#gem5-polymorphic-isa-includes"></a><a class="link" href="#gem5-polymorphic-isa-includes">19.19.7.2. gem5 polymorphic ISA includes</a></h5>
 <div class="paragraph">
 <p>E.g. <code>src/cpu/decode_cache.hh</code> includes:</p>
 </div>
@@ -22288,7 +23393,7 @@ build/ARM/config/the_isa.hh
 </div>
 </div>
 <div class="sect4">
-<h5 id="why-are-all-c-symlinked-into-the-gem5-build-dir"><a class="anchor" href="#why-are-all-c-symlinked-into-the-gem5-build-dir"></a><a class="link" href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.19.7.2. Why are all C++ symlinked into the gem5 build dir?</a></h5>
+<h5 id="why-are-all-c-symlinked-into-the-gem5-build-dir"><a class="anchor" href="#why-are-all-c-symlinked-into-the-gem5-build-dir"></a><a class="link" href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.19.7.3. Why are all C++ symlinked into the gem5 build dir?</a></h5>
 <div class="paragraph">
 <p>Some scons madness.</p>
 </div>
@@ -23518,6 +24623,36 @@ echo 1 &gt; /proc/sys/vm/overcommit_memory
 <p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/hello.cpp">userland/cpp/hello.cpp</a></p>
 </li>
 <li>
+<p>classes</p>
+<div class="ulist">
+<ul>
+<li>
+<p>constructor</p>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/initializer_list_constructor.cpp">userland/cpp/initializer_list_constructor.cpp</a>: documents stuff like <code>std::vector&lt;int&gt; v{0, 1};</code> and <code>std::initializer_list</code></p>
+</li>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/most_vexing_parse.cpp">userland/cpp/most_vexing_parse.cpp</a>: the most vexing parse is a famous constructor vs function declaration syntax gotcha!</p>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://en.wikipedia.org/wiki/Most_vexing_parse" class="bare">https://en.wikipedia.org/wiki/Most_vexing_parse</a></p>
+</li>
+<li>
+<p><a href="http://stackoverflow.com/questions/180172/default-constructor-with-empty-brackets" class="bare">http://stackoverflow.com/questions/180172/default-constructor-with-empty-brackets</a></p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+<li>
 <p>templates</p>
 <div class="ulist">
 <ul>
@@ -23586,26 +24721,6 @@ echo 1 &gt; /proc/sys/vm/overcommit_memory
 </ul>
 </div>
 </li>
-<li>
-<p>Language madness</p>
-<div class="ulist">
-<ul>
-<li>
-<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/most_vexing_parse.cpp">userland/cpp/most_vexing_parse.cpp</a>: the most vexing parse is a famous constructor vs function declaration syntax gotcha!</p>
-<div class="ulist">
-<ul>
-<li>
-<p><a href="https://en.wikipedia.org/wiki/Most_vexing_parse" class="bare">https://en.wikipedia.org/wiki/Most_vexing_parse</a></p>
-</li>
-<li>
-<p><a href="http://stackoverflow.com/questions/180172/default-constructor-with-empty-brackets" class="bare">http://stackoverflow.com/questions/180172/default-constructor-with-empty-brackets</a></p>
-</li>
-</ul>
-</div>
-</li>
-</ul>
-</div>
-</li>
 </ul>
 </div>
 <div class="sect3">
@@ -26166,24 +27281,27 @@ One of the ways I often want to do that, especially when doing [user mode simula
 
 Here is a tiny example that calls just `exit` from the C standard library:
 
-main.S
-
-```
-.global _start
-_start:
+main.S</pre>
+</div>
+</div>
+<div class="paragraph">
+<div class="title">global _start</div>
+<p>_start:
     mov $0, %rdi
-    call exit
-
-```
-
-Compile and run with:
-
-```
-gcc -ggdb3 -nostartfiles -static -o exit.out exit.S
-qemu-x86_64 -d in_asm exit.out
-```
-
-However, for programming convenience, and to potentially keep my examples more OS portable, I would like to avoid making raw system calls, which would of course work, by using C standard library functions instead.
+    call exit</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>Compile and run with:</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>gcc -ggdb3 -nostartfiles -static -o exit.out exit.S
+qemu-x86_64 -d in_asm exit.out</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>However, for programming convenience, and to potentially keep my examples more OS portable, I would like to avoid making raw system calls, which would of course work, by using C standard library functions instead.
 
 But I'm afraid that some of those C standard library functions will fail in subtle ways because I have skipped required initialization steps that would normally happen before `main`.
 
@@ -34170,10 +35288,263 @@ west build -b qemu_aarch64 samples/hello_world</pre>
 <p>Algorithms to keep the caches of different cores of a system coherent.</p>
 </div>
 <div class="paragraph">
+<p>The main goal of such systems is to reduce the number of messages that have to be sent on the coherency bus, and most importantly, to memory (which passes first through the coherency bus).</p>
+</div>
+<div class="paragraph">
 <p>E.g.: if one processors writes to the cache, other processors have to know about it before they read from that address.</p>
 </div>
+<div class="paragraph">
+<p>The main software use case example to have in mind is that of multiple threads incrementing an atomic counter as in <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/atomic/std_atomic.cpp">userland/cpp/atomic/std_atomic.cpp</a>, see also: <a href="#atomic-cpp">atomic.cpp</a>.</p>
+</div>
 <div class="sect3">
-<h4 id="msi-protocol"><a class="anchor" href="#msi-protocol"></a><a class="link" href="#msi-protocol">32.1.1. MSI protocol</a></h4>
+<h4 id="vi-protocol"><a class="anchor" href="#vi-protocol"></a><a class="link" href="#vi-protocol">32.1.1. VI protocol</a></h4>
+<div class="paragraph">
+<p>Mentioned at:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="http://courses.csail.mit.edu/6.888/spring13/lectures/L7-coherence.pdf" class="bare">http://courses.csail.mit.edu/6.888/spring13/lectures/L7-coherence.pdf</a></p>
+</li>
+<li>
+<p><a href="http://csg.csail.mit.edu/6.823S16/lectures/L15.pdf" class="bare">http://csg.csail.mit.edu/6.823S16/lectures/L15.pdf</a></p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>This is the most trivial, but likely it is too bad and most sources don&#8217;t even mention it.</p>
+</div>
+<div class="paragraph">
+<p>In what follows I make some stuff up with design choice comparisons, needs confirmation.</p>
+</div>
+<div class="paragraph">
+<p>In this protocol, every cache only needs a single bit of state: validity.</p>
+</div>
+<div class="paragraph">
+<p>At the start, everything is invalid.</p>
+</div>
+<div class="paragraph">
+<p>Then, when you need to read and are invalid, you send a read on bus. If there is another valid cache in another CPU, it services the request. Otherwise, goes the request goes to memory. After read you become valid.</p>
+</div>
+<div class="paragraph">
+<p>Read for valid generates no bus requests, which is good.</p>
+</div>
+<div class="paragraph">
+<p>When you write, if you are invalid, you must first read to get the full cache line, like for any other protocol.</p>
+</div>
+<div class="paragraph">
+<p>Then, there are two possible design choices, either:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>that read is marked as exclusive, and all caches that had it snoop and become invalid.</p>
+<div class="paragraph">
+<p>Upside: no need to send the new data to the bus.</p>
+</div>
+<div class="paragraph">
+<p>Downside: more invalidations. But those are not too serious, because future invalid reads tend to just hit the remaining valid cache.</p>
+</div>
+</li>
+<li>
+<p>after the read and write, you send the data on the bus, and those that had it update and become valid.</p>
+<div class="paragraph">
+<p>Downside: much more data on bus, so likely this is not going to be the best choice.</p>
+</div>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>So we take the first option.</p>
+</div>
+<div class="paragraph">
+<p>When you write and are valid, you don&#8217;t need to read. But you still have invalidate everyone else, because multiple reads can lead to multiple valid holders, otherwise other valid holders would keep reading old values.</p>
+</div>
+<div class="paragraph">
+<p>We could either do this with an exclusive read, and ignore the return, or with a new Invalidate request that has no reply. This invalidation is called <code>BusUpgr</code> to match with Wikipedia.</p>
+</div>
+<div class="paragraph">
+<p>Write also has two other possible design choices, either:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>every write writes through to memory. This is likely never the best option.</p>
+</li>
+<li>
+<p>when the cache is full, eviction leads to a write to memory.</p>
+<div class="paragraph">
+<p>If multiple valid holders may exist, then this may lead to multiple</p>
+</div>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>So we take the second option.</p>
+</div>
+<div class="paragraph">
+<p>With this we would have:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>V</p>
+<div class="ulist">
+<ul>
+<li>
+<p>PrRd</p>
+<div class="ulist">
+<ul>
+<li>
+<p>V
+<strong>*</strong></p>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p>PrWr</p>
+<div class="ulist">
+<ul>
+<li>
+<p>V</p>
+</li>
+<li>
+<p>BusUpgr</p>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p>BusRd</p>
+<div class="ulist">
+<ul>
+<li>
+<p>V</p>
+</li>
+<li>
+<p>BusData</p>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p>BusRdX</p>
+<div class="ulist">
+<ul>
+<li>
+<p>I</p>
+</li>
+<li>
+<p>BusData</p>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p>BusUpgr</p>
+<div class="ulist">
+<ul>
+<li>
+<p>I
+<strong>*</strong></p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p>I</p>
+<div class="ulist">
+<ul>
+<li>
+<p>PrRd</p>
+<div class="ulist">
+<ul>
+<li>
+<p>V</p>
+</li>
+<li>
+<p>BusRd</p>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p>PrWr</p>
+<div class="ulist">
+<ul>
+<li>
+<p>V</p>
+</li>
+<li>
+<p>BusRdX</p>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p>BusRd</p>
+<div class="ulist">
+<ul>
+<li>
+<p>I
+<strong>*</strong></p>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p>BusRdX</p>
+<div class="ulist">
+<ul>
+<li>
+<p>I
+<strong>*</strong></p>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p>BusUpgr</p>
+<div class="ulist">
+<ul>
+<li>
+<p>I
+<strong>*</strong></p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>Here Flush and BusData replies are omitted since those never lead to a change of state, nor to the sending of further messages.</p>
+</div>
+<div class="paragraph">
+<p>TODO at:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="http://courses.csail.mit.edu/6.888/spring13/lectures/L7-coherence.pdf" class="bare">http://courses.csail.mit.edu/6.888/spring13/lectures/L7-coherence.pdf</a></p>
+</li>
+<li>
+<p><a href="http://csg.csail.mit.edu/6.823S16/lectures/L15.pdf" class="bare">http://csg.csail.mit.edu/6.823S16/lectures/L15.pdf</a></p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>why PrWr stays in invalid? Why do writes always go to memory? Why not wait until eviction?</p>
+</div>
+</div>
+<div class="sect3">
+<h4 id="msi-protocol"><a class="anchor" href="#msi-protocol"></a><a class="link" href="#msi-protocol">32.1.2. MSI protocol</a></h4>
 <div class="paragraph">
 <p><a href="https://en.wikipedia.org/wiki/MSI_protocol" class="bare">https://en.wikipedia.org/wiki/MSI_protocol</a></p>
 </div>
@@ -34399,6 +35770,9 @@ CACHE2 S nyy
 <div class="paragraph">
 <p>Since the writer will become the new sole data owner, the writer can get the cache from us without going to DRAM at all! This is fine, because the writer will be the new sole owner of the line, so DRAM can remain dirty without problems.</p>
 </div>
+<div class="paragraph">
+<p>TODO Wikipedia requires a Flush there, why? <a href="https://electronics.stackexchange.com/questions/484830/why-is-a-flush-needed-in-the-msi-cache-coherency-protocol-when-moving-from-modif" class="bare">https://electronics.stackexchange.com/questions/484830/why-is-a-flush-needed-in-the-msi-cache-coherency-protocol-when-moving-from-modif</a></p>
+</div>
 <div class="ulist">
 <ul>
 <li>
@@ -34464,7 +35838,7 @@ CACHE2 S nyy
 </div>
 </div>
 <div class="sect3">
-<h4 id="mesi-protocol"><a class="anchor" href="#mesi-protocol"></a><a class="link" href="#mesi-protocol">32.1.2. MESI protocol</a></h4>
+<h4 id="mesi-protocol"><a class="anchor" href="#mesi-protocol"></a><a class="link" href="#mesi-protocol">32.1.3. MESI protocol</a></h4>
 <div class="paragraph">
 <p><a href="https://en.wikipedia.org/wiki/MESI_protocol" class="bare">https://en.wikipedia.org/wiki/MESI_protocol</a></p>
 </div>
@@ -34499,7 +35873,7 @@ CACHE2 S nyy
 </div>
 </div>
 <div class="sect3">
-<h4 id="mosi-protocol"><a class="anchor" href="#mosi-protocol"></a><a class="link" href="#mosi-protocol">32.1.3. MOSI protocol</a></h4>
+<h4 id="mosi-protocol"><a class="anchor" href="#mosi-protocol"></a><a class="link" href="#mosi-protocol">32.1.4. MOSI protocol</a></h4>
 <div class="paragraph">
 <p><a href="https://en.wikipedia.org/wiki/MOSI_protocol" class="bare">https://en.wikipedia.org/wiki/MOSI_protocol</a></p>
 </div>
@@ -34508,7 +35882,7 @@ CACHE2 S nyy
 </div>
 </div>
 <div class="sect3">
-<h4 id="moesi-protocol"><a class="anchor" href="#moesi-protocol"></a><a class="link" href="#moesi-protocol">32.1.4. MOESI protocol</a></h4>
+<h4 id="moesi-protocol"><a class="anchor" href="#moesi-protocol"></a><a class="link" href="#moesi-protocol">32.1.5. MOESI protocol</a></h4>
 <div class="paragraph">
 <p><a href="https://en.wikipedia.org/wiki/MOESI_protocol" class="bare">https://en.wikipedia.org/wiki/MOESI_protocol</a></p>
 </div>