From bcdc72552d9df7d8550335977f485dbee61c08f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?=
 =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= <ciro.santilli@gmail.com>
Date: Thu, 7 May 2020 01:00:00 +0000
Subject: [PATCH] 252dd80500cd3aa36cffc1ea6474f32a1900ec91

---
 index.html | 1305 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 1245 insertions(+), 60 deletions(-)
diff --git a/index.html b/index.html
index 558cf84..bf1647e 100644
--- a/index.html
+++ b/index.html
@@ -1226,7 +1226,11 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 <ul class="sectlevel3">
 <li><a href="#gem5-eclipse-configuration">19.20.1. gem5 Eclipse configuration</a></li>
 <li><a href="#gem5-python-c-interaction">19.20.2. gem5 Python C++ interaction</a></li>
-<li><a href="#gem5-entry-point">19.20.3. gem5 entry point</a></li>
+<li><a href="#gem5-entry-point">19.20.3. gem5 entry point</a>
+<ul class="sectlevel4">
+<li><a href="#gem5-m5-objects-module">19.20.3.1. gem5 <code>m5.objects</code> module</a></li>
+</ul>
+</li>
 <li><a href="#gem5-event-queue">19.20.4. gem5 event queue</a>
 <ul class="sectlevel4">
 <li><a href="#gem5-event-queue-atomicsimplecpu-syscall-emulation-freestanding-example-analysis">19.20.4.1. gem5 event queue AtomicSimpleCPU syscall emulation freestanding example analysis</a>
@@ -1281,29 +1285,52 @@ body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-b
 <li><a href="#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis">19.20.4.6. gem5 event queue DerivO3CPU syscall emulation freestanding example analysis</a></li>
 </ul>
 </li>
-<li><a href="#gem5-threadcontext-vs-threadstate-vs-execcontext-vs-process">19.20.5. gem5 <code>ThreadContext</code> vs <code>ThreadState</code> vs <code>ExecContext</code> vs <code>Process</code></a>
+<li><a href="#gem5-instruction-definitions">19.20.5. gem5 instruction definitions</a>
 <ul class="sectlevel4">
-<li><a href="#gem5-threadcontext">19.20.5.1. gem5 <code>ThreadContext</code></a>
+<li><a href="#gem5-execute-vs-initiateacc-vs-completeacc">19.20.5.1. gem5 <code>execute</code> vs <code>initiateAcc</code> vs <code>completeAcc</code></a>
 <ul class="sectlevel5">
-<li><a href="#gem5-simplethread">19.20.5.1.1. gem5 <code>SimpleThread</code></a></li>
-<li><a href="#gem5-o3threadcontext">19.20.5.1.2. gem5 <code>O3ThreadContext</code></a></li>
+<li><a href="#gem5-completeacc">19.20.5.1.1. gem5 <code>completeAcc</code></a></li>
 </ul>
 </li>
-<li><a href="#gem5-threadstate">19.20.5.2. gem5 <code>ThreadState</code></a></li>
-<li><a href="#gem5-execcontext">19.20.5.3. gem5 <code>ExecContext</code></a></li>
-<li><a href="#gem5-process">19.20.5.4. gem5 <code>Process</code></a></li>
 </ul>
 </li>
-<li><a href="#gem5-code-generation">19.20.6. gem5 code generation</a>
+<li><a href="#gem5-port-system">19.20.6. gem5 port system</a>
 <ul class="sectlevel4">
-<li><a href="#gem5-the-isa">19.20.6.1. gem5 THE_ISA</a></li>
+<li><a href="#gem5-functional-vs-atomic-vs-timing-memory-requests">19.20.6.1. gem5 functional vs atomic vs timing memory requests</a>
+<ul class="sectlevel5">
+<li><a href="#gem5-functional-requests">19.20.6.1.1. gem5 functional requests</a></li>
 </ul>
 </li>
-<li><a href="#gem5-build-system">19.20.7. gem5 build system</a>
+</ul>
+</li>
+<li><a href="#gem5-threadcontext-vs-threadstate-vs-execcontext-vs-process">19.20.7. gem5 <code>ThreadContext</code> vs <code>ThreadState</code> vs <code>ExecContext</code> vs <code>Process</code></a>
 <ul class="sectlevel4">
-<li><a href="#gem5-build-broken-on-recent-compiler-version">19.20.7.1. gem5 build broken on recent compiler version</a></li>
-<li><a href="#gem5-polymorphic-isa-includes">19.20.7.2. gem5 polymorphic ISA includes</a></li>
-<li><a href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.20.7.3. Why are all C++ symlinked into the gem5 build dir?</a></li>
+<li><a href="#gem5-threadcontext">19.20.7.1. gem5 <code>ThreadContext</code></a>
+<ul class="sectlevel5">
+<li><a href="#gem5-simplethread">19.20.7.1.1. gem5 <code>SimpleThread</code></a></li>
+<li><a href="#gem5-o3threadcontext">19.20.7.1.2. gem5 <code>O3ThreadContext</code></a></li>
+</ul>
+</li>
+<li><a href="#gem5-threadstate">19.20.7.2. gem5 <code>ThreadState</code></a></li>
+<li><a href="#gem5-execcontext">19.20.7.3. gem5 <code>ExecContext</code></a>
+<ul class="sectlevel5">
+<li><a href="#gem5-execcontext-readintregoperand-register-resolution">19.20.7.3.1. gem5 <code>ExecContext::readIntRegOperand</code> register resolution</a></li>
+</ul>
+</li>
+<li><a href="#gem5-process">19.20.7.4. gem5 <code>Process</code></a></li>
+</ul>
+</li>
+<li><a href="#gem5-code-generation">19.20.8. gem5 code generation</a>
+<ul class="sectlevel4">
+<li><a href="#gem5-the-isa">19.20.8.1. gem5 THE_ISA</a></li>
+</ul>
+</li>
+<li><a href="#gem5-build-system">19.20.9. gem5 build system</a>
+<ul class="sectlevel4">
+<li><a href="#m5-override-py-source">19.20.9.1. M5_OVERRIDE_PY_SOURCE</a></li>
+<li><a href="#gem5-build-broken-on-recent-compiler-version">19.20.9.2. gem5 build broken on recent compiler version</a></li>
+<li><a href="#gem5-polymorphic-isa-includes">19.20.9.3. gem5 polymorphic ISA includes</a></li>
+<li><a href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.20.9.4. Why are all C++ symlinked into the gem5 build dir?</a></li>
 </ul>
 </li>
 </ul>
@@ -7863,6 +7890,24 @@ qemu-x86_64: /path/to/linux-kernel-module-cheat/submodules/qemu/accel/tcg/cpu-ex
 <p>And a native Ubuntu 18.04 AMD64 run with static compilation segfaults.</p>
 </div>
 <div class="paragraph">
+<p>As of LKMC f5d4998ff51a548ed3f5153aacb0411d22022058 the aarch64 error:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./run --arch aarch64 --userland userland/cpp/atomic/fail.cpp --static</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>is:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>terminate called after throwing an instance of 'std::system_error'
+  what():  Unknown error 16781344
+qemu: uncaught target signal 6 (Aborted) - core dumped</pre>
+</div>
+</div>
+<div class="paragraph">
 <p>The workaround:</p>
 </div>
 <div class="literalblock">
@@ -7871,7 +7916,7 @@ qemu-x86_64: /path/to/linux-kernel-module-cheat/submodules/qemu/accel/tcg/cpu-ex
 </div>
 </div>
 <div class="paragraph">
-<p>fixes some of the problems, but not all, so we are just skipping those tests for now.</p>
+<p>fixes some of the problems, but not all TODO which were missing?, so we are just skipping those tests for now.</p>
 </div>
 </div>
 </div>
@@ -21153,6 +21198,9 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 </ul>
 </div>
 <div class="paragraph">
+<p>Note that the <code>--ruby</code> option has some crazy side effects besides enabling Ruby, e.g. it <a href="https://github.com/gem5/gem5/blob/9fc9c67b4242c03f165951775be5cd0812f2a705/configs/ruby/Ruby.py#L61">sets the default <code>--cpu-type</code> to <code>TimingSimpleCPU</code> instead of the otherwise default <code>AtomicSimpleCPU</code></a>. But why.</p>
+</div>
+<div class="paragraph">
 <p>It is not possible to build more than one Ruby system into a single build, and this is a major pain point for testing Ruby: <a href="https://gem5.atlassian.net/browse/GEM5-467" class="bare">https://gem5.atlassian.net/browse/GEM5-467</a></p>
 </div>
 <div class="paragraph">
@@ -21239,6 +21287,36 @@ cat "$(./getvar --arch aarch64 --emulator gem5 trace_txt_file)"</pre>
 <div class="paragraph">
 <p>This is the simplest of all protocols, and therefore the first one you should study to learn how Ruby works.</p>
 </div>
+<div class="paragraph">
+<p>To study it, we can take an approach similar to what was done at: <a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis-with-caches-and-multiple-cpus">gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis with caches and multiple CPUs</a>.</p>
+</div>
+<div class="paragraph">
+<p>Our full command line will be something like</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./build-gem5 --arch aarch64 --gem5-build-id MI_example
+./run \
+  --arch aarch64 \
+  --cli-args '2 100' \
+  --cpus 3 \
+  --emulator gem5 \
+  --userland userland/cpp/atomic/aarch64_add.cpp \
+  --gem5-build-id MI_example \
+  -- \
+  --ruby \
+;</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which produces a <a href="#gem5-config-dot"><code>config.dot.svg</code></a> like the following by with 3 CPUs instead of 2:</p>
+</div>
+<div id="config-dot-svg-timingsimplecpu-caches-3-cpus-ruby" class="imageblock">
+<div class="content">
+<img src="https://raw.githubusercontent.com/cirosantilli/media/master/gem5_config_TimingSimpleCPU_3_CPUs_MI_example_b1623cb2087873f64197e503ab8894b5e4d4c7b4.svg?sanitize=true" alt="gem5 config TimingSimpleCPU 3 CPUs MI example b1623cb2087873f64197e503ab8894b5e4d4c7b4" height="600">
+</div>
+<div class="title">Figure 2. <code>config.dot.svg</code> for a system with three TimingSimpleCPU CPUs with the Ruby <code>MI_example</code> protocol.</div>
+</div>
 </div>
 <div class="sect4">
 <h5 id="gem5-crossbar-interconnect"><a class="anchor" href="#gem5-crossbar-interconnect"></a><a class="link" href="#gem5-crossbar-interconnect">19.15.6.2. gem5 crossbar interconnect</a></h5>
@@ -21965,6 +22043,170 @@ exec filecode in scope</pre>
 <div class="paragraph">
 <p>Tested at gem5 b4879ae5b0b6644e6836b0881e4da05c64a6550d.</p>
 </div>
+<div class="sect4">
+<h5 id="gem5-m5-objects-module"><a class="anchor" href="#gem5-m5-objects-module"></a><a class="link" href="#gem5-m5-objects-module">19.20.3.1. gem5 <code>m5.objects</code> module</a></h5>
+<div class="paragraph">
+<p>All <code>SimObjects</code> seem to be automatically added to the <code>m5.objects</code> namespace, and this is done in a very convoluted way, let&#8217;s try to understand a bit:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>src/python/m5/objects/__init__.py</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>contains:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>modules = __loader__.modules
+
+for module in modules.keys():
+    if module.startswith('m5.objects.'):
+        exec("from %s import *" % module)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>And from <a href="#debug-gem5-python-scripts">IPDB</a> we see that this appears to loop over every object string of type <code>m5.objects.modulename</code>.</p>
+</div>
+<div class="paragraph">
+<p>This <code><em>init</em></code> gets called from <code>src/python/importer.py</code> at the <code>exec</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>class CodeImporter(object):
+    def load_module(self, fullname):
+            override = os.environ.get('M5_OVERRIDE_PY_SOURCE', 'false').lower()
+            if override in ('true', 'yes') and  os.path.exists(abspath):
+                src = open(abspath, 'r').read()
+                code = compile(src, abspath, 'exec')
+
+            if os.path.basename(srcfile) == '__init__.py':
+                mod.__path__ = fullname.split('.')
+                mod.__package__ = fullname
+            else:
+                mod.__package__ = fullname.rpartition('.')[0]
+            mod.__file__ = srcfile
+
+            exec(code, mod.__dict__)
+
+import sys
+importer = CodeImporter()
+add_module = importer.add_module
+sys.meta_path.append(importer)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Here as a bonus here we also see how <a href="#m5-override-py-source"><code>M5_OVERRIDE_PY_SOURCE</code></a> works.</p>
+</div>
+<div class="paragraph">
+<p>In <code>src/SConscript</code> we see that <code>SimObject</code> is just a <code>PySource</code> with module equals to <code>m5.objects</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>class SimObject(PySource):
+    def __init__(self, source, tags=None, add_tags=None):
+        '''Specify the source file and any tags (automatically in
+        the m5.objects package)'''
+        super(SimObject, self).__init__('m5.objects', source, tags, add_tags)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>The <code>add_module</code> method seems to be doing the magic and is called from <code>src/sim/init.cc</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>bool
+EmbeddedPython::addModule() const
+{
+    PyObject *code = getCode();
+    PyObject *result = PyObject_CallMethod(importerModule, PyCC("add_module"),</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which is called from:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int
+EmbeddedPython::initAll()
+{
+    // Load the importer module
+    PyObject *code = importer-&gt;getCode();
+    importerModule = PyImport_ExecCodeModule(PyCC("importer"), code);
+    if (!importerModule) {
+        PyErr_Print();
+        return 1;
+    }
+
+    // Load the rest of the embedded python files into the embedded
+    // python importer
+    list&lt;EmbeddedPython *&gt;::iterator i = getList().begin();
+    list&lt;EmbeddedPython *&gt;::iterator end = getList().end();
+    for (; i != end; ++i)
+        if (!(*i)-&gt;addModule())</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and <code>getList</code> comes from:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EmbeddedPython::EmbeddedPython(const char *filename, const char *abspath,
+    const char *modpath, const unsigned char *code, int zlen, int len)
+    : filename(filename), abspath(abspath), modpath(modpath), code(code),
+      zlen(zlen), len(len)
+{
+    // if we've added the importer keep track of it because we need it
+    // to bootstrap.
+    if (string(modpath) == string("importer"))
+        importer = this;
+    else
+        getList().push_back(this);
+}
+
+list&lt;EmbeddedPython *&gt; &amp;
+EmbeddedPython::getList()
+{
+    static list&lt;EmbeddedPython *&gt; the_list;
+    return the_list;
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and the constructor in turn gets called from per <code>SimObject</code> autogenerated files such as e.g. <code>dev/storage/Ide.py.cc</code> for <code>src/dev/storage/Ide.py</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EmbeddedPython embedded_m5_objects_Ide(
+    "m5/objects/Ide.py",
+    "/home/ciro/bak/git/linux-kernel-module-cheat/data/gem5/master4/src/dev/storage/Ide.py",
+    "m5.objects.Ide",
+    data_m5_objects_Ide,
+    947,
+    2099);
+
+} // anonymous namespace</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which get autogenerated at <code>src/SConscript</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>def embedPyFile(target, source, env):
+
+for source in PySource.all:
+    base_py_env.Command(source.cpp, [ py_marshal, source.tnode ],
+                        MakeAction(embedPyFile, Transform("EMBED PY")))</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>where the <code>PySource.all</code> thing as you might expect is a static list of all <code>PySource</code> source files as they get updated in the constructor.</p>
+</div>
+<div class="paragraph">
+<p>Tested in gem5 d9cb548d83fa81858599807f54b52e5be35a6b03.</p>
+</div>
+</div>
 </div>
 <div class="sect3">
 <h4 id="gem5-event-queue"><a class="anchor" href="#gem5-event-queue"></a><a class="link" href="#gem5-event-queue">19.20.4. gem5 event queue</a></h4>
@@ -21972,6 +22214,9 @@ exec filecode in scope</pre>
 <p>gem5 is an event based simulator, and as such the event queue is of of the crucial elements in the system.</p>
 </div>
 <div class="paragraph">
+<p>Every single action that takes time (e.g. notably <a href="#timingsimplecpu-analysis-ldr-stall">reading from memory</a>) models that time delay by scheduling an event in the future.</p>
+</div>
+<div class="paragraph">
 <p>The gem5 event queue stores one callback event for each future point in time.</p>
 </div>
 <div class="paragraph">
@@ -22477,8 +22722,26 @@ clock=500</pre>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>        fault = thread-&gt;itb-&gt;translateAtomic(ifetch_req, thread-&gt;getTC(),
-                                                BaseTLB::Execute);</pre>
+<pre>fault = thread-&gt;itb-&gt;translateAtomic(ifetch_req, thread-&gt;getTC(),
+                                        BaseTLB::Execute);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and later on after translation the memory is obtained at:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>icache_latency = sendPacket(icachePort, &amp;ifetch_pkt);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which <a href="#gem5-functional-vs-atomic-vs-timing-memory-requests">sends the packet atomically</a> through the port:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>AtomicSimpleCPU::sendPacket(MasterPort &amp;port, const PacketPtr &amp;pkt) {
+    return port.sendAtomic(pkt);
+}</pre>
 </div>
 </div>
 <div class="paragraph">
@@ -22486,15 +22749,15 @@ clock=500</pre>
 </div>
 <div class="literalblock">
 <div class="content">
-<pre>        thread-&gt;itb-&gt;translateTiming(ifetch_req, thread-&gt;getTC(),
-                &amp;fetchTranslation, BaseTLB::Execute);</pre>
+<pre>thread-&gt;itb-&gt;translateTiming(ifetch_req, thread-&gt;getTC(),
+        &amp;fetchTranslation, BaseTLB::Execute);</pre>
 </div>
 </div>
 <div class="paragraph">
 <p>and so there it is: the <code>ITB</code> classes are the same, but there are a separate <code>Atomic</code> and <code>Timing</code> methods!</p>
 </div>
 <div class="paragraph">
-<p>The <code>Timing</code> one calls <code>ArmISA::TLB::translateComplete</code></p>
+<p>The timing request is shown further at: <a href="#gem5-functional-vs-atomic-vs-timing-memory-requests">sends the packet atomically</a>.</p>
 </div>
 <div class="paragraph">
 <p>Tested in gem5 b4879ae5b0b6644e6836b0881e4da05c64a6550d.</p>
@@ -22708,13 +22971,13 @@ info: Entering event queue @ 0.  Starting simulation...
 </div>
 </div>
 <div class="paragraph">
-<p>Looking into the generated <a href="#gem5-config-ini"><code>config.dot.svg</code></a> can give a better intuition on the shape of the memory system: <a href="#config-dot-svg-timingsimplecpu">Figure 2, &#8220;<code>config.dot.svg</code> for a TimingSimpleCPU without caches.&#8221;</a>, so it is good to keep that in mind.</p>
+<p>Looking into the generated <a href="#gem5-config-ini"><code>config.dot.svg</code></a> can give a better intuition on the shape of the memory system: <a href="#config-dot-svg-timingsimplecpu">Figure 3, &#8220;<code>config.dot.svg</code> for a TimingSimpleCPU without caches.&#8221;</a>, so it is good to keep that in mind.</p>
 </div>
 <div id="config-dot-svg-timingsimplecpu" class="imageblock">
 <div class="content">
 <img src="https://raw.githubusercontent.com/cirosantilli/media/master/gem5_config_TimingSimpleCPU_12c917de54145d2d50260035ba7fa614e25317a3.svg?sanitize=true" alt="gem5 config TimingSimpleCPU 12c917de54145d2d50260035ba7fa614e25317a3" height="600">
 </div>
-<div class="title">Figure 2. <code>config.dot.svg</code> for a TimingSimpleCPU without caches.</div>
+<div class="title">Figure 3. <code>config.dot.svg</code> for a TimingSimpleCPU without caches.</div>
 </div>
 <div class="paragraph">
 <p>It is also helpful to see this as a tree of events where one execute event schedules other events:</p>
@@ -23621,6 +23884,9 @@ TimingSimpleCPU::IcachePort::ITickEvent::process</pre>
 <p>One important thing we want to check now, is how the memory reads are going to make the processor stall in the middle of an instruction.</p>
 </div>
 <div class="paragraph">
+<p>This is also discussed at: <a href="#gem5-execute-vs-initiateacc-vs-completeacc">gem5 <code>execute</code> vs <code>initiateAcc</code> vs <code>completeAcc</code></a>.</p>
+</div>
+<div class="paragraph">
 <p>Since we were using a simple CPU without a pipeline, the data memory access stall everything: there is no further progress until memory comes back.</p>
 </div>
 <div class="paragraph">
@@ -23653,20 +23919,16 @@ TimingSimpleCPU::IcachePort::ITickEvent::process</pre>
 </ul>
 </div>
 <div class="paragraph">
-<p>So, where is the <code>execute</code> happening? Well, I&#8217;ll satisfy myself with a quick source grep and guess:</p>
+<p>We can verify that <code>execute</code> never happens by putting a breakpoint on <code>ArmISAInst::LDRXL64_LIT::execute</code> which never gets called.</p>
 </div>
-<div class="ulist">
-<ul>
-<li>
-<p><code>curStaticInst-&gt;initiateAcc</code> sets up some memory request events</p>
-</li>
-<li>
-<p>which likely lead up to: <code>TimingSimpleCPU::completeDataAccess</code>, which off the bat ends in <code>advanceInst</code>.</p>
 <div class="paragraph">
-<p>It also calls <code>curStaticInst-&gt;completeAcc</code>, which pairs up with the <code>initiateAcc</code> call.</p>
+<p>Therefore, we conclude that <code>initiateAcc</code> is what actually starts the memory request.</p>
 </div>
-</li>
-</ul>
+<div class="paragraph">
+<p>Later on, when the memory access completes the event calls <code>TimingSimpleCPU::completeDataAccess</code> which calls <code>ArmISAInst::LDRXL64_LIT::completeAcc</code>, which sets the register value to what was read from memory.</p>
+</div>
+<div class="paragraph">
+<p>More memory event details can be seen at: <a href="#gem5-functional-vs-atomic-vs-timing-memory-requests">gem5 functional vs atomic vs timing memory requests</a>.</p>
 </div>
 <div class="paragraph">
 <p>The following is the region of interest of the event log:</p>
@@ -23805,7 +24067,7 @@ TimingSimpleCPU::IcachePort::ITickEvent::process</pre>
 <p>Notably, we now see that very little time passed between the first and second instructions which are marked with <code>ExecEnable</code> in #39 and #47, presumably because rather than going out all the way to the DRAM system the event chain stops right at the <code>icache.cpu_side</code> when a hit happens, which must have been the case for the second instruction, which is just adjacent to the first one.</p>
 </div>
 <div class="paragraph">
-<p>It is also interested to look into the generated <a href="#gem5-config-ini"><code>config.dot.svg</code></a> to compare it to the one without caches: <a href="#config-dot-svg-timingsimplecpu">Figure 2, &#8220;<code>config.dot.svg</code> for a TimingSimpleCPU without caches.&#8221;</a>. With caches: <a href="#config-dot-svg-timingsimplecpu-caches">Figure 3, &#8220;<code>config.dot.svg</code> for a TimingSimpleCPU with caches.&#8221;</a>.</p>
+<p>It is also interested to look into the generated <a href="#gem5-config-ini"><code>config.dot.svg</code></a> to compare it to the one without caches: <a href="#config-dot-svg-timingsimplecpu">Figure 3, &#8220;<code>config.dot.svg</code> for a TimingSimpleCPU without caches.&#8221;</a>. With caches: <a href="#config-dot-svg-timingsimplecpu-caches">Figure 4, &#8220;<code>config.dot.svg</code> for a TimingSimpleCPU with caches.&#8221;</a>.</p>
 </div>
 <div class="paragraph">
 <p>We can see from there, that we now have <code>icache</code> and <code>dcache</code> elements inside the CPU block, and that the CPU <code>icache</code> and <code>dcache</code> ports go through the caches to the <code>SystemXBar</code> rather than being directly connected as before.</p>
@@ -23817,7 +24079,7 @@ TimingSimpleCPU::IcachePort::ITickEvent::process</pre>
 <div class="content">
 <img src="https://raw.githubusercontent.com/cirosantilli/media/master/gem5_config_TimingSimpleCPU_caches_12c917de54145d2d50260035ba7fa614e25317a3.svg?sanitize=true" alt="gem5 config TimingSimpleCPU caches 12c917de54145d2d50260035ba7fa614e25317a3" height="600">
 </div>
-<div class="title">Figure 3. <code>config.dot.svg</code> for a TimingSimpleCPU with caches.</div>
+<div class="title">Figure 4. <code>config.dot.svg</code> for a TimingSimpleCPU with caches.</div>
 </div>
 <div class="paragraph">
 <p>We can break down the events between the instructions as follows.</p>
@@ -24036,14 +24298,32 @@ type=SetAssociative</pre>
 <div class="paragraph">
 <p>If we don&#8217;t use such instructions that flush memory, we would only see the interconnect at work when caches run out.</p>
 </div>
+<div class="paragraph">
+<p>For this study, we will use the same CLI as <a href="#gem5-event-queue-timingsimplecpu-syscall-emulation-freestanding-example-analysis">gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis</a> but with multiple CPUs and a content like <a href="#atomic-cpp">atomic.cpp</a> which shares a variable across threads.</p>
+</div>
+<div class="paragraph">
+<p>We will then focus on the behaviour or the memory of the shared variable to see if we can observe cache coherency on the crossbar:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./run \
+  --arch aarch64 \
+  --cli-args '2 100' \
+  --cpus 3 \
+  --emulator gem5 \
+  --gem5-worktree master3 \
+  --userland userland/cpp/atomic/aarch64_add.cpp \
+;</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>The <a href="#gem5-config-dot"><code>config.dot.svg</code></a> now looks like this but with 3 CPUs instead of 2:</p>
+</div>
 <div id="config-dot-svg-timingsimplecpu-caches-2-cpus" class="imageblock">
 <div class="content">
 <img src="https://raw.githubusercontent.com/cirosantilli/media/master/gem5_config_TimingSimpleCPU_caches_2_CPUs_12c917de54145d2d50260035ba7fa614e25317a3.svg?sanitize=true" alt="gem5 config TimingSimpleCPU caches 2 CPUs 12c917de54145d2d50260035ba7fa614e25317a3" height="600">
 </div>
-<div class="title">Figure 4. <code>config.dot.svg</code> for a system with two TimingSimpleCPU with caches.</div>
-</div>
-<div class="paragraph">
-<p>The simplest setup to understand will be to use <a href="#gem5-syscall-emulation-multiple-executables">gem5 syscall emulation multiple executables</a>.</p>
+<div class="title">Figure 5. <code>config.dot.svg</code> for a system with two TimingSimpleCPU with caches.</div>
 </div>
 </div>
 <div class="sect4">
@@ -24086,7 +24366,601 @@ type=SetAssociative</pre>
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-threadcontext-vs-threadstate-vs-execcontext-vs-process"><a class="anchor" href="#gem5-threadcontext-vs-threadstate-vs-execcontext-vs-process"></a><a class="link" href="#gem5-threadcontext-vs-threadstate-vs-execcontext-vs-process">19.20.5. gem5 <code>ThreadContext</code> vs <code>ThreadState</code> vs <code>ExecContext</code> vs <code>Process</code></a></h4>
+<h4 id="gem5-instruction-definitions"><a class="anchor" href="#gem5-instruction-definitions"></a><a class="link" href="#gem5-instruction-definitions">19.20.5. gem5 instruction definitions</a></h4>
+<div class="paragraph">
+<p>This is one of the parts of gem5 that rely on semi-useless <a href="#gem5-code-generation">code generation</a> inside the <code>.isa</code> sublanguage.</p>
+</div>
+<div class="paragraph">
+<p>Which is mostly Python, with some magic letters thrown in for good measure.</p>
+</div>
+<div class="paragraph">
+<p>The class definitions get all dumped into one humongous C++ include file:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>build/ARM/arch/arm/generated/exec-ns.cc.inc</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>That file defines the key methods of each instruction, e.g. the ARM immediate <a href="#userland-assembly">ADD instruction</a> has its <code>execute</code> method defined there:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    Fault AddImm::execute(
+        ExecContext *xc, Trace::InstRecord *traceData) const</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>or for example the key methods of an <a href="#arm-str-instruction">ARM 64-bit (X) STR with an immediate offset</a> (<code>STR &lt;Wt&gt;, [&lt;Xn|SP&gt;], #&lt;simm&gt;</code>):</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    Fault STRX64_IMM::execute(ExecContext *xc,
+                                  Trace::InstRecord *traceData) const
+
+    Fault STRX64_IMM::initiateAcc(ExecContext *xc,
+                                      Trace::InstRecord *traceData) const
+
+    Fault STRX64_IMM::completeAcc(PacketPtr pkt, ExecContext *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+        return NoFault;
+    }</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>We also notice that the key argument passed to those instructions is of type <code>ExecContext</code>, which is discussed further at: <a href="#gem5-execcontext">Section 19.20.7.3, &#8220;gem5 <code>ExecContext</code>&#8221;</a>.</p>
+</div>
+<div class="paragraph">
+<p>The file is an include so that compilation can be split up into chunks by the autogenerated includers</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>build/ARM/arch/arm/generated/generic_cpu_1.cc
+build/ARM/arch/arm/generated/generic_cpu_2.cc
+...</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>via the <code>__SPLIT</code> macro as in:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include "exec-g.cc.inc"
+#include "cpu/exec_context.hh"
+#include "decoder.hh"
+namespace ArmISAInst {
+#define __SPLIT 1
+#include "exec-ns.cc.inc"
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>This is likely done to not overload the C++ compiler? But sure enough overloads IDEs and GDB which takes forever to load the source of any frames going through it.</p>
+</div>
+<div class="paragraph">
+<p>We should split that file into one per class for the love of God.</p>
+</div>
+<div class="paragraph">
+<p>The autogenerated instruction class declarations can be found at:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>build/ARM/arch/arm/generated/decoder-ns.hh.inc</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and the autogenerated bulk of the decoder:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>build/ARM/arch/arm/generated/decoder-ns.cc.inc</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which also happens to contain the constructor definitions of the instruction classes, e.g. for the ADD immediate because why not:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    AddImm::AddImm(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          uint32_t _imm,
+                                          bool _rotC)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>The above files get tied in the autogenerated:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>build/ARM/arch/arm/generated/decoder.hh</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which contains:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include "decoder-g.hh.inc"
+namespace ArmISAInst {
+#include "decoder-ns.hh.inc"
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Different instructions inherit form different classes, e.g. the ARM immediate ADD instruction is a <code>DataImmOp</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>class AddImm : public DataImmOp
+{
+    public:
+        // Constructor
+        AddImm(ExtMachInst machInst, IntRegIndex _dest,
+                IntRegIndex _op1, uint32_t _imm, bool _rotC=true);
+        Fault execute(ExecContext *, Trace::InstRecord *) const override;
+};</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and STRX64_IMM is an <code>ArmISA::MemoryImm64</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    class STRX64_IMM : public ArmISA::MemoryImm64
+    {
+      public:
+
+        /// Constructor.
+        STRX64_IMM(ExtMachInst machInst,
+                IntRegIndex _dest, IntRegIndex _base, int64_t _imm);
+
+        Fault execute(ExecContext *, Trace::InstRecord *) const override;
+        Fault initiateAcc(ExecContext *, Trace::InstRecord *) const override;
+        Fault completeAcc(PacketPtr, ExecContext *,
+                          Trace::InstRecord *) const override;
+
+        void
+        annotateFault(ArmFault *fault) override
+        {
+                    fault-&gt;annotate(ArmFault::SAS, 3);
+                    fault-&gt;annotate(ArmFault::SSE, false);
+                    fault-&gt;annotate(ArmFault::SRT, dest);
+                    fault-&gt;annotate(ArmFault::SF, true);
+                    fault-&gt;annotate(ArmFault::AR, false);
+        }
+    };</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>but different memory instructions can have different base classes too e.g. <a href="#arm-ldxr-and-stxr-instructions">STXR</a>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>class STXRX64 : public ArmISA::MemoryEx64</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>A summarized class hierarchy for the above is:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><code>StaticInst</code></p>
+<div class="ulist">
+<ul>
+<li>
+<p><code>ArmISA::ArmStaticInst</code></p>
+<div class="ulist">
+<ul>
+<li>
+<p><code>ArmISA::PredOp</code></p>
+<div class="ulist">
+<ul>
+<li>
+<p><code>ArmISA::DataImmOp</code></p>
+<div class="ulist">
+<ul>
+<li>
+<p><code>ArmISA::AddImm</code></p>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p><code>ArmISA::MightBeMicro64</code></p>
+<div class="ulist">
+<ul>
+<li>
+<p>ArmISA::Memory64</p>
+<div class="ulist">
+<ul>
+<li>
+<p><code>ArmISA::MemoryImm64</code></p>
+<div class="ulist">
+<ul>
+<li>
+<p><code>ArmISA::MemoryEx64</code></p>
+<div class="ulist">
+<ul>
+<li>
+<p><code>ArmISA::STXRX64</code></p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>Tested in gem5 b1623cb2087873f64197e503ab8894b5e4d4c7b4.</p>
+</div>
+<div class="sect4">
+<h5 id="gem5-execute-vs-initiateacc-vs-completeacc"><a class="anchor" href="#gem5-execute-vs-initiateacc-vs-completeacc"></a><a class="link" href="#gem5-execute-vs-initiateacc-vs-completeacc">19.20.5.1. gem5 <code>execute</code> vs <code>initiateAcc</code> vs <code>completeAcc</code></a></h5>
+<div class="paragraph">
+<p>These are the key methods defined in instruction definitions, so lets see when each one gets called and what they do more or less.</p>
+</div>
+<div class="paragraph">
+<p><code>execute</code> is the only one of the three that gets defined by "non-memory" instructions.</p>
+</div>
+<div class="paragraph">
+<p>Memory instructions define all three.</p>
+</div>
+<div class="paragraph">
+<p>The three methods are present in the base class <code>StaticInst</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    virtual Fault execute(ExecContext *xc,
+                          Trace::InstRecord *traceData) const = 0;
+
+    virtual Fault initiateAcc(ExecContext *xc,
+                              Trace::InstRecord *traceData) const
+    {
+        panic("initiateAcc not defined!");
+    }
+
+    virtual Fault completeAcc(Packet *pkt, ExecContext *xc,
+                              Trace::InstRecord *traceData) const
+    {
+        panic("completeAcc not defined!");
+    }</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>so we see that all instructions must implement <code>execute</code>, while overriding <code>initiateAcc</code> and <code>completeAcc</code> are optional and only done by classes for which those might get called: memory instructions.</p>
+</div>
+<div class="paragraph">
+<p><code>execute</code> is what does the actual job for non-memory instructions (obviously, since it is the only one of the three methods that is defined as not <code>panic</code> for those).</p>
+</div>
+<div class="paragraph">
+<p>Memory instructions however run either:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><code>execute</code> in <code>AtomicSimpleCPU</code>: this does the entire memory access in one go</p>
+</li>
+<li>
+<p><code>initiateAcc</code> + <code>completeAcc</code> in timing CPUs. <code>initiateAcc</code> is called when the instruction starts executing, and <code>completeAcc</code> is called when the memory fetch returns from the memory system.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>This can be seen concretely in GDB from the analysis done at: <a href="#timingsimplecpu-analysis-ldr-stall">TimingSimpleCPU analysis: LDR stall</a> and for more memory details see <a href="#gem5-functional-vs-atomic-vs-timing-memory-requests">gem5 functional vs atomic vs timing memory requests</a>.</p>
+</div>
+<div class="sect5">
+<h6 id="gem5-completeacc"><a class="anchor" href="#gem5-completeacc"></a><a class="link" href="#gem5-completeacc">19.20.5.1.1. gem5 <code>completeAcc</code></a></h6>
+<div class="paragraph">
+<p><code>completeAcc</code> is boring on most simple store memory instructions, e.g. a simple STR:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    Fault STRX64_IMM::completeAcc(PacketPtr pkt, ExecContext *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+        return NoFault;
+    }</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>This is because the store does all of its job on <code>completeAcc</code> basically, creating the memory write request.</p>
+</div>
+<div class="paragraph">
+<p>Loads however have non-trivial <code>completeAcc</code>, because now we have at the very least, to save the value read from memory into a CPU address.</p>
+</div>
+<div class="paragraph">
+<p>Things are much more interesting however on more interesting loads, for example <a href="#arm-ldxr-and-stxr-instructions">STXR</a> (hand formatted here):</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>Fault STXRX64::completeAcc(PacketPtr pkt, ExecContext *xc,
+                                    Trace::InstRecord *traceData) const {
+    Fault fault = NoFault;
+    uint64_t XResult = 0;
+    uint32_t SevMailbox = 0;
+    uint32_t LLSCLock = 0;
+    uint64_t writeResult = pkt-&gt;req-&gt;getExtraData();
+    XResult = !writeResult; SevMailbox = 1; LLSCLock = 0;
+    if (fault == NoFault) {
+        {
+            uint64_t final_val = XResult;
+            xc-&gt;setIntRegOperand(this, 0, (XResult) &amp; mask(aarch64 ? 64 : 32));
+            if (traceData) { traceData-&gt;setData(final_val); }
+        }
+        xc-&gt;setMiscRegOperand(this, 1, SevMailbox);
+        if (traceData) { traceData-&gt;setData(SevMailbox); }
+        xc-&gt;setMiscRegOperand(this, 2, LLSCLock);
+        if (traceData) { traceData-&gt;setData(LLSCLock); }
+    }
+    return fault;
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>From GDB on <a href="#timingsimplecpu-analysis-ldr-stall">TimingSimpleCPU analysis: LDR stall</a> we see that <code>completeAcc</code> gets called from <code>TimingSimpleCPU::completeDataAccess</code>.</p>
+</div>
+</div>
+</div>
+</div>
+<div class="sect3">
+<h4 id="gem5-port-system"><a class="anchor" href="#gem5-port-system"></a><a class="link" href="#gem5-port-system">19.20.6. gem5 port system</a></h4>
+<div class="paragraph">
+<p>The gem5 memory system is connected in a very flexible way through the port system.</p>
+</div>
+<div class="paragraph">
+<p>This system exists to allow seamlessly connecting any combination of CPU, caches, interconnects, DRAM and peripherals.</p>
+</div>
+<div class="sect4">
+<h5 id="gem5-functional-vs-atomic-vs-timing-memory-requests"><a class="anchor" href="#gem5-functional-vs-atomic-vs-timing-memory-requests"></a><a class="link" href="#gem5-functional-vs-atomic-vs-timing-memory-requests">19.20.6.1. gem5 functional vs atomic vs timing memory requests</a></h5>
+<div class="paragraph">
+<p>gem5 memory requests can be classified in the following broad categories:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>functional: get the value magically, do not update caches, see also: <a href="#gem5-functional-requests">gem5 functional requests</a></p>
+</li>
+<li>
+<p>atomic: get the value now without making a <a href="#gem5-event-queue">separate event</a>, but do not update caches</p>
+</li>
+<li>
+<p>timing: get the value simulating delays and updating caches</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>This trichotomy can be notably seen in the definition of the <a href="https://github.com/gem5/gem5/blob/9fc9c67b4242c03f165951775be5cd0812f2a705/src/mem/port.hh#L75">MasterPort class</a>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>class MasterPort : public Port, public AtomicRequestProtocol,
+    public TimingRequestProtocol, public FunctionalRequestProtocol</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and the base classes are defined under <code>src/mem/protocol/</code>.</p>
+</div>
+<div class="paragraph">
+<p>Then, by reading the rest of the class, we see that the send methods are all boring, and just forward to some polymorphic receiver that does the actual interesting activity:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    Tick
+    sendAtomicSnoop(PacketPtr pkt)
+    {
+        return AtomicResponseProtocol::sendSnoop(_masterPort, pkt);
+    }
+
+    Tick
+    AtomicResponseProtocol::sendSnoop(AtomicRequestProtocol *peer, PacketPtr pkt)
+    {
+        assert(pkt-&gt;isRequest());
+        return peer-&gt;recvAtomicSnoop(pkt);
+    }</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>The receive methods are therefore the interesting ones, and must be overridden on derived classes if they ever expect to receive such requests:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    Tick
+    recvAtomicSnoop(PacketPtr pkt) override
+    {
+        panic("%s was not expecting an atomic snoop request\n", name());
+        return 0;
+    }
+
+    void
+    recvFunctionalSnoop(PacketPtr pkt) override
+    {
+        panic("%s was not expecting a functional snoop request\n", name());
+    }
+
+    void
+    recvTimingSnoopReq(PacketPtr pkt) override
+    {
+        panic("%s was not expecting a timing snoop request.\n", name());
+    }</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>One question that comes up now is: but why do CPUs need to care about <a href="#cache-coherence">snoop requests</a>?</p>
+</div>
+<div class="paragraph">
+<p>And one big answer is: to be able to implement LLSC atomicity as mentioned at: <a href="#arm-ldxr-and-stxr-instructions">ARM LDXR and STXR instructions</a>, since when other cores update memory, they could invalidate the lock of the current core.</p>
+</div>
+<div class="paragraph">
+<p>Then, as you might expect, we can see that for example <code>AtomicSimpleCPU</code> does not override <code>recvTimingSnoopReq</code>.</p>
+</div>
+<div class="paragraph">
+<p>Now let see which requests are generated by ordinary <a href="#arm-ldr-instruction">ARM LDR instruction</a>. We run:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>./run \
+  --arch aarch64 \
+  --debug-vm \
+  --emulator gem5 \
+  --gem5-build-type debug \
+  --useland userland/arch/aarch64/freestanding/linux/hello.S \</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and then break at the methods of the LDR class <code>LDRXL64_LIT</code>: <a href="#gem5-execute-vs-initiateacc-vs-completeacc">gem5 <code>execute</code> vs <code>initiateAcc</code> vs <code>completeAcc</code></a>.</p>
+</div>
+<div class="paragraph">
+<p>Before starting, we of course guess that:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><code>AtomicSimpleCPU</code> will be making atomic accesses from <code>execute</code></p>
+</li>
+<li>
+<p><code>TimingSimpleCPU</code> will be making timing accesses from <code>initiateAcc</code>, which must generate the event which leads to <code>completeAcc</code></p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>so let&#8217;s confirm it.</p>
+</div>
+<div class="paragraph">
+<p>We break on <code>ArmISAInst::LDRXL64_LIT::execute</code> which is what <code>AtomicSimpleCPU</code> uses, and that leads as expected to:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>MasterPort::sendAtomic
+AtomicSimpleCPU::sendPacket
+AtomicSimpleCPU::readMem
+SimpleExecContext::readMem
+readMemAtomic&lt;(ByteOrder)1, ExecContext, unsigned long&gt;
+readMemAtomicLE&lt;ExecContext, unsigned long&gt;
+ArmISAInst::LDRXL64_LIT::execute
+AtomicSimpleCPU::tick</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Notably, <code>AtomicSimpleCPU::readMem</code> immediately translates the address, creates a packet, sends the atomic request, and gets the response back without any events.</p>
+</div>
+<div class="paragraph">
+<p>And now if we do the same with <code>--cpu-type TimingSimpleCPU</code> and break at <code>ArmISAInst::LDRXL64_LIT::initiateAcc</code>, and then add another break for the next event schedule <code>b EventManager::schedule</code> (which we imagine is the memory read) we reach:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>EventManager::schedule
+DRAMCtrl::addToReadQueue
+DRAMCtrl::recvTimingReq
+DRAMCtrl::MemoryPort::recvTimingReq
+TimingRequestProtocol::sendReq
+MasterPort::sendTimingReq
+CoherentXBar::recvTimingReq
+CoherentXBar::CoherentXBarSlavePort::recvTimingReq
+TimingRequestProtocol::sendReq
+MasterPort::sendTimingReq
+TimingSimpleCPU::handleReadPacket
+TimingSimpleCPU::sendData
+TimingSimpleCPU::finishTranslation
+DataTranslation&lt;TimingSimpleCPU*&gt;::finish
+ArmISA::TLB::translateComplete
+ArmISA::TLB::translateTiming
+ArmISA::TLB::translateTiming
+TimingSimpleCPU::initiateMemRead
+SimpleExecContext::initiateMemRead
+initiateMemRead&lt;ExecContext, unsigned long&gt;
+ArmISAInst::LDRXL64_LIT::initiateAcc
+TimingSimpleCPU::completeIfetch
+TimingSimpleCPU::IcachePort::ITickEvent::process
+EventQueue::serviceOne</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>so as expected we have <code>TimingRequestProtocol::sendReq</code>.</p>
+</div>
+<div class="paragraph">
+<p>Remember however that timing requests are a bit more complicated due to <a href="#arm-paging">paging</a>, since the page table walk can itself lead to further memory requests.</p>
+</div>
+<div class="paragraph">
+<p>In this particular instance, the address being read with <code>ldr x2, =len</code> <a href="#arm-ldr-pseudo-instruction">ARM LDR pseudo-instruction</a> is likely placed just after the text section, and therefore the pagewalk is already in the TLB due to previous instruction fetches, and this is because the translation just finished immediately going through <code>TimingSimpleCPU::finishTranslation</code>, some key snippets are:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>TLB::translateComplete(const RequestPtr &amp;req, ThreadContext *tc,
+        Translation *translation, Mode mode, TLB::ArmTranslationType tranType,
+        bool callFromS2)
+{
+    bool delay = false;
+    Fault fault;
+    if (FullSystem)
+        fault = translateFs(req, tc, mode, translation, delay, true, tranType);
+    else
+        fault = translateSe(req, tc, mode, translation, delay, true);
+    if (!delay)
+        translation-&gt;finish(fault, req, tc, mode);
+    else
+        translation-&gt;markDelayed();</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and then <code>translateSe</code> does not use <code>delay</code> at all, so we learn that in syscall emulation, <code>delay</code> is always <code>false</code> and things progress immediately there. And then further down <code>TimingSimpleCPU::finishTranslation</code> does some more fault checking:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void
+TimingSimpleCPU::finishTranslation(WholeTranslationState *state)
+{
+    if (state-&gt;getFault() != NoFault) {
+        translationFault(state-&gt;getFault());
+    } else {
+        if (!state-&gt;isSplit) {
+            sendData(state-&gt;mainReq, state-&gt;data, state-&gt;res,
+                     state-&gt;mode == BaseTLB::Read);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Tested in gem5 b1623cb2087873f64197e503ab8894b5e4d4c7b4.</p>
+</div>
+<div class="sect5">
+<h6 id="gem5-functional-requests"><a class="anchor" href="#gem5-functional-requests"></a><a class="link" href="#gem5-functional-requests">19.20.6.1.1. gem5 functional requests</a></h6>
+<div class="paragraph">
+<p>As seen at <a href="#gem5-functional-vs-atomic-vs-timing-memory-requests">gem5 functional vs atomic vs timing memory requests</a>, functional requests are not used in common simulation, since the core must always go through caches.</p>
+</div>
+<div class="paragraph">
+<p>Functional access are therefore only used for more magic simulation functionalities.</p>
+</div>
+<div class="paragraph">
+<p>One such functionality, is the <a href="#gem5-syscall-emulation-mode">gem5 syscall emulation mode</a> implementation of the <a href="#futex-system-call">futex system call</a> which is done at <code>futexFunc</code> in <a href="https://github.com/gem5/gem5/blob/9fc9c67b4242c03f165951775be5cd0812f2a705/src/sim/syscall_emul.hh#L394"><code>src/sim/sycall_emul.hh</code></a>.</p>
+</div>
+<div class="paragraph">
+<p>As seen from <code>man futex</code>, the Linux kernel reads the value from an address that is given as the first argument of the call.</p>
+</div>
+<div class="paragraph">
+<p>Therefore, here it makes sense for gem5 syscall implementation, which does not actually have a real kernel running, to just make a functional request and be done with it, since the impact of cache changes done by this read would be insignificant to the cost of an actual full context switch that would happen on a real syscall.</p>
+</div>
+</div>
+</div>
+</div>
+<div class="sect3">
+<h4 id="gem5-threadcontext-vs-threadstate-vs-execcontext-vs-process"><a class="anchor" href="#gem5-threadcontext-vs-threadstate-vs-execcontext-vs-process"></a><a class="link" href="#gem5-threadcontext-vs-threadstate-vs-execcontext-vs-process">19.20.7. gem5 <code>ThreadContext</code> vs <code>ThreadState</code> vs <code>ExecContext</code> vs <code>Process</code></a></h4>
 <div class="paragraph">
 <p>These classes get used everywhere, and they have a somewhat convoluted relation with one another, so let&#8217;s figure it out this mess.</p>
 </div>
@@ -24097,7 +24971,7 @@ type=SetAssociative</pre>
 <p>This section and all children tested at gem5 b1623cb2087873f64197e503ab8894b5e4d4c7b4.</p>
 </div>
 <div class="sect4">
-<h5 id="gem5-threadcontext"><a class="anchor" href="#gem5-threadcontext"></a><a class="link" href="#gem5-threadcontext">19.20.5.1. gem5 <code>ThreadContext</code></a></h5>
+<h5 id="gem5-threadcontext"><a class="anchor" href="#gem5-threadcontext"></a><a class="link" href="#gem5-threadcontext">19.20.7.1. gem5 <code>ThreadContext</code></a></h5>
 <div class="paragraph">
 <p>As we delve into more details below, we will reach the following conclusion: a <code>ThreadContext</code> represents on thread of a CPU with multiple <a href="#hardware-threads">Hardware threads</a>.</p>
 </div>
@@ -24147,7 +25021,7 @@ typedef SimpleThread MinorThread;</pre>
 <p>Essentially all methods of the base <code>ThreadContext</code> are pure virtual.</p>
 </div>
 <div class="sect5">
-<h6 id="gem5-simplethread"><a class="anchor" href="#gem5-simplethread"></a><a class="link" href="#gem5-simplethread">19.20.5.1.1. gem5 <code>SimpleThread</code></a></h6>
+<h6 id="gem5-simplethread"><a class="anchor" href="#gem5-simplethread"></a><a class="link" href="#gem5-simplethread">19.20.7.1.1. gem5 <code>SimpleThread</code></a></h6>
 <div class="paragraph">
 <p><code>SimpleThread</code> storage defined on <a href="#gem5-basesimplecpu"><code>BaseSimpleCPU</code></a> for simple CPUs like <code>AtomicSimpleCPU</code>:</p>
 </div>
@@ -24242,7 +25116,7 @@ typedef SimpleThread MinorThread;</pre>
 </div>
 </div>
 <div class="sect5">
-<h6 id="gem5-o3threadcontext"><a class="anchor" href="#gem5-o3threadcontext"></a><a class="link" href="#gem5-o3threadcontext">19.20.5.1.2. gem5 <code>O3ThreadContext</code></a></h6>
+<h6 id="gem5-o3threadcontext"><a class="anchor" href="#gem5-o3threadcontext"></a><a class="link" href="#gem5-o3threadcontext">19.20.7.1.2. gem5 <code>O3ThreadContext</code></a></h6>
 <div class="paragraph">
 <p>Instantiation happens in the <code>FullO3CPU</code> constructor:</p>
 </div>
@@ -24343,7 +25217,7 @@ FullO3CPU&lt;Impl&gt;::readArchIntReg(int reg_idx, ThreadID tid)
 </div>
 </div>
 <div class="sect4">
-<h5 id="gem5-threadstate"><a class="anchor" href="#gem5-threadstate"></a><a class="link" href="#gem5-threadstate">19.20.5.2. gem5 <code>ThreadState</code></a></h5>
+<h5 id="gem5-threadstate"><a class="anchor" href="#gem5-threadstate"></a><a class="link" href="#gem5-threadstate">19.20.7.2. gem5 <code>ThreadState</code></a></h5>
 <div class="paragraph">
 <p>Owned one per <code>ThreadContext</code>.</p>
 </div>
@@ -24389,12 +25263,17 @@ class O3ThreadContext : public ThreadContext
 </div>
 </div>
 <div class="sect4">
-<h5 id="gem5-execcontext"><a class="anchor" href="#gem5-execcontext"></a><a class="link" href="#gem5-execcontext">19.20.5.3. gem5 <code>ExecContext</code></a></h5>
+<h5 id="gem5-execcontext"><a class="anchor" href="#gem5-execcontext"></a><a class="link" href="#gem5-execcontext">19.20.7.3. gem5 <code>ExecContext</code></a></h5>
 <div class="paragraph">
-<p><code>ExecContext</code> gets used in instruction definitions, e.g.:</p>
+<p><code>ExecContext</code> gets used in <a href="#gem5-instruction-definitions">gem5 instruction definitions</a>, e.g.:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>build/ARM/arch/arm/generated/exec-ns.cc.inc</pre>
+</div>
 </div>
 <div class="paragraph">
-<p>build/ARM/arch/arm/generated/exec-ns.cc.inc</p>
+<p>contains:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -24543,9 +25422,281 @@ class O3ThreadContext : public ThreadContext
 <div class="paragraph">
 <p>This makes sense, since each <code>ThreadContext</code> represents one CPU register set, and therefore needs a separate <code>ExecContext</code> which allows instruction implementations to access those registers.</p>
 </div>
+<div class="sect5">
+<h6 id="gem5-execcontext-readintregoperand-register-resolution"><a class="anchor" href="#gem5-execcontext-readintregoperand-register-resolution"></a><a class="link" href="#gem5-execcontext-readintregoperand-register-resolution">19.20.7.3.1. gem5 <code>ExecContext::readIntRegOperand</code> register resolution</a></h6>
+<div class="paragraph">
+<p>Let&#8217;s have a look at how <code>ExecContext::readIntRegOperand</code> actually matches registers to decoded registers IDs, since it is not obvious.</p>
+</div>
+<div class="paragraph">
+<p>Let&#8217;s study a simple aarch64 register register addition:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>add x0, x1, x2</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which corresponds to the <code>AddXSReg</code> instruction (formatted and simplified):</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>Fault AddXSReg::execute(ExecContext *xc, Trace::InstRecord *traceData) const {
+    uint64_t Op264 = 0;
+    uint64_t Dest64 = 0;
+    uint64_t Op164 = 0;
+    Op264 = ((xc-&gt;readIntRegOperand(this, 0)) &amp; mask(intWidth));
+    Op164 = ((xc-&gt;readIntRegOperand(this, 1)) &amp; mask(intWidth));
+    uint64_t secOp = shiftReg64(Op264, shiftAmt, shiftType, intWidth);
+    Dest64 = Op164 + secOp;
+    uint64_t final_val = Dest64;
+    xc-&gt;setIntRegOperand(this, 0, (Dest64) &amp; mask(intWidth));
+    if (traceData) { traceData-&gt;setData(final_val); }
+    return NoFault;
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>So what are those magic <code>0</code> and <code>1</code> constants on <code>xc&#8594;readIntRegOperand(this, 0)</code> and <code>xc&#8594;readIntRegOperand(this, 1)</code>?</p>
+</div>
+<div class="paragraph">
+<p>First, we guess that they  must be related to the reading of <code>x1</code> and <code>x2</code>, which are the inputs of the addition.</p>
+</div>
+<div class="paragraph">
+<p>Next, we also guess that the <code>0</code> read must correspond to <code>x2</code>, since it later gets potentially shifted as mentioned at <a href="#arm-shift-suffixes">Section 24.4.4.1, &#8220;ARM shift suffixes&#8221;</a>.</p>
+</div>
+<div class="paragraph">
+<p>Let&#8217;s also have a look at the decoder code that builds the instruction instance in <code>build/ARM/arch/arm/generated/decoder-ns.cc.inc</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>ArmShiftType type =
+    (ArmShiftType)(uint8_t)bits(machInst, 23, 22);
+if (type == ROR)
+    return new Unknown64(machInst);
+uint8_t imm6 = bits(machInst, 15, 10);
+if (!bits(machInst, 31) &amp;&amp; bits(imm6, 5))
+    return new Unknown64(machInst);
+IntRegIndex rd = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+IntRegIndex rdzr = makeZero(rd);
+IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+IntRegIndex rm = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+
+return new AddXSReg(machInst, rdzr, rn, rm, imm6, type);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and the ARM instruction pseudocode from the <a href="#armarm8">ARMv8 architecture reference manual</a>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>ADD &lt;Xd&gt;, &lt;Xn&gt;, &lt;Xm&gt;{, &lt;shift&gt; #&lt;amount&gt;}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and the constructor:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>AddXSReg::AddXSReg(ExtMachInst machInst,
+    IntRegIndex _dest,
+    IntRegIndex _op1,
+    IntRegIndex _op2,
+    int32_t _shiftAmt,
+    ArmShiftType _shiftType
+) : DataXSRegOp("add", machInst, IntAluOp,
+                _dest, _op1, _op2, _shiftAmt, _shiftType) {
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    _numFPDestRegs = 0;
+    _numVecDestRegs = 0;
+    _numVecElemDestRegs = 0;
+    _numVecPredDestRegs = 0;
+    _numIntDestRegs = 0;
+    _numCCDestRegs = 0;
+    _srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op2);
+    _destRegIdx[_numDestRegs++] = RegId(IntRegClass, dest);
+    _numIntDestRegs++;
+    _srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op1);
+    flags[IsInteger] = true;;
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>where <code>RegId</code> is just a container class, and so the lines that we care about for now are:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op2);
+_srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op1);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which matches the guess we made earlier: <code>op2</code> is <code>0</code> and <code>op1</code> is <code>1</code> (<code>op1</code> and <code>op2</code> are the same as <code>_op1</code> and <code>_op2</code> which are set in the base constructor <code>DataXSRegOp</code>).</p>
+</div>
+<div class="paragraph">
+<p>We also note that the register decodings (which the ARM spec says are <code>1</code> for <code>x1</code> and <code>2</code> for <code>x2</code>) are actually passed as enum <code>IntRegIndex</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    IntRegIndex _op1,
+    IntRegIndex _op2,</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which are defined at <code>src/arch/arm/interegs.hh</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>enum IntRegIndex
+{
+    /* All the unique register indices. */
+    INTREG_R0,
+    INTREG_R1,
+    INTREG_R2,</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Then <code>SimpleExecContext::readIntRegOperand</code> does:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    /** Reads an integer register. */
+    RegVal
+    readIntRegOperand(const StaticInst *si, int idx) override
+    {
+        numIntRegReads++;
+        const RegId&amp; reg = si-&gt;srcRegIdx(idx);
+        assert(reg.isIntReg());
+        return thread-&gt;readIntReg(reg.index());
+    }</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>const RegId&amp; srcRegIdx(int i)  const { return _srcRegIdx[i]; }</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>which is what is populated in the constructor.</p>
+</div>
+<div class="paragraph">
+<p>Then, <code>RegIndex::index() { return regIdx; }</code> just returns the decoded register bytes, and now <code>SimpleThread::readIntReg</code>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>RegVal readIntReg(RegIndex reg_idx) const override {
+    int flatIndex = isa-&gt;flattenIntIndex(reg_idx);
+    return readIntRegFlat(flatIndex);
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><code>readIntRegFlag</code> is what finally reads from the int register array:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>RegVal SimpleThreadContext::readIntRegFlat(RegIndex idx) const override { return intRegs[idx]; }
+
+std::array&lt;RegVal, TheISA::NumIntRegs&gt; SimpleThreadContext::intRegs;</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and then there is the flattening magic at:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int
+flattenIntIndex(int reg) const
+{
+    assert(reg &gt;= 0);
+    if (reg &lt; NUM_ARCH_INTREGS) {
+        return intRegMap[reg];
+    } else if (reg &lt; NUM_INTREGS) {
+        return reg;
+    } else if (reg == INTREG_SPX) {
+        CPSR cpsr = miscRegs[MISCREG_CPSR];
+        ExceptionLevel el = opModeToEL(
+            (OperatingMode) (uint8_t) cpsr.mode);
+        if (!cpsr.sp &amp;&amp; el != EL0)
+            return INTREG_SP0;
+        switch (el) {
+            case EL3:
+            return INTREG_SP3;
+            case EL2:
+            return INTREG_SP2;
+            case EL1:
+            return INTREG_SP1;
+            case EL0:
+            return INTREG_SP0;
+            default:
+            panic("Invalid exception level");
+            return 0;  // Never happens.
+        }
+    } else {
+        return flattenIntRegModeIndex(reg);
+    }
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Then:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    NUM_ARCH_INTREGS = 32,</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>so we undertand that this covers x0 to x31. <code>NUM_INTREGS</code> is also 32, so I&#8217;m a bit confused, that case is never reached.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>    INTREG_SPX = NUM_INTREGS,</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>SP is 32, but it is a bit more magic, since in ARM there is one SP per <a href="#arm-exception-levels">exception level</a> as mentioned at <a href="#arm-sp0-vs-spx">ARM SP0 vs SPx</a>.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>   INTREG_SPX = NUM_INTREGS</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>We can also have a quick look at the <code>AddXImm</code> instruction which corresponds to a simple addition of an immediate as shown in <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/arch/aarch64/add.S">userland/arch/aarch64/add.S</a>:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>add x0, x1, 2</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Its <a href="#gem5-execute-vs-initiateacc-vs-completeacc"><code>execute</code> method</a> contains in <code>build/ARM/arch/arm/generated/exec-ns.cc.inc</code> (hand formatted and slightly simplified):</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>Fault AddXImm::execute(ExecContext *xc, Trace::InstRecord *traceData) const {
+    uint64_t Dest64 = 0;
+    uint64_t Op164 = 0;
+    Op164 = ((xc-&gt;readIntRegOperand(this, 0)) &amp; mask(intWidth));
+    Dest64 = Op164 + imm;
+    uint64_t final_val = Dest64;
+    xc-&gt;setIntRegOperand(this, 0, (Dest64) &amp; mask(intWidth));
+    if (traceData) { traceData-&gt;setData(final_val); }
+    return NoFault;
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>and <code>imm</code> is set directly on the constructor.</p>
+</div>
+</div>
 </div>
 <div class="sect4">
-<h5 id="gem5-process"><a class="anchor" href="#gem5-process"></a><a class="link" href="#gem5-process">19.20.5.4. gem5 <code>Process</code></a></h5>
+<h5 id="gem5-process"><a class="anchor" href="#gem5-process"></a><a class="link" href="#gem5-process">19.20.7.4. gem5 <code>Process</code></a></h5>
 <div class="paragraph">
 <p>The <code>Process</code> class is used only for <a href="#gem5-syscall-emulation-mode">gem5 syscall emulation mode</a>, and it represents a process like a Linux userland process, in addition to any further gem5 specific data needed to represent the process.</p>
 </div>
@@ -24633,7 +25784,7 @@ readFunc(SyscallDesc *desc, ThreadContext *tc,
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-code-generation"><a class="anchor" href="#gem5-code-generation"></a><a class="link" href="#gem5-code-generation">19.20.6. gem5 code generation</a></h4>
+<h4 id="gem5-code-generation"><a class="anchor" href="#gem5-code-generation"></a><a class="link" href="#gem5-code-generation">19.20.8. gem5 code generation</a></h4>
 <div class="paragraph">
 <p>gem5 uses a ton of code generation, which makes the project horrendous:</p>
 </div>
@@ -24643,7 +25794,7 @@ readFunc(SyscallDesc *desc, ThreadContext *tc,
 <p>lots of magic happen on top of pybind11, which is already magic, to more automatically glue the C++ and Python worlds: <a href="#gem5-python-c-interaction">gem5 Python C++ interaction</a></p>
 </li>
 <li>
-<p>.isa code which describes most of the instructions</p>
+<p>.isa code which describes most of the instructions: <a href="#gem5-instruction-definitions">gem5 instruction definitions</a></p>
 </li>
 <li>
 <p><a href="#gem5-ruby-build">Ruby</a> for memory systems</p>
@@ -24678,7 +25829,7 @@ readFunc(SyscallDesc *desc, ThreadContext *tc,
 <p>But it has been widely overused to insanity. It likely also exists partly because when the project started in 2003 C++ compilers weren&#8217;t that good, so you couldn&#8217;t rely on features like templates that much.</p>
 </div>
 <div class="sect4">
-<h5 id="gem5-the-isa"><a class="anchor" href="#gem5-the-isa"></a><a class="link" href="#gem5-the-isa">19.20.6.1. gem5 THE_ISA</a></h5>
+<h5 id="gem5-the-isa"><a class="anchor" href="#gem5-the-isa"></a><a class="link" href="#gem5-the-isa">19.20.8.1. gem5 THE_ISA</a></h5>
 <div class="paragraph">
 <p>Generated code at: <code>build/&lt;ISA&gt;/config/the_isa.hh</code> which e.g. for ARM contains:</p>
 </div>
@@ -24724,9 +25875,24 @@ enum class Arch {
 </div>
 </div>
 <div class="sect3">
-<h4 id="gem5-build-system"><a class="anchor" href="#gem5-build-system"></a><a class="link" href="#gem5-build-system">19.20.7. gem5 build system</a></h4>
+<h4 id="gem5-build-system"><a class="anchor" href="#gem5-build-system"></a><a class="link" href="#gem5-build-system">19.20.9. gem5 build system</a></h4>
 <div class="sect4">
-<h5 id="gem5-build-broken-on-recent-compiler-version"><a class="anchor" href="#gem5-build-broken-on-recent-compiler-version"></a><a class="link" href="#gem5-build-broken-on-recent-compiler-version">19.20.7.1. gem5 build broken on recent compiler version</a></h5>
+<h5 id="m5-override-py-source"><a class="anchor" href="#m5-override-py-source"></a><a class="link" href="#m5-override-py-source">19.20.9.1. M5_OVERRIDE_PY_SOURCE</a></h5>
+<div class="paragraph">
+<p><a href="https://stackoverflow.com/questions/52312070/how-to-modify-a-file-under-src-python-and-run-it-without-rebuilding-in-gem5" class="bare">https://stackoverflow.com/questions/52312070/how-to-modify-a-file-under-src-python-and-run-it-without-rebuilding-in-gem5</a></p>
+</div>
+<div class="paragraph">
+<p>Running gem5 with the <code>M5_OVERRIDE_PY_SOURCE=true</code> environment variable allows you to modify a file under src/python and run it without rebuilding in gem5?</p>
+</div>
+<div class="paragraph">
+<p>We set this environment variable by default in our <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/run">run</a> script.</p>
+</div>
+<div class="paragraph">
+<p>How <code>M5_OVERRID_PY_SOURCE</code> works is shown at: <a href="#gem5-m5-objects-module">gem5 <code>m5.objects</code> module</a>.</p>
+</div>
+</div>
+<div class="sect4">
+<h5 id="gem5-build-broken-on-recent-compiler-version"><a class="anchor" href="#gem5-build-broken-on-recent-compiler-version"></a><a class="link" href="#gem5-build-broken-on-recent-compiler-version">19.20.9.2. gem5 build broken on recent compiler version</a></h5>
 <div class="paragraph">
 <p>gem5 moves a bit slowly, and if your host compiler is very new, the gem5 build might be broken for it, e.g. this was the case for Ubuntu 19.10 with GCC 9 and gem5 62d75e7105fe172eb906d4f80f360ff8591d4178 from Dec 2019.</p>
 </div>
@@ -24751,7 +25917,7 @@ enum class Arch {
 </div>
 </div>
 <div class="sect4">
-<h5 id="gem5-polymorphic-isa-includes"><a class="anchor" href="#gem5-polymorphic-isa-includes"></a><a class="link" href="#gem5-polymorphic-isa-includes">19.20.7.2. gem5 polymorphic ISA includes</a></h5>
+<h5 id="gem5-polymorphic-isa-includes"><a class="anchor" href="#gem5-polymorphic-isa-includes"></a><a class="link" href="#gem5-polymorphic-isa-includes">19.20.9.3. gem5 polymorphic ISA includes</a></h5>
 <div class="paragraph">
 <p>E.g. <code>src/cpu/decode_cache.hh</code> includes:</p>
 </div>
@@ -24830,7 +25996,7 @@ build/ARM/config/the_isa.hh
 </div>
 </div>
 <div class="sect4">
-<h5 id="why-are-all-c-symlinked-into-the-gem5-build-dir"><a class="anchor" href="#why-are-all-c-symlinked-into-the-gem5-build-dir"></a><a class="link" href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.20.7.3. Why are all C++ symlinked into the gem5 build dir?</a></h5>
+<h5 id="why-are-all-c-symlinked-into-the-gem5-build-dir"><a class="anchor" href="#why-are-all-c-symlinked-into-the-gem5-build-dir"></a><a class="link" href="#why-are-all-c-symlinked-into-the-gem5-build-dir">19.20.9.4. Why are all C++ symlinked into the gem5 build dir?</a></h5>
 <div class="paragraph">
 <p>Upstream request: <a href="https://gem5.atlassian.net/browse/GEM5-469" class="bare">https://gem5.atlassian.net/browse/GEM5-469</a></p>
 </div>
@@ -26436,7 +27602,7 @@ global 10000</pre>
 <p>so we clearly see that basically a <code>lock addq</code> is used to do an atomic read and write to memory every single time, just like in our other example <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/cpp/atomic/x86_64_lock_inc.cpp">userland/cpp/atomic/x86_64_lock_inc.cpp</a>.</p>
 </div>
 <div class="paragraph">
-<p>This setup can also be used to benchmark different synchronization mechanisms. For example, <code>std::mutex</code> was about 1.5x slower with two cores than <code>std::atomic</code>, presumably because it relies on the <code>futex</code> system call as can be seen from <code>strace -f -s999 -v</code> logs, while <code>std::atomic</code> uses just userland instructions: <a href="https://www.quora.com/How-does-std-atomic-work-in-C++11/answer/Ciro-Santilli" class="bare">https://www.quora.com/How-does-std-atomic-work-in-C++11/answer/Ciro-Santilli</a> Tested in <code>-O3</code> with:</p>
+<p>This setup can also be used to benchmark different synchronization mechanisms. For example, <code>std::mutex</code> was about 1.5x slower with two cores than <code>std::atomic</code>, presumably because it relies on the <a href="#futex-system-call"><code>futex</code> system call</a> as can be seen from <code>strace -f -s999 -v</code> logs, while <code>std::atomic</code> uses just userland instructions: <a href="https://www.quora.com/How-does-std-atomic-work-in-C++11/answer/Ciro-Santilli" class="bare">https://www.quora.com/How-does-std-atomic-work-in-C++11/answer/Ciro-Santilli</a> Tested in <code>-O3</code> with:</p>
 </div>
 <div class="literalblock">
 <div class="content">
@@ -26626,13 +27792,16 @@ fork() return = 13039</pre>
 <div class="ulist">
 <ul>
 <li>
-<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/posix/pthread_count.c">userland/posix/pthread_count.c</a></p>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/posix/pthread_self.c">userland/posix/pthread_self.c</a>: the simplest example possible</p>
 </li>
 <li>
-<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/posix/pthread_deadlock.c">userland/posix/pthread_deadlock.c</a></p>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/posix/pthread_count.c">userland/posix/pthread_count.c</a>: count an atomic varible across threads</p>
 </li>
 <li>
-<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/posix/pthread_self.c">userland/posix/pthread_self.c</a></p>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/posix/pthread_deadlock.c">userland/posix/pthread_deadlock.c</a>: purposefully create a deadlock to see what it looks like</p>
+</li>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/posix/pthread_barrier.c">userland/posix/pthread_barrier.c</a>: related: <a href="https://stackoverflow.com/questions/28663622/understanding-posix-barrier-mechanism" class="bare">https://stackoverflow.com/questions/28663622/understanding-posix-barrier-mechanism</a></p>
 </li>
 </ul>
 </div>
@@ -29345,7 +30514,17 @@ child after parent sleep</pre>
 <div class="sect3">
 <h4 id="getcpu"><a class="anchor" href="#getcpu"></a><a class="link" href="#getcpu">22.7.2. <code>getcpu</code> system call and the <code>sched_getaffinity</code> glibc wrapper</a></h4>
 <div class="paragraph">
-<p>Example: <a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/linux/sched_getcpu.c">userland/linux/sched_getcpu.c</a></p>
+<p>Examples:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/linux/sched_getcpu.c">userland/linux/sched_getcpu.c</a></p>
+</li>
+<li>
+<p><a href="https://github.com/cirosantilli/linux-kernel-module-cheat/blob/master/userland/linux/sched_getcpu_barrier.c">userland/linux/sched_getcpu_barrier.c</a>: this uses a barrier to ensure that gem5 will run each thread on one separate CPU</p>
+</li>
+</ul>
 </div>
 <div class="paragraph">
 <p>Returns the CPU that the process/thread is currently running on:</p>
@@ -34000,6 +35179,12 @@ CurrentEL.EL 0x3</pre>
 <p>See <a href="#armarm8-db">ARMv8 architecture reference manual db</a> D1.6.2 "The stack pointer registers".</p>
 </div>
 <div class="paragraph">
+<p>There is one SP per <a href="#arm-exception-levels">exception level</a>.</p>
+</div>
+<div class="paragraph">
+<p>This can also be seen clearly on the analysis at <a href="#gem5-execcontext-readintregoperand-register-resolution">gem5 <code>ExecContext::readIntRegOperand</code> register resolution</a>.</p>
+</div>
+<div class="paragraph">
 <p>TODO create a minimal runnable example.</p>
 </div>
 <div class="paragraph">