diff --git a/.gitmodules b/.gitmodules
index 834b2a4..c577874 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -39,3 +39,6 @@
 [submodule "submodules/xen"]
 	path = submodules/xen
 	url = https://github.com/cirosantilli/xen
+[submodule "submodules/stream-benchmark"]
+	path = submodules/stream-benchmark
+	url = https://github.com/cirosantilli/stream-benchmark
diff --git a/README.adoc b/README.adoc
index a539e56..0a895a9 100644
--- a/README.adoc
+++ b/README.adoc
@@ -12115,6 +12115,8 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
 
 From the message, this appears however to be a Python / pyenv11 bug however and not in gem5 specifically. I think it worked when I tried it in the past in an older gem5 / Ubuntu.
 
+`--without-tcmalloc` is needed / a good idea when using `--with-asan`: https://stackoverflow.com/questions/42712555/address-sanitizer-fsanitize-address-works-with-tcmalloc since both do more or less similar jobs, see also <<memory-leaks>>.
+
 ==== gem5 Ruby build
 
 Ruby is a system that includes the SLICC domain specific language to describe memory systems: http://gem5.org/Ruby
@@ -12666,6 +12668,8 @@ Tested at b4879ae5b0b6644e6836b0881e4da05c64a6550d.
 
 ===== gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis
 
+TODO: analyze better what each of the memory event mean. For now, we have just collected a bunch of data there, but needs interpreting. The CPU specifics in this section are already insightful however.
+
 <<gem5-basesimplecpu,TimingSimpleCPU>> should be the second simplest CPU to analyze, so let's give it a try:
 
 ....
@@ -14506,20 +14510,26 @@ Build and run on QEMU <<user-mode-simulation>>:
 
 ....
 git submodule update --init submodules/dhrystone
-./build-dhrystone --mode userland
+./build-dhrystone --optimization-level 3
 ./run --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"
 ....
 
+Increase the number of loops to try and reach more meaningful results:
+
+....
+./run --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone" --userland-args 100000000
+....
+
 Build and run on gem5 user mode:
 
 ....
-./build-dhrystone --mode userland --static --force-rebuild
-./run --emulator gem5 --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"
+./build-dhrystone --optimization-level 3 --static
+./run --emulator gem5 --userland "$(./getvar --static userland_build_dir)/submodules/dhrystone/dhrystone"
 ....
 
 TODO automate run more nicely.
 
-Build for <<baremetal>> execution and run it in baremetal QEMU:
+Build for <<baremetal>> execution and run it in baremetal QEMU. TODO: fix the build, just need to factor out all run arguments from link:build-baremetal[] into link:common.py[] and it should just work, no missing syscalls.
 
 ....
 # Build our Newlib stubs.
@@ -14528,8 +14538,6 @@ Build for <<baremetal>> execution and run it in baremetal QEMU:
 ./run --arch aarch64 --baremetal "$(./getvar baremetal_build_dir)/submodules/dhrystone/dhrystone"
 ....
 
-TODO: fix the build, just need to factor out all run arguments from link:build-baremetal[] into link:common.py[] and it should just work, no missing syscalls.
-
 If you really want the Buildroot package for some reason, build it with:
 
 ....
@@ -14542,6 +14550,68 @@ and run inside the guest from `PATH` with:
 dhrystone
 ....
 
+==== STREAM benchmark
+
+http://www.cs.virginia.edu/stream/ref.html
+
+Very simple memory width benchmark with one C and one Fortran version, originally published in 1991, and the latest version at the time of writing is from 2013.
+
+Its operation is very simple: fork one thread for each CPU in the system (using OpenMP) and do the following four array operations (4 separate loops of individual operations):
+
+....
+/* Copy. */
+times[0 * ntimes + k] = mysecond();
+#pragma omp parallel for
+for (j=0; j<stream_array_size; j++)
+    c[j] = a[j];
+times[0 * ntimes + k] = mysecond() - times[0 * ntimes + k];
+
+/* Scale. */
+times[1 * ntimes + k] = mysecond();
+#pragma omp parallel for
+for (j=0; j<stream_array_size; j++)
+    b[j] = scalar*c[j];
+times[1 * ntimes + k] = mysecond() - times[1 * ntimes + k];
+
+/* Add. */
+times[2 * ntimes + k] = mysecond();
+#pragma omp parallel for
+for (j=0; j<stream_array_size; j++)
+    c[j] = a[j]+b[j];
+times[2 * ntimes + k] = mysecond() - times[2 * ntimes + k];
+
+/* Triad. */
+times[3 * ntimes + k] = mysecond();
+#pragma omp parallel for
+for (j=0; j<stream_array_size; j++)
+    a[j] = b[j]+scalar*c[j];
+times[3 * ntimes + k] = mysecond() - times[3 * ntimes + k];
+}
+....
+
+See also: https://stackoverflow.com/questions/56086993/what-does-stream-memory-bandwidth-benchmark-really-measure
+
+The LKMC usage of STREAM is analogous to that of <<dhrystone>>. Build and run on QEMU <<user-mode-simulation>>:
+
+....
+git submodule update --init submodules/stream-benchmark
+./build-stream --optimization-level 3
+./run --userland "$(./getvar userland_build_dir)/submodules/stream-benchmark/stream_c.exe"
+....
+
+Decrease the benchmark size and the retry count to finish simulation faster, but possibly have a less representative result:
+
+....
+./run --userland "$(./getvar userland_build_dir)/submodules/stream-benchmark/stream_c.exe" --userland-args '100 2'
+....
+
+Build and run on gem5 user mode:
+
+....
+./build-stream --optimization-level 3 --static
+./run --emulator gem5 --userland "$(./getvar --static userland_build_dir)/submodules/stream-benchmark/stream_c.exe" --userland-args '1000 2'
+....
+
 ==== PARSEC benchmark
 
 We have ported parts of the http://parsec.cs.princeton.edu[PARSEC benchmark] for cross compilation at: https://github.com/cirosantilli/parsec-benchmark See the documentation on that repo to find out which benchmarks have been ported. Some of the benchmarks were are segfaulting, they are documented in that repo.
diff --git a/build-dhrystone b/build-dhrystone
index c32f8e9..9273b65 100755
--- a/build-dhrystone
+++ b/build-dhrystone
@@ -42,13 +42,13 @@ https://cirosantilli.com/linux-kernel-module-cheat#dhrystone
                 '-j', str(self.env['nproc']), LF,
                 '-C', os.path.join(self.env['submodules_dir'], 'dhrystone'), LF,
                 'CC={}'.format(self.env['gcc_path']), LF,
-                'CFLAGS={}'.format(' '.join(cflags)), LF,
+                'CFLAGS_EXTRA={}'.format(' '.join(cflags)), LF,
                 'EXTRA_OBJS={}'.format(' '.join(extra_objs)), LF,
                 'OUT_DIR={}'.format(build_dir), LF,
             ]
             + extra_flags
         )
-        if ret == 0 and env['copy_overlay']:
+        if ret == 0 and self.env['copy_overlay']:
             self.sh.copy_file_if_update(
                 os.path.join(build_dir, 'dhrystone'),
                 os.path.join(self.env['out_rootfs_overlay_lkmc_dir'], self.root_relpath, 'dhrystone'),
diff --git a/build-stream b/build-stream
new file mode 100755
index 0000000..72a5ff9
--- /dev/null
+++ b/build-stream
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+import os
+import shutil
+
+import common
+import shlex
+from shell_helpers import LF
+
+class Main(common.BuildCliFunction):
+    def __init__(self):
+        super().__init__(
+            description='''\
+https://cirosantilli.com/linux-kernel-module-cheat#stream-benchmark
+'''
+        )
+        self._add_argument('--ccflags')
+        self._add_argument('--force-rebuild')
+        self._add_argument('--optimization-level')
+
+    def setup(self, env):
+        self.root_relpath = os.path.join('submodules', 'stream-benchmark')
+
+    def build(self):
+        build_dir = self.get_build_dir()
+        cflags = ['-O{}'.format(self.env['optimization_level'])]
+        extra_flags = []
+        if self.env['static']:
+            cflags.extend(['-static'])
+        if self.env['force_rebuild']:
+            extra_flags.extend(['-B', LF])
+        if self.env['mode'] == 'baremetal':
+            extra_objs = [
+                self.env['baremetal_syscalls_obj'],
+                self.env['baremetal_syscalls_asm_obj']
+            ]
+        else:
+            extra_objs = []
+        ret = self.sh.run_cmd(
+            [
+                'make', LF,
+                '-j', str(self.env['nproc']), LF,
+                '-C', os.path.join(self.env['submodules_dir'], 'stream-benchmark'), LF,
+                'CC={}'.format(self.env['gcc_path']), LF,
+                'CFLAGS_EXTRA={}'.format(' '.join(cflags)), LF,
+                'EXTRA_OBJS={}'.format(' '.join(extra_objs)), LF,
+                'FC={}'.format(self.env['gfortran_path']), LF,
+                'OUT_DIR={}'.format(build_dir), LF,
+            ]
+            + extra_flags
+        )
+        if ret == 0 and self.env['copy_overlay']:
+            self.sh.copy_file_if_update(
+                os.path.join(build_dir, 'stream_c.exe'),
+                os.path.join(self.env['out_rootfs_overlay_lkmc_dir'], self.root_relpath, 'stream-benchmark'),
+            )
+        return ret
+
+    def get_build_dir(self):
+        return os.path.join(self.env['build_dir'], self.root_relpath)
+
+if __name__ == '__main__':
+    Main().cli()
diff --git a/common.py b/common.py
index 93f3f3b..c3ffc77 100644
--- a/common.py
+++ b/common.py
@@ -1011,7 +1011,7 @@ Incompatible archs are skipped.
         )
         if env['mode'] == 'baremetal':
             env['build_dir'] = env['baremetal_build_dir']
-        elif env['mode'] == 'userland':
+        else:
             env['build_dir'] = env['userland_build_dir']
 
         # Docker
@@ -1119,6 +1119,7 @@ lunch aosp_{}-eng
             env['toolchain_prefix_dash'] = ''
         else:
             env['toolchain_prefix_dash'] = '{}-'.format(env['toolchain_prefix'])
+        env['gfortran_path'] = self.get_toolchain_tool('gfortran')
         env['gcc_path'] = self.get_toolchain_tool('gcc')
         env['gxx_path'] = self.get_toolchain_tool('g++')
         env['ld_path'] = self.get_toolchain_tool('ld')
diff --git a/submodules/stream-benchmark b/submodules/stream-benchmark
new file mode 160000
index 0000000..eaa0a90
--- /dev/null
+++ b/submodules/stream-benchmark
@@ -0,0 +1 @@
+Subproject commit eaa0a90ded985a1cb738f1b811ba0531bfa46a03
diff --git a/userland/c/multidimentional_array.c b/userland/c/multidimentional_array.c
new file mode 100644
index 0000000..b784c86
--- /dev/null
+++ b/userland/c/multidimentional_array.c
@@ -0,0 +1,34 @@
+/* https://cirosantilli.com/linux-kernel-module-cheat#c
+ *
+ * Multidimentional arrays are generally a bad idea that confuses
+ * everyone, use single dimentional arrays + indexing if possible.
+ * But here goes nothing.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(void) {
+    /* Initialized in the code. */
+    {
+        /* We can skip the first dimension as it is inferred. */
+        int is[][3] = {
+            {1, 2, 3,},
+            {4, 5, 5,},
+        };
+        assert(is[0][0] == 1);
+        assert(is[0][1] == 2);
+        assert(is[1][0] == 4);
+
+        /* We can get the total sizes of either the entire array,
+         * or of just on row. */
+        assert(sizeof(is) == 6 * sizeof(is[0][0]));
+        assert(sizeof(is[0]) == 3 * sizeof(is[0][0]));
+
+        /* Multi dimentional arrays are contiguous and row major. */
+        assert(&is[0][1] - &is[0][0] == 1);
+        assert(&is[1][0] - &is[0][0] == 3);
+    }
+    return EXIT_SUCCESS;
+}