diff --git a/.travis.yml b/.travis.yml
index 6d23f03..bb24335 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,3 @@
-# This was a noble attempt, but it hits the current 50 minute job timeout :-)
-# https://travis-ci.org/cirosantilli/linux-kernel-module-cheat/builds/296454523
-# I bet it would likely hit a disk maxout either way if it went on.
-
 language: cpp
 
 sudo: required
diff --git a/README.adoc b/README.adoc
index fe8b233..b4db5b2 100644
--- a/README.adoc
+++ b/README.adoc
@@ -27,7 +27,7 @@ cd linux-kernel-module-cheat
 ./configure && ./build && ./run
 ....
 
-The first configure will take a while (30 minutes to 2 hours) to clone and build, see <<benchmark-initial-build>> for more details.
+The first configure will take a while (30 minutes to 2 hours) to clone and build, see <<benchmark-builds>> for more details.
 
 If you don't want to wait, you could also try to compile the examples and run them on your host computer as explained on at <<run-on-host>>, but as explained on that section, that is dangerous, limited, and will likely not work.
 
@@ -818,7 +818,19 @@ If you are using gem5 instead of QEMU, `-u` has a different effect: it opens the
 ./run -gu
 ....
 
-If you also want to use the debugger with gem5, you will need to create your own panes or windows.
+If you also want to use the debugger with gem5, you will need to create your own panes or windows, or to see the debugger instead of the terminal:
+
+....
+./tmu ./rungdb;./run -dg
+....
+
+TODO: there is a problem with our method however: if you do something like:
+
+....
+./build && ./run -du
+....
+
+and the build takes a while, and you move to another tmux window, then the split happens on the current window, not where you ran the command: https://unix.stackexchange.com/questions/439031/how-to-split-the-window-that-ran-the-tmux-split-window-command-instead-of-the
 
 === GDB step debug kernel module
 
@@ -3683,7 +3695,6 @@ But keep in mind that it only affects benchmark performance of the most detailed
 {empty}*: couldn't test because of:
 
 * https://stackoverflow.com/questions/49011096/how-to-switch-cpu-models-in-gem5-after-restoring-a-checkpoint-and-then-observe-t
-* https://github.com/gem5/gem5/issues/16
 
 Cache sizes can in theory be checked with the methods described at: link:https://superuser.com/questions/55776/finding-l2-cache-size-in-linux[]:
 
@@ -3753,7 +3764,7 @@ instructions 80899588
 We make the following conclusions:
 
 * the number of instructions almost does not change: the CPU is waiting for memory all the extra time. TODO: why does it change at all?
-* the wall clock execution time is not directionally proportional to the number of cycles: here we had a 10x cycle increase, but only 2x time increase
+* the wall clock execution time is not directionally proportional to the number of cycles: here we had a 10x cycle increase, but only 2x time increase. This suggests that the simulation of cycles in which the CPU is waiting for memory to come back is faster.
 
 ===== gem5 memory latency
 
@@ -4745,15 +4756,126 @@ Finally, do a clone of the relevant repository out of tree and reproduce the bug
 
 == Benchmark this repo
 
-In this section document how fast the build and clone are, and how to investigate them.
-
-This is to give an idea to people of what they should expect.
-
-Send a pull request if you try it out on something significantly different.
+In this section document how benchmark builds and runs of this repo, and how to investigate what the bottleneck is.
 
 Ideally, we should setup an automated build server that benchmarks those things continuously for us.
 
-=== Find which packages are making the build slow
+We tried to automate it on Travis with link:.travis.yml[] but it hits the current 50 minute job timeout: https://travis-ci.org/cirosantilli/linux-kernel-module-cheat/builds/296454523 And I bet it would likely hit a disk maxout either way if it went on.
+
+So currently, we are running benchmarks manually when it seems reasonable and uploading them to: https://github.com/cirosantilli/linux-kernel-module-cheat-regression
+
+All benchmarks were run on the <<p51>> machine, unless stated otherwise.
+
+Run all benchmarks and upload the results:
+
+....
+./bench-all -A
+....
+
+=== Benchmark this repo benchmarks
+
+==== Benchmark Linux kernel boot
+
+....
+./bench-boot
+cat out/bench-boot.txt
+....
+
+Sample results at 2bddcc2891b7e5ac38c10d509bdfc1c8fe347b94:
+
+....
+cmd ./run -a x86_64 -E '/poweroff.out'
+time 3.58
+exit_status 0
+cmd ./run -a x86_64 -E '/poweroff.out' -K
+time 0.89
+exit_status 0
+cmd ./run -a x86_64 -E '/poweroff.out' -T exec_tb
+time 4.12
+exit_status 0
+instructions 2343768
+cmd ./run -a x86_64 -E 'm5 exit' -g
+time 451.10
+exit_status 0
+instructions 706187020
+cmd ./run -a arm -E '/poweroff.out'
+time 1.85
+exit_status 0
+cmd ./run -a arm -E '/poweroff.out' -T exec_tb
+time 1.92
+exit_status 0
+instructions 681000
+cmd ./run -a arm -E 'm5 exit' -g
+time 94.85
+exit_status 0
+instructions 139895210
+cmd ./run -a aarch64 -E '/poweroff.out'
+time 1.36
+exit_status 0
+cmd ./run -a aarch64 -E '/poweroff.out' -T exec_tb
+time 1.37
+exit_status 0
+instructions 178879
+cmd ./run -a aarch64 -E 'm5 exit' -g
+time 72.50
+exit_status 0
+instructions 115754212
+cmd ./run -a aarch64 -E 'm5 exit' -g -- --cpu-type=HPI --caches --l2cache --l1d_size=1024kB --l1i_size=1024kB --l2_size=1024kB --l3_size=1024kB
+time 369.13
+exit_status 0
+instructions 115774177
+....
+
+TODO: aarch64 gem5 and QEMU use the same kernel, so why is the gem5 instruction count so much much higher?
+
+===== gem5 arm HPI boot takes much longer than aarch64
+
+TODO 62f6870e4e0b384c4bd2d514116247e81b241251 takes 33 minutes to finish at 62f6870e4e0b384c4bd2d514116247e81b241251:
+
+....
+cmd ./run -a arm -E 'm5 exit' -g -- --caches --cpu-type=HPI
+....
+
+while aarch64 only 7 minutes.
+
+I had previously documented on README 10 minutes at: 2eff007f7c3458be240c673c32bb33892a45d3a0 found with `git log` search for `10 minutes`. But then I checked out there, run it, and kernel panics before any messages come out. Lol?
+
+Logs of the runs can be found at: https://github.com/cirosantilli-work/gem5-issues/tree/0df13e862b50ae20fcd10bae1a9a53e55d01caac/arm-hpi-slow
+
+The cycle count is higher for `arm`, 350M vs 250M for `aarch64`, not nowhere near the 5x runtime time increase.
+
+A quick look at the boot logs show that they are basically identical in structure: the same operations appear more ore less on both, and there isn't one specific huge time pit in arm: it is just that every individual operation seems to be taking a lot longer.
+
+===== gem5 x86_64 DerivO3CPU boot panics
+
+https://github.com/cirosantilli-work/gem5-issues/issues/2
+
+....
+Kernel panic - not syncing: Attempted to kill the idle task!
+....
+
+==== Benchmark builds
+
+The build times are calculated after doing `./configure` and link:https://buildroot.org/downloads/manual/manual.html#_offline_builds[`make source`], which downloads the sources, and basically benchmarks the <<benchmark-internets,Internet>>.
+
+Sample build time at 2c12b21b304178a81c9912817b782ead0286d282: 28 minutes, 15 with full ccache hits. Breakdown: 19% GCC, 13% Linux kernel, 7% uclibc, 6% host-python, 5% host-qemu, 5% host-gdb, 2% host-binutils
+
+Single file change on `./build kernel_module-reconfigure`: 7 seconds.
+
+Buildroot automatically stores build timestamps as milliseconds since Epoch. Convert to minutes:
+
+....
+awk -F: 'NR==1{start=$1}; END{print ($1 - start)/(60000.0)}' out/x86_64/buildroot/build/build-time.log
+....
+
+Or to conveniently do a clean build without affecting your current one:
+
+....
+./bench-all -b
+cat ../linux-kernel-module-cheat-regression/*/build-time.log
+....
+
+===== Find which packages are making the build slow
 
 ....
 cd out/x86_64/buildroot
@@ -4783,59 +4905,19 @@ We do our best to reduce the instruction and feature count to the bare minimum n
 +
 One possibility we could play with is to build loadable modules instead of built-in modules to reduce runtime, but make it easier to get started with the modules.
 
-=== Benchmark this repo benchmarks
-
-==== Benchmark Linux kernel boot
-
-....
-./bench-boot
-cat out/bench-boot.txt
-....
-
-Benchmark results will be kept at: https://github.com/cirosantilli/linux-kernel-module-cheat-regression
-
-Output  fb317f4778633692b91c9174224dccc6a3a02893:
-
-TODO the following takes more than 1 hour to finish on the <<p51>>:
-
-....
-cmd ./run -a arm -E 'm5 exit' -g -- --caches --cpu-type=HPI
-....
-
-Why so long? I had previously documented on README 10 minutes at: 2eff007f7c3458be240c673c32bb33892a45d3a0 found with `git log` search for `10 minutes`. But then I checked out there, run it, and kernel panics before any messages come out. Lol? Things that did not happen:
-
-* update gem5 to master 2a9573f5942b5416fb0570cf5cb6cdecba733392
-* larger caches: `--l2cache --l1d_size=1024kB --l1i_size=1024kB --l2_size=1024kB --l3_size=1024kB`
-
-But TODO: on aarch64 both use the same kernel build already, and the gem5 instruction count is much higher, why?
-
-==== Benchmark initial build
-
-The build times are calculated after doing `./configure` and link:https://buildroot.org/downloads/manual/manual.html#_offline_builds[`make source`], which downloads the sources, and basically benchmarks the Internet.
-
-Build time on <<p51>> at 2c12b21b304178a81c9912817b782ead0286d282: 28 minutes, 15 with full ccache hits. Breakdown: 19% GCC, 13% Linux kernel, 7% uclibc, 6% host-python, 5% host-qemu, 5% host-gdb, 2% host-binutils
-
-Single file change on `./build kernel_module-reconfigure`: 7 seconds.
-
-==== Benchmark Buildroot build baseline
+===== Benchmark Buildroot build baseline
 
 This is the minimal build we could expect to get away with.
 
-On the upstream Buildroot repo at 7d43534625ac06ae01987113e912ffaf1aec2302 we run:
+We will run this whenever the Buildroot submodule is updated.
+
+On the upstream Buildroot repo at :
 
 ....
-make qemu_x86_64_defconfig
-printf '
-BR2_CCACHE=y
-BR2_TARGET_ROOTFS_CPIO=y
-BR2_TARGET_ROOTFS_EXT2=n
-' >>.config
-make olddefconfig
-time env -u LD_LIBRARY_PATH make BR2_JLEVEL="$(nproc)"
-ls -l output/images
+./bench-all -B
 ....
 
-Time: 11 minutes, 7 with full ccache hits. Breakdown: 47% GCC, 15% Linux kernel, 9% uclibc, 5% host-binutils. Conclusions:
+Sample time on 2017.08: 11 minutes, 7 with full ccache hits. Breakdown: 47% GCC, 15% Linux kernel, 9% uclibc, 5% host-binutils. Conclusions:
 
 * we have bloated our kernel build 3x with all those delicious features :-)
 * GCC time increased 1.5x by our bloat, but its percentage of the total was greatly reduced, due to new packages being introduced.
@@ -4858,11 +4940,20 @@ Size:
 
 Zipped: 4.9M, `rootfs.cpio` deflates 50%, `bzImage` almost nothing.
 
-==== Benchmark gem5 build
+===== Benchmark gem5 build
 
-How long it takes to build gem5 itself on <<P51>>
+How long it takes to build gem5 itself.
 
-* x86 at 68af229490fc811aebddf68b3e2e09e63a5fa475: 9m40s
+We will update this whenever the gem5 submoule is updated.
+
+Sample results at gem5 2a9573f5942b5416fb0570cf5cb6cdecba733392: 10 to 12 minutes.
+
+Get results with:
+
+....
+./bench-all -g
+tail -n+1 ../linux-kernel-module-cheat-regression/*/gem5-bench-build-*.txt
+....
 
 === Benchmark machines
 
diff --git a/bench-all b/bench-all
index 2e8b336..df3fc01 100755
--- a/bench-all
+++ b/bench-all
@@ -1,19 +1,54 @@
 #!/usr/bin/env bash
 # Run all benchmarks for this repo, and save the results to the
 # benchmark repo, which should be cloned at ../linux-kernel-module-cheat-benchmarks.
-#
-# We will only include here things which are likely to change due to Buildroot / Linux
-# kernel configuration changes. This excludes for example:
-#
-# * ./gem5-bench-caches: basically only benchmarks gem5 itself,
-#   since it makes no syscalls on the main loop (checked with strace).
-# * gem5 built time
 set -eu
 . common
-./build-all
-./bench-boot
+
+bench_build=false
+bench_buildroot_baseline=false
+bench_gem5_build=false
+bench_linux_boot=false
+default_arch=x86_64
+update_repo=true
+set -- ${cli_bench_all:-} "$@"
+while getopts Aa:Bbglu OPT; do
+  case "$OPT" in
+    A)
+      bench_build=true
+      bench_buildroot_baseline=true
+      bench_gem5_build=true
+      bench_linux_boot=true
+      ;;
+    a)
+      default_arch="$OPTARG"
+      ;;
+    b)
+      bench_build=true
+      ;;
+    B)
+      bench_buildroot_baseline=true
+      ;;
+    g)
+      bench_gem5_build=true
+      ;;
+    l)
+      bench_linux_boot=true
+      ;;
+    u)
+      update_repo=false
+      ;;
+    ?)
+      exit 2
+      ;;
+  esac
+done
+shift "$(($OPTIND - 1))"
+comment="${1:-}"
+
+# Create output directory.
 sha="$(git log -1 --format="%H")"
-benchmark_repo=../linux-kernel-module-cheat-regression
+benchmark_repo="${root_dir}/../linux-kernel-module-cheat-regression"
+mkdir -p "$benchmark_repo"
 last_dir="$(ls "$benchmark_repo" | grep -E '^[0-9]' | tail -n 1)"
 if [ -n "$last_dir" ]; then
   seq_id="$(("$(echo "$last_dir" | sed -E 's/_.*//')" + 1))"
@@ -22,10 +57,67 @@ else
 fi
 seq_id="$(printf '%0.4d' "$seq_id")"
 sha="$(git log -1 --format="%H")"
-new_dir="${benchmark_repo}/${seq_id}_${sha}"
+dir_basename="${seq_id}_${sha}"
+new_dir="${benchmark_repo}/${dir_basename}"
 mkdir "$new_dir"
-cp "$common_bench_boot" "$new_dir"
-cd "$benchmark_repo"
-git add .
-git commit -m "$new_dir"
-git push
+
+if "$bench_build"; then
+  arch="$default_arch"
+  suffix=bench
+  set_common_vars "$arch" false "$suffix"
+  rm -rf "$out_arch_dir"
+  ./build -a "$arch" -B 'BR2_CCACHE=n' -s "$suffix"
+  cp "${build_dir}/build-time.log" "${new_dir}/build-time-${arch}.log"
+  rm -rf "$out_arch_dir"
+fi
+
+if "$bench_buildroot_baseline"; then
+  cd "${root_dir}/buildroot"
+  git clean -xdf
+  make "qemu_${default_arch}_defconfig"
+  printf '
+BR2_CCACHE=y
+BR2_TARGET_ROOTFS_CPIO=y
+BR2_TARGET_ROOTFS_EXT2=n
+' >>.config
+  make olddefconfig
+  make source
+  time env -u LD_LIBRARY_PATH make BR2_JLEVEL="$(nproc)"
+  cp output/build/build-time.log "${new_dir}/baseline-build-time-${default_arch}.log"
+  wc -c output/images/* > "${new_dir}/baseline-image-size-${default_arch}.log"
+  git clean -xdf
+fi
+
+if "$bench_gem5_build"; then
+  arches='x86_64 arm'
+  for arch in $arches; do
+    set_common_vars "$arch"
+    cd "${root_dir}/gem5/gem5"
+    git clean -xdf
+    cd "${root_dir}/gem5"
+    results_file="${gem5_out_dir}/bench-build.txt"
+    rm "$results_file"
+    common_bench_cmd "timeout 900 ./build -a '$arch'" "$results_file"
+    cp "$results_file" "${new_dir}/gem5-bench-build-${arch}.txt"
+    cd "${root_dir}/gem5/gem5"
+    git clean -xdf
+  done
+fi
+
+if "$bench_linux_boot"; then
+  cd "${root_dir}"
+  ./build-all
+  ./bench-boot
+  cp "$common_bench_boot" "$new_dir"
+fi
+
+if "$update_repo"; then
+  if [ -n "$comment" ]; then
+    echo "$comment" > "${new_dir}/README.adoc"
+  fi
+  echo ""
+  cd "$benchmark_repo"
+  git add .
+  git commit -m "$dir_basename"
+  git push
+fi
diff --git a/bench-boot b/bench-boot
index bf14c79..7f20b45 100755
--- a/bench-boot
+++ b/bench-boot
@@ -23,7 +23,6 @@ bench "$arch -E '/poweroff.out' -T exec_tb"
 qemu_insts "$arch"
 bench "$arch -E 'm5 exit' -g"
 gem5_insts "$arch"
-# Was taking more than one hour.
 #bench "$arch -E 'm5 exit' -g -- --cpu-type=DerivO3CPU ${caches}"
 #gem5_insts "$arch"
 
@@ -33,7 +32,6 @@ bench "$arch -E '/poweroff.out' -T exec_tb"
 qemu_insts "$arch"
 bench "$arch -E 'm5 exit' -g"
 gem5_insts "$arch"
-# Was taking more than one hour.
 #bench "$arch -E 'm5 exit' -g -- --cpu-type=HPI ${caches}"
 #gem5_insts "$arch"
 
diff --git a/build b/build
index 042dcbd..3241cb4 100755
--- a/build
+++ b/build
@@ -14,8 +14,9 @@ linux_reconfigure=false
 linux_kernel_custom_config_file=
 post_script_args=
 qemu_sdl='--enable-sdl --with-sdlabi=2.0'
+suffix=
 v=0
-while getopts 'a:B:b:CGgj:hIiK:klp:qSv' OPT; do
+while getopts 'a:B:b:CGgj:hIiK:klp:qSs:v' OPT; do
   case "$OPT" in
     a)
       arch="$OPTARG"
@@ -77,6 +78,9 @@ BR2_TARGET_ROOTFS_INITRAMFS=n
     S)
       qemu_sdl=
       ;;
+    s)
+      suffix="$OPTARG"
+      ;;
     v)
       v=1
       ;;
@@ -87,7 +91,7 @@ BR2_TARGET_ROOTFS_INITRAMFS=n
 done
 shift $(($OPTIND - 1))
 extra_make_args="${extra_make_args} $@"
-set_common_vars "$arch" "$gem5"
+set_common_vars "$arch" "$gem5" "$suffix"
 config_file="${buildroot_out_dir}/.config"
 case "$arch" in
   x86_64)
diff --git a/cli.example b/cli.example
index bfcddae..f2a4f40 100644
--- a/cli.example
+++ b/cli.example
@@ -5,10 +5,12 @@ gem5=
 printf '
 #BR2_TARGET_ROOTFS_EXT2_SIZE="1024M"
 ' > br2.gitignore
+cli_bench_all=""
 cli_build="-a ${arch} -b br2.gitignore -b br2_x11 ${gem5}"
 cli_run="-a ${arch} ${gem5}"
 cli_rungdb="-a ${arch} ${gem5}"
-cli_rungdbserver="-a ${arch} ${gem5}"
 cli_rungdb_user="-a ${arch} ${gem5}"
-cli_trace_boot="-a ${arch}"
+cli_rungdbserver="-a ${arch} ${gem5}"
+cli_runtc="-a ${arch}"
 cli_trace2line="-a ${arch}"
+cli_trace_boot="-a ${arch}"
diff --git a/common b/common
index 0a3c53e..c9dc064 100644
--- a/common
+++ b/common
@@ -6,7 +6,7 @@ common_bench_cmd() (
   # Benchmark a command.
   #
   # $1: command to benchmark
-  # $2: where to write results to
+  # $2: where to append write results to. Default: /dev/null.
   #
   # Result format:
   #
@@ -14,16 +14,20 @@ common_bench_cmd() (
   # time <time in seconds to finish>
   # exit_status <exit status>
   cmd="$1"
-  results_file="$2"
+  results_file="${2:-/dev/null}"
   printf 'cmd ' >> "$results_file"
-  env time --append -f 'time %e' --output="$results_file" ./eeval -a "$cmd" "$results_file"
+  env time --append -f 'time %e' --output="$results_file" "${root_dir}/eeval" -a "$cmd" "$results_file"
   printf "exit_status $?\n" >> "$results_file"
 )
 set_common_vars() {
   arch="$1"
   gem5="${2:-false}"
+  common_suffix="${3:-}"
   buildroot_dir="${root_dir}/buildroot"
   arch_dir="$arch"
+  if [ -n "$common_suffix" ]; then
+    arch_dir="${arch_dir}-${common_suffix}"
+  fi
   out_arch_dir="${out_dir}/${arch_dir}"
   buildroot_out_dir="${out_arch_dir}/buildroot"
   build_dir="${buildroot_out_dir}/build"
diff --git a/parsec-benchmark/parsec-benchmark b/parsec-benchmark/parsec-benchmark
index f1b8a70..c56acac 160000
--- a/parsec-benchmark/parsec-benchmark
+++ b/parsec-benchmark/parsec-benchmark
@@ -1 +1 @@
-Subproject commit f1b8a70c7930fdd150649dfe43f0ea3b27f7937b
+Subproject commit c56acac70e53763a03f4e4ff1b5440e5b3ef96c5
diff --git a/run b/run
index 0535020..4a6ba65 100755
--- a/run
+++ b/run
@@ -281,10 +281,10 @@ ${extra_flags} \
   esac
 fi
 if "$tmux"; then
-  if "$debug"; then
-    eval "./tmu ./rungdb -a "${arch}" ${tmux_args}"
-  elif "$gem5"; then
+  if "$gem5"; then
     eval "./tmu 'sleep 2;./gem5-shell'"
+  elif  "$debug"; then
+    eval "./tmu ./rungdb -a '${arch}' ${tmux_args}"
   fi
 fi
 "${root_dir}/eeval" "$cmd" "${common_out_run_dir}/run.sh"
diff --git a/runtc b/runtc
index a53d983..22fb324 100755
--- a/runtc
+++ b/runtc
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 set -eu
 . common
-set -- ${cli_tc:-} "$@"
+set -- ${cli_runtc:-} "$@"
 while getopts a:gh OPT; do
   case "$OPT" in
     a)