userland: add assembly support

Move arm assembly cheat here, and start some work on x86 cheat as well.
2026-01-25 19:21:35 +01:00 · 2019-05-05 00:00:00 +00:00
parent 7a5ca339a3
commit 0263c21557
117 changed files with 3870 additions and 547 deletions
--- a/README.adoc
+++ b/README.adoc
@@ -422,7 +422,7 @@ index af583ce578..3cc341f303 100644
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 ....

-Finally, rebuild Binutils, userland and test our program with <<user-mode-setup>>:
+Finally, rebuild Binutils, userland and test our program with <<user-mode-simulation>>:

 ....
 ./build-buildroot -- host-binutils-rebuild
@@ -438,7 +438,7 @@ Tested on b60784d59bee993bf0de5cde6c6380dd69420dda + 1.

 OK, now time to hack GCC.

-For convenience, let's use the <<user-mode-setup>>.
+For convenience, let's use the <<user-mode-simulation>>.

 If we run the program link:userland/gcc_hack.c[]:

@@ -929,6 +929,115 @@ sudo rmmod hello.ko
 dmesg
 ....

+=== Userland setup
+
+==== About the userland setup
+
+In order to test the kernel and emulators, userland content in the form of executables and scripts is of course required, and we store it mostly under:
+
+* link:userland/[]
+* <<rootfs_overlay>>
+* <<add-new-buildroot-packages>>
+
+When we started this repository, it only contained content that interacted very closely with the kernel, or that had required performance analysis.
+
+However, we soon started to notice that this had an increasing overlap with other userland test repositories: we were duplicating build and test infrastructure and even some examples.
+
+Therefore, we decided to consolidate other userland tutorials that we had scattered around into this repository.
+
+Notable userland content included / moving into this repository includes:
+
+* <<arm-userland>>
+* <<x86-userland>>
+* <<c>>
+* <<cpp>>
+* <<posix>>
+* https://github.com/cirosantilli/algorithm-cheat will be good to move here for performance analysis
+
+==== Userland setup getting started
+
+There are several ways to run our userland content, notably:
+
+* natively on the host as shown at: <<userland-setup-getting-started-natively>>
+
+Can only run examples compatible with your host architecture and OS, but has the fastest setup and runtimes.
+* from user mode simulation as shown at: <<qemu-user-mode-getting-started>>
+
+Can run most examples, with the notable exception of examples that rely on kernel modules.
+* from full system simulation as shown at: <<qemu-buildroot-setup-getting-started>>.
+
+This is the most reproducible and controlled environment, and all examples work there. But also the slower one to setup.
+
+===== Userland setup getting started natively
+
+With this setup, we will use the host toolchain and execute executables directly on the host.
+
+No installation or toolchain build is reuqired, so you can just jump straight into it.
+
+Build, run and example, and clean it in-tree with:
+
+....
+cd userland
+./build
+./c/hello.out
+./build --clean
+....
+
+Source: link:userland/c/hello.c[].
+
+Or build just one directory:
+
+....
+./build c
+....
+
+or just one executable:
+
+....
+./build c/hello
+....
+
+Do a more clean out of tree build and run the program instead:
+
+....
+./build-userland --gcc-which host --userland-build-id host
+"$(./getvar --userland-build-id host userland_build_dir)/hello.out"
+....
+
+===== Userland setup getting started full system
+
+First ensure that <<qemu-buildroot-setup>> is working.
+
+After doing that setup, you can already execute your userland programs from inside QEMU: the only missing step is how to rebuild executables and run them.
+
+And the answer is exactly analogous to what is shown at: <<your-first-kernel-module-hack>>
+
+For example, if we modify link:userland/c/hello.c[] to print out something different, we can just rebuild it with:
+
+....
+./build-userland
+....
+
+Source: link:build-userland[]. `./build` calls that script automatically for us when doing the initial full build.
+
+Now, run the program either without rebooting use the <<9p>> mount:
+
+....
+/mnt/9p/out_rootfs_overlay/c/hello.out
+....
+
+or shutdown QEMU, add the executable to the root filesystem:
+
+....
+./build-buildroot
+....
+
+reboot and use the root filesystem as usual:
+
+....
+/c/hello.out
+....
+
 === Baremetal setup

 ==== About the baremetal setup
@@ -1076,8 +1185,8 @@ But just stick to newer and better `VExpress_GEM5_V1` unless you have a good rea

 When doing bare metal programming, it is likely that you will want to learn assembly language basics. Have a look at these tutorials for the userland part:

-* https://github.com/cirosantilli/x86-assembly-cheat
-* https://github.com/cirosantilli/arm-assembly-cheat
+* <<x86-userland>>
+* <<arm-userland>>

 For more information on baremetal, see the section: <<baremetal>>.

@@ -1086,14 +1195,6 @@ The following subjects are particularly important:
 * <<tracing>>
 * <<baremetal-gdb-step-debug>>

-=== User mode setup
-
-Much like <<baremetal-setup>>, this is another fun setup that does not require Buildroot or the Linux kernel.
-
-Getting started at: <<qemu-user-mode-getting-started>>.
-
-Introduction at: <<user-mode-simulation>>.
-
 [[gdb]]
 == GDB step debug

@@ -1700,7 +1801,7 @@ since GDB does not know that libc is loaded.

 This is the userland debug setup most likely to work, since at init time there is only one userland executable running.

-For executables from the <<userland-directory>> such as link:userland/count.c[]:
+For executables from the link:userland/[] directory such as link:userland/count.c[]:

 * Shell 1:
 +
@@ -3288,7 +3389,7 @@ qw er

 `./run --userland` path resolution is analogous to <<baremetal-setup-getting-started,that of `./run --baremetal`>>.

-`./build user-mode-qemu` first builds Buildroot, and then runs `./build-userland`, which is further documented at: <<userland-directory>>. It also builds QEMU. If you ahve already done a <<qemu-buildroot-setup>> previously, this will be very fast.
+`./build user-mode-qemu` first builds Buildroot, and then runs `./build-userland`, which is further documented at: <<userland-setup>>. It also builds QEMU. If you ahve already done a <<qemu-buildroot-setup>> previously, this will be very fast.

 If you modify the userland programs, rebuild simply with:

@@ -12033,6 +12134,295 @@ make CROSS_COMPILE_DIR=/usr/bin
 ;
 ....

+== C
+
+Programs under link:userland/c/[] are examples of link:https://en.wikipedia.org/wiki/ANSI_C[ANSI C] programming.
+
+[[cpp]]
+== C++
+
+Programs under link:userland/cpp/[] are examples of link:https://en.wikipedia.org/wiki/C%2B%2B#Standardization[ISO C] programming.
+
+== POSIX
+
+Programs under link:userland/posix/[] are examples of POSIX C programming.
+
+What is POSIX:
+
+* https://stackoverflow.com/questions/1780599/what-is-the-meaning-of-posix/31865755#31865755
+* https://unix.stackexchange.com/questions/11983/what-exactly-is-posix/220877#220877
+
+== x86 userland
+
+Programs under link:userland/arch/x86_64/[] are examples of x86 userland assembly programming.
+
+Those examples are progressively being moved out of: https://github.com/cirosantilli/x86-assembly-cheat
+
+== arm userland
+
+Programs under:
+
+* link:userland/arch/arm/[]
+* link:userland/arch/aarch64/[]
+
+are examples of ARM userland assembly programming.
+
+== Android
+
+Remember: Android AOSP is a huge undocumented piece of bloatware. It's integration into this repo will likely never be super good.
+
+Verbose setup description: https://stackoverflow.com/questions/1809774/how-to-compile-the-android-aosp-kernel-and-test-it-with-the-android-emulator/48310014#48310014
+
+Download, build and run with the prebuilt AOSP QEMU emulator and the AOSP kernel:
+
+....
+./build-android \
+  --android-base-dir /path/to/your/hd \
+  --android-version 8.1.0_r60 \
+  download \
+  build \
+;
+./run-android \
+  --android-base-dir /path/to/your/hd \
+  --android-version 8.1.0_r60 \
+;
+....
+
+Sources:
+
+* link:build-android[]
+* link:run-android[]
+
+TODO how to hack the AOSP kernel, userland and emulator?
+
+Other archs work as well as usual with `--arch` parameter. However, running in non-x86 is very slow due to the lack of KVM.
+
+Tested on: `8.1.0_r60`.
+
+=== Android image structure
+
+https://source.android.com/devices/bootloader/partitions-images
+
+The messy AOSP generates a ton of images instead of just one.
+
+When the emulator launches, we can see them through QEMU `-drive` arguments:
+
+....
+emulator: argv[21] = "-initrd"
+emulator: argv[22] = "/data/aosp/8.1.0_r60/out/target/product/generic_x86_64/ramdisk.img"
+emulator: argv[23] = "-drive"
+emulator: argv[24] = "if=none,index=0,id=system,file=/path/to/aosp/8.1.0_r60/out/target/product/generic_x86_64/system-qemu.img,read-only"
+emulator: argv[25] = "-device"
+emulator: argv[26] = "virtio-blk-pci,drive=system,iothread=disk-iothread,modern-pio-notify"
+emulator: argv[27] = "-drive"
+emulator: argv[28] = "if=none,index=1,id=cache,file=/path/to/aosp/8.1.0_r60/out/target/product/generic_x86_64/cache.img.qcow2,overlap-check=none,cache=unsafe,l2-cache-size=1048576"
+emulator: argv[29] = "-device"
+emulator: argv[30] = "virtio-blk-pci,drive=cache,iothread=disk-iothread,modern-pio-notify"
+emulator: argv[31] = "-drive"
+emulator: argv[32] = "if=none,index=2,id=userdata,file=/path/to/aosp/8.1.0_r60/out/target/product/generic_x86_64/userdata-qemu.img.qcow2,overlap-check=none,cache=unsafe,l2-cache-size=1048576"
+emulator: argv[33] = "-device"
+emulator: argv[34] = "virtio-blk-pci,drive=userdata,iothread=disk-iothread,modern-pio-notify"
+emulator: argv[35] = "-drive"
+emulator: argv[36] = "if=none,index=3,id=encrypt,file=/path/to/aosp/8.1.0_r60/out/target/product/generic_x86_64/encryptionkey.img.qcow2,overlap-check=none,cache=unsafe,l2-cache-size=1048576"
+emulator: argv[37] = "-device"
+emulator: argv[38] = "virtio-blk-pci,drive=encrypt,iothread=disk-iothread,modern-pio-notify"
+emulator: argv[39] = "-drive"
+emulator: argv[40] = "if=none,index=4,id=vendor,file=/path/to/aosp/8.1.0_r60/out/target/product/generic_x86_64/vendor-qemu.img,read-only"
+emulator: argv[41] = "-device"
+emulator: argv[42] = "virtio-blk-pci,drive=vendor,iothread=disk-iothread,modern-pio-notify"
+....
+
+The root directory is the <<initrd>> given on the QEMU CLI, which `/proc/mounts` reports at:
+
+....
+rootfs on / type rootfs (ro,seclabel,size=886392k,nr_inodes=221598)
+....
+
+This contains the <<android-init>>, which through `.rc` must be mounting mounts the drives int o the right places TODO find exact point.
+
+The drive order is:
+
+....
+system
+cache
+userdata
+encryptionkey
+vendor-qemu
+....
+
+Then, on the terminal:
+
+....
+mount | grep vd
+....
+
+gives:
+
+....
+/dev/block/vda1 on /system type ext4 (ro,seclabel,relatime,data=ordered)
+/dev/block/vde1 on /vendor type ext4 (ro,seclabel,relatime,data=ordered)
+/dev/block/vdb on /cache type ext4 (rw,seclabel,nosuid,nodev,noatime,errors=panic,data=ordered)
+....
+
+and we see that the order of `vda`, `vdb`, etc. matches that in which `-drive` were given to QEMU.
+
+Tested on: `8.1.0_r60`.
+
+==== Android images read-only
+
+From `mount`, we can see that some of the mounted images are `ro`.
+
+Basically, every image that was given to QEMU as qcow2 is writable, and that qcow2 is an overlay over the actual original image.
+
+In order to make `/system` and `/vendor` writable by using qcow2 for them as well, we must use the `-writable-system` option:
+
+....
+./run-android -- -writable-system
+....
+
+* https://android.stackexchange.com/questions/110927/how-to-mount-system-rewritable-or-read-only-rw-ro/207200#207200
+* https://stackoverflow.com/questions/13089694/adb-remount-permission-denied-but-able-to-access-super-user-in-shell-android/43163693#43163693
+
+then:
+
+....
+su
+mount -o rw,remount /system
+date >/system/a
+....
+
+Now reboot, and relaunch with `-writable-system` once again to pick up the modified qcow2 images:
+
+....
+./run-android -- -writable-system
+....
+
+and the newly created file is still there:
+
+....
+date >/system/a
+....
+
+`/system` and `/vendor` can be nuked quickly with:
+
+....
+./build-android --extra-args snod
+./build-android --extra-args vnod
+....
+
+as mentioned at: https://stackoverflow.com/questions/29023406/how-to-just-build-android-system-image and on:
+
+....
+./build-android --extra-args help
+....
+
+Tested on: `8.1.0_r60`.
+
+==== Android /data partition
+
+When I install an app like F-Droid, it goes under `/data` according to:
+
+....
+find / -iname '*fdroid*'
+....
+
+and it <<disk-persistency,persists across boots>>.
+
+`/data` is behind a RW LVM device:
+
+....
+/dev/block/dm-0 on /data type ext4 (rw,seclabel,nosuid,nodev,noatime,errors=panic,data=ordered)
+....
+
+but TODO I can't find where it comes from since I don't have the CLI tools mentioned at:
+
+* https://superuser.com/questions/131519/what-is-this-dm-0-device
+* https://unix.stackexchange.com/questions/185057/where-does-lvm-store-its-configuration
+
+However, by looking at:
+
+....
+./run-android -- -help
+....
+
+we see:
+
+....
+-data <file>                   data image (default <datadir>/userdata-qemu.img
+....
+
+which confirms the suspicion that this data goes in `userdata-qemu.img`.
+
+To reset images to their original state, just remove the qcow2 overlay and regenerate it: https://stackoverflow.com/questions/54446680/how-to-reset-the-userdata-image-when-building-android-aosp-and-running-it-on-the
+
+Tested on: `8.1.0_r60`.
+
+=== Install Android apps
+
+I don't know how to download files from the web on Vanilla android, the default browser does not download anything, and there is no `wget`:
+
+* https://android.stackexchange.com/questions/6984/how-to-download-files-from-the-web-in-the-android-browser
+* https://stackoverflow.com/questions/26775079/wget-in-android-terminal
+
+Installing with `adb install` does however work: https://stackoverflow.com/questions/7076240/install-an-apk-file-from-command-prompt
+
+link:https://f-droid.org[F-Droid] installed fine like that, however it does not have permission to install apps: https://www.maketecheasier.com/install-apps-from-unknown-sources-android/
+
+And the `Settings` app crashes so I can't change it, logcat contains:
+
+....
+No service published for: wifip2p
+....
+
+which is mentioned at: https://stackoverflow.com/questions/47839955/android-8-settings-app-crashes-on-emulator-with-clean-aosp-build
+
+We also tried to enable it from the command line with:
+
+....
+settings put secure install_non_market_apps 1
+....
+
+as mentioned at: https://android.stackexchange.com/questions/77280/allow-unknown-sources-from-terminal-without-going-to-settings-app but it didn't work either.
+
+No person alive seems to know how to pre-install apps on AOSP: https://stackoverflow.com/questions/6249458/pre-installing-android-application
+
+Tested on: `8.1.0_r60`.
+
+=== Android init
+
+For Linux in general, see: <<init>>.
+
+The `/init` executable interprets the `/init.rc` files, which is in a custom Android init system language: https://android.googlesource.com/platform/system/core/+/ee0e63f71d90537bb0570e77aa8a699cc222cfaf/init/README.md
+
+The top of that file then sources other `.rc` files present on the root directory:
+
+....
+import /init.environ.rc
+import /init.usb.rc
+import /init.${ro.hardware}.rc
+import /vendor/etc/init/hw/init.${ro.hardware}.rc
+import /init.usb.configfs.rc
+import /init.${ro.zygote}.rc
+....
+
+TODO: how is `ro.hardware` determined? https://stackoverflow.com/questions/20572781/android-boot-where-is-the-init-hardware-rc-read-in-init-c-where-are-servic It is a system property and can be obtained with:
+
+....
+getprop ro.hardware
+....
+
+This gives:
+
+....
+ranchu
+....
+
+which is the codename for the QEMU virtual platform we are running on: https://www.oreilly.com/library/view/android-system-programming/9781787125360/9736a97c-cd09-40c3-b14d-955717648302.xhtml
+
+TODO: is it possible to add a custom `.rc` file without modifying the initrd that <<android-image-structure,gets mounted on root>>? https://stackoverflow.com/questions/9768103/make-persistent-changes-to-init-rc
+
+Tested on: `8.1.0_r60`.
+
 == Benchmark this repo

 TODO: didn't fully port during refactor after 3b0a343647bed577586989fb702b760bd280844a. Reimplementing should not be hard.
@@ -12316,266 +12706,6 @@ gem5:
 ** https://stackoverflow.com/questions/47997565/gem5-system-requirements-for-decent-performance/48941793#48941793
 ** https://github.com/gem5/gem5/issues/25

-== WIP
-
-Big new features that are not yet working.
-
-=== Android
-
-Remember: Android AOSP is a huge undocumented piece of bloatware. It's integration into this repo will likely never be super good.
-
-Verbose setup description: https://stackoverflow.com/questions/1809774/how-to-compile-the-android-aosp-kernel-and-test-it-with-the-android-emulator/48310014#48310014
-
-Download, build and run with the prebuilt AOSP QEMU emulator and the AOSP kernel:
-
-....
-./build-android \
-  --android-base-dir /path/to/your/hd \
-  --android-version 8.1.0_r60 \
-  download \
-  build \
-;
-./run-android \
-  --android-base-dir /path/to/your/hd \
-  --android-version 8.1.0_r60 \
-;
-....
-
-Sources:
-
-* link:build-android[]
-* link:run-android[]
-
-TODO how to hack the AOSP kernel, userland and emulator?
-
-Other archs work as well as usual with `--arch` parameter. However, running in non-x86 is very slow due to the lack of KVM.
-
-Tested on: `8.1.0_r60`.
-
-==== Android image structure
-
-https://source.android.com/devices/bootloader/partitions-images
-
-The messy AOSP generates a ton of images instead of just one.
-
-When the emulator launches, we can see them through QEMU `-drive` arguments:
-
-....
-emulator: argv[21] = "-initrd"
-emulator: argv[22] = "/data/aosp/8.1.0_r60/out/target/product/generic_x86_64/ramdisk.img"
-emulator: argv[23] = "-drive"
-emulator: argv[24] = "if=none,index=0,id=system,file=/path/to/aosp/8.1.0_r60/out/target/product/generic_x86_64/system-qemu.img,read-only"
-emulator: argv[25] = "-device"
-emulator: argv[26] = "virtio-blk-pci,drive=system,iothread=disk-iothread,modern-pio-notify"
-emulator: argv[27] = "-drive"
-emulator: argv[28] = "if=none,index=1,id=cache,file=/path/to/aosp/8.1.0_r60/out/target/product/generic_x86_64/cache.img.qcow2,overlap-check=none,cache=unsafe,l2-cache-size=1048576"
-emulator: argv[29] = "-device"
-emulator: argv[30] = "virtio-blk-pci,drive=cache,iothread=disk-iothread,modern-pio-notify"
-emulator: argv[31] = "-drive"
-emulator: argv[32] = "if=none,index=2,id=userdata,file=/path/to/aosp/8.1.0_r60/out/target/product/generic_x86_64/userdata-qemu.img.qcow2,overlap-check=none,cache=unsafe,l2-cache-size=1048576"
-emulator: argv[33] = "-device"
-emulator: argv[34] = "virtio-blk-pci,drive=userdata,iothread=disk-iothread,modern-pio-notify"
-emulator: argv[35] = "-drive"
-emulator: argv[36] = "if=none,index=3,id=encrypt,file=/path/to/aosp/8.1.0_r60/out/target/product/generic_x86_64/encryptionkey.img.qcow2,overlap-check=none,cache=unsafe,l2-cache-size=1048576"
-emulator: argv[37] = "-device"
-emulator: argv[38] = "virtio-blk-pci,drive=encrypt,iothread=disk-iothread,modern-pio-notify"
-emulator: argv[39] = "-drive"
-emulator: argv[40] = "if=none,index=4,id=vendor,file=/path/to/aosp/8.1.0_r60/out/target/product/generic_x86_64/vendor-qemu.img,read-only"
-emulator: argv[41] = "-device"
-emulator: argv[42] = "virtio-blk-pci,drive=vendor,iothread=disk-iothread,modern-pio-notify"
-....
-
-The root directory is the <<initrd>> given on the QEMU CLI, which `/proc/mounts` reports at:
-
-....
-rootfs on / type rootfs (ro,seclabel,size=886392k,nr_inodes=221598)
-....
-
-This contains the <<android-init>>, which through `.rc` must be mounting mounts the drives int o the right places TODO find exact point.
-
-The drive order is:
-
-....
-system
-cache
-userdata
-encryptionkey
-vendor-qemu
-....
-
-Then, on the terminal:
-
-....
-mount | grep vd
-....
-
-gives:
-
-....
-/dev/block/vda1 on /system type ext4 (ro,seclabel,relatime,data=ordered)
-/dev/block/vde1 on /vendor type ext4 (ro,seclabel,relatime,data=ordered)
-/dev/block/vdb on /cache type ext4 (rw,seclabel,nosuid,nodev,noatime,errors=panic,data=ordered)
-....
-
-and we see that the order of `vda`, `vdb`, etc. matches that in which `-drive` were given to QEMU.
-
-Tested on: `8.1.0_r60`.
-
-===== Android images read-only
-
-From `mount`, we can see that some of the mounted images are `ro`.
-
-Basically, every image that was given to QEMU as qcow2 is writable, and that qcow2 is an overlay over the actual original image.
-
-In order to make `/system` and `/vendor` writable by using qcow2 for them as well, we must use the `-writable-system` option:
-
-....
-./run-android -- -writable-system
-....
-
-* https://android.stackexchange.com/questions/110927/how-to-mount-system-rewritable-or-read-only-rw-ro/207200#207200
-* https://stackoverflow.com/questions/13089694/adb-remount-permission-denied-but-able-to-access-super-user-in-shell-android/43163693#43163693
-
-then:
-
-....
-su
-mount -o rw,remount /system
-date >/system/a
-....
-
-Now reboot, and relaunch with `-writable-system` once again to pick up the modified qcow2 images:
-
-....
-./run-android -- -writable-system
-....
-
-and the newly created file is still there:
-
-....
-date >/system/a
-....
-
-`/system` and `/vendor` can be nuked quickly with:
-
-....
-./build-android --extra-args snod
-./build-android --extra-args vnod
-....
-
-as mentioned at: https://stackoverflow.com/questions/29023406/how-to-just-build-android-system-image and on:
-
-....
-./build-android --extra-args help
-....
-
-Tested on: `8.1.0_r60`.
-
-===== Android /data partition
-
-When I install an app like F-Droid, it goes under `/data` according to:
-
-....
-find / -iname '*fdroid*'
-....
-
-and it <<disk-persistency,persists across boots>>.
-
-`/data` is behind a RW LVM device:
-
-....
-/dev/block/dm-0 on /data type ext4 (rw,seclabel,nosuid,nodev,noatime,errors=panic,data=ordered)
-....
-
-but TODO I can't find where it comes from since I don't have the CLI tools mentioned at:
-
-* https://superuser.com/questions/131519/what-is-this-dm-0-device
-* https://unix.stackexchange.com/questions/185057/where-does-lvm-store-its-configuration
-
-However, by looking at:
-
-....
-./run-android -- -help
-....
-
-we see:
-
-....
-data <file>                   data image (default <datadir>/userdata-qemu.img
-....
-
-which confirms the suspicion that this data goes in `userdata-qemu.img`.
-
-To reset images to their original state, just remove the qcow2 overlay and regenerate it: https://stackoverflow.com/questions/54446680/how-to-reset-the-userdata-image-when-building-android-aosp-and-running-it-on-the
-
-Tested on: `8.1.0_r60`.
-
-==== Install Android apps
-
-I don't know how to download files from the web on Vanilla android, the default browser does not download anything, and there is no `wget`:
-
-* https://android.stackexchange.com/questions/6984/how-to-download-files-from-the-web-in-the-android-browser
-* https://stackoverflow.com/questions/26775079/wget-in-android-terminal
-
-Installing with `adb install` does however work: https://stackoverflow.com/questions/7076240/install-an-apk-file-from-command-prompt
-
-link:https://f-droid.org[F-Droid] installed fine like that, however it does not have permission to install apps: https://www.maketecheasier.com/install-apps-from-unknown-sources-android/
-
-And the `Settings` app crashes so I can't change it, logcat contains:
-
-....
-No service published for: wifip2p
-....
-
-which is mentioned at: https://stackoverflow.com/questions/47839955/android-8-settings-app-crashes-on-emulator-with-clean-aosp-build
-
-We also tried to enable it from the command line with:
-
-....
-settings put secure install_non_market_apps 1
-....
-
-as mentioned at: https://android.stackexchange.com/questions/77280/allow-unknown-sources-from-terminal-without-going-to-settings-app but it didn't work either.
-
-No person alive seems to know how to pre-install apps on AOSP: https://stackoverflow.com/questions/6249458/pre-installing-android-application
-
-Tested on: `8.1.0_r60`.
-
-=== Android init
-
-For Linux in general, see: <<init>>.
-
-The `/init` executable interprets the `/init.rc` files, which is in a custom Android init system language: https://android.googlesource.com/platform/system/core/+/ee0e63f71d90537bb0570e77aa8a699cc222cfaf/init/README.md
-
-The top of that file then sources other `.rc` files present on the root directory:
-
-....
-import /init.environ.rc
-import /init.usb.rc
-import /init.${ro.hardware}.rc
-import /vendor/etc/init/hw/init.${ro.hardware}.rc
-import /init.usb.configfs.rc
-import /init.${ro.zygote}.rc
-....
-
-TODO: how is `ro.hardware` determined? https://stackoverflow.com/questions/20572781/android-boot-where-is-the-init-hardware-rc-read-in-init-c-where-are-servic It is a system property and can be obtained with:
-
-....
-getprop ro.hardware
-....
-
-This gives:
-
-....
-ranchu
-....
-
-which is the codename for the QEMU virtual platform we are running on: https://www.oreilly.com/library/view/android-system-programming/9781787125360/9736a97c-cd09-40c3-b14d-955717648302.xhtml
-
-TODO: is it possible to add a custom `.rc` file without modifying the initrd that <<android-image-structure,gets mounted on root>>? https://stackoverflow.com/questions/9768103/make-persistent-changes-to-init-rc
-
-Tested on: `8.1.0_r60`.
-
 == About this repo

 === Supported hosts
@@ -13057,88 +13187,6 @@ link:include/[] contains headers that are shared across both kernel modules and

 They contain data structs and magic constant for kernel to userland communication.

-==== userland directory
-
-Userland test programs. They can be used in the following ways:
-
-* inside a full system simulation, e.g.: <<qemu-buildroot-setup>>
-* inside <<user-mode-simulation>>
-* directly on the host: <<userland-directory-host-build>>
-
-For usage inside full system simulation, first ensure that Buildroot has been built for the toolchain, and then build the examples with:
-
-....
-./build-userland
-....
-
-Source: link:build-userland[].
-
-This makes them visible immediately on the <<9p>> mount of a running simulator.
-
-In order to place them in the root filesystem image itself, you must also run:
-
-....
-./build-buildroot
-....
-
-===== userland directory host build
-
-It is possible to build and run some of the userland examples directly on your host:
-
-....
-cd userland
-make
-./hello.out
-make clean
-....
-
-or more cleanly out of tree:
-
-....
-./build-userland --gcc-which host --userland-build-id host
-"$(./getvar --userland-build-id host userland_build_dir)/hello.out"
-....
-
-Extra make flags may be passed as:
-
-....
-./build-userland --gcc-which host --userland-build-id host-static --make-args='-B CFLAGS_EXTRA=-static'
-"$(./getvar --userland-build-id host-static userland_build_dir)/hello.out"
-....
-
-This for example would both force a rebuild due to `-B` and link statically due to `CFLAGS_EXTRA=-static`.
-
-TODO: OpenMP does not like `-static`:
-
-....
-/usr/lib/gcc/x86_64-linux-gnu/5/libgomp.a(target.o): In function `gomp_target_init':
-(.text+0xba): warning: Using 'dlopen' in statically linked applications requires at runtime the shared libraries from the glibc version used for linking
-....
-
-See: https://stackoverflow.com/questions/23869981/linking-openmp-statically-with-gcc
-
-===== userland cheats
-
-We have accumulated considerable material in the following userland subjects.
-
-====== C
-
-Programs under link:userland/c/[] are examples of link:https://en.wikipedia.org/wiki/ANSI_C[ANSI C] programming.
-
-[[cpp]]
-====== C++
-
-Programs under link:userland/cpp/[] are examples of link:https://en.wikipedia.org/wiki/C%2B%2B#Standardization[ISO C] programming.
-
-====== POSIX
-
-Programs under link:userland/posix/[] are examples of POSIX C programming.
-
-What is POSIX:
-
-* https://stackoverflow.com/questions/1780599/what-is-the-meaning-of-posix/31865755#31865755
-* https://unix.stackexchange.com/questions/11983/what-exactly-is-posix/220877#220877
-
 ==== buildroot_packages directory

 Source: link:buildroot_packages/[]
@@ -13171,7 +13219,7 @@ You can force a rebuild with:
 ./build-buildroot --config 'BR2_PACKAGE_SAMPLE_PACKAGE=y' -- sample_package-reconfigure
 ....

-Buildroot packages are convenient, but in general, if a package if very important to you, but not really mergeable back to Buildroot, you might want to just use a custom build script for it, and point it to the Buildroot toolchain, and then use `BR2_ROOTFS_OVERLAY`, much like we do for <<userland-directory>>.
+Buildroot packages are convenient, but in general, if a package if very important to you, but not really mergeable back to Buildroot, you might want to just use a custom build script for it, and point it to the Buildroot toolchain, and then use `BR2_ROOTFS_OVERLAY`, much like we do for <<userland-setup>>.

 A custom build script can give you more flexibility: e.g. the package can be made work with other root filesystems more easily, have better <<9p>> support, and rebuild faster as it evades some Buildroot boilerplate.

--- a/375
+++ b/375
@@ -2,13 +2,12 @@

 import os
 import shlex
-
-import common
-import threading
 import subprocess
-from shell_helpers import LF
+import threading

-error = False
+from shell_helpers import LF
+import common
+from thread_pool import ThreadPool

 class Main(common.BuildCliFunction):
    def __init__(self):
@@ -32,10 +31,15 @@ allows us to build examples that rely on it.
            '--in-tree',
            default=False,
            help='''\
-Magic build mode tailored to build from within the source tree:
-
-* place build output inside soure tree to conveniently run it
-* if not targets are given, build use the current working directory
+Place build output inside soure tree to conveniently run it, especially when
+building with the host toolchain.
+''',
+        )
+        self.add_argument(
+            '--target-cwd',
+            default=False,
+            help='''\
+Treat targets as relative to the current working directory.
 ''',
        )
        self.add_argument(
@@ -63,85 +67,76 @@ Default: build all examples that have their package dependencies met, e.g.:
        extra_deps=None,
        extra_objs=None,
        link=True,
-        raise_on_failure=True,
-        thread_limiter=None,
    ):
-        try:
-            if extra_deps is None:
-                extra_deps = []
-            if extra_objs is None:
-                extra_objs = []
-            if ccflags_after is None:
-                ccflags_after = []
-            ret = 0
-            if self.need_rebuild([in_path] + extra_objs + extra_deps, out_path):
-                ccflags = ccflags.copy()
-                if not link:
-                    ccflags.extend(['-c', LF])
-                in_ext = os.path.splitext(in_path)[1]
-                do_compile = True
-                if in_ext == self.env['c_ext']:
-                    cc = self.env['gcc']
-                    if cstd is None:
-                        std = self.default_cstd
-                    else:
-                        std = cstd
-                    ccflags.extend([
-                        '-fopenmp', LF,
-                    ])
-                elif in_ext == self.env['cxx_ext']:
-                    cc = self.env['gxx']
-                    if cxxstd is None:
-                        std = self.default_cxxstd
-                    else:
-                        std = cxxstd
+        if extra_deps is None:
+            extra_deps = []
+        if extra_objs is None:
+            extra_objs = []
+        if ccflags_after is None:
+            ccflags_after = []
+        ret = 0
+        if self.need_rebuild([in_path] + extra_objs + extra_deps, out_path):
+            ccflags = ccflags.copy()
+            if not link:
+                ccflags.extend(['-c', LF])
+            in_ext = os.path.splitext(in_path)[1]
+            do_compile = True
+            if in_ext in (self.env['c_ext'], self.env['asm_ext']):
+                cc = self.env['gcc']
+                if cstd is None:
+                    std = self.default_cstd
                else:
-                    do_compile = False
-                if do_compile:
-                    ret = self.sh.run_cmd(
-                        (
-                            [
-                                cc, LF,
-                            ] +
-                            ccflags +
-                            [
-                                '-std={}'.format(std), LF,
-                                '-o', out_path, LF,
-                                in_path, LF,
-                            ] +
-                            self.sh.add_newlines(extra_objs) +
-                            [
-                                '-lm', LF,
-                                '-pthread', LF,
-                            ] +
-                            ccflags_after
-                        ),
-                        extra_paths=[self.env['ccache_dir']],
-                        raise_on_failure=raise_on_failure,
-                    )
-        finally:
-            if thread_limiter is not None:
-                thread_limiter.release()
-        if ret != 0:
-            self.error = True
+                    std = cstd
+                ccflags.extend([
+                    '-fopenmp', LF,
+                ])
+            elif in_ext == self.env['cxx_ext']:
+                cc = self.env['gxx']
+                if cxxstd is None:
+                    std = self.default_cxxstd
+                else:
+                    std = cxxstd
+            else:
+                do_compile = False
+            if do_compile:
+                os.makedirs(os.path.dirname(out_path), exist_ok=True)
+                ret = self.sh.run_cmd(
+                    (
+                        [
+                            cc, LF,
+                        ] +
+                        ccflags +
+                        [
+                            '-std={}'.format(std), LF,
+                            '-o', out_path, LF,
+                            in_path, LF,
+                        ] +
+                        self.sh.add_newlines(extra_objs) +
+                        [
+                            '-lm', LF,
+                            '-pthread', LF,
+                        ] +
+                        ccflags_after
+                    ),
+                    extra_paths=[self.env['ccache_dir']],
+                )
        return ret

    def _get_targets(self):
        if self.env['_args_given']['targets']:
            targets = self.env['targets']
-            if self.env['in_tree']:
+            if self.env['target_cwd']:
                cwd = os.getcwd()
                targets = [os.path.join(cwd, target) for target in targets]
            return targets
        else:
-            if self.env['in_tree']:
+            if self.env['target_cwd']:
                return [os.getcwd()]
            else:
                return [self.env['userland_source_dir']]

    def build(self):
        build_dir = self.get_build_dir()
-        os.makedirs(build_dir, exist_ok=True)
        has_packages = set(self.env['has_package'])
        ccflags = [
            '-I', self.env['root_dir'], LF,
@@ -166,6 +161,25 @@ Default: build all examples that have their package dependencies met, e.g.:
            extra_deps=[self.env['common_h']],
            link=False,
        )
+        common_obj_asm = os.path.join(
+            build_dir,
+            'arch',
+            'main' + self.env['obj_ext']
+        )
+        common_obj_asm_relpath = os.path.join(
+            'arch',
+            'main' + self.env['c_ext']
+        )
+        self._build_one(
+            in_path=os.path.join(
+                self.env['userland_source_dir'],
+                common_obj_asm_relpath
+            ),
+            out_path=common_obj_asm,
+            ccflags=ccflags,
+            extra_deps=[self.env['common_h']],
+            link=False,
+        )
        pkgs = {
            'eigen': {
                # TODO: was failing with:
@@ -189,84 +203,136 @@ Default: build all examples that have their package dependencies met, e.g.:
            'openblas': {},
        }
        rootdir_abs_len = len(self.env['userland_source_dir'])
-        thread_limiter = threading.BoundedSemaphore(self.env['nproc'])
-        self.error = False
-        for target in self._get_targets():
-            target = self.resolve_userland_source(target)
-            for path, in_dirnames, in_filenames in self.sh.walk(target):
-                in_dirnames.sort()
-                path_abs = os.path.abspath(path)
-                dirpath_relative_root = path_abs[rootdir_abs_len + 1:]
-                dirpath_relative_root_components = dirpath_relative_root.split(os.sep)
-                if (
-                    len(dirpath_relative_root_components) < 2 or
-                    dirpath_relative_root_components[0] != 'arch' or
-                    dirpath_relative_root_components[1] == self.env['arch']
-                ):
-                    out_dir = os.path.join(
-                        build_dir,
-                        dirpath_relative_root
-                    )
-                    os.makedirs(out_dir, exist_ok=True)
-                    ccflags_dir = ccflags.copy()
-                    if dirpath_relative_root_components == ['gcc']:
-                        cstd = 'gnu11'
-                        cxxstd = 'gnu++17'
-                    else:
-                        cstd = self.default_cstd
-                        cxxstd = self.default_cxxstd
-                        # -pedantic complains even if we use -std=gnu11.
-                        ccflags_dir.extend(['-pedantic', LF])
-                    for in_filename in in_filenames:
-                        in_path = os.path.join(path, in_filename)
-                        in_name, in_ext = os.path.splitext(in_filename)
-                        out_path = os.path.join(
-                            out_dir,
-                            in_name + self.env['userland_build_ext']
+        thread_pool = ThreadPool(
+            self._build_one,
+            nthreads=self.env['nproc'],
+        )
+        class ExitLoop(Exception): pass
+        try:
+            for target in self._get_targets():
+                target = self.resolve_userland_source(target)
+                for path, in_dirnames, in_filenames in self.sh.walk(target):
+                    in_dirnames.sort()
+                    in_filenames.sort()
+                    path_abs = os.path.abspath(path)
+                    dirpath_relative_root = path_abs[rootdir_abs_len + 1:]
+                    dirpath_relative_root_components = dirpath_relative_root.split(os.sep)
+                    dirpath_relative_root_components_len = len(dirpath_relative_root_components)
+                    do_build_dir = True
+                    in_arch = False
+                    if dirpath_relative_root_components_len > 0:
+                        if dirpath_relative_root_components[0] == 'arch':
+                            if dirpath_relative_root_components_len > 1:
+                                if dirpath_relative_root_components[1] == self.env['arch']:
+                                    in_arch = True
+                                else:
+                                    do_build_dir = False
+                            else:
+                                do_build_dir = False
+                    if do_build_dir:
+                        out_dir = os.path.join(
+                            build_dir,
+                            dirpath_relative_root
                        )
-                        pkg_key = in_name.split('_')[0]
-                        ccflags_file = ccflags_dir.copy()
-                        ccflags_after = []
-                        if pkg_key in pkgs:
-                            if pkg_key not in has_packages:
+                        common_objs_dir = [common_obj]
+                        ccflags_dir = ccflags.copy()
+                        if dirpath_relative_root_components == ['gcc']:
+                            cstd = 'gnu11'
+                            cxxstd = 'gnu++17'
+                        else:
+                            cstd = self.default_cstd
+                            cxxstd = self.default_cxxstd
+                            # -pedantic complains even if we use -std=gnu11.
+                            ccflags_dir.extend(['-pedantic', LF])
+                            if in_arch:
+                                ccflags_dir.extend([
+                                    '-I', os.path.join(self.env['userland_source_arch_arch_dir']), LF,
+                                    '-I', os.path.join(self.env['userland_source_arch_dir']), LF,
+                                    '-fno-pie', LF,
+                                    '-no-pie', LF,
+                                ])
+                                if 'freestanding' in dirpath_relative_root_components:
+                                    common_objs_dir = []
+                                    ccflags_dir.extend([
+                                        '-ffreestanding', LF,
+                                        '-nostdlib', LF,
+                                        '-static', LF,
+                                    ])
+                                else:
+                                    if 'c' in dirpath_relative_root_components:
+                                        common_objs_dir = []
+                                    else:
+                                        common_objs_dir = [common_obj_asm]
+                                if self.env['arch'] == 'arm':
+                                    ccflags_dir.extend([
+                                        '-Xassembler', '-mcpu=cortex-a72', LF,
+                                        # To prevent:
+                                        # > vfp.S: Error: selected processor does not support <FPU instruction> in ARM mode
+                                        # https://stackoverflow.com/questions/41131432/cross-compiling-error-selected-processor-does-not-support-fmrx-r3-fpexc-in/52875732#52875732
+                                        # We aim to take the most extended mode currently available that works on QEMU.
+                                        '-Xassembler', '-mfpu=crypto-neon-fp-armv8.1', LF,
+                                        '-Xassembler', '-meabi=5', LF,
+                                        # Treat inline assembly as arm instead of thumb
+                                        # The opposite of -mthumb.
+                                        '-marm', LF,
+                                        # Make gcc generate .syntax unified for inline assembly.
+                                        # However, it gets ignored if -marm is given, which a GCC bug that was recently fixed:
+                                        # https://stackoverflow.com/questions/54078112/how-to-write-syntax-unified-ual-armv7-inline-assembly-in-gcc/54132097#54132097
+                                        # So we just write divided inline assembly for now.
+                                        '-masm-syntax-unified', LF,
+                                    ])
+                        for in_filename in in_filenames:
+                            path_relative_root = os.path.join(dirpath_relative_root, in_filename)
+                            if path_relative_root == common_obj_asm_relpath:
                                continue
-                            pkg = pkgs[pkg_key]
-                            if 'ccflags' in pkg:
-                                ccflags_file.extend(pkg['ccflags'])
-                            else:
-                                pkg_config_output = subprocess.check_output([
-                                    self.env['buildroot_pkg_config'],
-                                    '--cflags',
-                                    pkg_key
-                                ]).decode()
-                                ccflags_file.extend(self.sh.shlex_split(pkg_config_output))
-                            if 'ccflags_after' in pkg:
-                                ccflags_file.extend(pkg['ccflags_after'])
-                            else:
-                                pkg_config_output = subprocess.check_output([
-                                    self.env['buildroot_pkg_config'],
-                                    '--libs',
-                                    pkg_key
-                                ]).decode()
-                                ccflags_after.extend(self.sh.shlex_split(pkg_config_output))
-                        thread_limiter.acquire()
-                        if self.error:
-                            return 1
-                        thread = threading.Thread(
-                            target=self._build_one,
-                            kwargs={
-                                'in_path': in_path,
-                                'out_path': out_path,
-                                'ccflags': ccflags_file,
-                                'cstd': cstd,
-                                'cxxstd': cxxstd,
-                                'extra_objs': [common_obj],
-                                'ccflags_after': ccflags_after,
-                                'raise_on_failure': False,
-                                'thread_limiter': thread_limiter,
-                            }
-                        )
-                        thread.start()
+                            in_path = os.path.join(path, in_filename)
+                            in_name, in_ext = os.path.splitext(in_filename)
+                            out_path = os.path.join(
+                                out_dir,
+                                in_name + self.env['userland_build_ext']
+                            )
+                            pkg_key = in_name.split('_')[0]
+                            ccflags_file = ccflags_dir.copy()
+                            ccflags_after = []
+                            if pkg_key in pkgs:
+                                if pkg_key not in has_packages:
+                                    continue
+                                pkg = pkgs[pkg_key]
+                                if 'ccflags' in pkg:
+                                    ccflags_file.extend(pkg['ccflags'])
+                                else:
+                                    pkg_config_output = subprocess.check_output([
+                                        self.env['buildroot_pkg_config'],
+                                        '--cflags',
+                                        pkg_key
+                                    ]).decode()
+                                    ccflags_file.extend(self.sh.shlex_split(pkg_config_output))
+                                if 'ccflags_after' in pkg:
+                                    ccflags_file.extend(pkg['ccflags_after'])
+                                else:
+                                    pkg_config_output = subprocess.check_output([
+                                        self.env['buildroot_pkg_config'],
+                                        '--libs',
+                                        pkg_key
+                                    ]).decode()
+                                    ccflags_after.extend(self.sh.shlex_split(pkg_config_output))
+                            error = thread_pool.submit({
+                                    'in_path': in_path,
+                                    'out_path': out_path,
+                                    'ccflags': ccflags_file,
+                                    'cstd': cstd,
+                                    'cxxstd': cxxstd,
+                                    'extra_objs': common_objs_dir,
+                                    'ccflags_after': ccflags_after,
+                            })
+                            if error is not None:
+                                raise ExitLoop()
+        except ExitLoop:
+            pass
+        error = thread_pool.join()
+        if error is not None:
+            print(error)
+            return 1
        self.sh.copy_dir_if_update(
            srcdir=build_dir,
            destdir=self.env['out_rootfs_overlay_dir'],
@@ -277,12 +343,15 @@ Default: build all examples that have their package dependencies met, e.g.:
    def clean(self):
        if self.env['in_tree']:
            for target in self._get_targets():
-                for path, dirnames, filenames in os.walk(target):
-                    filenames.sort()
-                    dirnames.sort()
-                    for filename in filenames:
-                        if os.path.splitext(filename)[1] in self.env['userland_out_exts']:
-                            self.sh.rmrf(os.path.join(path, filename))
+                if os.path.exists(target):
+                    for path, dirnames, filenames in os.walk(target):
+                        filenames.sort()
+                        dirnames.sort()
+                        for filename in filenames:
+                            if os.path.splitext(filename)[1] in self.env['userland_out_exts']:
+                                self.sh.rmrf(os.path.join(path, filename))
+                else:
+                    raise Exception('Path does not exist: ' + target)
        else:
            self.sh.rmrf(self.get_build_dir())

--- a/7
+++ b/7
@@ -1,2 +1,7 @@
 #!/usr/bin/env bash
-"$(git rev-parse --show-toplevel)/build-userland" --gcc-which host --in-tree "$@"
+"$(git rev-parse --show-toplevel)/build-userland" \
+  --gcc-which host \
+  --in-tree \
+  --target-cwd \
+  "$@" \
+;
--- a/common.py
+++ b/common.py
@@ -56,6 +56,7 @@ consts['kernel_modules_subdir'] = 'kernel_modules'
 consts['kernel_modules_source_dir'] = os.path.join(consts['root_dir'], consts['kernel_modules_subdir'])
 consts['userland_subdir'] = 'userland'
 consts['userland_source_dir'] = os.path.join(consts['root_dir'], consts['userland_subdir'])
+consts['userland_source_arch_dir'] = os.path.join(consts['userland_source_dir'], 'arch')
 consts['userland_build_ext'] = '.out'
 consts['include_subdir'] = 'include'
 consts['include_source_dir'] = os.path.join(consts['root_dir'], consts['include_subdir'])
@@ -714,12 +715,15 @@ Valid emulators: {}
            env['initarg'] = 'init'
        env['quit_init'] = '{}={}'.format(env['initarg'], env['userland_quit_cmd'])

+        # Userland
+        env['userland_source_arch_arch_dir'] = join(env['userland_source_arch_dir'], env['arch'])
+        env['userland_build_dir'] = join(env['out_dir'], 'userland', env['userland_build_id'], env['arch'])
+
        # Kernel modules.
        env['kernel_modules_build_dir'] = join(env['kernel_modules_build_base_dir'], env['arch'])
        env['kernel_modules_build_subdir'] = join(env['kernel_modules_build_dir'], env['kernel_modules_subdir'])
        env['kernel_modules_build_host_dir'] = join(env['kernel_modules_build_base_dir'], 'host')
        env['kernel_modules_build_host_subdir'] = join(env['kernel_modules_build_host_dir'], env['kernel_modules_subdir'])
-        env['userland_build_dir'] = join(env['out_dir'], 'userland', env['userland_build_id'], env['arch'])
        env['out_rootfs_overlay_dir'] = join(env['out_dir'], 'rootfs_overlay', env['arch'])
        env['out_rootfs_overlay_bin_dir'] = join(env['out_rootfs_overlay_dir'], 'bin')

--- a/shell_helpers.py
+++ b/shell_helpers.py
@@ -126,12 +126,12 @@ class ShellHelpers:
            src = os.path.join(srcdir, basename)
            if os.path.isfile(src):
                noext, ext = os.path.splitext(basename)
-                if filter_ext is not None and ext == filter_ext:
-                    distutils.file_util.copy_file(
-                        src,
-                        os.path.join(destdir, basename),
-                        update=1,
-                    )
+                dest = os.path.join(destdir, basename)
+                if (
+                    (filter_ext is not None and ext == filter_ext) and
+                    (os.path.exists(dest) and os.path.getmtime(src) > os.path.getmtime(dest))
+                ):
+                    self.cp(src, dest)

    def copy_dir_if_update(self, srcdir, destdir, filter_ext=None):
        self.copy_dir_if_update_non_recursive(srcdir, destdir, filter_ext)
@@ -283,7 +283,9 @@ class ShellHelpers:
                #signal.signal(signal.SIGPIPE, sigpipe_old)
            returncode = proc.returncode
            if returncode != 0 and raise_on_failure:
-                raise Exception('Command exited with status: {}'.format(returncode))
+                e = Exception('Command exited with status: {}'.format(returncode))
+                e.returncode = returncode
+                raise e
            return returncode
        else:
            return 0
--- a/thread_pool.py
+++ b/thread_pool.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+
+from typing import Any, Callable, Dict, Iterable, Union
+import multiprocessing
+import queue
+import sys
+import threading
+import time
+
+class ThreadPool:
+    '''
+    Start a pool of a limited number of threads to do some work.
+
+    This is similar to the stdlib concurrent, but I could not find
+    how to reach all my design goals with that implementation:
+
+    - the input function does not need to be modified
+    - limit the number of threads
+    - queue sizes closely follow number of threads
+    - if an exception happens, optionally stop soon afterwards
+
+    Functional form and further discussion at:
+    https://stackoverflow.com/questions/19369724/the-right-way-to-limit-maximum-number-of-threads-running-at-once/55263676#55263676
+
+    This class form allows to use your own while loops with submit().
+
+    Quick test with:
+
+        ./thread_limit.py 2 -10 20 0
+        ./thread_limit.py 2 -10 20 1
+        ./thread_limit.py 2 -10 20 2
+        ./thread_limit.py 2 -10 20 3
+
+    These ensure that execution stops neatly on error.
+    '''
+    def __init__(
+        self,
+        func: Callable,
+        handle_output: Union[Callable[[Any,Any,Exception],Any],None] = None,
+        nthreads: Union[int,None] = None
+    ):
+        '''
+        Start in a thread pool immediately.
+
+        join() must be called afterwards at some point.
+
+        :param func: main work function to be evaluated.
+        :param handle_output: called on func return values as they
+            are returned.
+
+            Signature is: handle_output(input, output, exception) where:
+
+            - input: input given to func
+            - output: return value of func
+            - exception: the exception that func raised, or None otherwise
+
+            If this function returns non-None or raises, stop feeding
+            new input and exit ASAP when all currently running threads
+            have finished.
+
+            Default: a handler that does nothing and just exits on exception.
+        :param nthreads: number of threads to use. Default: nproc.
+        '''
+        self.func = func
+        if handle_output is None:
+            handle_output = lambda input, output, exception: exception
+        self.handle_output = handle_output
+        if nthreads is None:
+            nthreads = multiprocessing.cpu_count()
+        self.nthreads = nthreads
+        self.error_output = None
+        self.error_output_lock = threading.Lock()
+        self.in_queue = queue.Queue(maxsize=nthreads)
+        self.threads = []
+        for i in range(self.nthreads):
+            thread = threading.Thread(
+                target=self._func_runner,
+            )
+            self.threads.append(thread)
+            thread.start()
+
+    def submit(self, work):
+        '''
+        Submit work. Block if there is already enough work scheduled (~nthreads).
+
+        :return: if an error occurred in some previously executed thread, the error.
+                 Otherwise, None. This allows the caller to stop submitting further
+                 work if desired.
+        '''
+        self.in_queue.put(work)
+        return self.error_output
+
+    def join(self):
+        '''
+        Request all threads to stop after they finish currently submitted work.
+
+        :return: same as submit()
+        '''
+        for thread in range(self.nthreads):
+            self.in_queue.put(None)
+        for thread in self.threads:
+            thread.join()
+        return self.error_output
+
+    def _func_runner(self):
+        while True:
+            work = self.in_queue.get(block=True)
+            if work is None:
+                break
+            try:
+                exception = None
+                out = self.func(**work)
+            except Exception as e:
+                exception = e
+            try:
+                handle_output_return = self.handle_output(work, out, exception)
+            except Exception as e:
+                self.error_output_lock.acquire()
+                self.error_output = (work, out, e)
+                self.error_output_lock.release()
+            else:
+                if handle_output_return is not None:
+                    self.error_output_lock.acquire()
+                    self.error_output = handle_output_return
+                    self.error_output_lock.release()
+            finally:
+                self.in_queue.task_done()
+
+if __name__ == '__main__':
+    def my_func(i):
+        '''
+        The main function that will be evaluated.
+
+        It sleeps to simulate an IO operation.
+        '''
+        time.sleep((abs(i) % 4) / 10.0)
+        return 10.0 / i
+
+    def get_work(min_, max_):
+        '''
+        Generate simple range work for my_func.
+        '''
+        for i in range(min_, max_):
+            yield {'i': i}
+
+    def handle_output_print(input, output, exception):
+        '''
+        Print outputs and exit immediately on failure.
+        '''
+        print('{!r} {!r} {!r}'.format(input, output, exception))
+        return exception
+
+    def handle_output_print_no_exit(input, output, exception):
+        '''
+        Print outputs, don't exit on failure.
+        '''
+        print('{!r} {!r} {!r}'.format(input, output, exception))
+
+    out_queue = queue.Queue()
+    def handle_output_queue(input, output, exception):
+        '''
+        Store outputs in a queue for later usage.
+        '''
+        global out_queue
+        out_queue.put((input, output, exception))
+        return exception
+
+    def handle_output_raise(input, output, exception):
+        '''
+        Raise if input == 10, to test that execution
+        stops nicely if this raises.
+        '''
+        print('{!r} {!r} {!r}'.format(input, output, exception))
+        if input['i'] == 10:
+            raise Exception
+
+    # CLI arguments.
+    argv_len = len(sys.argv)
+    if argv_len > 1:
+        nthreads = int(sys.argv[1])
+        if nthreads == 0:
+            nthreads = None
+    else:
+        nthreads = None
+    if argv_len > 2:
+        min_ = int(sys.argv[2])
+    else:
+        min_ = 1
+    if argv_len > 3:
+        max_ = int(sys.argv[3])
+    else:
+        max_ = 100
+    if argv_len > 4:
+        c = sys.argv[4][0]
+    else:
+        c = '0'
+    if c == '1':
+        handle_output = handle_output_print_no_exit
+    elif c == '2':
+        handle_output = handle_output_queue
+    elif c == '3':
+        handle_output = handle_output_raise
+    else:
+        handle_output = handle_output_print
+
+    # Action.
+    thread_pool = ThreadPool(
+        my_func,
+        handle_output,
+        nthreads
+    )
+    for work in get_work(min_, max_):
+        error = thread_pool.submit(work)
+        if error is not None:
+            break
+    error = thread_pool.join()
+    if error is not None:
+        print('error: {!r}'.format(error))
+    if handle_output == handle_output_queue:
+        while not out_queue.empty():
+            print(out_queue.get())
--- a/userland/arch/aarch64/add.S
+++ b/userland/arch/aarch64/add.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+    mov x0, 1
+    add x1, x0, 2
+    ASSERT_EQ(x1, 3)
+EXIT
--- a/userland/arch/aarch64/adr.S
+++ b/userland/arch/aarch64/adr.S
@@ -0,0 +1,21 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#adr */
+
+#include "common.h"
+
+.data
+data_label:
+    .word 0x1234678
+ENTRY
+    /* This is not possible in v7 because the label is in another section.
+     * objdump says that this generates a R_AARCH64_ADR_PRE relocation.
+     * which looks specific to ADR, and therefore makes it more likely
+     * that there was no such relocation in v7.
+     *
+     * This relocation is particularly important because str does not have a
+     * pc-relative mode in ARMv8.
+     */
+    adr x0, data_label
+    ldr x1, =data_label
+label:
+    ASSERT_EQ_REG(x0, x1)
+EXIT
--- a/userland/arch/aarch64/adrp.S
+++ b/userland/arch/aarch64/adrp.S
@@ -0,0 +1,13 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#adr */
+
+#include "common.h"
+
+ENTRY
+    adrp x0, label
+    adr x1, label
+label:
+    /* Clear the lower 12 bits. */
+    bic x1, x1, 0xFF
+    bic x1, x1, 0xF00
+    ASSERT_EQ_REG(x0, x1)
+EXIT
--- a/userland/arch/aarch64/asm_hello.c
+++ b/userland/arch/aarch64/asm_hello.c
@@ -1,13 +0,0 @@
-#include <assert.h>
-#include <inttypes.h>
-
-int main(void) {
-    uint32_t myvar = 1;
-    __asm__ (
-        "add %[myvar], %[myvar], 1;"
-        : [myvar] "=r" (myvar)
-        :
-        :
-    );
-    assert(myvar == 2);
-}
--- a/userland/arch/aarch64/beq.S
+++ b/userland/arch/aarch64/beq.S
@@ -0,0 +1,33 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#cbz */
+
+#include "common.h"
+
+ENTRY
+    /* cbz == 0 */
+    mov x0, 0
+    cbz x0, 1f
+    FAIL
+1:
+
+    /* cbz != 0 */
+    mov x0, 1
+    cbz x0, 1f
+    b 2f
+1:
+    FAIL
+2:
+
+    /* cbnz != 0 */
+    mov x0, 1
+    cbnz x0, 1f
+    FAIL
+1:
+
+    /* cbnz == 0 */
+    mov x0, 0
+    cbnz x0, 1f
+    b 2f
+1:
+    FAIL
+2:
+EXIT
--- a/userland/arch/aarch64/bfi.S
+++ b/userland/arch/aarch64/bfi.S
@@ -0,0 +1,11 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bfi */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x1122334455667788
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    bfi x1, x0, 16, 32
+    ASSERT_EQ(x1, 0xFFFF55667788FFFF)
+EXIT
--- a/userland/arch/aarch64/c/asm_from_c.c
+++ b/userland/arch/aarch64/c/asm_from_c.c
@@ -0,0 +1,39 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#calling-convention */
+
+#include <assert.h>
+#include <inttypes.h>
+
+uint64_t my_asm_func(void);
+/* { return 42; } */
+__asm__(
+    ".global my_asm_func;"
+    "my_asm_func:"
+    "mov x0, 42;"
+    "ret;"
+);
+
+/* Now a more complex example that also calls a C function.
+ * We have to store the return value x30 for later because bl modifies it.
+ * https://stackoverflow.com/questions/27941220/push-lr-and-pop-lr-in-arm-arch64/34504752#34504752
+ * We are not modifying any other callee saved register in this function,
+ * since my_c_func is not either (unless GCC has a bug ;-)), so everything else if fine.
+ */
+uint64_t my_asm_func_2(void);
+/* { return my_c_func(); } */
+__asm__(
+    ".global my_asm_func_2;"
+    "my_asm_func_2:"
+    "str x30, [sp, -16]!;"
+    "bl my_c_func;"
+    "ldr x30, [sp], 16;"
+    "ret;"
+);
+
+uint64_t my_c_func(void) {
+    return 42;
+}
+
+int main(void) {
+    assert(my_asm_func() == 42);
+    assert(my_asm_func_2() == 42);
+}
--- a/userland/arch/aarch64/c/build
+++ b/userland/arch/aarch64/c/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/aarch64/c/earlyclobber.c
+++ b/userland/arch/aarch64/c/earlyclobber.c
@@ -0,0 +1,21 @@
+/* An example of using the '&' earlyclobber modifier.
+ * https://stackoverflow.com/questions/15819794/when-to-use-earlyclobber-constraint-in-extended-gcc-inline-assembly/54853663#54853663
+ * The assertion may fail without it. It actually does fail in GCC 8.2.0 at
+ * 34017bcd0bc96a3cf77f6acba4d58350e67c2694 + 1.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint64_t in = 1;
+    uint64_t out;
+    __asm__ (
+        "add %[out], %[in], 1;"
+        "add %[out], %[in], 1;"
+        : [out] "=&r" (out)
+        : [in] "r" (in)
+        :
+    );
+    assert(out == 2);
+}
--- a/userland/arch/aarch64/c/freestanding/build
+++ b/userland/arch/aarch64/c/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/aarch64/c/freestanding/hello.c
+++ b/userland/arch/aarch64/c/freestanding/hello.c
@@ -0,0 +1,37 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#freestanding-linux-inline-assembly-system-calls */
+
+#include <inttypes.h>
+
+void _start(void) {
+    uint64_t exit_status;
+
+    /* write */
+    {
+        char msg[] = "hello\n";
+        uint64_t syscall_return;
+        register uint64_t x0 __asm__ ("x0") = 1; /* stdout */
+        register char *x1 __asm__ ("x1") = msg;
+        register uint64_t x2 __asm__ ("x2") = sizeof(msg);
+        register uint64_t x8 __asm__ ("x8") = 64; /* syscall number */
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (x0)
+            : "r" (x1), "r" (x2), "r" (x8)
+            : "memory"
+        );
+        syscall_return = x0;
+        exit_status = (syscall_return != sizeof(msg));
+    }
+
+    /* exit */
+    {
+        register uint64_t x0 __asm__ ("x0") = exit_status;
+        register uint64_t x8 __asm__ ("x8") = 93;
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (x0)
+            : "r" (x8)
+            :
+        );
+    }
+}
--- a/userland/arch/aarch64/c/freestanding/hello_clobbers.c
+++ b/userland/arch/aarch64/c/freestanding/hello_clobbers.c
@@ -0,0 +1,40 @@
+/* Like hello.c trying to do it without named register variables.
+ * The code is more complicated, and I was not able to get as efficient,
+ * so better just stick to named register variables.
+ */
+
+#include <inttypes.h>
+
+void _start(void) {
+    uint64_t exit_status;
+
+    /* write */
+    {
+        char msg[] = "hello\n";
+        uint64_t syscall_return;
+        __asm__ (
+            "mov x0, 1;" /* stdout */
+            "mov x1, %[msg];"
+            "mov x2, %[len];"
+            "mov x8, 64;" /* syscall number */
+            "svc 0;"
+            "mov %[syscall_return], x0;"
+            : [syscall_return] "=r" (syscall_return)
+            : [msg] "p" (msg),
+            [len] "i" (sizeof(msg))
+            : "x0", "x1", "x2", "x8", "memory"
+        );
+        exit_status = (syscall_return != sizeof(msg));
+    }
+
+    /* exit */
+    __asm__ (
+        "mov x0, %[exit_status];"
+        "mov x8, 93;" /* syscall number */
+        "svc 0;"
+        :
+        : [exit_status] "r" (exit_status)
+        : "x0", "x8"
+    );
+}
+
--- a/userland/arch/aarch64/c/inc.c
+++ b/userland/arch/aarch64/c/inc.c
@@ -0,0 +1,13 @@
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint64_t io = 1;
+    __asm__ (
+        "add %[io], %[io], 1;"
+        : [io] "+r" (io)
+        :
+        :
+    );
+    assert(io == 2);
+}
--- a/userland/arch/aarch64/c/inc_float.c
+++ b/userland/arch/aarch64/c/inc_float.c
@@ -0,0 +1,28 @@
+/* https://stackoverflow.com/questions/53960240/armv8-floating-point-output-inline-assembly
+ *
+ * We use the undocumented %s and %d modifiers!
+ */
+
+#include <assert.h>
+
+int main(void) {
+    float my_float = 1.5;
+    __asm__ (
+        "fmov s0, 1.0;"
+        "fadd %s[my_float], %s[my_float], s0;"
+        : [my_float] "+w" (my_float)
+        :
+        : "s0"
+    );
+    assert(my_float == 2.5);
+
+    double my_double = 1.5;
+    __asm__ (
+        "fmov d0, 1.0;"
+        "fadd %d[my_double], %d[my_double], d0;"
+        : [my_double] "+w" (my_double)
+        :
+        : "d0"
+    );
+    assert(my_double == 2.5);
+}
--- a/userland/arch/aarch64/c/multiline.cpp
+++ b/userland/arch/aarch64/c/multiline.cpp
@@ -0,0 +1,18 @@
+// https://stackoverflow.com/questions/3666013/how-to-write-multiline-inline-assembly-code-in-gcc-c/54575948#54575948
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint64_t io = 0;
+    __asm__ (
+        R"(
+add %[io], %[io], #1
+add %[io], %[io], #1
+)"
+        : [io] "+r" (io)
+        :
+        :
+    );
+    assert(io == 2);
+}
--- a/userland/arch/aarch64/c/reg_var.c
+++ b/userland/arch/aarch64/c/reg_var.c
@@ -0,0 +1,27 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#register-variables */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    register uint32_t x0 __asm__ ("x0");
+    register uint32_t x1 __asm__ ("x1");
+    uint32_t new_x0;
+    uint32_t new_x1;
+    {
+        x0 = 1;
+        x1 = 2;
+        __asm__ (
+            "add %[x0], x0, #1;"
+            "add %[x1], x1, #1;"
+            : [x0] "+r" (x0),
+              [x1] "+r" (x1)
+            :
+            :
+        );
+        new_x0 = x0;
+        new_x1 = x1;
+    }
+    assert(new_x0 == 2);
+    assert(new_x1 == 3);
+}
--- a/userland/arch/aarch64/c/reg_var_float.c
+++ b/userland/arch/aarch64/c/reg_var_float.c
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#register-variables */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    register double d0 __asm__ ("d0");
+    register double d1 __asm__ ("d1");
+    double new_d0;
+    double new_d1;
+    {
+        d0 = 1.5;
+        d1 = 2.5;
+        __asm__ (
+            "fmov d2, 1.5;"
+            "fadd %d[d0], d0, d2;"
+            "fadd %d[d1], d1, d2;"
+            : [d0] "+w" (d0),
+              [d1] "+w" (d1)
+            :
+            : "d2"
+        );
+        new_d0 = d0;
+        new_d1 = d1;
+    }
+    assert(new_d0 == 3.0);
+    assert(new_d1 == 4.0);
+}
--- a/userland/arch/aarch64/cbz.S
+++ b/userland/arch/aarch64/cbz.S
@@ -0,0 +1,19 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#cbz */
+
+#include "common.h"
+
+ENTRY
+
+    /* Branch. */
+    mov x0, 0x0
+    cbz x0, ok
+    FAIL
+ok:
+
+    /* Don't branch. */
+    mov x0, 0x1
+    cbz x0, ko
+
+EXIT
+ko:
+    FAIL
--- a/userland/arch/aarch64/comments.S
+++ b/userland/arch/aarch64/comments.S
@@ -0,0 +1,17 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#comments */
+
+#include "common.h"
+ENTRY
+    # mycomment
+
+    /* ARMv8 has // instead of @ as for comments. */
+    // mycomment
+    nop // mycomment
+
+    /* All these fail. Lol, different than v7, no consistency. */
+#if 0
+    nop # mycomment
+    @ mycomment
+    nop @ mycomment
+#endif
+EXIT
--- a/userland/arch/aarch64/common_arch.h
+++ b/userland/arch/aarch64/common_arch.h
@@ -0,0 +1,64 @@
+#ifndef COMMON_ARCH_H
+#define COMMON_ARCH_H
+
+#define ASSERT_EQ(reg, const) \
+    ldr x11, =const; \
+	cmp reg, x11; \
+	ASSERT(beq); \
+;
+
+#define ASSERT_MEMCMP(s1, s2, n) \
+	MEMCMP(s1, s2, n); \
+	ASSERT_EQ(x0, 0); \
+;
+
+#define ENTRY \
+.text; \
+.global asm_main; \
+asm_main: \
+    sub  sp, sp, 0xA0; \
+    stp  x29, x30, [sp]; \
+    stp  x27, x28, [sp, 0x10]; \
+    stp  x25, x26, [sp, 0x20]; \
+    stp  x23, x24, [sp, 0x30]; \
+    stp  x21, x22, [sp, 0x40]; \
+    stp  x19, x20, [sp, 0x50]; \
+    stp  x6, x7, [sp, 0x60]; \
+    stp  x4, x5, [sp, 0x70]; \
+    stp  x2, x3, [sp, 0x80]; \
+    stp  x0, x1, [sp, 0x90]; \
+asm_main_after_prologue: \
+;
+
+#define EXIT \
+    mov w0, 0; \
+    mov w1, 0; \
+    b pass; \
+fail: \
+    ldr x1, [sp, 0x90]; \
+    str w0, [x1]; \
+    mov w0, 1; \
+pass: \
+    ldp x19, x20, [sp, 0x50]; \
+    ldp x21, x22, [sp, 0x40]; \
+    ldp x23, x24, [sp, 0x30]; \
+    ldp x25, x26, [sp, 0x20]; \
+    ldp x27, x28, [sp, 0x10]; \
+    ldp x29, x30, [sp]; \
+    add sp, sp, 0xA0; \
+    ret; \
+;
+
+#define FAIL \
+    ldr w0, =__LINE__; \
+    b fail; \
+;
+
+#define MEMCMP(s1, s2, n) \
+    adr x0, s1; \
+    adr x1, s2; \
+    ldr x2, =n; \
+    bl memcmp; \
+;
+
+#endif
--- a/userland/arch/aarch64/cset.S
+++ b/userland/arch/aarch64/cset.S
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#cset */
+
+#include "common.h"
+
+ENTRY
+    /* Test values. */
+    mov x0, 0
+    mov x1, 1
+
+    /* eq is true, set x2 = 1. */
+    cmp x0, x0
+    cset x2, eq
+    ASSERT_EQ(x2, 1)
+
+    /* eq is false, set x2 = 0. */
+    cmp x0, x1
+    cset x2, eq
+    ASSERT_EQ(x2, 0)
+
+    /* Same for ne. */
+    cmp x0, x0
+    cset x2, ne
+    ASSERT_EQ(x2, 0)
+
+    cmp x0, x1
+    cset x2, ne
+    ASSERT_EQ(x2, 1)
+EXIT
--- a/userland/arch/aarch64/empty.S
+++ b/userland/arch/aarch64/empty.S
@@ -0,0 +1 @@
+../empty.S
--- a/userland/arch/aarch64/fail.S
+++ b/userland/arch/aarch64/fail.S
@@ -0,0 +1 @@
+../fail.S
--- a/userland/arch/aarch64/floating_point.S
+++ b/userland/arch/aarch64/floating_point.S
@@ -0,0 +1,60 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* 1.5 + 2.5 == 4.0
+     * using 64-bit double immediates.
+     */
+    fmov d0, 1.5
+    fmov d1, 2.5
+    fadd d2, d0, d1
+    fmov d3, 4.0
+    /* Unlike VFP vcmp, this stores the status
+     * automatically in the main CPSR.
+     */
+    fcmp d2, d3
+    ASSERT(beq)
+
+    /* Now with a memory stored value. */
+.data
+my_double_0:
+    .double 1.5
+my_double_1:
+    .double 2.5
+my_double_sum_expect:
+    .double 4.0
+.text
+    ldr d0, my_double_0
+    ldr d1, my_double_1
+    fadd d2, d0, d1
+    ldr d3, my_double_sum_expect
+    fcmp d2, d3
+    ASSERT(beq)
+
+    /* Now in 32-bit. */
+    fmov s0, 1.5
+    fmov s1, 2.5
+    fadd s2, s0, s1
+    fmov s3, 4.0
+    fcmp s2, s3
+    ASSERT(beq)
+
+    /* TODO why? What's the point of q then?
+     * Error: operand mismatch -- `fmov q0,1.5'
+     */
+#if 0
+    fmov q0, 1.5
+#endif
+
+    /* Much like integers, immediates are constrained to
+     * fit in 32-byte instructions. TODO exact rules.
+     *
+     * Assembly here would fail with:
+     *
+     * Error: invalid floating-point constant at operand 2
+     */
+#if 0
+    fmov d0, 1.23456798
+#endif
+EXIT
--- a/userland/arch/aarch64/freestanding/build
+++ b/userland/arch/aarch64/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/aarch64/freestanding/hello.S
+++ b/userland/arch/aarch64/freestanding/hello.S
@@ -0,0 +1,20 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#linux-system-calls */
+
+.text
+.global _start
+_start:
+asm_main_after_prologue:
+    /* write */
+    mov x0, 1     /* stdout */
+    adr x1, msg   /* buffer */
+    ldr x2, =len  /* len */
+    mov x8, 64    /* syscall number */
+    svc 0
+
+    /* exit */
+    mov x0, 0     /* exit status */
+    mov x8, 93    /* syscall number */
+    svc 0
+msg:
+    .ascii "hello\n"
+len = . - msg
--- a/userland/arch/aarch64/hello_driver.S
+++ b/userland/arch/aarch64/hello_driver.S
@@ -0,0 +1,6 @@
+.text
+.global asm_main
+asm_main:
+asm_main_after_prologue:
+    mov w0, 0
+    ret
--- a/userland/arch/aarch64/immediates.S
+++ b/userland/arch/aarch64/immediates.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#immediates */
+
+#include "common.h"
+ENTRY
+    mov x0, 1
+    mov x0, 0x1
+    mov x0, 1
+    mov x0, 0x1
+EXIT
--- a/userland/arch/aarch64/movk.S
+++ b/userland/arch/aarch64/movk.S
@@ -0,0 +1,26 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#movk */
+
+#include "common.h"
+
+ENTRY
+    movk x0, 0x4444, lsl 0
+    movk x0, 0x3333, lsl 16
+    movk x0, 0x2222, lsl 32
+    movk x0, 0x1111, lsl 48
+    ASSERT_EQ(x0, 0x1111222233334444)
+
+    /* Set a label (addresses are 48-bit) with immediates:
+     *
+     * * https://stackoverflow.com/questions/38570495/aarch64-relocation-prefixes
+     * * https://sourceware.org/binutils/docs-2.26/as/AArch64_002dRelocations.html
+     *
+     * This could be used if the label is too far away for
+     * adr relative addressing.
+     */
+    movz x0, :abs_g2:label     /* bits 32-47, overflow check */
+    movk x0, :abs_g1_nc:label  /* bits 16-31, no overflow check */
+    movk x0, :abs_g0_nc:label  /* bits  0-15, no overflow check */
+    adr x1, label
+label:
+    ASSERT_EQ_REG(x0, x1)
+EXIT
--- a/userland/arch/aarch64/movn.S
+++ b/userland/arch/aarch64/movn.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#movn */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x123456789ABCDEF0
+    movn x0, 0x8888, lsl 16
+    ASSERT_EQ(x0, 0xFFFFFFFF7777FFFF)
+EXIT
--- a/userland/arch/aarch64/pc.S
+++ b/userland/arch/aarch64/pc.S
@@ -0,0 +1,78 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#registers */
+
+#include "common.h"
+
+ENTRY
+#if 0
+    /* Unlike v7, we can't use PC like any other register in ARMv8,
+     * since it is not a general purpose register anymore.
+     *
+     * Only branch instructions can modify the PC.
+     *
+     * B1.2.1 "Registers in AArch64 state" says:
+     *
+     * Software cannot write directly to the PC. It
+     * can only be updated on a branch, exception entry or
+     * exception return.
+     */
+    ldr pc, =10f
+    FAIL
+10:
+#endif
+#if 0
+    mov x0, pc
+#endif
+
+    /* LDR PC-relative loads exist in ARMv8, but they have a separate encoding
+     * "LDR (literal)" instead of "LDR (immediate)":
+     * https://stackoverflow.com/questions/28638981/howto-write-pc-relative-adressing-on-arm-asm/54480999#54480999
+     */
+    ldr x0, pc_relative_ldr
+    b 1f
+pc_relative_ldr:
+    .quad 0x123456789ABCDEF0
+1:
+    ASSERT_EQ(x0, 0x123456789ABCDEF0)
+
+    /* Just for fun, we can also use relative numbers instead of labels.
+     * https://reverseengineering.stackexchange.com/questions/17666/how-does-the-ldr-instruction-work-on-arm/20567#20567
+     */
+    ldr x0, 0x8
+    b 1f
+    .quad 0x123456789ABCDEF0
+1:
+    ASSERT_EQ(x0, 0x123456789ABCDEF0)
+
+    /* Analogous for b with PC. */
+    mov x0, 0
+    /* Jumps over mov to ASSERT_EQ. */
+    b 8
+    mov x0, 1
+    ASSERT_EQ(x0, 0)
+
+    /* Trying to use the old "LDR (immediate)" PC-relative
+     * syntax does not work.
+     */
+#if 0
+    /* 64-bit integer or SP register expected at operand 2 -- `ldr x0,[pc]' */
+    ldr x0, [pc]
+#endif
+
+    /* There is however no analogue for str. TODO rationale? */
+#if 0
+    /* Error: invalid addressing mode at operand 2 -- `str x0,pc_relative_str' */
+    str x0, pc_relative_str
+#endif
+
+    /* You just have to use adr + "STR (register)". */
+    ldr x0, pc_relative_str
+    ASSERT_EQ(x0, 0x0)
+    adr x1, pc_relative_str
+    ldr x0, pc_relative_ldr
+    str x0, [x1]
+    ldr x0, pc_relative_str
+    ASSERT_EQ(x0, 0x123456789ABCDEF0)
+EXIT
+.data
+pc_relative_str:
+    .quad 0x0000000000000000
--- a/userland/arch/aarch64/regs.S
+++ b/userland/arch/aarch64/regs.S
@@ -0,0 +1,47 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#armv8-registers */
+
+#include "common.h"
+
+ENTRY
+
+    /* 31 64-bit eXtended general purpose registers. */
+    mov x0, 0
+    mov x1, 1
+    mov x2, 2
+    mov x3, 3
+    mov x4, 4
+    mov x5, 5
+    mov x6, 6
+    mov x7, 7
+    mov x8, 8
+    mov x9, 9
+    mov x10, 10
+    mov x11, 11
+    mov x12, 12
+    mov x13, 13
+    mov x14, 14
+    mov x15, 15
+    mov x16, 16
+    mov x17, 17
+    mov x18, 18
+    mov x19, 19
+    mov x20, 20
+    mov x21, 21
+    mov x22, 22
+    mov x23, 23
+    mov x24, 24
+    mov x25, 25
+    mov x26, 26
+    mov x27, 27
+    mov x28, 28
+    mov x29, 29
+
+    /* x30 is the link register. BL stores the return address here. */
+    /*mov x30, 30*/
+
+    /* W form addresses the lower 4 bytes word, and zeroes the top. */
+    ldr x0, =0x1111222233334444
+    ldr x1, =0x5555666677778888
+    mov w0, w1
+    ASSERT_EQ(x0, 0x0000000077778888)
+EXIT
--- a/userland/arch/aarch64/ret.S
+++ b/userland/arch/aarch64/ret.S
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bl */
+
+#include "common.h"
+
+ENTRY
+    mov x0, 1
+    bl inc
+    ASSERT_EQ(x0, 2)
+    bl inc2
+    ASSERT_EQ(x0, 3)
+    bl inc3
+    ASSERT_EQ(x0, 4)
+EXIT
+
+/* void inc(uint64_t *i) { (*i)++ } */
+inc:
+    add x0, x0, 1
+    ret
+
+/* Same but explicit return register. */
+inc2:
+    add x0, x0, 1
+    ret x30
+
+/* Same but with br. */
+inc3:
+    add x0, x0, 1
+    br x30
--- a/userland/arch/aarch64/simd.S
+++ b/userland/arch/aarch64/simd.S
@@ -0,0 +1,86 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* 4x 32-bit integer add.
+     *
+     * s stands for single == 32 bits.
+     *
+     * 1 in ld1 means to load just one register, see:
+     * https://github.com/cirosantilli/arm-assembly-cheat#simd-interleaving
+     */
+.data
+    u32_0:          .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444
+    u32_1:          .word 0x15551555, 0x16661666, 0x17771777, 0x18881888
+    u32_sum_expect: .word 0x06670666, 0x08890888, 0x0AAB0AAA, 0x0CCD0CCC
+.bss
+    u32_sum:        .skip 16
+.text
+    adr x0, u32_0
+    ld1 {v0.4s}, [x0]
+    adr x1, u32_1
+    ld1 {v1.4s}, [x1]
+    add v2.4s, v0.4s, v1.4s
+    adr x0, u32_sum
+    st1 {v2.4s}, [x0]
+    ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10)
+
+    /* 2x 64-bit integer add.
+     *
+     * d stands for double == 64 bits.
+     */
+.data
+    u64_0:          .quad 0xF1111111F1111111, 0xF2222222F2222222
+    u64_1:          .quad 0x1555555515555555, 0x1666666616666666
+    u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
+.bss
+    u64_sum: .skip 16
+.text
+    adr x0, u64_0
+    ld1 {v0.2d}, [x0]
+    adr x1, u64_1
+    ld1 {v1.2d}, [x1]
+    add v2.2d, v0.2d, v1.2d
+    adr x0, u64_sum
+    st1 {v2.2d}, [x0]
+    ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10)
+
+    /* 4x 32-bit float add.
+     *
+     * The only difference between the integer point version
+     * is that we use fadd instead of add.
+     */
+.data
+    f32_0:          .float 1.5, 2.5,  3.5,  4.5
+    f32_1:          .float 5.5, 6.5,  7.5,  8.5
+    f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
+.bss
+    f32_sum: .skip 16
+.text
+    adr x0, f32_0
+    ld1 {v0.4s}, [x0]
+    adr x1, f32_1
+    ld1 {v1.4s}, [x1]
+    fadd v2.4s, v0.4s, v1.4s
+    adr x0, f32_sum
+    st1 {v2.4s}, [x0]
+    ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10)
+
+    /* 2x 64-bit float add. */
+.data
+    f64_0:          .double 1.5, 2.5
+    f64_1:          .double 5.5, 6.5
+    f64_sum_expect: .double 7.0, 9.0
+.bss
+    f64_sum: .skip 16
+.text
+    adr x0, f64_0
+    ld1 {v0.2d}, [x0]
+    adr x1, f64_1
+    ld1 {v1.2d}, [x1]
+    fadd v2.2d, v0.2d, v1.2d
+    adr x0, f64_sum
+    st1 {v2.2d}, [x0]
+    ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10)
+EXIT
--- a/userland/arch/aarch64/simd_interleave.S
+++ b/userland/arch/aarch64/simd_interleave.S
@@ -0,0 +1,26 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#simd-interleaving */
+
+#include "common.h"
+
+ENTRY
+.data
+    u32_interleave: .word \
+        0x11111111, 0x55555555, \
+        0x22222222, 0x66666666, \
+        0x33333333, 0x77777777, \
+        0x44444444, 0x88888888
+    u32_interleave_sum_expect: .word \
+        0x66666666, \
+        0x88888888, \
+        0xAAAAAAAA, \
+        0xCCCCCCCC
+.bss
+    u32_interleave_sum: .skip 16
+.text
+    adr x0, u32_interleave
+    ld2 {v0.4s, v1.4s}, [x0]
+    add v2.4s, v0.4s, v1.4s
+    adr x0, u32_interleave_sum
+    st1 {v2.4s}, [x0]
+    ASSERT_MEMCMP(u32_interleave_sum, u32_interleave_sum_expect, 0x10)
+EXIT
--- a/userland/arch/aarch64/str.S
+++ b/userland/arch/aarch64/str.S
@@ -0,0 +1,13 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#armv8-str */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, myvar
+    ASSERT_EQ(x0, 0x12346789ABCDEF0)
+#if 0
+    /* Error: invalid addressing mode at operand 2 -- `str x0,myvar' */
+    str x0, myvar
+#endif
+EXIT
+    myvar: .quad 0x12346789ABCDEF0
--- a/userland/arch/aarch64/ubfm.S
+++ b/userland/arch/aarch64/ubfm.S
@@ -0,0 +1,17 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ubfm */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x1122334455667788
+
+    // lsr alias: imms == 63
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfm x1, x0, 16, 63
+    ASSERT_EQ(x1, 0x0000112233445566)
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfm x1, x0, 32, 63
+    ASSERT_EQ(x1, 0x0000000011223344)
+EXIT
--- a/userland/arch/aarch64/ubfx.S
+++ b/userland/arch/aarch64/ubfx.S
@@ -0,0 +1,15 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ubfx */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x1122334455667788
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfx x1, x0, 8, 16
+    ASSERT_EQ(x1, 0x0000000000006677)
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfx x1, x0, 8, 32
+    ASSERT_EQ(x1, 0x0000000044556677)
+EXIT
--- a/userland/arch/aarch64/x31.S
+++ b/userland/arch/aarch64/x31.S
@@ -0,0 +1,51 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#x31 */
+
+#include "common.h"
+
+ENTRY
+    /* ERROR: can never use the name x31. */
+#if 0
+    mov x31, 31
+#endif
+
+    /* mov (register) is an alias for ORR, which accepts xzr. */
+    mov x0, 1
+    mov x0, xzr
+    ASSERT_EQ(x0, 0)
+
+    /* Same encoding as the mov version. */
+    mov x0, 1
+    orr x0, xzr, xzr
+    ASSERT_EQ(x0, 0)
+
+    /* So, orr, which is not an alias, can only take xzr, not sp. */
+#if 0
+    orr sp, sp, sp
+#endif
+
+    /* Zero register discards result if written to. */
+    mov x0, 1
+    orr xzr, x0, x0
+    ASSERT_EQ(xzr, 0)
+
+    /* MOV (to/from SP) is an alias for ADD (immediate). */
+    mov x0, sp
+    mov sp, 1
+    /* Alias to add. */
+    mov x1, sp
+    /* Exact same encoding as above. */
+    add x1, sp, 0
+    ASSERT_EQ(x1, 1)
+    mov sp, x0
+
+    /* So, ADD (immediate), which is not an alias, can only take sp, not xzr. */
+#if 0
+    /* Error: integer register expected in the extended/shifted operand register at operand 3 -- `add xzr,xzr,1' */
+    add xzr, xzr, 1
+#endif
+
+    /* Note however that ADD (register), unlike ADD (immediate),
+     * does not say anything about SP, and so does accept xzr just fine.
+     */
+    add xzr, xzr, xzr
+EXIT
--- a/userland/arch/arm/add.S
+++ b/userland/arch/arm/add.S
@@ -0,0 +1,58 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+
+    /* Immediate encoding.
+     *
+     * r1 = r0 + 2
+     */
+    mov r0, 1
+    /* r1 = r0 + 2 */
+    add r1, r0, 2
+    ASSERT_EQ(r1, 3)
+
+    /* If src == dest, we can omit one of them.
+     *
+     * r0 = r0 + 2
+     */
+    mov r0, 1
+    add r0, 2
+    ASSERT_EQ(r0, 3)
+
+    /* Same as above but explicit. */
+    mov r0, 1
+    add r0, r0, 2
+    ASSERT_EQ(r0, 3)
+
+#if 0
+    /* But we cannot omit the register if there is a shift when using .syntx unified:
+     * https://github.com/cirosantilli/arm-assembly-cheat#shift-suffixes
+     */
+    .syntax unified
+    /* Error: garbage following instruction */
+    add r0, r1, lsl 1
+    /* OK */
+    add r0, r0, r1, lsl 1
+#endif
+
+    /* Register encoding.
+     *
+     * r2 = r0 + r1
+     */
+    mov r0, 1
+    mov r1, 2
+    add r2, r0, r1
+    ASSERT_EQ(r2, 3)
+
+    /* Register encoding, omit implicit register.
+     *
+     * r1 = r1 + r0
+     */
+    mov r0, 1
+    mov r1, 2
+    add r1, r0
+    ASSERT_EQ(r1, 3)
+
+EXIT
--- a/userland/arch/arm/address_modes.S
+++ b/userland/arch/arm/address_modes.S
@@ -0,0 +1,51 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#addressing-modes */
+
+#include "common.h"
+
+ENTRY
+
+    /* Offset mode with immediate. Add 4 to the address register, which ends up
+     * reading myvar2 instead of myvar.
+     */
+    adr r0, myvar
+    ldr r1, [r0, 4]
+    ASSERT_EQ(r1, 0x9ABCDEF0)
+    /* r0 was not modified. */
+    ASSERT_EQ(r0, myvar)
+
+    /* Pre-indexed mode */
+    adr r0, myvar
+    ldr r1, [r0, 4]!
+    ASSERT_EQ(r1, 0x9ABCDEF0)
+    /* r0 was modified. */
+    ASSERT_EQ(r0, myvar2)
+
+    /* Post-indexed mode */
+    adr r0, myvar
+    ldr r1, [r0], 4
+    ASSERT_EQ(r1, 0x12345678)
+    /* r0 was modified. */
+    ASSERT_EQ(r0, myvar2)
+
+    /* Offset in register. */
+    adr r0, myvar
+    mov r1, 4
+    ldr r2, [r0, r1]
+    ASSERT_EQ(r2, 0x9ABCDEF0)
+
+    /* Offset in shifted register:
+     * r2 =
+     * (r0 + (r1 << 1))
+     * == *(myvar + (2 << 1))
+     * == *(myvar + 4)
+     */
+    adr r0, myvar
+    mov r1, 2
+    ldr r2, [r0, r1, lsl 1]
+    ASSERT_EQ(r2, 0x9ABCDEF0)
+
+EXIT
+myvar:
+    .word 0x12345678
+myvar2:
+    .word 0x9ABCDEF0
--- a/userland/arch/arm/adr.S
+++ b/userland/arch/arm/adr.S
@@ -0,0 +1,33 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#adr */
+
+#include "common.h"
+
+.data
+data_label:
+    .word 0x1234678
+ENTRY
+    adr r0, label
+    /* objdump tells us that this uses the literal pool,
+     * it does not get converted to adr, which is the better
+     * alternative here.
+     */
+    adr r1, label
+    adrl r2, label
+label:
+    ASSERT_EQ_REG(r0, r1)
+    ASSERT_EQ_REG(r0, r2)
+
+#if 0
+    /* Error: symbol .data is in a different section.
+     *
+     * It works however in ARMv8.
+     * I think this means that there is no relocation type
+     * that takes care of this encoding in ARMv8, but there
+     * is one in ARMv8.
+     *
+     * If you have no idea what I'm talking about, read this:
+     * https://stackoverflow.com/questions/3322911/what-do-linkers-do/33690144#33690144
+     */
+    adr r1, data_label
+#endif
+EXIT
--- a/userland/arch/arm/and.S
+++ b/userland/arch/arm/and.S
@@ -0,0 +1,27 @@
+/* Bitwise AND. */
+
+#include "common.h"
+
+ENTRY
+
+    /* 0x00 && 0xFF == 0x00 */
+    mov r0, 0x00
+    and r0, 0xFF
+    ASSERT_EQ(r0, 0x00)
+
+    /* 0x0F && 0xF0 == 0x00 */
+    mov r0, 0x0F
+    and r0, 0xF0
+    ASSERT_EQ(r0, 0x00)
+
+    /* 0x0F && 0xFF == 0x0F */
+    mov r0, 0x0F
+    and r0, 0xFF
+    ASSERT_EQ(r0, 0x0F)
+
+    /* 0xF0 && 0xFF == 0xF0 */
+    mov r0, 0xF0
+    and r0, 0xFF
+    ASSERT_EQ(r0, 0xF0)
+
+EXIT
--- a/userland/arch/arm/b.S
+++ b/userland/arch/arm/b.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#b */
+
+#include "common.h"
+ENTRY
+    /* Jump over the fail. 26-bit PC-relative. */
+    b ok
+    FAIL
+ok:
+EXIT
--- a/userland/arch/arm/beq.S
+++ b/userland/arch/arm/beq.S
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#beq */
+
+#include "common.h"
+
+ENTRY
+
+    /* Smaller*/
+    mov r0, 1
+    cmp r0, 2
+    ASSERT(ble)
+    ASSERT(blt)
+    ASSERT(bne)
+
+    /* Equal. */
+    mov r1, 0
+    cmp r1, 0
+    ASSERT(beq)
+    ASSERT(bge)
+    ASSERT(ble)
+
+    /* Greater. */
+    mov r0, 2
+    cmp r0, 1
+    ASSERT(bge)
+    ASSERT(bgt)
+    ASSERT(bne)
+
+EXIT
--- a/userland/arch/arm/bfi.S
+++ b/userland/arch/arm/bfi.S
@@ -0,0 +1,10 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bfi */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =0x11223344
+    ldr r1, =0xFFFFFFFF
+    bfi r1, r0, 8, 16
+    ASSERT_EQ(r1, 0xFF3344FF)
+EXIT
--- a/userland/arch/arm/bic.S
+++ b/userland/arch/arm/bic.S
@@ -0,0 +1,10 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bic */
+
+#include "common.h"
+
+ENTRY
+    /* 0x0F & ~0x55 == 0x0F & 0xAA == 0x0A */
+    mov r0, 0x0F
+    bic r0, 0x55
+    ASSERT_EQ(r0, 0x0A)
+EXIT
--- a/userland/arch/arm/bl.S
+++ b/userland/arch/arm/bl.S
@@ -0,0 +1,14 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bl */
+
+#include "common.h"
+
+ENTRY
+    mov r0, 1
+    bl inc
+    ASSERT_EQ(r0, 2)
+EXIT
+
+/* void inc(int *i) { (*i)++ } */
+inc:
+    add r0, 1
+    bx lr
--- a/userland/arch/arm/build
+++ b/userland/arch/arm/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/c/add.c
+++ b/userland/arch/arm/c/add.c
@@ -0,0 +1,17 @@
+/* 1 + 2 == 3 */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint32_t in0 = 1, in1 = 2, out;
+    __asm__ (
+        "add %[out], %[in0], %[in1];"
+        : [out] "=r" (out)
+        : [in0] "r"  (in0),
+          [in1] "r"  (in1)
+    );
+    assert(in0 == 1);
+    assert(in1 == 2);
+    assert(out == 3);
+}
--- a/userland/arch/arm/c/build
+++ b/userland/arch/arm/c/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/c/freestanding/build
+++ b/userland/arch/arm/c/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/c/freestanding/hello.c
+++ b/userland/arch/arm/c/freestanding/hello.c
@@ -0,0 +1,35 @@
+#include <inttypes.h>
+
+void _start(void) {
+    uint32_t exit_status;
+
+    /* write */
+    {
+        char msg[] = "hello\n";
+        uint32_t syscall_return;
+        register uint32_t r0 __asm__ ("r0") = 1; /* stdout */
+        register char *r1 __asm__ ("r1") = msg;
+        register uint32_t r2 __asm__ ("r2") = sizeof(msg);
+        register uint32_t r8 __asm__ ("r7") = 4; /* syscall number */
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (r0)
+            : "r" (r1), "r" (r2), "r" (r8)
+            : "memory"
+        );
+        syscall_return = r0;
+        exit_status = (syscall_return != sizeof(msg));
+    }
+
+    /* exit */
+    {
+        register uint32_t r0 __asm__ ("r0") = exit_status;
+        register uint32_t r7 __asm__ ("r7") = 1;
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (r0)
+            : "r" (r7)
+            :
+        );
+    }
+}
--- a/userland/arch/arm/c/inc.c
+++ b/userland/arch/arm/c/inc.c
@@ -0,0 +1,15 @@
+/* Increment a variable in inline assembly. */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint32_t my_local_var = 1;
+    __asm__ (
+        "add %[my_local_var], %[my_local_var], #1;"
+        : [my_local_var] "+r" (my_local_var)
+        :
+        :
+    );
+    assert(my_local_var == 2);
+}
--- a/userland/arch/arm/c/inc_float.c
+++ b/userland/arch/arm/c/inc_float.c
@@ -0,0 +1,28 @@
+/* https://stackoverflow.com/questions/53960240/armv8-floating-point-output-inline-assembly */
+
+#include <assert.h>
+
+int main(void) {
+    float my_float = 1.5;
+    __asm__ (
+        "vmov s0, 1.0;"
+        "vadd.f32 %[my_float], %[my_float], s0;"
+        : [my_float] "+t" (my_float)
+        :
+        : "s0"
+    );
+    assert(my_float == 2.5);
+
+    /* Undocumented %P
+     * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89482
+     */
+    double my_double = 1.5;
+    __asm__ (
+        "vmov.f64 d0, 1.0;"
+        "vadd.f64 %P[my_double], %P[my_double], d0;"
+        : [my_double] "+w" (my_double)
+        :
+        : "d0"
+    );
+    assert(my_double == 2.5);
+}
--- a/userland/arch/arm/c/inc_memory.c
+++ b/userland/arch/arm/c/inc_memory.c
@@ -0,0 +1,32 @@
+/* Like inc.c but less good since we do more work ourselves.
+ *
+ * Just doing this to test out the "m" memory constraint.
+ *
+ * GCC 8.2.0 -O0 assembles ldr line to:
+ *
+ * ....
+ * ldr r0, [fp, #-12]
+ * ....
+ *
+ * and `-O3` assembles to:
+ *
+ * ....
+ * ldr r0, [sp]
+ * ....
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint32_t my_local_var = 1;
+    __asm__ (
+        "ldr r0, %[my_local_var];"
+        "add r0, r0, #1;"
+        "str r0, %[my_local_var];"
+        : [my_local_var] "+m" (my_local_var)
+        :
+        : "r0"
+    );
+    assert(my_local_var == 2);
+}
--- a/userland/arch/arm/c/inc_memory_global.c
+++ b/userland/arch/arm/c/inc_memory_global.c
@@ -0,0 +1,25 @@
+/* GCC 8.2.0 -O0 and -O3 assembles ldr line to:
+ *
+ * ....
+ * movw r3, #<lower address part>
+ * movt r3, #<higher address part>
+ * ldr r0, [r3]
+ * ....
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+uint32_t my_global_var = 1;
+
+int main(void) {
+    __asm__ (
+        "ldr r0, %[my_global_var];"
+        "add r0, r0, #1;"
+        "str r0, %[my_global_var];"
+        : [my_global_var] "+m" (my_global_var)
+        :
+        : "r0"
+    );
+    assert(my_global_var == 2);
+}
--- a/userland/arch/arm/c/reg_var.c
+++ b/userland/arch/arm/c/reg_var.c
@@ -0,0 +1,38 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#register-variables */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    register uint32_t r0 __asm__ ("r0");
+    register uint32_t r1 __asm__ ("r1");
+    uint32_t new_r0;
+    uint32_t new_r1;
+    {
+        /* We must set the registers immediately before calling,
+         * without making any function calls in between.
+         */
+        r0 = 1;
+        r1 = 2;
+        __asm__ (
+            /* We intentionally use an explicit r0 and r1 here,
+            * just to illustrate that we are certain that the
+            * r0 variable will go in r0. Real code would never do this.
+            */
+            "add %[r0], r0, #1;"
+            "add %[r1], r1, #1;"
+            /* We have to specify r0 in the constraints.*/
+            : [r0] "+r" (r0),
+              [r1] "+r" (r1)
+            :
+            :
+        );
+        /* When we are done, we must immediatly assign
+         * the register variables to regular variables.
+         */
+        new_r0 = r0;
+        new_r1 = r1;
+    }
+    assert(new_r0 == 2);
+    assert(new_r1 == 3);
+}
--- a/userland/arch/arm/c_from_asm.S
+++ b/userland/arch/arm/c_from_asm.S
@@ -0,0 +1,59 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#calling-convention */
+
+#include "common.h"
+
+.data
+puts_s:
+    .asciz "hello puts"
+printf_format:
+    .asciz "hello printf %x\n"
+my_array_0:
+    .word 0x11111111, 0x22222222, 0x33333333, 0x44444444
+my_array_1:
+    .word 0x55555555, 0x66666666, 0x77777777, 0x88888888
+
+ENTRY
+    /* puts("hello world") */
+    /* r0 is first argument. */
+    ldr r0, =puts_s
+    bl puts
+    /* Check exit statut >= 0 for success. */
+    cmp r0, 0
+    ASSERT(bge)
+
+    /* printf */
+    ldr r0, =printf_format
+    ldr r1, =0x12345678
+    bl printf
+    cmp r0, 0
+    ASSERT(bge)
+
+    /* memcpy and memcmp. */
+
+        /* Smaller. */
+        ldr r0, =my_array_0
+        ldr r1, =my_array_1
+        ldr r2, =0x10
+        bl memcmp
+        cmp r0, 0
+        ASSERT(blt)
+
+        /* Copy. */
+        ldr r0, =my_array_0
+        ldr r1, =my_array_1
+        ldr r2, =0x10
+        bl memcpy
+
+        /* Equal. */
+        ldr r0, =my_array_0
+        ldr r1, =my_array_1
+        ldr r2, =0x10
+        bl memcmp
+        ASSERT_EQ(r0, 0)
+
+    /* exit(0) */
+    mov r0, 0
+    bl exit
+
+    /* Never reached, just for the fail symbol. */
+EXIT
--- a/userland/arch/arm/clz.S
+++ b/userland/arch/arm/clz.S
@@ -0,0 +1,17 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =0x7FFFFFFF
+    clz r1, r0
+    ASSERT_EQ(r1, 1)
+
+    ldr r0, =0x3FFFFFFF
+    clz r1, r0
+    ASSERT_EQ(r1, 2)
+
+    ldr r0, =0x1FFFFFFF
+    clz r1, r0
+    ASSERT_EQ(r1, 3)
+EXIT
--- a/userland/arch/arm/comments.S
+++ b/userland/arch/arm/comments.S
@@ -0,0 +1,14 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#comments */
+
+#include "common.h"
+ENTRY
+    # mycomment
+    @ mycomment
+    /* # only works at the beginning of the line.
+     * Error: garbage following instruction -- `nop #comment'
+     */
+#if 0
+    nop # mycomment
+#endif
+    nop @ mycomment
+EXIT
--- a/userland/arch/arm/common_arch.h
+++ b/userland/arch/arm/common_arch.h
@@ -0,0 +1,71 @@
+#ifndef COMMON_ARCH_H
+#define COMMON_ARCH_H
+
+.syntax unified
+
+/* Assert that a register equals a constant.
+ * * reg: the register to check. Can be r0-r10, but not r11. r11 is overwritten.
+ * * const: the constant to compare to. Only works for literals or labels, not for registers.
+ *          For register / register comparision, use ASSERT_EQ_REG.
+ */
+#define ASSERT_EQ(reg, const) \
+    ldr r11, =const; \
+	cmp reg, r11; \
+	ASSERT(beq); \
+;
+
+/* Assert that two arrays are the same. */
+#define ASSERT_MEMCMP(s1, s2, n) \
+	MEMCMP(s1, s2, n); \
+	ASSERT_EQ(r0, 0); \
+;
+
+/* Store all callee saved registers, and LR in case we make further BL calls.
+ *
+ * Also save the input arguments r0-r3 on the stack, so we can access them later on,
+ * despite those registers being overwritten.
+ */
+#define ENTRY \
+.text; \
+.global asm_main; \
+asm_main: \
+    stmdb sp!, {r0-r12, lr}; \
+asm_main_after_prologue: \
+;
+
+/* Meant to be called at the end of ENTRY.*
+ *
+ * Branching to "fail" makes tests fail with exit status 1.
+ *
+ * If EXIT is reached, the program ends successfully.
+ *
+ * Restore LR and bx jump to it to return from asm_main.
+ */
+#define EXIT \
+    mov r0, 0; \
+    mov r1, 0; \
+    b pass; \
+fail: \
+    ldr r1, [sp]; \
+    str r0, [r1]; \
+    mov r0, 1; \
+pass: \
+    add sp, 16; \
+    ldmia sp!, {r4-r12, lr}; \
+    bx lr; \
+;
+
+/* Always fail. */
+#define FAIL \
+    ldr r0, =__LINE__; \
+    b fail; \
+;
+
+#define MEMCMP(s1, s2, n) \
+    ldr r0, =s1; \
+    ldr r1, =s2; \
+    ldr r2, =n; \
+    bl memcmp; \
+;
+
+#endif
--- a/userland/arch/arm/cond.S
+++ b/userland/arch/arm/cond.S
@@ -0,0 +1,16 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#conditional-execution */
+
+#include "common.h"
+
+ENTRY
+    mov r0, 0
+    mov r1, 1
+    cmp r0, 1
+    /* Previous cmp failed, skip this operation. */
+    addeq r1, 1
+    ASSERT_EQ(r1, 1)
+    cmp r0, 0
+    /* Previous passed, do this operation. */
+    addeq r1, 1
+    ASSERT_EQ(r1, 2)
+EXIT
--- a/userland/arch/arm/empty.S
+++ b/userland/arch/arm/empty.S
@@ -0,0 +1 @@
+../empty.S
--- a/userland/arch/arm/fail.S
+++ b/userland/arch/arm/fail.S
@@ -0,0 +1 @@
+../fail.S
--- a/userland/arch/arm/freestanding/build
+++ b/userland/arch/arm/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/freestanding/hello.S
+++ b/userland/arch/arm/freestanding/hello.S
@@ -0,0 +1,21 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#linux-system-calls */
+
+.syntax unified
+.text
+.global _start
+_start:
+asm_main_after_prologue:
+    /* write */
+    mov r0, 1     /* stdout */
+    adr r1, msg   /* buffer */
+    ldr r2, =len  /* len */
+    mov r7, 4     /* syscall number */
+    svc 0
+
+    /* exit */
+    mov r0, 0     /* exit status */
+    mov r7, 1     /* syscall number */
+    svc 0
+msg:
+    .ascii "hello\n"
+len = . - msg
--- a/userland/arch/arm/hello_driver.S
+++ b/userland/arch/arm/hello_driver.S
@@ -0,0 +1,23 @@
+/* Minimal example using driver.
+ *
+ * Controls the exit status of the program.
+ */
+
+.syntax unified
+.text
+.global asm_main
+asm_main:
+asm_main_after_prologue:
+
+    /* Set the return value according to the ARM calling convention. */
+    mov r0, 0
+
+    /* Try some whacky value to see tests break. */
+    /*mov r0, 77*/
+
+    /* Branch to the address at register lr.
+     * That is the return value which was put there by the C driver (likely with a bl).
+     *
+     * X means eXchange encoding from thumb back to ARM, which is what the driver uses.
+     */
+    bx lr
--- a/userland/arch/arm/immediates.S
+++ b/userland/arch/arm/immediates.S
@@ -0,0 +1,24 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#immediates */
+
+#include "common.h"
+
+ENTRY
+    /* This is the default. We hack it in common.h however. */
+.syntax divided
+   /* These fail. */
+#if 0
+    mov r0, 1
+    mov r0, 0x1
+#endif
+    mov r0, #1
+    mov r0, #0x1
+    mov r0, $1
+    mov r0, $0x1
+.syntax unified
+    mov r0, 1
+    mov r0, 0x1
+    mov r0, 1
+    mov r0, 0x1
+    mov r0, $1
+    mov r0, $0x1
+EXIT
--- a/userland/arch/arm/inc_array.S
+++ b/userland/arch/arm/inc_array.S
@@ -0,0 +1,27 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#loop-over-array */
+
+#include "common.h"
+
+#define NELEM 4
+#define ELEM_SIZE 4
+
+.data;
+my_array:
+    .word 0x11111111, 0x22222222, 0x33333333, 0x44444444
+my_array_expect:
+    .word 0x11111112, 0x22222223, 0x33333334, 0x44444445
+
+ENTRY
+    /* Increment. */
+    ldr r0, =my_array
+    mov r1, NELEM
+increment:
+    ldr r2, [r0]
+    add r2, 1
+    /* Post index usage. */
+    str r2, [r0], ELEM_SIZE
+    sub r1, 1
+    cmp r1, 0
+    bne increment
+    ASSERT_MEMCMP(my_array, my_array_expect, 0x10)
+EXIT
--- a/userland/arch/arm/ldmia.S
+++ b/userland/arch/arm/ldmia.S
@@ -0,0 +1,62 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#loop-over-array */
+
+#include "common.h"
+
+#define NELEM 4
+#define ELEM_SIZE 4
+
+.data;
+my_array_0:
+    .word 0x11111111, 0x22222222, 0x33333333, 0x44444444
+my_array_1:
+    .word 0x55555555, 0x66666666, 0x77777777, 0x88888888
+
+ENTRY
+
+    /* Load r1, r2, r3 and r4 starting from the address in r0. Don't change r0 */
+    ldr r0, =my_array_0
+    ldr r1, =0
+    ldr r2, =0
+    ldr r3, =0
+    ldr r4, =0
+    ldmia r0, {r1-r4}
+    ASSERT_EQ(r0, my_array_0)
+    ASSERT_EQ(r1, 0x11111111)
+    ASSERT_EQ(r2, 0x22222222)
+    ASSERT_EQ(r3, 0x33333333)
+    ASSERT_EQ(r4, 0x44444444)
+
+    /* Swapping the order of r1 and r2 on the mnemonic makes no difference to load order.
+     *
+     * But it gives an assembler warning, so we won't do it by default:
+     *
+     *  ldmia.S: Assembler messages:
+     *  ldmia.S:32: Warning: register range not in ascending order
+     */
+#if 0
+    ldr r0, =my_array_0
+    ldr r1, =0
+    ldr r2, =0
+    ldmia r0, {r2,r1}
+    ASSERT_EQ(r1, 0x11111111)
+    ASSERT_EQ(r2, 0x22222222)
+#endif
+
+    /* Modify the array */
+    ldr r0, =my_array_1
+    ldr r1, =0x55555555
+    ldr r2, =0x66666666
+    ldr r3, =0x77777777
+    ldr r4, =0x88888888
+    stmdb r0, {r1-r4}
+
+    /* Verify that my_array_0 changed and is equal to my_array_1. */
+    MEMCMP(my_array_0, my_array_1, 0x10)
+    ASSERT_EQ(r0, 0)
+
+    /* Load registers and increment r0. */
+    ldr r0, =my_array_0
+    ldmia r0!, {r1-r4}
+    ASSERT_EQ(r0, my_array_1)
+
+EXIT
--- a/userland/arch/arm/ldr_pseudo.S
+++ b/userland/arch/arm/ldr_pseudo.S
@@ -0,0 +1,65 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldr-pseudo-instruction */
+
+#include "common.h"
+
+ENTRY
+
+    /* Mnemonic for a PC relative load:
+     *
+     * ....
+     * ldr r0, [pc, offset]
+     * r0 = myvar
+     * ....
+     */
+    ldr r0, myvar
+    ASSERT_EQ(r0, 0x12345678)
+
+    /* Mnemonic PC relative load with an offset.
+     * Load myvar2 instead of myvar.
+     */
+    ldr r0, myvar + 4
+    ASSERT_EQ(r0, 0x9ABCDEF0)
+
+    /* First store the address in r0 using a magic =myvar, which creates
+     * a new variable containing the address and PC-relative addresses it
+     * https://stackoverflow.com/questions/17214962/what-is-the-difference-between-label-equals-sign-and-label-brackets-in-ar
+     *
+     * Use the adr instruction would likely be better for this application however.
+     *
+     * ....
+     * r0 = &myvar
+     * r1 = *r0
+     * ....
+     */
+    ldr r0, =myvar
+    ldr r1, [r0]
+    ASSERT_EQ(r1, 0x12345678)
+
+    /* More efficiently, use r0 as the address to read, and write to r0 itself. */
+    ldr r0, =myvar
+    ldr r0, [r0]
+    ASSERT_EQ(r0, 0x12345678)
+
+    /* Same as =myvar but store a constant to a register.
+     * Can also be done with movw and movt. */
+    ldr r0, =0x11112222
+    ASSERT_EQ(r0, 0x11112222)
+
+    /* We can also use GAS tolower16 and topper16  and movw and movt
+     * to load the address of myvar into r0 with two immediates.
+     *
+     * This results in one extra 4 byte instruction read from memory,
+     * and one less data read, so it is likely more cache efficient.
+     *
+     * https://sourceware.org/binutils/docs-2.19/as/ARM_002dRelocations.html
+     */
+    movw r0, #:lower16:myvar
+    movt r0, #:upper16:myvar
+    ldr r1, [r0]
+    ASSERT_EQ(r1, 0x12345678)
+
+EXIT
+myvar:
+    .word 0x12345678
+myvar2:
+    .word 0x9ABCDEF0
--- a/userland/arch/arm/ldrb.S
+++ b/userland/arch/arm/ldrb.S
@@ -0,0 +1,12 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldrh-and-ldrb */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =myvar
+    mov r1, 0x0
+    ldrb r1, [r0]
+    ASSERT_EQ(r1, 0x00000078)
+EXIT
+myvar:
+    .word 0x12345678
--- a/userland/arch/arm/ldrh.S
+++ b/userland/arch/arm/ldrh.S
@@ -0,0 +1,12 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldrh-and-ldrb */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =myvar
+    mov r1, 0x0
+    ldrh r1, [r0]
+    ASSERT_EQ(r1, 0x00005678)
+EXIT
+myvar:
+    .word 0x12345678
--- a/userland/arch/arm/mov.S
+++ b/userland/arch/arm/mov.S
@@ -0,0 +1,19 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#mov */
+
+#include "common.h"
+
+ENTRY
+
+    /* Immediate. */
+    mov r0, 0
+    ASSERT_EQ(r0, 0)
+    mov r0, 1
+    ASSERT_EQ(r0, 1)
+
+    /* Register. */
+    mov r0, 0
+    mov r1, 1
+    mov r1, r0
+    ASSERT_EQ(r1, 0)
+
+EXIT
--- a/userland/arch/arm/movw.S
+++ b/userland/arch/arm/movw.S
@@ -0,0 +1,27 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#movw-and-movt */
+
+#include "common.h"
+
+ENTRY
+
+    /* movt (top) and movw (TODO what is w) set the higher
+     * and lower 16 bits of the register.
+     */
+    movw r0, 0xFFFF
+    movt r0, 0x1234
+    add r0, 1
+    ASSERT_EQ(r0, 0x12350000)
+
+    /* movw also zeroes out the top bits, allowing small 16-bit
+     * C constants to be assigned in a single instruction.
+     *
+     * It differs from mov because mov can only encode 8 bits
+     * at a time, while movw can encode 16.
+     *
+     * movt does not modify the lower bits however.
+     */
+    ldr r0, =0x12345678
+    movw r0, 0x1111
+    ASSERT_EQ(r0, 0x00001111)
+
+EXIT
--- a/userland/arch/arm/mul.S
+++ b/userland/arch/arm/mul.S
@@ -0,0 +1,12 @@
+/* Multiplication. */
+
+#include "common.h"
+
+ENTRY
+    /* 2 * 3 = 6 */
+    mov r0, 0
+    mov r1, 2
+    mov r2, 3
+    mul r1, r2
+    ASSERT_EQ(r1, 6)
+EXIT
--- a/userland/arch/arm/nop.S
+++ b/userland/arch/arm/nop.S
@@ -0,0 +1,32 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#nop */
+
+#include "common.h"
+
+ENTRY
+    /* Disassembles as:
+     *
+     * ....
+     * nop {0}
+     * ....
+     *
+     * TODO what is the `{0}`?
+     */
+    nop
+
+    /* Disassembles as:
+     *
+     * ....
+     * nop ; (mov r0, r0)
+     * ....
+     */
+    mov r0, r0
+
+    /* Disassemble as mov. TODO Why not as nop as in `mov r0, r0`?
+     * Do they have any effect?
+     */
+    mov r1, r1
+    mov r8, r8
+
+    /* And there are other nops as well? Disassembles as `and`. */
+    and r0, r0, r0
+EXIT
--- a/userland/arch/arm/push.S
+++ b/userland/arch/arm/push.S
@@ -0,0 +1,31 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldmia */
+
+#include "common.h"
+
+ENTRY
+
+    /* Save sp before push. */
+    mov r0, sp
+
+    /* Push. */
+    mov r1, 1
+    mov r2, 2
+    push {r1, r2}
+
+    /* Save sp after push. */
+    mov r1, sp
+
+    /* Restore. */
+    mov r3, 0
+    mov r4, 0
+    pop {r3, r4}
+    ASSERT_EQ(r3, 1)
+    ASSERT_EQ(r4, 2)
+
+    /* Check that stack pointer moved down by 8 bytes
+     * (2 registers x 4 bytes each).
+     */
+    sub r0, r1
+    ASSERT_EQ(r0, 8)
+
+EXIT
--- a/userland/arch/arm/rbit.S
+++ b/userland/arch/arm/rbit.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#rbit */
+
+#include "common.h"
+
+ENTRY
+    ldr r0,      =0b00000001001000110100010101100101
+    rbit r1, r0
+    ASSERT_EQ(r1, 0b10100110101000101100010010000000)
+EXIT
--- a/userland/arch/arm/regs.S
+++ b/userland/arch/arm/regs.S
@@ -0,0 +1,69 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#registers */
+
+#include "common.h"
+
+ENTRY
+
+    /* 13 general purpose registers. */
+    mov r0, 0
+    mov r1, 1
+    mov r2, 2
+    mov r3, 3
+    mov r4, 4
+    mov r5, 5
+    mov r6, 6
+    mov r7, 7
+    mov r8, 8
+    mov r9, 9
+    mov r10, 10
+    mov r11, 11
+    mov r12, 12
+
+    /* * r11: aliased to FP (frame pointer, debug stack trace usage only)
+     * +
+     * I think FP is only a convention with no instruction impact, but TODO:
+     * not mentioned on AAPCS. aarch64 AAPCS mentions it though.
+     * * r13: aliased to SP (stack pointer), what push / pop use
+     * * r14: aliased to LR (link register), what bl writes the return address to
+     * * r15: aliased to PC (program counter), contains the current instruction address
+     *
+     * In ARMv8, SP and PC have dedicated registers in addition to
+     * the 32-general purpose ones. LR is still general purpose as before.
+     *
+     * Therefore, it is possible to use those registers in any place
+     * other registers may be used.
+     *
+     * This is not possible in ARMv8 anymore.
+     *
+     * For example, we can load an address into PC, which is very similar to what B / BX does:
+     * https://stackoverflow.com/questions/32304646/arm-assembly-branch-to-address-inside-register-or-memory/54145818#54145818
+     */
+    ldr pc, =10f
+    FAIL
+10:
+
+    /* Same with r15, which is the same as pc. */
+    ldr r15, =10f
+    FAIL
+10:
+
+    /* Another example with mov reading from pc. */
+pc_addr:
+    mov r0, pc
+    /* Why sub 8:
+     * https://stackoverflow.com/questions/24091566/why-does-the-arm-pc-register-point-to-the-instruction-after-the-next-one-to-be-e
+     */
+    sub r0, r0, 8
+
+    /* pc-relative load also just work just like any other register. */
+    ldr r0, [pc]
+    b 1f
+    .word 0x12345678
+1:
+    ASSERT_EQ(r0, 0x12345678)
+
+    /* We can also use fp in GNU GAS assembly. */
+    mov r11, 0
+    mov fp, 1
+    ASSERT_EQ(r11, 1)
+EXIT
--- a/userland/arch/arm/rev.S
+++ b/userland/arch/arm/rev.S
@@ -0,0 +1,15 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* All bytes in register. */
+    ldr r0, =0x11223344
+    rev r1, r0
+    ASSERT_EQ(r1, 0x44332211)
+
+    /* Groups of 16-bits. */
+    ldr r0, =0x11223344
+    rev16 r1, r0
+    ASSERT_EQ(r1, 0x22114433)
+EXIT
--- a/userland/arch/arm/s_suffix.S
+++ b/userland/arch/arm/s_suffix.S
@@ -0,0 +1,35 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#s-suffix */
+
+#include "common.h"
+
+ENTRY
+
+    /* Result is 0, set beq. */
+    movs r0, 0
+    ASSERT(beq)
+
+    /* The opposite. */
+    movs r0, 1
+    ASSERT(bne)
+
+    /* mov without s does not set the status. */
+    movs r0, 0
+    mov r0, 1
+    ASSERT(beq)
+
+    /* movs still moves... */
+    mov r0, 0
+    movs r0, 1
+    ASSERT_EQ(r0, 1)
+
+    /* add: the result is 0. */
+    mov r0, 1
+    adds r0, -1
+    ASSERT(beq)
+
+    /* add: result non 0. */
+    mov r0, 1
+    adds r0, 1
+    ASSERT(bne)
+
+EXIT
--- a/userland/arch/arm/shift.S
+++ b/userland/arch/arm/shift.S
@@ -0,0 +1,79 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#shift-suffixes */
+
+#include "common.h"
+
+ENTRY
+
+    /* lsr */
+    ldr r0, =0xFFF00FFF
+    mov r1, r0, lsl 8
+    ldr r2, =0xF00FFF00
+    ASSERT_EQ_REG(r1, r2)
+
+    /* lsl */
+    ldr r0, =0xFFF00FFF
+    mov r1, r0, lsr 8
+    ldr r2, =0x00FFF00F
+    ASSERT_EQ_REG(r1, r2)
+
+    /* ror */
+    ldr r0, =0xFFF00FFF
+    mov r1, r0, ror 8
+    ldr r2, =0xFFFFF00F
+    ASSERT_EQ_REG(r1, r2)
+
+    /* asr negative */
+    ldr r0, =0x80000008
+    mov r1, r0, asr 1
+    ldr r2, =0xC0000004
+    ASSERT_EQ_REG(r1, r2)
+
+    /* asr positive */
+    ldr r0, =0x40000008
+    mov r1, r0, asr 1
+    ldr r2, =0x20000004
+    ASSERT_EQ_REG(r1, r2)
+
+    /* There are also direct shift mnemonics for the mov shifts.
+     *
+     * They assembly to the exact same bytes as the mov version
+     */
+    ldr r0, =0xFFF00FFF
+    lsl r1, r0, 8
+    ldr r2, =0xF00FFF00
+    ASSERT_EQ_REG(r1, r2)
+
+    /* If used with the `mov` instruction, it results in a pure shift,
+     * but the suffixes also exist for all the other data processing instructions.
+     *
+     * Here we illustrate a shifted add instruction which calculates:
+     *
+     * ....
+     * r1 = r1 + (r0 << 1)
+     * ....
+     */
+    ldr r0, =0x10
+    ldr r1, =0x100
+    add r1, r1, r0, lsl 1
+    ldr r2, =0x00000120
+    ASSERT_EQ_REG(r1, r2)
+
+    /* The shift takes up the same encoding slot as the immediate,
+     * therefore it is not possible to both use an immediate and shift.
+     *
+     * Error: shift expression expected -- `add r1,r0,1,lsl#1'
+     */
+#if 0
+    add r1, r0, 1, lsl 1
+#endif
+
+    /* However, you can still encode shifted bitmasks of
+     * limited width in immediates, so why not just use the
+     * assembler pre-processing for it?
+     */
+    ldr r1, =0x100
+    add r1, r1, (0x10 << 1)
+    ldr r2, =0x00000120
+    ASSERT_EQ_REG(r1, r2)
+
+EXIT
--- a/userland/arch/arm/simd.S
+++ b/userland/arch/arm/simd.S
@@ -0,0 +1,113 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* vadd.u32
+     *
+     * Add 4x 32-bit unsigned integers in one go.
+     *
+     * q means 128-bits.
+     *
+     * u32 means that we treat memory as uint32_t types.
+     *
+     * 4 is deduced: in 128 bits you can fit 4 u32.
+     *
+     * Observe how the carry is propagated within u32 integers,
+     * but not across them.
+     */
+.data
+    u32_0:          .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444
+    u32_1:          .word 0x15551555, 0x16661666, 0x17771777, 0x18881888
+    u32_sum_expect: .word 0x06670666, 0x08890888, 0x0AAB0AAA, 0x0CCD0CCC
+.bss
+    u32_sum: .skip 0x10
+.text
+    ldr r0, =u32_0
+    vld1.32 {q0}, [r0]
+    ldr r0, =u32_1
+    vld1.32 {q1}, [r0]
+    vadd.u32 q2, q0, q1
+    ldr r0, =u32_sum
+    vst1.u32 {q2}, [r0]
+    ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10)
+
+    /* vadd.u64: 2x 64-bit unsigned integer add. */
+.data
+    u64_0:          .quad 0xF1111111F1111111, 0xF2222222F2222222
+    u64_1:          .quad 0x1555555515555555, 0x1666666616666666
+    u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
+.bss
+    u64_sum: .skip 0x10
+.text
+    ldr r0, =u64_0
+    vld1.64 {q0}, [r0]
+    ldr r0, =u64_1
+    vld1.64 {q1}, [r0]
+    vadd.u64 q2, q0, q1
+    ldr r0, =u64_sum
+    vst1.u64 {q2}, [r0]
+    ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10)
+
+    /* vadd.s64: 2x 64-bit signed integer add. TODO: how to differentiate
+     * it from signed? I think signed and unsigned addition are identical
+     * in two's complement, the only difference is overflow / carry detection
+     * flags. But how do flags work when there are many values being added
+     * at once?
+     */
+.data
+    s64_0:          .quad -1, -2
+    s64_1:          .quad -1, -2
+    s64_sum_expect: .quad -2, -4
+.bss
+    s64_sum: .skip 0x10
+.text
+    ldr r0, =s64_0
+    vld1.64 {q0}, [r0]
+    ldr r0, =s64_1
+    vld1.64 {q1}, [r0]
+    vadd.s64 q2, q0, q1
+    ldr r0, =s64_sum
+    vst1.s64 {q2}, [r0]
+    ASSERT_MEMCMP(s64_sum, s64_sum_expect, 0x10)
+
+    /* vadd.f32: 4x 32-bit float add. */
+.data
+    f32_0:          .float 1.5, 2.5,  3.5,  4.5
+    f32_1:          .float 5.5, 6.5,  7.5,  8.5
+    f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
+.bss
+    f32_sum: .skip 0x10
+.text
+    ldr r0, =f32_0
+    vld1.32 {q0}, [r0]
+    ldr r0, =f32_1
+    vld1.32 {q1}, [r0]
+    vadd.f32 q2, q0, q1
+    ldr r0, =f32_sum
+    vst1.32 {q2}, [r0]
+    ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10)
+
+    /* vadd.f64: 2x 64-bit float add: appears not possible.
+     *
+     * https://stackoverflow.com/questions/36052564/does-arm-support-simd-operations-for-64-bit-floating-point-numbers
+     */
+.data
+    f64_0:          .double 1.5, 2.5
+    f64_1:          .double 5.5, 6.5
+    f64_sum_expect: .double 7.0, 9.0
+.bss
+    f64_sum: .skip 0x10
+.text
+    ldr r0, =f64_0
+    vld1.64 {q0}, [r0]
+    ldr r0, =f64_1
+    vld1.64 {q1}, [r0]
+#if 0
+    /* bad type in Neon instruction -- `vadd.f64 q2,q0,q1' */
+    vadd.f64 q2, q0, q1
+    ldr r0, =f64_sum
+    vst1.64 {q2}, [r0]
+    ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10)
+#endif
+EXIT
--- a/userland/arch/arm/str.S
+++ b/userland/arch/arm/str.S
@@ -0,0 +1,60 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#load-and-store-instructions */
+
+#include "common.h"
+
+.data;
+    /* Must be in the .data section, since we want to modify it. */
+myvar:
+    .word 0x12345678
+
+ENTRY
+    /* r0 will contain the address. */
+    ldr r0, =myvar
+
+    /* Sanity check. */
+    ldr r1, [r0]
+    movw r2, 0x5678
+    movt r2, 0x1234
+    ASSERT_EQ_REG(r1, r2)
+
+    /* Modify the value. */
+    movw r1, 0xDEF0
+    movt r1, 0x9ABC
+    str r1, [r0]
+
+    /* Check that it changed. */
+    ldr r1, [r0]
+    movw r2, 0xDEF0
+    movt r2, 0x9ABC
+    ASSERT_EQ_REG(r1, r2)
+
+    /* Cannot use PC relative addressing to a different segment,
+     * or else it fails with:
+     *
+     * ....
+     * Error: internal_relocation (type: OFFSET_IMM) not fixed up
+     * ....
+     *
+     * https://stackoverflow.com/questions/10094282/internal-relocation-not-fixed-up
+     */
+    /*ldr r0, myvar*/
+
+#if 0
+    /* We could in theory write this to set the address of myvar,
+     * but it will always segfault under Linux because the text segment is read-only.
+     * This is however useful in baremetal programming.
+     * This construct is not possible in ARMv8 for str:
+     * https://github.com/cirosantilli/arm-assembly-cheat#armv8-str
+     */
+    str r1, var_in_same_section
+var_in_same_section:
+#endif
+
+    /* = sign just doesn't make sense for str, you can't set the
+     * address of a variable.
+     */
+#if 0
+    str r1, =myvar
+#endif
+
+EXIT
--- a/userland/arch/arm/sub.S
+++ b/userland/arch/arm/sub.S
@@ -0,0 +1,11 @@
+/* Subtraction. */
+
+#include "common.h"
+
+ENTRY
+    /* 3 - 2 == 1 , register version.*/
+    mov r0, 3
+    mov r1, 2
+    sub r0, r0, r1
+    ASSERT_EQ(r0, 1)
+EXIT
--- a/userland/arch/arm/thumb.S
+++ b/userland/arch/arm/thumb.S
@@ -0,0 +1,17 @@
+/* Illustrates features that are only available in thumb. */
+
+.syntax unified
+.text
+.thumb_func
+.global asm_main
+asm_main:
+asm_main_after_prologue:
+
+    /* CBZ: cmp and branch if zero instruction. Equivalent to CMP + BEQ.
+     * TODO create an interesting assertion here.
+     */
+    cbz r1, 1f
+    1:
+
+    mov r0, 0
+    bx lr
--- a/userland/arch/arm/tst.S
+++ b/userland/arch/arm/tst.S
@@ -0,0 +1,19 @@
+/* Test. Same as ands, but don't store the result, just update flags. */
+
+#include "common.h"
+
+ENTRY
+
+    /* 0x0F && 0xF0 == 0x00, so beq. */
+    mov r0, 0x0F
+    tst r0, 0xF0
+    ASSERT(beq)
+
+    /* bne */
+    mov r0, 0xFF
+    tst r0, 0x0F
+    ASSERT(bne)
+    # r0 was not modified.
+    ASSERT_EQ(r0, 0xFF)
+
+EXIT
--- a/userland/arch/arm/vcvt.S
+++ b/userland/arch/arm/vcvt.S
@@ -0,0 +1,90 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vcvt */
+
+#include "common.h"
+
+ENTRY
+    /* SIMD positive. */
+.data
+    vcvt_positive_0:      .float 1.25, 2.5, 3.75, 4.0
+    vcvt_positive_expect: .word  1,    2,   3,    4
+.bss
+    vcvt_positive_result: .skip 0x10
+.text
+    ldr r0, =vcvt_positive_0
+    vld1.32 {q0}, [r0]
+    vcvt.u32.f32 q1, q0
+    ldr r0, =vcvt_positive_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(vcvt_positive_result, vcvt_positive_expect, 0x10)
+
+    /* SIMD negative. */
+.data
+    vcvt_negative_0:      .float -1.25, -2.5, -3.75, -4.0
+    vcvt_negative_expect: .word  -1,    -2,   -3,    -4
+.bss
+    vcvt_negative_result: .skip 0x10
+.text
+    ldr r0, =vcvt_negative_0
+    vld1.32 {q0}, [r0]
+    vcvt.s32.f32 q1, q0
+    ldr r0, =vcvt_negative_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(vcvt_negative_result, vcvt_negative_expect, 0x10)
+
+    /* Floating point. */
+.data
+    vcvt_positive_float_0:      .float 1.5, 2.5
+    vcvt_positive_float_expect: .word  1
+                                .float      2.5
+.bss
+    vcvt_positive_float_result: .skip 0x8
+.text
+    ldr r0, =vcvt_positive_float_0
+    vld1.32 {d0}, [r0]
+    vcvt.u32.f32 s0, s0
+    ldr r0, =vcvt_positive_float_result
+    vst1.32 {d0}, [r0]
+    ASSERT_MEMCMP(vcvt_positive_float_result, vcvt_positive_float_expect, 0x8)
+
+    /* Floating point but with immediates.
+     *
+     * You have to worry of course about representability of
+     * the immediate in 4 bytes, which is even more fun for
+     * floating point numbers :-)
+     *
+     * Doing this mostly to illustrate the joys of vmov.i32.
+     *
+     * For some reason, there is no vmov.i32 sn, only dn.
+     * If you try to use sn, it does the same as .f32 and
+     * stores a float instead. Horrible!
+     */
+    vmov.f32 d0, 1.5
+    vcvt.u32.f32 s0, s0
+    vmov.i32 d1, 1
+    vcmp.f32 s0, s2
+    vmrs apsr_nzcv, fpscr
+    ASSERT(beq)
+    /* Check that s1 wasn't modified by vcvt. */
+    vmov.f32 s2, 1.5
+    vcmp.f32 s1, s2
+    vmrs apsr_nzcv, fpscr
+    ASSERT(beq)
+
+    /* Floating point double precision. */
+.data
+    vcvt_positive_double_0:      .double 1.5
+    vcvt_positive_double_expect: .word   1
+.bss
+    vcvt_positive_double_result: .skip 0x8
+.text
+    ldr r0, =vcvt_positive_double_0
+    vld1.64 {d0}, [r0]
+    vcvt.u32.f64 s0, d0
+    ldr r0, =vcvt_positive_double_result
+    vst1.32 {d0}, [r0]
+    ASSERT_MEMCMP(
+        vcvt_positive_double_result,
+        vcvt_positive_double_expect,
+        0x4
+    )
+EXIT
--- a/userland/arch/arm/vcvta.S
+++ b/userland/arch/arm/vcvta.S
@@ -0,0 +1,41 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vcvta */
+
+#include "common.h"
+
+ENTRY
+    /* SIMD positive. */
+.data
+    vcvta_positive_0:      .float 1.25, 2.5, 3.75, 4.0
+    vcvta_positive_expect: .word  1,    3,   4,    4
+.bss
+    vcvta_positive_result: .skip 0x10
+.text
+    ldr r0, =vcvta_positive_0
+    vld1.32 {q0}, [r0]
+    vcvta.u32.f32 q1, q0
+    ldr r0, =vcvta_positive_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvta_positive_result,
+        vcvta_positive_expect,
+        0x10
+    )
+
+    /* SIMD negative. */
+.data
+    vcvta_negative_0:      .float -1.25, -2.5, -3.75, -4.0
+    vcvta_negative_expect: .word  -1,    -3,   -4,    -4
+.bss
+    vcvta_negative_result: .skip 0x10
+.text
+    ldr r0, =vcvta_negative_0
+    vld1.32 {q0}, [r0]
+    vcvta.s32.f32 q1, q0
+    ldr r0, =vcvta_negative_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvta_negative_result,
+        vcvta_negative_expect,
+        0x10
+    )
+EXIT
--- a/userland/arch/arm/vcvtr.S
+++ b/userland/arch/arm/vcvtr.S
@@ -0,0 +1,46 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vcvtrr */
+
+#include "common.h"
+
+ENTRY
+.data
+    vcvtr_0:                    .float 1.25, 2.5, 3.75, 4.0
+    vcvtr_expect_zero:          .word  1,    2,   3,    4
+    vcvtr_expect_plus_infinity: .word  2,    3,   4,    4
+.bss
+    vcvtr_result_zero:          .skip 0x10
+    vcvtr_result_plus_infinity: .skip 0x10
+.text
+    ldr r0, =vcvtr_0
+    vld1.32 {q0}, [r0]
+
+    /* zero */
+    vmrs r0, fpscr
+    orr r0, r0, (3 << 22)
+    vmsr fpscr, r0
+    vcvtr.u32.f32 q1, q0
+    ldr r0, =vcvtr_result_zero
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvtr_result_zero,
+        vcvtr_expect_zero,
+        0x10
+    )
+
+#if 0
+    /* TODO why is this not working? Rounds to zero still. */
+    /* plus infinity */
+    vmrs r0, fpscr
+    mov r1, 1
+    bfi r0, r1, 22, 2
+    vmsr fpscr, r0
+    vcvtr.u32.f32 q1, q0
+    ldr r0, =vcvtr_result_plus_infinity
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvtr_result_plus_infinity,
+        vcvtr_expect_plus_infinity,
+        0x10
+    )
+#endif
+EXIT
--- a/userland/arch/arm/vfp.S
+++ b/userland/arch/arm/vfp.S
@@ -0,0 +1,152 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vfp
+ * Adapted from: https://mindplusplus.wordpress.com/2013/06/27/arm-vfp-vector-programming-part-2-examples/ */
+
+#include "common.h"
+
+.data;
+a1:
+    .float 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5
+a2:
+    .float 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5
+sum:
+    .skip 32
+sum_expect:
+    .float 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0
+
+ENTRY
+    /* Minimal single precision floating point example.
+     * TODO: floating point representation constraints due to 4-byte instruction?
+     */
+    vmov s0, 1.5
+    vmov s1, 2.5
+    vadd.f32 s2, s0, s1
+    vmov s3, 4.0
+    /* Compare two floating point registers. Stores results in fpscr:
+     * (floating point status and control register).
+     */
+    vcmp.f32 s2, s3
+    /* Move the nzcv bits from fpscr to apsr */
+    vmrs apsr_nzcv, fpscr
+    /* This branch uses the Z bit of apsr, which was set accordingly. */
+    ASSERT(beq)
+
+    /* Now the same from memory with vldr and vstr. */
+.data
+my_float_0:
+    .float 1.5
+my_float_1:
+    .float 2.5
+my_float_sum_expect:
+    .float 4.0
+.bss
+my_float_sum:
+    .skip 4
+.text
+    ldr r0, =my_float_0
+    vldr s0, [r0]
+    ldr r0, =my_float_1
+    vldr s1, [r0]
+    vadd.f32 s2, s0, s1
+    ldr r0, =my_float_sum
+    vstr.f32 s2, [r0]
+    ASSERT_MEMCMP(my_float_sum, my_float_sum_expect, 4)
+
+#if 0
+    /* We can't do pseudo vldr as for ldr, fails with:
+     * Error: cannot represent CP_OFF_IMM relocation in this object file format
+     * It works on ARMv8 however, so the relocation must have been added.
+     */
+    vldr s0, my_float_0
+#endif
+
+    /* Minimal double precision floating point example. */
+    vmov.f64 d0, 1.5
+    vmov.f64 d1, 2.5
+    vadd.f64 d2, d0, d1
+    vmov.f64 d3, 4.0
+    vcmp.f64 d2, d3
+    vmrs apsr_nzcv, fpscr
+    ASSERT(beq)
+
+    /* vmov can also move to general purpose registers.
+     *
+     * Just remember that we can't use float immediates with general purpose registers:
+     * https://stackoverflow.com/questions/6514537/how-do-i-specify-immediate-floating-point-numbers-with-inline-assembly/52906126#52906126
+     */
+    mov r1, 2
+    mov r0, 1
+    vmov s0, r0
+    vmov s1, s0
+    vmov r1, s1
+    ASSERT_EQ_REG(r0, r1)
+
+    /* Now a more complex test function. */
+    ldr r0, =sum
+    ldr r1, =a1
+    ldr r2, =a2
+    mov r3, 8
+    bl vec_sum
+    /* The assert works easily because all floats used
+     * have exact base-2 representation.
+     */
+    ASSERT_MEMCMP(sum, sum_expect, 0x20)
+EXIT
+
+/* void vec_sum(float *sum, float *a1, float *a2, int length) {
+ *   int i;
+ *   for (i=0; i &lt; length; i++)
+ *     *(sum+i) = *(a1+i) + *(a2+i);
+ * }
+ */
+vec_sum:
+    /* Setup */
+    push {r0, r1, r4, lr}
+    push {r0, r1}
+    mov r0, 1
+    mov r1, 8
+    bl reconfig
+    pop {r0, r1}
+    asr r3, 3
+
+    /* Do the sum. */
+1:
+    fldmias r1!, {s8-s15}
+    fldmias r2!, {s16-s23}
+    vadd.f32 s24, s8, s16
+    fstmias r0!, {s24-s31}
+    subs r3, r3, 1
+    bne 1b
+
+    /* Teardown. */
+    bl deconfig
+    pop {r0, r1, r4, pc}
+
+/* inputs:
+ * r0: desired vector stride (1 or 2)
+ * r1: desired vector length (min. 1, max. 8)
+ * outputs: (none)
+ * modified: r0, r1, FPSCR
+ * notes:
+ * r0 and r1 will be truncated before fitting into FPSCR
+ */
+reconfig:
+    push {r0-r2}
+    and r0, r0, 3
+    eor r0, r0, 1
+    sub r1, r1, 1
+    and r1, r1, 7
+    mov r0, r0, lsl 20
+    orr r0, r0, r1, lsl 16
+    vmrs r2, fpscr
+    bic r2, 55*65536
+    orr r2, r2, r0
+    vmsr fpscr, r0
+    pop {r0-r2}
+    bx lr
+
+deconfig:
+    push {r0, r1, lr}
+    mov r0, 1
+    mov r1, 1
+    bl reconfig
+    pop {r0, r1, pc}
--- a/userland/arch/common.h
+++ b/userland/arch/common.h
@@ -0,0 +1,28 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+/* We define in this header only macros that are the same on all archs. */
+
+/* common_arch.h contains arch specific macros. */
+#include "common_arch.h"
+
+.extern \
+    exit, \
+    printf, \
+    puts \
+;
+
+/* Assert that the given branch instruction is taken. */
+#define ASSERT(branch_if_pass) \
+    branch_if_pass 1f; \
+    FAIL; \
+1: \
+;
+
+/* Assert that a register equals another register. */
+#define ASSERT_EQ_REG(reg1, reg2) \
+	cmp reg1, reg2; \
+	ASSERT(beq); \
+;
+
+#endif
--- a/Show More
+++ b/Show More