userland: add assembly support

Move arm assembly cheat here, and start some work on x86 cheat as well.
2026-01-26 03:31:36 +01:00 · 2019-03-22 00:00:00 +00:00
parent 4943c9ed2e
commit 287c83f3f9
117 changed files with 3870 additions and 547 deletions
--- a/userland/arch/aarch64/add.S
+++ b/userland/arch/aarch64/add.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+    mov x0, 1
+    add x1, x0, 2
+    ASSERT_EQ(x1, 3)
+EXIT
--- a/userland/arch/aarch64/adr.S
+++ b/userland/arch/aarch64/adr.S
@@ -0,0 +1,21 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#adr */
+
+#include "common.h"
+
+.data
+data_label:
+    .word 0x1234678
+ENTRY
+    /* This is not possible in v7 because the label is in another section.
+     * objdump says that this generates a R_AARCH64_ADR_PRE relocation.
+     * which looks specific to ADR, and therefore makes it more likely
+     * that there was no such relocation in v7.
+     *
+     * This relocation is particularly important because str does not have a
+     * pc-relative mode in ARMv8.
+     */
+    adr x0, data_label
+    ldr x1, =data_label
+label:
+    ASSERT_EQ_REG(x0, x1)
+EXIT
--- a/userland/arch/aarch64/adrp.S
+++ b/userland/arch/aarch64/adrp.S
@@ -0,0 +1,13 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#adr */
+
+#include "common.h"
+
+ENTRY
+    adrp x0, label
+    adr x1, label
+label:
+    /* Clear the lower 12 bits. */
+    bic x1, x1, 0xFF
+    bic x1, x1, 0xF00
+    ASSERT_EQ_REG(x0, x1)
+EXIT
--- a/userland/arch/aarch64/asm_hello.c
+++ b/userland/arch/aarch64/asm_hello.c
@@ -1,13 +0,0 @@
-#include <assert.h>
-#include <inttypes.h>
-
-int main(void) {
-    uint32_t myvar = 1;
-    __asm__ (
-        "add %[myvar], %[myvar], 1;"
-        : [myvar] "=r" (myvar)
-        :
-        :
-    );
-    assert(myvar == 2);
-}
--- a/userland/arch/aarch64/beq.S
+++ b/userland/arch/aarch64/beq.S
@@ -0,0 +1,33 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#cbz */
+
+#include "common.h"
+
+ENTRY
+    /* cbz == 0 */
+    mov x0, 0
+    cbz x0, 1f
+    FAIL
+1:
+
+    /* cbz != 0 */
+    mov x0, 1
+    cbz x0, 1f
+    b 2f
+1:
+    FAIL
+2:
+
+    /* cbnz != 0 */
+    mov x0, 1
+    cbnz x0, 1f
+    FAIL
+1:
+
+    /* cbnz == 0 */
+    mov x0, 0
+    cbnz x0, 1f
+    b 2f
+1:
+    FAIL
+2:
+EXIT
--- a/userland/arch/aarch64/bfi.S
+++ b/userland/arch/aarch64/bfi.S
@@ -0,0 +1,11 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bfi */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x1122334455667788
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    bfi x1, x0, 16, 32
+    ASSERT_EQ(x1, 0xFFFF55667788FFFF)
+EXIT
--- a/userland/arch/aarch64/c/asm_from_c.c
+++ b/userland/arch/aarch64/c/asm_from_c.c
@@ -0,0 +1,39 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#calling-convention */
+
+#include <assert.h>
+#include <inttypes.h>
+
+uint64_t my_asm_func(void);
+/* { return 42; } */
+__asm__(
+    ".global my_asm_func;"
+    "my_asm_func:"
+    "mov x0, 42;"
+    "ret;"
+);
+
+/* Now a more complex example that also calls a C function.
+ * We have to store the return value x30 for later because bl modifies it.
+ * https://stackoverflow.com/questions/27941220/push-lr-and-pop-lr-in-arm-arch64/34504752#34504752
+ * We are not modifying any other callee saved register in this function,
+ * since my_c_func is not either (unless GCC has a bug ;-)), so everything else if fine.
+ */
+uint64_t my_asm_func_2(void);
+/* { return my_c_func(); } */
+__asm__(
+    ".global my_asm_func_2;"
+    "my_asm_func_2:"
+    "str x30, [sp, -16]!;"
+    "bl my_c_func;"
+    "ldr x30, [sp], 16;"
+    "ret;"
+);
+
+uint64_t my_c_func(void) {
+    return 42;
+}
+
+int main(void) {
+    assert(my_asm_func() == 42);
+    assert(my_asm_func_2() == 42);
+}
--- a/userland/arch/aarch64/c/build
+++ b/userland/arch/aarch64/c/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/aarch64/c/earlyclobber.c
+++ b/userland/arch/aarch64/c/earlyclobber.c
@@ -0,0 +1,21 @@
+/* An example of using the '&' earlyclobber modifier.
+ * https://stackoverflow.com/questions/15819794/when-to-use-earlyclobber-constraint-in-extended-gcc-inline-assembly/54853663#54853663
+ * The assertion may fail without it. It actually does fail in GCC 8.2.0 at
+ * 34017bcd0bc96a3cf77f6acba4d58350e67c2694 + 1.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint64_t in = 1;
+    uint64_t out;
+    __asm__ (
+        "add %[out], %[in], 1;"
+        "add %[out], %[in], 1;"
+        : [out] "=&r" (out)
+        : [in] "r" (in)
+        :
+    );
+    assert(out == 2);
+}
--- a/userland/arch/aarch64/c/freestanding/build
+++ b/userland/arch/aarch64/c/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/aarch64/c/freestanding/hello.c
+++ b/userland/arch/aarch64/c/freestanding/hello.c
@@ -0,0 +1,37 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#freestanding-linux-inline-assembly-system-calls */
+
+#include <inttypes.h>
+
+void _start(void) {
+    uint64_t exit_status;
+
+    /* write */
+    {
+        char msg[] = "hello\n";
+        uint64_t syscall_return;
+        register uint64_t x0 __asm__ ("x0") = 1; /* stdout */
+        register char *x1 __asm__ ("x1") = msg;
+        register uint64_t x2 __asm__ ("x2") = sizeof(msg);
+        register uint64_t x8 __asm__ ("x8") = 64; /* syscall number */
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (x0)
+            : "r" (x1), "r" (x2), "r" (x8)
+            : "memory"
+        );
+        syscall_return = x0;
+        exit_status = (syscall_return != sizeof(msg));
+    }
+
+    /* exit */
+    {
+        register uint64_t x0 __asm__ ("x0") = exit_status;
+        register uint64_t x8 __asm__ ("x8") = 93;
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (x0)
+            : "r" (x8)
+            :
+        );
+    }
+}
--- a/userland/arch/aarch64/c/freestanding/hello_clobbers.c
+++ b/userland/arch/aarch64/c/freestanding/hello_clobbers.c
@@ -0,0 +1,40 @@
+/* Like hello.c trying to do it without named register variables.
+ * The code is more complicated, and I was not able to get as efficient,
+ * so better just stick to named register variables.
+ */
+
+#include <inttypes.h>
+
+void _start(void) {
+    uint64_t exit_status;
+
+    /* write */
+    {
+        char msg[] = "hello\n";
+        uint64_t syscall_return;
+        __asm__ (
+            "mov x0, 1;" /* stdout */
+            "mov x1, %[msg];"
+            "mov x2, %[len];"
+            "mov x8, 64;" /* syscall number */
+            "svc 0;"
+            "mov %[syscall_return], x0;"
+            : [syscall_return] "=r" (syscall_return)
+            : [msg] "p" (msg),
+            [len] "i" (sizeof(msg))
+            : "x0", "x1", "x2", "x8", "memory"
+        );
+        exit_status = (syscall_return != sizeof(msg));
+    }
+
+    /* exit */
+    __asm__ (
+        "mov x0, %[exit_status];"
+        "mov x8, 93;" /* syscall number */
+        "svc 0;"
+        :
+        : [exit_status] "r" (exit_status)
+        : "x0", "x8"
+    );
+}
+
--- a/userland/arch/aarch64/c/inc.c
+++ b/userland/arch/aarch64/c/inc.c
@@ -0,0 +1,13 @@
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint64_t io = 1;
+    __asm__ (
+        "add %[io], %[io], 1;"
+        : [io] "+r" (io)
+        :
+        :
+    );
+    assert(io == 2);
+}
--- a/userland/arch/aarch64/c/inc_float.c
+++ b/userland/arch/aarch64/c/inc_float.c
@@ -0,0 +1,28 @@
+/* https://stackoverflow.com/questions/53960240/armv8-floating-point-output-inline-assembly
+ *
+ * We use the undocumented %s and %d modifiers!
+ */
+
+#include <assert.h>
+
+int main(void) {
+    float my_float = 1.5;
+    __asm__ (
+        "fmov s0, 1.0;"
+        "fadd %s[my_float], %s[my_float], s0;"
+        : [my_float] "+w" (my_float)
+        :
+        : "s0"
+    );
+    assert(my_float == 2.5);
+
+    double my_double = 1.5;
+    __asm__ (
+        "fmov d0, 1.0;"
+        "fadd %d[my_double], %d[my_double], d0;"
+        : [my_double] "+w" (my_double)
+        :
+        : "d0"
+    );
+    assert(my_double == 2.5);
+}
--- a/userland/arch/aarch64/c/multiline.cpp
+++ b/userland/arch/aarch64/c/multiline.cpp
@@ -0,0 +1,18 @@
+// https://stackoverflow.com/questions/3666013/how-to-write-multiline-inline-assembly-code-in-gcc-c/54575948#54575948
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint64_t io = 0;
+    __asm__ (
+        R"(
+add %[io], %[io], #1
+add %[io], %[io], #1
+)"
+        : [io] "+r" (io)
+        :
+        :
+    );
+    assert(io == 2);
+}
--- a/userland/arch/aarch64/c/reg_var.c
+++ b/userland/arch/aarch64/c/reg_var.c
@@ -0,0 +1,27 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#register-variables */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    register uint32_t x0 __asm__ ("x0");
+    register uint32_t x1 __asm__ ("x1");
+    uint32_t new_x0;
+    uint32_t new_x1;
+    {
+        x0 = 1;
+        x1 = 2;
+        __asm__ (
+            "add %[x0], x0, #1;"
+            "add %[x1], x1, #1;"
+            : [x0] "+r" (x0),
+              [x1] "+r" (x1)
+            :
+            :
+        );
+        new_x0 = x0;
+        new_x1 = x1;
+    }
+    assert(new_x0 == 2);
+    assert(new_x1 == 3);
+}
--- a/userland/arch/aarch64/c/reg_var_float.c
+++ b/userland/arch/aarch64/c/reg_var_float.c
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#register-variables */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    register double d0 __asm__ ("d0");
+    register double d1 __asm__ ("d1");
+    double new_d0;
+    double new_d1;
+    {
+        d0 = 1.5;
+        d1 = 2.5;
+        __asm__ (
+            "fmov d2, 1.5;"
+            "fadd %d[d0], d0, d2;"
+            "fadd %d[d1], d1, d2;"
+            : [d0] "+w" (d0),
+              [d1] "+w" (d1)
+            :
+            : "d2"
+        );
+        new_d0 = d0;
+        new_d1 = d1;
+    }
+    assert(new_d0 == 3.0);
+    assert(new_d1 == 4.0);
+}
--- a/userland/arch/aarch64/cbz.S
+++ b/userland/arch/aarch64/cbz.S
@@ -0,0 +1,19 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#cbz */
+
+#include "common.h"
+
+ENTRY
+
+    /* Branch. */
+    mov x0, 0x0
+    cbz x0, ok
+    FAIL
+ok:
+
+    /* Don't branch. */
+    mov x0, 0x1
+    cbz x0, ko
+
+EXIT
+ko:
+    FAIL
--- a/userland/arch/aarch64/comments.S
+++ b/userland/arch/aarch64/comments.S
@@ -0,0 +1,17 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#comments */
+
+#include "common.h"
+ENTRY
+    # mycomment
+
+    /* ARMv8 has // instead of @ as for comments. */
+    // mycomment
+    nop // mycomment
+
+    /* All these fail. Lol, different than v7, no consistency. */
+#if 0
+    nop # mycomment
+    @ mycomment
+    nop @ mycomment
+#endif
+EXIT
--- a/userland/arch/aarch64/common_arch.h
+++ b/userland/arch/aarch64/common_arch.h
@@ -0,0 +1,64 @@
+#ifndef COMMON_ARCH_H
+#define COMMON_ARCH_H
+
+#define ASSERT_EQ(reg, const) \
+    ldr x11, =const; \
+	cmp reg, x11; \
+	ASSERT(beq); \
+;
+
+#define ASSERT_MEMCMP(s1, s2, n) \
+	MEMCMP(s1, s2, n); \
+	ASSERT_EQ(x0, 0); \
+;
+
+#define ENTRY \
+.text; \
+.global asm_main; \
+asm_main: \
+    sub  sp, sp, 0xA0; \
+    stp  x29, x30, [sp]; \
+    stp  x27, x28, [sp, 0x10]; \
+    stp  x25, x26, [sp, 0x20]; \
+    stp  x23, x24, [sp, 0x30]; \
+    stp  x21, x22, [sp, 0x40]; \
+    stp  x19, x20, [sp, 0x50]; \
+    stp  x6, x7, [sp, 0x60]; \
+    stp  x4, x5, [sp, 0x70]; \
+    stp  x2, x3, [sp, 0x80]; \
+    stp  x0, x1, [sp, 0x90]; \
+asm_main_after_prologue: \
+;
+
+#define EXIT \
+    mov w0, 0; \
+    mov w1, 0; \
+    b pass; \
+fail: \
+    ldr x1, [sp, 0x90]; \
+    str w0, [x1]; \
+    mov w0, 1; \
+pass: \
+    ldp x19, x20, [sp, 0x50]; \
+    ldp x21, x22, [sp, 0x40]; \
+    ldp x23, x24, [sp, 0x30]; \
+    ldp x25, x26, [sp, 0x20]; \
+    ldp x27, x28, [sp, 0x10]; \
+    ldp x29, x30, [sp]; \
+    add sp, sp, 0xA0; \
+    ret; \
+;
+
+#define FAIL \
+    ldr w0, =__LINE__; \
+    b fail; \
+;
+
+#define MEMCMP(s1, s2, n) \
+    adr x0, s1; \
+    adr x1, s2; \
+    ldr x2, =n; \
+    bl memcmp; \
+;
+
+#endif
--- a/userland/arch/aarch64/cset.S
+++ b/userland/arch/aarch64/cset.S
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#cset */
+
+#include "common.h"
+
+ENTRY
+    /* Test values. */
+    mov x0, 0
+    mov x1, 1
+
+    /* eq is true, set x2 = 1. */
+    cmp x0, x0
+    cset x2, eq
+    ASSERT_EQ(x2, 1)
+
+    /* eq is false, set x2 = 0. */
+    cmp x0, x1
+    cset x2, eq
+    ASSERT_EQ(x2, 0)
+
+    /* Same for ne. */
+    cmp x0, x0
+    cset x2, ne
+    ASSERT_EQ(x2, 0)
+
+    cmp x0, x1
+    cset x2, ne
+    ASSERT_EQ(x2, 1)
+EXIT
--- a/userland/arch/aarch64/empty.S
+++ b/userland/arch/aarch64/empty.S
@@ -0,0 +1 @@
+../empty.S
--- a/userland/arch/aarch64/fail.S
+++ b/userland/arch/aarch64/fail.S
@@ -0,0 +1 @@
+../fail.S
--- a/userland/arch/aarch64/floating_point.S
+++ b/userland/arch/aarch64/floating_point.S
@@ -0,0 +1,60 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* 1.5 + 2.5 == 4.0
+     * using 64-bit double immediates.
+     */
+    fmov d0, 1.5
+    fmov d1, 2.5
+    fadd d2, d0, d1
+    fmov d3, 4.0
+    /* Unlike VFP vcmp, this stores the status
+     * automatically in the main CPSR.
+     */
+    fcmp d2, d3
+    ASSERT(beq)
+
+    /* Now with a memory stored value. */
+.data
+my_double_0:
+    .double 1.5
+my_double_1:
+    .double 2.5
+my_double_sum_expect:
+    .double 4.0
+.text
+    ldr d0, my_double_0
+    ldr d1, my_double_1
+    fadd d2, d0, d1
+    ldr d3, my_double_sum_expect
+    fcmp d2, d3
+    ASSERT(beq)
+
+    /* Now in 32-bit. */
+    fmov s0, 1.5
+    fmov s1, 2.5
+    fadd s2, s0, s1
+    fmov s3, 4.0
+    fcmp s2, s3
+    ASSERT(beq)
+
+    /* TODO why? What's the point of q then?
+     * Error: operand mismatch -- `fmov q0,1.5'
+     */
+#if 0
+    fmov q0, 1.5
+#endif
+
+    /* Much like integers, immediates are constrained to
+     * fit in 32-byte instructions. TODO exact rules.
+     *
+     * Assembly here would fail with:
+     *
+     * Error: invalid floating-point constant at operand 2
+     */
+#if 0
+    fmov d0, 1.23456798
+#endif
+EXIT
--- a/userland/arch/aarch64/freestanding/build
+++ b/userland/arch/aarch64/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/aarch64/freestanding/hello.S
+++ b/userland/arch/aarch64/freestanding/hello.S
@@ -0,0 +1,20 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#linux-system-calls */
+
+.text
+.global _start
+_start:
+asm_main_after_prologue:
+    /* write */
+    mov x0, 1     /* stdout */
+    adr x1, msg   /* buffer */
+    ldr x2, =len  /* len */
+    mov x8, 64    /* syscall number */
+    svc 0
+
+    /* exit */
+    mov x0, 0     /* exit status */
+    mov x8, 93    /* syscall number */
+    svc 0
+msg:
+    .ascii "hello\n"
+len = . - msg
--- a/userland/arch/aarch64/hello_driver.S
+++ b/userland/arch/aarch64/hello_driver.S
@@ -0,0 +1,6 @@
+.text
+.global asm_main
+asm_main:
+asm_main_after_prologue:
+    mov w0, 0
+    ret
--- a/userland/arch/aarch64/immediates.S
+++ b/userland/arch/aarch64/immediates.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#immediates */
+
+#include "common.h"
+ENTRY
+    mov x0, 1
+    mov x0, 0x1
+    mov x0, 1
+    mov x0, 0x1
+EXIT
--- a/userland/arch/aarch64/movk.S
+++ b/userland/arch/aarch64/movk.S
@@ -0,0 +1,26 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#movk */
+
+#include "common.h"
+
+ENTRY
+    movk x0, 0x4444, lsl 0
+    movk x0, 0x3333, lsl 16
+    movk x0, 0x2222, lsl 32
+    movk x0, 0x1111, lsl 48
+    ASSERT_EQ(x0, 0x1111222233334444)
+
+    /* Set a label (addresses are 48-bit) with immediates:
+     *
+     * * https://stackoverflow.com/questions/38570495/aarch64-relocation-prefixes
+     * * https://sourceware.org/binutils/docs-2.26/as/AArch64_002dRelocations.html
+     *
+     * This could be used if the label is too far away for
+     * adr relative addressing.
+     */
+    movz x0, :abs_g2:label     /* bits 32-47, overflow check */
+    movk x0, :abs_g1_nc:label  /* bits 16-31, no overflow check */
+    movk x0, :abs_g0_nc:label  /* bits  0-15, no overflow check */
+    adr x1, label
+label:
+    ASSERT_EQ_REG(x0, x1)
+EXIT
--- a/userland/arch/aarch64/movn.S
+++ b/userland/arch/aarch64/movn.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#movn */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x123456789ABCDEF0
+    movn x0, 0x8888, lsl 16
+    ASSERT_EQ(x0, 0xFFFFFFFF7777FFFF)
+EXIT
--- a/userland/arch/aarch64/pc.S
+++ b/userland/arch/aarch64/pc.S
@@ -0,0 +1,78 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#registers */
+
+#include "common.h"
+
+ENTRY
+#if 0
+    /* Unlike v7, we can't use PC like any other register in ARMv8,
+     * since it is not a general purpose register anymore.
+     *
+     * Only branch instructions can modify the PC.
+     *
+     * B1.2.1 "Registers in AArch64 state" says:
+     *
+     * Software cannot write directly to the PC. It
+     * can only be updated on a branch, exception entry or
+     * exception return.
+     */
+    ldr pc, =10f
+    FAIL
+10:
+#endif
+#if 0
+    mov x0, pc
+#endif
+
+    /* LDR PC-relative loads exist in ARMv8, but they have a separate encoding
+     * "LDR (literal)" instead of "LDR (immediate)":
+     * https://stackoverflow.com/questions/28638981/howto-write-pc-relative-adressing-on-arm-asm/54480999#54480999
+     */
+    ldr x0, pc_relative_ldr
+    b 1f
+pc_relative_ldr:
+    .quad 0x123456789ABCDEF0
+1:
+    ASSERT_EQ(x0, 0x123456789ABCDEF0)
+
+    /* Just for fun, we can also use relative numbers instead of labels.
+     * https://reverseengineering.stackexchange.com/questions/17666/how-does-the-ldr-instruction-work-on-arm/20567#20567
+     */
+    ldr x0, 0x8
+    b 1f
+    .quad 0x123456789ABCDEF0
+1:
+    ASSERT_EQ(x0, 0x123456789ABCDEF0)
+
+    /* Analogous for b with PC. */
+    mov x0, 0
+    /* Jumps over mov to ASSERT_EQ. */
+    b 8
+    mov x0, 1
+    ASSERT_EQ(x0, 0)
+
+    /* Trying to use the old "LDR (immediate)" PC-relative
+     * syntax does not work.
+     */
+#if 0
+    /* 64-bit integer or SP register expected at operand 2 -- `ldr x0,[pc]' */
+    ldr x0, [pc]
+#endif
+
+    /* There is however no analogue for str. TODO rationale? */
+#if 0
+    /* Error: invalid addressing mode at operand 2 -- `str x0,pc_relative_str' */
+    str x0, pc_relative_str
+#endif
+
+    /* You just have to use adr + "STR (register)". */
+    ldr x0, pc_relative_str
+    ASSERT_EQ(x0, 0x0)
+    adr x1, pc_relative_str
+    ldr x0, pc_relative_ldr
+    str x0, [x1]
+    ldr x0, pc_relative_str
+    ASSERT_EQ(x0, 0x123456789ABCDEF0)
+EXIT
+.data
+pc_relative_str:
+    .quad 0x0000000000000000
--- a/userland/arch/aarch64/regs.S
+++ b/userland/arch/aarch64/regs.S
@@ -0,0 +1,47 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#armv8-registers */
+
+#include "common.h"
+
+ENTRY
+
+    /* 31 64-bit eXtended general purpose registers. */
+    mov x0, 0
+    mov x1, 1
+    mov x2, 2
+    mov x3, 3
+    mov x4, 4
+    mov x5, 5
+    mov x6, 6
+    mov x7, 7
+    mov x8, 8
+    mov x9, 9
+    mov x10, 10
+    mov x11, 11
+    mov x12, 12
+    mov x13, 13
+    mov x14, 14
+    mov x15, 15
+    mov x16, 16
+    mov x17, 17
+    mov x18, 18
+    mov x19, 19
+    mov x20, 20
+    mov x21, 21
+    mov x22, 22
+    mov x23, 23
+    mov x24, 24
+    mov x25, 25
+    mov x26, 26
+    mov x27, 27
+    mov x28, 28
+    mov x29, 29
+
+    /* x30 is the link register. BL stores the return address here. */
+    /*mov x30, 30*/
+
+    /* W form addresses the lower 4 bytes word, and zeroes the top. */
+    ldr x0, =0x1111222233334444
+    ldr x1, =0x5555666677778888
+    mov w0, w1
+    ASSERT_EQ(x0, 0x0000000077778888)
+EXIT
--- a/userland/arch/aarch64/ret.S
+++ b/userland/arch/aarch64/ret.S
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bl */
+
+#include "common.h"
+
+ENTRY
+    mov x0, 1
+    bl inc
+    ASSERT_EQ(x0, 2)
+    bl inc2
+    ASSERT_EQ(x0, 3)
+    bl inc3
+    ASSERT_EQ(x0, 4)
+EXIT
+
+/* void inc(uint64_t *i) { (*i)++ } */
+inc:
+    add x0, x0, 1
+    ret
+
+/* Same but explicit return register. */
+inc2:
+    add x0, x0, 1
+    ret x30
+
+/* Same but with br. */
+inc3:
+    add x0, x0, 1
+    br x30
--- a/userland/arch/aarch64/simd.S
+++ b/userland/arch/aarch64/simd.S
@@ -0,0 +1,86 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* 4x 32-bit integer add.
+     *
+     * s stands for single == 32 bits.
+     *
+     * 1 in ld1 means to load just one register, see:
+     * https://github.com/cirosantilli/arm-assembly-cheat#simd-interleaving
+     */
+.data
+    u32_0:          .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444
+    u32_1:          .word 0x15551555, 0x16661666, 0x17771777, 0x18881888
+    u32_sum_expect: .word 0x06670666, 0x08890888, 0x0AAB0AAA, 0x0CCD0CCC
+.bss
+    u32_sum:        .skip 16
+.text
+    adr x0, u32_0
+    ld1 {v0.4s}, [x0]
+    adr x1, u32_1
+    ld1 {v1.4s}, [x1]
+    add v2.4s, v0.4s, v1.4s
+    adr x0, u32_sum
+    st1 {v2.4s}, [x0]
+    ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10)
+
+    /* 2x 64-bit integer add.
+     *
+     * d stands for double == 64 bits.
+     */
+.data
+    u64_0:          .quad 0xF1111111F1111111, 0xF2222222F2222222
+    u64_1:          .quad 0x1555555515555555, 0x1666666616666666
+    u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
+.bss
+    u64_sum: .skip 16
+.text
+    adr x0, u64_0
+    ld1 {v0.2d}, [x0]
+    adr x1, u64_1
+    ld1 {v1.2d}, [x1]
+    add v2.2d, v0.2d, v1.2d
+    adr x0, u64_sum
+    st1 {v2.2d}, [x0]
+    ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10)
+
+    /* 4x 32-bit float add.
+     *
+     * The only difference between the integer point version
+     * is that we use fadd instead of add.
+     */
+.data
+    f32_0:          .float 1.5, 2.5,  3.5,  4.5
+    f32_1:          .float 5.5, 6.5,  7.5,  8.5
+    f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
+.bss
+    f32_sum: .skip 16
+.text
+    adr x0, f32_0
+    ld1 {v0.4s}, [x0]
+    adr x1, f32_1
+    ld1 {v1.4s}, [x1]
+    fadd v2.4s, v0.4s, v1.4s
+    adr x0, f32_sum
+    st1 {v2.4s}, [x0]
+    ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10)
+
+    /* 2x 64-bit float add. */
+.data
+    f64_0:          .double 1.5, 2.5
+    f64_1:          .double 5.5, 6.5
+    f64_sum_expect: .double 7.0, 9.0
+.bss
+    f64_sum: .skip 16
+.text
+    adr x0, f64_0
+    ld1 {v0.2d}, [x0]
+    adr x1, f64_1
+    ld1 {v1.2d}, [x1]
+    fadd v2.2d, v0.2d, v1.2d
+    adr x0, f64_sum
+    st1 {v2.2d}, [x0]
+    ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10)
+EXIT
--- a/userland/arch/aarch64/simd_interleave.S
+++ b/userland/arch/aarch64/simd_interleave.S
@@ -0,0 +1,26 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#simd-interleaving */
+
+#include "common.h"
+
+ENTRY
+.data
+    u32_interleave: .word \
+        0x11111111, 0x55555555, \
+        0x22222222, 0x66666666, \
+        0x33333333, 0x77777777, \
+        0x44444444, 0x88888888
+    u32_interleave_sum_expect: .word \
+        0x66666666, \
+        0x88888888, \
+        0xAAAAAAAA, \
+        0xCCCCCCCC
+.bss
+    u32_interleave_sum: .skip 16
+.text
+    adr x0, u32_interleave
+    ld2 {v0.4s, v1.4s}, [x0]
+    add v2.4s, v0.4s, v1.4s
+    adr x0, u32_interleave_sum
+    st1 {v2.4s}, [x0]
+    ASSERT_MEMCMP(u32_interleave_sum, u32_interleave_sum_expect, 0x10)
+EXIT
--- a/userland/arch/aarch64/str.S
+++ b/userland/arch/aarch64/str.S
@@ -0,0 +1,13 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#armv8-str */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, myvar
+    ASSERT_EQ(x0, 0x12346789ABCDEF0)
+#if 0
+    /* Error: invalid addressing mode at operand 2 -- `str x0,myvar' */
+    str x0, myvar
+#endif
+EXIT
+    myvar: .quad 0x12346789ABCDEF0
--- a/userland/arch/aarch64/ubfm.S
+++ b/userland/arch/aarch64/ubfm.S
@@ -0,0 +1,17 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ubfm */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x1122334455667788
+
+    // lsr alias: imms == 63
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfm x1, x0, 16, 63
+    ASSERT_EQ(x1, 0x0000112233445566)
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfm x1, x0, 32, 63
+    ASSERT_EQ(x1, 0x0000000011223344)
+EXIT
--- a/userland/arch/aarch64/ubfx.S
+++ b/userland/arch/aarch64/ubfx.S
@@ -0,0 +1,15 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ubfx */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x1122334455667788
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfx x1, x0, 8, 16
+    ASSERT_EQ(x1, 0x0000000000006677)
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfx x1, x0, 8, 32
+    ASSERT_EQ(x1, 0x0000000044556677)
+EXIT
--- a/userland/arch/aarch64/x31.S
+++ b/userland/arch/aarch64/x31.S
@@ -0,0 +1,51 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#x31 */
+
+#include "common.h"
+
+ENTRY
+    /* ERROR: can never use the name x31. */
+#if 0
+    mov x31, 31
+#endif
+
+    /* mov (register) is an alias for ORR, which accepts xzr. */
+    mov x0, 1
+    mov x0, xzr
+    ASSERT_EQ(x0, 0)
+
+    /* Same encoding as the mov version. */
+    mov x0, 1
+    orr x0, xzr, xzr
+    ASSERT_EQ(x0, 0)
+
+    /* So, orr, which is not an alias, can only take xzr, not sp. */
+#if 0
+    orr sp, sp, sp
+#endif
+
+    /* Zero register discards result if written to. */
+    mov x0, 1
+    orr xzr, x0, x0
+    ASSERT_EQ(xzr, 0)
+
+    /* MOV (to/from SP) is an alias for ADD (immediate). */
+    mov x0, sp
+    mov sp, 1
+    /* Alias to add. */
+    mov x1, sp
+    /* Exact same encoding as above. */
+    add x1, sp, 0
+    ASSERT_EQ(x1, 1)
+    mov sp, x0
+
+    /* So, ADD (immediate), which is not an alias, can only take sp, not xzr. */
+#if 0
+    /* Error: integer register expected in the extended/shifted operand register at operand 3 -- `add xzr,xzr,1' */
+    add xzr, xzr, 1
+#endif
+
+    /* Note however that ADD (register), unlike ADD (immediate),
+     * does not say anything about SP, and so does accept xzr just fine.
+     */
+    add xzr, xzr, xzr
+EXIT
--- a/userland/arch/arm/add.S
+++ b/userland/arch/arm/add.S
@@ -0,0 +1,58 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+
+    /* Immediate encoding.
+     *
+     * r1 = r0 + 2
+     */
+    mov r0, 1
+    /* r1 = r0 + 2 */
+    add r1, r0, 2
+    ASSERT_EQ(r1, 3)
+
+    /* If src == dest, we can omit one of them.
+     *
+     * r0 = r0 + 2
+     */
+    mov r0, 1
+    add r0, 2
+    ASSERT_EQ(r0, 3)
+
+    /* Same as above but explicit. */
+    mov r0, 1
+    add r0, r0, 2
+    ASSERT_EQ(r0, 3)
+
+#if 0
+    /* But we cannot omit the register if there is a shift when using .syntx unified:
+     * https://github.com/cirosantilli/arm-assembly-cheat#shift-suffixes
+     */
+    .syntax unified
+    /* Error: garbage following instruction */
+    add r0, r1, lsl 1
+    /* OK */
+    add r0, r0, r1, lsl 1
+#endif
+
+    /* Register encoding.
+     *
+     * r2 = r0 + r1
+     */
+    mov r0, 1
+    mov r1, 2
+    add r2, r0, r1
+    ASSERT_EQ(r2, 3)
+
+    /* Register encoding, omit implicit register.
+     *
+     * r1 = r1 + r0
+     */
+    mov r0, 1
+    mov r1, 2
+    add r1, r0
+    ASSERT_EQ(r1, 3)
+
+EXIT
--- a/userland/arch/arm/address_modes.S
+++ b/userland/arch/arm/address_modes.S
@@ -0,0 +1,51 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#addressing-modes */
+
+#include "common.h"
+
+ENTRY
+
+    /* Offset mode with immediate. Add 4 to the address register, which ends up
+     * reading myvar2 instead of myvar.
+     */
+    adr r0, myvar
+    ldr r1, [r0, 4]
+    ASSERT_EQ(r1, 0x9ABCDEF0)
+    /* r0 was not modified. */
+    ASSERT_EQ(r0, myvar)
+
+    /* Pre-indexed mode */
+    adr r0, myvar
+    ldr r1, [r0, 4]!
+    ASSERT_EQ(r1, 0x9ABCDEF0)
+    /* r0 was modified. */
+    ASSERT_EQ(r0, myvar2)
+
+    /* Post-indexed mode */
+    adr r0, myvar
+    ldr r1, [r0], 4
+    ASSERT_EQ(r1, 0x12345678)
+    /* r0 was modified. */
+    ASSERT_EQ(r0, myvar2)
+
+    /* Offset in register. */
+    adr r0, myvar
+    mov r1, 4
+    ldr r2, [r0, r1]
+    ASSERT_EQ(r2, 0x9ABCDEF0)
+
+    /* Offset in shifted register:
+     * r2 =
+     * (r0 + (r1 << 1))
+     * == *(myvar + (2 << 1))
+     * == *(myvar + 4)
+     */
+    adr r0, myvar
+    mov r1, 2
+    ldr r2, [r0, r1, lsl 1]
+    ASSERT_EQ(r2, 0x9ABCDEF0)
+
+EXIT
+myvar:
+    .word 0x12345678
+myvar2:
+    .word 0x9ABCDEF0
--- a/userland/arch/arm/adr.S
+++ b/userland/arch/arm/adr.S
@@ -0,0 +1,33 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#adr */
+
+#include "common.h"
+
+.data
+data_label:
+    .word 0x1234678
+ENTRY
+    adr r0, label
+    /* objdump tells us that this uses the literal pool,
+     * it does not get converted to adr, which is the better
+     * alternative here.
+     */
+    adr r1, label
+    adrl r2, label
+label:
+    ASSERT_EQ_REG(r0, r1)
+    ASSERT_EQ_REG(r0, r2)
+
+#if 0
+    /* Error: symbol .data is in a different section.
+     *
+     * It works however in ARMv8.
+     * I think this means that there is no relocation type
+     * that takes care of this encoding in ARMv8, but there
+     * is one in ARMv8.
+     *
+     * If you have no idea what I'm talking about, read this:
+     * https://stackoverflow.com/questions/3322911/what-do-linkers-do/33690144#33690144
+     */
+    adr r1, data_label
+#endif
+EXIT
--- a/userland/arch/arm/and.S
+++ b/userland/arch/arm/and.S
@@ -0,0 +1,27 @@
+/* Bitwise AND. */
+
+#include "common.h"
+
+ENTRY
+
+    /* 0x00 && 0xFF == 0x00 */
+    mov r0, 0x00
+    and r0, 0xFF
+    ASSERT_EQ(r0, 0x00)
+
+    /* 0x0F && 0xF0 == 0x00 */
+    mov r0, 0x0F
+    and r0, 0xF0
+    ASSERT_EQ(r0, 0x00)
+
+    /* 0x0F && 0xFF == 0x0F */
+    mov r0, 0x0F
+    and r0, 0xFF
+    ASSERT_EQ(r0, 0x0F)
+
+    /* 0xF0 && 0xFF == 0xF0 */
+    mov r0, 0xF0
+    and r0, 0xFF
+    ASSERT_EQ(r0, 0xF0)
+
+EXIT
--- a/userland/arch/arm/b.S
+++ b/userland/arch/arm/b.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#b */
+
+#include "common.h"
+ENTRY
+    /* Jump over the fail. 26-bit PC-relative. */
+    b ok
+    FAIL
+ok:
+EXIT
--- a/userland/arch/arm/beq.S
+++ b/userland/arch/arm/beq.S
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#beq */
+
+#include "common.h"
+
+ENTRY
+
+    /* Smaller*/
+    mov r0, 1
+    cmp r0, 2
+    ASSERT(ble)
+    ASSERT(blt)
+    ASSERT(bne)
+
+    /* Equal. */
+    mov r1, 0
+    cmp r1, 0
+    ASSERT(beq)
+    ASSERT(bge)
+    ASSERT(ble)
+
+    /* Greater. */
+    mov r0, 2
+    cmp r0, 1
+    ASSERT(bge)
+    ASSERT(bgt)
+    ASSERT(bne)
+
+EXIT
--- a/userland/arch/arm/bfi.S
+++ b/userland/arch/arm/bfi.S
@@ -0,0 +1,10 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bfi */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =0x11223344
+    ldr r1, =0xFFFFFFFF
+    bfi r1, r0, 8, 16
+    ASSERT_EQ(r1, 0xFF3344FF)
+EXIT
--- a/userland/arch/arm/bic.S
+++ b/userland/arch/arm/bic.S
@@ -0,0 +1,10 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bic */
+
+#include "common.h"
+
+ENTRY
+    /* 0x0F & ~0x55 == 0x0F & 0xAA == 0x0A */
+    mov r0, 0x0F
+    bic r0, 0x55
+    ASSERT_EQ(r0, 0x0A)
+EXIT
--- a/userland/arch/arm/bl.S
+++ b/userland/arch/arm/bl.S
@@ -0,0 +1,14 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bl */
+
+#include "common.h"
+
+ENTRY
+    mov r0, 1
+    bl inc
+    ASSERT_EQ(r0, 2)
+EXIT
+
+/* void inc(int *i) { (*i)++ } */
+inc:
+    add r0, 1
+    bx lr
--- a/userland/arch/arm/build
+++ b/userland/arch/arm/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/c/add.c
+++ b/userland/arch/arm/c/add.c
@@ -0,0 +1,17 @@
+/* 1 + 2 == 3 */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint32_t in0 = 1, in1 = 2, out;
+    __asm__ (
+        "add %[out], %[in0], %[in1];"
+        : [out] "=r" (out)
+        : [in0] "r"  (in0),
+          [in1] "r"  (in1)
+    );
+    assert(in0 == 1);
+    assert(in1 == 2);
+    assert(out == 3);
+}
--- a/userland/arch/arm/c/build
+++ b/userland/arch/arm/c/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/c/freestanding/build
+++ b/userland/arch/arm/c/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/c/freestanding/hello.c
+++ b/userland/arch/arm/c/freestanding/hello.c
@@ -0,0 +1,35 @@
+#include <inttypes.h>
+
+void _start(void) {
+    uint32_t exit_status;
+
+    /* write */
+    {
+        char msg[] = "hello\n";
+        uint32_t syscall_return;
+        register uint32_t r0 __asm__ ("r0") = 1; /* stdout */
+        register char *r1 __asm__ ("r1") = msg;
+        register uint32_t r2 __asm__ ("r2") = sizeof(msg);
+        register uint32_t r8 __asm__ ("r7") = 4; /* syscall number */
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (r0)
+            : "r" (r1), "r" (r2), "r" (r8)
+            : "memory"
+        );
+        syscall_return = r0;
+        exit_status = (syscall_return != sizeof(msg));
+    }
+
+    /* exit */
+    {
+        register uint32_t r0 __asm__ ("r0") = exit_status;
+        register uint32_t r7 __asm__ ("r7") = 1;
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (r0)
+            : "r" (r7)
+            :
+        );
+    }
+}
--- a/userland/arch/arm/c/inc.c
+++ b/userland/arch/arm/c/inc.c
@@ -0,0 +1,15 @@
+/* Increment a variable in inline assembly. */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint32_t my_local_var = 1;
+    __asm__ (
+        "add %[my_local_var], %[my_local_var], #1;"
+        : [my_local_var] "+r" (my_local_var)
+        :
+        :
+    );
+    assert(my_local_var == 2);
+}
--- a/userland/arch/arm/c/inc_float.c
+++ b/userland/arch/arm/c/inc_float.c
@@ -0,0 +1,28 @@
+/* https://stackoverflow.com/questions/53960240/armv8-floating-point-output-inline-assembly */
+
+#include <assert.h>
+
+int main(void) {
+    float my_float = 1.5;
+    __asm__ (
+        "vmov s0, 1.0;"
+        "vadd.f32 %[my_float], %[my_float], s0;"
+        : [my_float] "+t" (my_float)
+        :
+        : "s0"
+    );
+    assert(my_float == 2.5);
+
+    /* Undocumented %P
+     * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89482
+     */
+    double my_double = 1.5;
+    __asm__ (
+        "vmov.f64 d0, 1.0;"
+        "vadd.f64 %P[my_double], %P[my_double], d0;"
+        : [my_double] "+w" (my_double)
+        :
+        : "d0"
+    );
+    assert(my_double == 2.5);
+}
--- a/userland/arch/arm/c/inc_memory.c
+++ b/userland/arch/arm/c/inc_memory.c
@@ -0,0 +1,32 @@
+/* Like inc.c but less good since we do more work ourselves.
+ *
+ * Just doing this to test out the "m" memory constraint.
+ *
+ * GCC 8.2.0 -O0 assembles ldr line to:
+ *
+ * ....
+ * ldr r0, [fp, #-12]
+ * ....
+ *
+ * and `-O3` assembles to:
+ *
+ * ....
+ * ldr r0, [sp]
+ * ....
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint32_t my_local_var = 1;
+    __asm__ (
+        "ldr r0, %[my_local_var];"
+        "add r0, r0, #1;"
+        "str r0, %[my_local_var];"
+        : [my_local_var] "+m" (my_local_var)
+        :
+        : "r0"
+    );
+    assert(my_local_var == 2);
+}
--- a/userland/arch/arm/c/inc_memory_global.c
+++ b/userland/arch/arm/c/inc_memory_global.c
@@ -0,0 +1,25 @@
+/* GCC 8.2.0 -O0 and -O3 assembles ldr line to:
+ *
+ * ....
+ * movw r3, #<lower address part>
+ * movt r3, #<higher address part>
+ * ldr r0, [r3]
+ * ....
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+uint32_t my_global_var = 1;
+
+int main(void) {
+    __asm__ (
+        "ldr r0, %[my_global_var];"
+        "add r0, r0, #1;"
+        "str r0, %[my_global_var];"
+        : [my_global_var] "+m" (my_global_var)
+        :
+        : "r0"
+    );
+    assert(my_global_var == 2);
+}
--- a/userland/arch/arm/c/reg_var.c
+++ b/userland/arch/arm/c/reg_var.c
@@ -0,0 +1,38 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#register-variables */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    register uint32_t r0 __asm__ ("r0");
+    register uint32_t r1 __asm__ ("r1");
+    uint32_t new_r0;
+    uint32_t new_r1;
+    {
+        /* We must set the registers immediately before calling,
+         * without making any function calls in between.
+         */
+        r0 = 1;
+        r1 = 2;
+        __asm__ (
+            /* We intentionally use an explicit r0 and r1 here,
+            * just to illustrate that we are certain that the
+            * r0 variable will go in r0. Real code would never do this.
+            */
+            "add %[r0], r0, #1;"
+            "add %[r1], r1, #1;"
+            /* We have to specify r0 in the constraints.*/
+            : [r0] "+r" (r0),
+              [r1] "+r" (r1)
+            :
+            :
+        );
+        /* When we are done, we must immediatly assign
+         * the register variables to regular variables.
+         */
+        new_r0 = r0;
+        new_r1 = r1;
+    }
+    assert(new_r0 == 2);
+    assert(new_r1 == 3);
+}
--- a/userland/arch/arm/c_from_asm.S
+++ b/userland/arch/arm/c_from_asm.S
@@ -0,0 +1,59 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#calling-convention */
+
+#include "common.h"
+
+.data
+puts_s:
+    .asciz "hello puts"
+printf_format:
+    .asciz "hello printf %x\n"
+my_array_0:
+    .word 0x11111111, 0x22222222, 0x33333333, 0x44444444
+my_array_1:
+    .word 0x55555555, 0x66666666, 0x77777777, 0x88888888
+
+ENTRY
+    /* puts("hello world") */
+    /* r0 is first argument. */
+    ldr r0, =puts_s
+    bl puts
+    /* Check exit statut >= 0 for success. */
+    cmp r0, 0
+    ASSERT(bge)
+
+    /* printf */
+    ldr r0, =printf_format
+    ldr r1, =0x12345678
+    bl printf
+    cmp r0, 0
+    ASSERT(bge)
+
+    /* memcpy and memcmp. */
+
+        /* Smaller. */
+        ldr r0, =my_array_0
+        ldr r1, =my_array_1
+        ldr r2, =0x10
+        bl memcmp
+        cmp r0, 0
+        ASSERT(blt)
+
+        /* Copy. */
+        ldr r0, =my_array_0
+        ldr r1, =my_array_1
+        ldr r2, =0x10
+        bl memcpy
+
+        /* Equal. */
+        ldr r0, =my_array_0
+        ldr r1, =my_array_1
+        ldr r2, =0x10
+        bl memcmp
+        ASSERT_EQ(r0, 0)
+
+    /* exit(0) */
+    mov r0, 0
+    bl exit
+
+    /* Never reached, just for the fail symbol. */
+EXIT
--- a/userland/arch/arm/clz.S
+++ b/userland/arch/arm/clz.S
@@ -0,0 +1,17 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =0x7FFFFFFF
+    clz r1, r0
+    ASSERT_EQ(r1, 1)
+
+    ldr r0, =0x3FFFFFFF
+    clz r1, r0
+    ASSERT_EQ(r1, 2)
+
+    ldr r0, =0x1FFFFFFF
+    clz r1, r0
+    ASSERT_EQ(r1, 3)
+EXIT
--- a/userland/arch/arm/comments.S
+++ b/userland/arch/arm/comments.S
@@ -0,0 +1,14 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#comments */
+
+#include "common.h"
+ENTRY
+    # mycomment
+    @ mycomment
+    /* # only works at the beginning of the line.
+     * Error: garbage following instruction -- `nop #comment'
+     */
+#if 0
+    nop # mycomment
+#endif
+    nop @ mycomment
+EXIT
--- a/userland/arch/arm/common_arch.h
+++ b/userland/arch/arm/common_arch.h
@@ -0,0 +1,71 @@
+#ifndef COMMON_ARCH_H
+#define COMMON_ARCH_H
+
+.syntax unified
+
+/* Assert that a register equals a constant.
+ * * reg: the register to check. Can be r0-r10, but not r11. r11 is overwritten.
+ * * const: the constant to compare to. Only works for literals or labels, not for registers.
+ *          For register / register comparision, use ASSERT_EQ_REG.
+ */
+#define ASSERT_EQ(reg, const) \
+    ldr r11, =const; \
+	cmp reg, r11; \
+	ASSERT(beq); \
+;
+
+/* Assert that two arrays are the same. */
+#define ASSERT_MEMCMP(s1, s2, n) \
+	MEMCMP(s1, s2, n); \
+	ASSERT_EQ(r0, 0); \
+;
+
+/* Store all callee saved registers, and LR in case we make further BL calls.
+ *
+ * Also save the input arguments r0-r3 on the stack, so we can access them later on,
+ * despite those registers being overwritten.
+ */
+#define ENTRY \
+.text; \
+.global asm_main; \
+asm_main: \
+    stmdb sp!, {r0-r12, lr}; \
+asm_main_after_prologue: \
+;
+
+/* Meant to be called at the end of ENTRY.*
+ *
+ * Branching to "fail" makes tests fail with exit status 1.
+ *
+ * If EXIT is reached, the program ends successfully.
+ *
+ * Restore LR and bx jump to it to return from asm_main.
+ */
+#define EXIT \
+    mov r0, 0; \
+    mov r1, 0; \
+    b pass; \
+fail: \
+    ldr r1, [sp]; \
+    str r0, [r1]; \
+    mov r0, 1; \
+pass: \
+    add sp, 16; \
+    ldmia sp!, {r4-r12, lr}; \
+    bx lr; \
+;
+
+/* Always fail. */
+#define FAIL \
+    ldr r0, =__LINE__; \
+    b fail; \
+;
+
+#define MEMCMP(s1, s2, n) \
+    ldr r0, =s1; \
+    ldr r1, =s2; \
+    ldr r2, =n; \
+    bl memcmp; \
+;
+
+#endif
--- a/userland/arch/arm/cond.S
+++ b/userland/arch/arm/cond.S
@@ -0,0 +1,16 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#conditional-execution */
+
+#include "common.h"
+
+ENTRY
+    mov r0, 0
+    mov r1, 1
+    cmp r0, 1
+    /* Previous cmp failed, skip this operation. */
+    addeq r1, 1
+    ASSERT_EQ(r1, 1)
+    cmp r0, 0
+    /* Previous passed, do this operation. */
+    addeq r1, 1
+    ASSERT_EQ(r1, 2)
+EXIT
--- a/userland/arch/arm/empty.S
+++ b/userland/arch/arm/empty.S
@@ -0,0 +1 @@
+../empty.S
--- a/userland/arch/arm/fail.S
+++ b/userland/arch/arm/fail.S
@@ -0,0 +1 @@
+../fail.S
--- a/userland/arch/arm/freestanding/build
+++ b/userland/arch/arm/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/freestanding/hello.S
+++ b/userland/arch/arm/freestanding/hello.S
@@ -0,0 +1,21 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#linux-system-calls */
+
+.syntax unified
+.text
+.global _start
+_start:
+asm_main_after_prologue:
+    /* write */
+    mov r0, 1     /* stdout */
+    adr r1, msg   /* buffer */
+    ldr r2, =len  /* len */
+    mov r7, 4     /* syscall number */
+    svc 0
+
+    /* exit */
+    mov r0, 0     /* exit status */
+    mov r7, 1     /* syscall number */
+    svc 0
+msg:
+    .ascii "hello\n"
+len = . - msg
--- a/userland/arch/arm/hello_driver.S
+++ b/userland/arch/arm/hello_driver.S
@@ -0,0 +1,23 @@
+/* Minimal example using driver.
+ *
+ * Controls the exit status of the program.
+ */
+
+.syntax unified
+.text
+.global asm_main
+asm_main:
+asm_main_after_prologue:
+
+    /* Set the return value according to the ARM calling convention. */
+    mov r0, 0
+
+    /* Try some whacky value to see tests break. */
+    /*mov r0, 77*/
+
+    /* Branch to the address at register lr.
+     * That is the return value which was put there by the C driver (likely with a bl).
+     *
+     * X means eXchange encoding from thumb back to ARM, which is what the driver uses.
+     */
+    bx lr
--- a/userland/arch/arm/immediates.S
+++ b/userland/arch/arm/immediates.S
@@ -0,0 +1,24 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#immediates */
+
+#include "common.h"
+
+ENTRY
+    /* This is the default. We hack it in common.h however. */
+.syntax divided
+   /* These fail. */
+#if 0
+    mov r0, 1
+    mov r0, 0x1
+#endif
+    mov r0, #1
+    mov r0, #0x1
+    mov r0, $1
+    mov r0, $0x1
+.syntax unified
+    mov r0, 1
+    mov r0, 0x1
+    mov r0, 1
+    mov r0, 0x1
+    mov r0, $1
+    mov r0, $0x1
+EXIT
--- a/userland/arch/arm/inc_array.S
+++ b/userland/arch/arm/inc_array.S
@@ -0,0 +1,27 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#loop-over-array */
+
+#include "common.h"
+
+#define NELEM 4
+#define ELEM_SIZE 4
+
+.data;
+my_array:
+    .word 0x11111111, 0x22222222, 0x33333333, 0x44444444
+my_array_expect:
+    .word 0x11111112, 0x22222223, 0x33333334, 0x44444445
+
+ENTRY
+    /* Increment. */
+    ldr r0, =my_array
+    mov r1, NELEM
+increment:
+    ldr r2, [r0]
+    add r2, 1
+    /* Post index usage. */
+    str r2, [r0], ELEM_SIZE
+    sub r1, 1
+    cmp r1, 0
+    bne increment
+    ASSERT_MEMCMP(my_array, my_array_expect, 0x10)
+EXIT
--- a/userland/arch/arm/ldmia.S
+++ b/userland/arch/arm/ldmia.S
@@ -0,0 +1,62 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#loop-over-array */
+
+#include "common.h"
+
+#define NELEM 4
+#define ELEM_SIZE 4
+
+.data;
+my_array_0:
+    .word 0x11111111, 0x22222222, 0x33333333, 0x44444444
+my_array_1:
+    .word 0x55555555, 0x66666666, 0x77777777, 0x88888888
+
+ENTRY
+
+    /* Load r1, r2, r3 and r4 starting from the address in r0. Don't change r0 */
+    ldr r0, =my_array_0
+    ldr r1, =0
+    ldr r2, =0
+    ldr r3, =0
+    ldr r4, =0
+    ldmia r0, {r1-r4}
+    ASSERT_EQ(r0, my_array_0)
+    ASSERT_EQ(r1, 0x11111111)
+    ASSERT_EQ(r2, 0x22222222)
+    ASSERT_EQ(r3, 0x33333333)
+    ASSERT_EQ(r4, 0x44444444)
+
+    /* Swapping the order of r1 and r2 on the mnemonic makes no difference to load order.
+     *
+     * But it gives an assembler warning, so we won't do it by default:
+     *
+     *  ldmia.S: Assembler messages:
+     *  ldmia.S:32: Warning: register range not in ascending order
+     */
+#if 0
+    ldr r0, =my_array_0
+    ldr r1, =0
+    ldr r2, =0
+    ldmia r0, {r2,r1}
+    ASSERT_EQ(r1, 0x11111111)
+    ASSERT_EQ(r2, 0x22222222)
+#endif
+
+    /* Modify the array */
+    ldr r0, =my_array_1
+    ldr r1, =0x55555555
+    ldr r2, =0x66666666
+    ldr r3, =0x77777777
+    ldr r4, =0x88888888
+    stmdb r0, {r1-r4}
+
+    /* Verify that my_array_0 changed and is equal to my_array_1. */
+    MEMCMP(my_array_0, my_array_1, 0x10)
+    ASSERT_EQ(r0, 0)
+
+    /* Load registers and increment r0. */
+    ldr r0, =my_array_0
+    ldmia r0!, {r1-r4}
+    ASSERT_EQ(r0, my_array_1)
+
+EXIT
--- a/userland/arch/arm/ldr_pseudo.S
+++ b/userland/arch/arm/ldr_pseudo.S
@@ -0,0 +1,65 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldr-pseudo-instruction */
+
+#include "common.h"
+
+ENTRY
+
+    /* Mnemonic for a PC relative load:
+     *
+     * ....
+     * ldr r0, [pc, offset]
+     * r0 = myvar
+     * ....
+     */
+    ldr r0, myvar
+    ASSERT_EQ(r0, 0x12345678)
+
+    /* Mnemonic PC relative load with an offset.
+     * Load myvar2 instead of myvar.
+     */
+    ldr r0, myvar + 4
+    ASSERT_EQ(r0, 0x9ABCDEF0)
+
+    /* First store the address in r0 using a magic =myvar, which creates
+     * a new variable containing the address and PC-relative addresses it
+     * https://stackoverflow.com/questions/17214962/what-is-the-difference-between-label-equals-sign-and-label-brackets-in-ar
+     *
+     * Use the adr instruction would likely be better for this application however.
+     *
+     * ....
+     * r0 = &myvar
+     * r1 = *r0
+     * ....
+     */
+    ldr r0, =myvar
+    ldr r1, [r0]
+    ASSERT_EQ(r1, 0x12345678)
+
+    /* More efficiently, use r0 as the address to read, and write to r0 itself. */
+    ldr r0, =myvar
+    ldr r0, [r0]
+    ASSERT_EQ(r0, 0x12345678)
+
+    /* Same as =myvar but store a constant to a register.
+     * Can also be done with movw and movt. */
+    ldr r0, =0x11112222
+    ASSERT_EQ(r0, 0x11112222)
+
+    /* We can also use GAS tolower16 and topper16  and movw and movt
+     * to load the address of myvar into r0 with two immediates.
+     *
+     * This results in one extra 4 byte instruction read from memory,
+     * and one less data read, so it is likely more cache efficient.
+     *
+     * https://sourceware.org/binutils/docs-2.19/as/ARM_002dRelocations.html
+     */
+    movw r0, #:lower16:myvar
+    movt r0, #:upper16:myvar
+    ldr r1, [r0]
+    ASSERT_EQ(r1, 0x12345678)
+
+EXIT
+myvar:
+    .word 0x12345678
+myvar2:
+    .word 0x9ABCDEF0
--- a/userland/arch/arm/ldrb.S
+++ b/userland/arch/arm/ldrb.S
@@ -0,0 +1,12 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldrh-and-ldrb */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =myvar
+    mov r1, 0x0
+    ldrb r1, [r0]
+    ASSERT_EQ(r1, 0x00000078)
+EXIT
+myvar:
+    .word 0x12345678
--- a/userland/arch/arm/ldrh.S
+++ b/userland/arch/arm/ldrh.S
@@ -0,0 +1,12 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldrh-and-ldrb */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =myvar
+    mov r1, 0x0
+    ldrh r1, [r0]
+    ASSERT_EQ(r1, 0x00005678)
+EXIT
+myvar:
+    .word 0x12345678
--- a/userland/arch/arm/mov.S
+++ b/userland/arch/arm/mov.S
@@ -0,0 +1,19 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#mov */
+
+#include "common.h"
+
+ENTRY
+
+    /* Immediate. */
+    mov r0, 0
+    ASSERT_EQ(r0, 0)
+    mov r0, 1
+    ASSERT_EQ(r0, 1)
+
+    /* Register. */
+    mov r0, 0
+    mov r1, 1
+    mov r1, r0
+    ASSERT_EQ(r1, 0)
+
+EXIT
--- a/userland/arch/arm/movw.S
+++ b/userland/arch/arm/movw.S
@@ -0,0 +1,27 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#movw-and-movt */
+
+#include "common.h"
+
+ENTRY
+
+    /* movt (top) and movw (TODO what is w) set the higher
+     * and lower 16 bits of the register.
+     */
+    movw r0, 0xFFFF
+    movt r0, 0x1234
+    add r0, 1
+    ASSERT_EQ(r0, 0x12350000)
+
+    /* movw also zeroes out the top bits, allowing small 16-bit
+     * C constants to be assigned in a single instruction.
+     *
+     * It differs from mov because mov can only encode 8 bits
+     * at a time, while movw can encode 16.
+     *
+     * movt does not modify the lower bits however.
+     */
+    ldr r0, =0x12345678
+    movw r0, 0x1111
+    ASSERT_EQ(r0, 0x00001111)
+
+EXIT
--- a/userland/arch/arm/mul.S
+++ b/userland/arch/arm/mul.S
@@ -0,0 +1,12 @@
+/* Multiplication. */
+
+#include "common.h"
+
+ENTRY
+    /* 2 * 3 = 6 */
+    mov r0, 0
+    mov r1, 2
+    mov r2, 3
+    mul r1, r2
+    ASSERT_EQ(r1, 6)
+EXIT
--- a/userland/arch/arm/nop.S
+++ b/userland/arch/arm/nop.S
@@ -0,0 +1,32 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#nop */
+
+#include "common.h"
+
+ENTRY
+    /* Disassembles as:
+     *
+     * ....
+     * nop {0}
+     * ....
+     *
+     * TODO what is the `{0}`?
+     */
+    nop
+
+    /* Disassembles as:
+     *
+     * ....
+     * nop ; (mov r0, r0)
+     * ....
+     */
+    mov r0, r0
+
+    /* Disassemble as mov. TODO Why not as nop as in `mov r0, r0`?
+     * Do they have any effect?
+     */
+    mov r1, r1
+    mov r8, r8
+
+    /* And there are other nops as well? Disassembles as `and`. */
+    and r0, r0, r0
+EXIT
--- a/userland/arch/arm/push.S
+++ b/userland/arch/arm/push.S
@@ -0,0 +1,31 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldmia */
+
+#include "common.h"
+
+ENTRY
+
+    /* Save sp before push. */
+    mov r0, sp
+
+    /* Push. */
+    mov r1, 1
+    mov r2, 2
+    push {r1, r2}
+
+    /* Save sp after push. */
+    mov r1, sp
+
+    /* Restore. */
+    mov r3, 0
+    mov r4, 0
+    pop {r3, r4}
+    ASSERT_EQ(r3, 1)
+    ASSERT_EQ(r4, 2)
+
+    /* Check that stack pointer moved down by 8 bytes
+     * (2 registers x 4 bytes each).
+     */
+    sub r0, r1
+    ASSERT_EQ(r0, 8)
+
+EXIT
--- a/userland/arch/arm/rbit.S
+++ b/userland/arch/arm/rbit.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#rbit */
+
+#include "common.h"
+
+ENTRY
+    ldr r0,      =0b00000001001000110100010101100101
+    rbit r1, r0
+    ASSERT_EQ(r1, 0b10100110101000101100010010000000)
+EXIT
--- a/userland/arch/arm/regs.S
+++ b/userland/arch/arm/regs.S
@@ -0,0 +1,69 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#registers */
+
+#include "common.h"
+
+ENTRY
+
+    /* 13 general purpose registers. */
+    mov r0, 0
+    mov r1, 1
+    mov r2, 2
+    mov r3, 3
+    mov r4, 4
+    mov r5, 5
+    mov r6, 6
+    mov r7, 7
+    mov r8, 8
+    mov r9, 9
+    mov r10, 10
+    mov r11, 11
+    mov r12, 12
+
+    /* * r11: aliased to FP (frame pointer, debug stack trace usage only)
+     * +
+     * I think FP is only a convention with no instruction impact, but TODO:
+     * not mentioned on AAPCS. aarch64 AAPCS mentions it though.
+     * * r13: aliased to SP (stack pointer), what push / pop use
+     * * r14: aliased to LR (link register), what bl writes the return address to
+     * * r15: aliased to PC (program counter), contains the current instruction address
+     *
+     * In ARMv8, SP and PC have dedicated registers in addition to
+     * the 32-general purpose ones. LR is still general purpose as before.
+     *
+     * Therefore, it is possible to use those registers in any place
+     * other registers may be used.
+     *
+     * This is not possible in ARMv8 anymore.
+     *
+     * For example, we can load an address into PC, which is very similar to what B / BX does:
+     * https://stackoverflow.com/questions/32304646/arm-assembly-branch-to-address-inside-register-or-memory/54145818#54145818
+     */
+    ldr pc, =10f
+    FAIL
+10:
+
+    /* Same with r15, which is the same as pc. */
+    ldr r15, =10f
+    FAIL
+10:
+
+    /* Another example with mov reading from pc. */
+pc_addr:
+    mov r0, pc
+    /* Why sub 8:
+     * https://stackoverflow.com/questions/24091566/why-does-the-arm-pc-register-point-to-the-instruction-after-the-next-one-to-be-e
+     */
+    sub r0, r0, 8
+
+    /* pc-relative load also just work just like any other register. */
+    ldr r0, [pc]
+    b 1f
+    .word 0x12345678
+1:
+    ASSERT_EQ(r0, 0x12345678)
+
+    /* We can also use fp in GNU GAS assembly. */
+    mov r11, 0
+    mov fp, 1
+    ASSERT_EQ(r11, 1)
+EXIT
--- a/userland/arch/arm/rev.S
+++ b/userland/arch/arm/rev.S
@@ -0,0 +1,15 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* All bytes in register. */
+    ldr r0, =0x11223344
+    rev r1, r0
+    ASSERT_EQ(r1, 0x44332211)
+
+    /* Groups of 16-bits. */
+    ldr r0, =0x11223344
+    rev16 r1, r0
+    ASSERT_EQ(r1, 0x22114433)
+EXIT
--- a/userland/arch/arm/s_suffix.S
+++ b/userland/arch/arm/s_suffix.S
@@ -0,0 +1,35 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#s-suffix */
+
+#include "common.h"
+
+ENTRY
+
+    /* Result is 0, set beq. */
+    movs r0, 0
+    ASSERT(beq)
+
+    /* The opposite. */
+    movs r0, 1
+    ASSERT(bne)
+
+    /* mov without s does not set the status. */
+    movs r0, 0
+    mov r0, 1
+    ASSERT(beq)
+
+    /* movs still moves... */
+    mov r0, 0
+    movs r0, 1
+    ASSERT_EQ(r0, 1)
+
+    /* add: the result is 0. */
+    mov r0, 1
+    adds r0, -1
+    ASSERT(beq)
+
+    /* add: result non 0. */
+    mov r0, 1
+    adds r0, 1
+    ASSERT(bne)
+
+EXIT
--- a/userland/arch/arm/shift.S
+++ b/userland/arch/arm/shift.S
@@ -0,0 +1,79 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#shift-suffixes */
+
+#include "common.h"
+
+ENTRY
+
+    /* lsr */
+    ldr r0, =0xFFF00FFF
+    mov r1, r0, lsl 8
+    ldr r2, =0xF00FFF00
+    ASSERT_EQ_REG(r1, r2)
+
+    /* lsl */
+    ldr r0, =0xFFF00FFF
+    mov r1, r0, lsr 8
+    ldr r2, =0x00FFF00F
+    ASSERT_EQ_REG(r1, r2)
+
+    /* ror */
+    ldr r0, =0xFFF00FFF
+    mov r1, r0, ror 8
+    ldr r2, =0xFFFFF00F
+    ASSERT_EQ_REG(r1, r2)
+
+    /* asr negative */
+    ldr r0, =0x80000008
+    mov r1, r0, asr 1
+    ldr r2, =0xC0000004
+    ASSERT_EQ_REG(r1, r2)
+
+    /* asr positive */
+    ldr r0, =0x40000008
+    mov r1, r0, asr 1
+    ldr r2, =0x20000004
+    ASSERT_EQ_REG(r1, r2)
+
+    /* There are also direct shift mnemonics for the mov shifts.
+     *
+     * They assembly to the exact same bytes as the mov version
+     */
+    ldr r0, =0xFFF00FFF
+    lsl r1, r0, 8
+    ldr r2, =0xF00FFF00
+    ASSERT_EQ_REG(r1, r2)
+
+    /* If used with the `mov` instruction, it results in a pure shift,
+     * but the suffixes also exist for all the other data processing instructions.
+     *
+     * Here we illustrate a shifted add instruction which calculates:
+     *
+     * ....
+     * r1 = r1 + (r0 << 1)
+     * ....
+     */
+    ldr r0, =0x10
+    ldr r1, =0x100
+    add r1, r1, r0, lsl 1
+    ldr r2, =0x00000120
+    ASSERT_EQ_REG(r1, r2)
+
+    /* The shift takes up the same encoding slot as the immediate,
+     * therefore it is not possible to both use an immediate and shift.
+     *
+     * Error: shift expression expected -- `add r1,r0,1,lsl#1'
+     */
+#if 0
+    add r1, r0, 1, lsl 1
+#endif
+
+    /* However, you can still encode shifted bitmasks of
+     * limited width in immediates, so why not just use the
+     * assembler pre-processing for it?
+     */
+    ldr r1, =0x100
+    add r1, r1, (0x10 << 1)
+    ldr r2, =0x00000120
+    ASSERT_EQ_REG(r1, r2)
+
+EXIT
--- a/userland/arch/arm/simd.S
+++ b/userland/arch/arm/simd.S
@@ -0,0 +1,113 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* vadd.u32
+     *
+     * Add 4x 32-bit unsigned integers in one go.
+     *
+     * q means 128-bits.
+     *
+     * u32 means that we treat memory as uint32_t types.
+     *
+     * 4 is deduced: in 128 bits you can fit 4 u32.
+     *
+     * Observe how the carry is propagated within u32 integers,
+     * but not across them.
+     */
+.data
+    u32_0:          .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444
+    u32_1:          .word 0x15551555, 0x16661666, 0x17771777, 0x18881888
+    u32_sum_expect: .word 0x06670666, 0x08890888, 0x0AAB0AAA, 0x0CCD0CCC
+.bss
+    u32_sum: .skip 0x10
+.text
+    ldr r0, =u32_0
+    vld1.32 {q0}, [r0]
+    ldr r0, =u32_1
+    vld1.32 {q1}, [r0]
+    vadd.u32 q2, q0, q1
+    ldr r0, =u32_sum
+    vst1.u32 {q2}, [r0]
+    ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10)
+
+    /* vadd.u64: 2x 64-bit unsigned integer add. */
+.data
+    u64_0:          .quad 0xF1111111F1111111, 0xF2222222F2222222
+    u64_1:          .quad 0x1555555515555555, 0x1666666616666666
+    u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
+.bss
+    u64_sum: .skip 0x10
+.text
+    ldr r0, =u64_0
+    vld1.64 {q0}, [r0]
+    ldr r0, =u64_1
+    vld1.64 {q1}, [r0]
+    vadd.u64 q2, q0, q1
+    ldr r0, =u64_sum
+    vst1.u64 {q2}, [r0]
+    ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10)
+
+    /* vadd.s64: 2x 64-bit signed integer add. TODO: how to differentiate
+     * it from signed? I think signed and unsigned addition are identical
+     * in two's complement, the only difference is overflow / carry detection
+     * flags. But how do flags work when there are many values being added
+     * at once?
+     */
+.data
+    s64_0:          .quad -1, -2
+    s64_1:          .quad -1, -2
+    s64_sum_expect: .quad -2, -4
+.bss
+    s64_sum: .skip 0x10
+.text
+    ldr r0, =s64_0
+    vld1.64 {q0}, [r0]
+    ldr r0, =s64_1
+    vld1.64 {q1}, [r0]
+    vadd.s64 q2, q0, q1
+    ldr r0, =s64_sum
+    vst1.s64 {q2}, [r0]
+    ASSERT_MEMCMP(s64_sum, s64_sum_expect, 0x10)
+
+    /* vadd.f32: 4x 32-bit float add. */
+.data
+    f32_0:          .float 1.5, 2.5,  3.5,  4.5
+    f32_1:          .float 5.5, 6.5,  7.5,  8.5
+    f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
+.bss
+    f32_sum: .skip 0x10
+.text
+    ldr r0, =f32_0
+    vld1.32 {q0}, [r0]
+    ldr r0, =f32_1
+    vld1.32 {q1}, [r0]
+    vadd.f32 q2, q0, q1
+    ldr r0, =f32_sum
+    vst1.32 {q2}, [r0]
+    ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10)
+
+    /* vadd.f64: 2x 64-bit float add: appears not possible.
+     *
+     * https://stackoverflow.com/questions/36052564/does-arm-support-simd-operations-for-64-bit-floating-point-numbers
+     */
+.data
+    f64_0:          .double 1.5, 2.5
+    f64_1:          .double 5.5, 6.5
+    f64_sum_expect: .double 7.0, 9.0
+.bss
+    f64_sum: .skip 0x10
+.text
+    ldr r0, =f64_0
+    vld1.64 {q0}, [r0]
+    ldr r0, =f64_1
+    vld1.64 {q1}, [r0]
+#if 0
+    /* bad type in Neon instruction -- `vadd.f64 q2,q0,q1' */
+    vadd.f64 q2, q0, q1
+    ldr r0, =f64_sum
+    vst1.64 {q2}, [r0]
+    ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10)
+#endif
+EXIT
--- a/userland/arch/arm/str.S
+++ b/userland/arch/arm/str.S
@@ -0,0 +1,60 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#load-and-store-instructions */
+
+#include "common.h"
+
+.data;
+    /* Must be in the .data section, since we want to modify it. */
+myvar:
+    .word 0x12345678
+
+ENTRY
+    /* r0 will contain the address. */
+    ldr r0, =myvar
+
+    /* Sanity check. */
+    ldr r1, [r0]
+    movw r2, 0x5678
+    movt r2, 0x1234
+    ASSERT_EQ_REG(r1, r2)
+
+    /* Modify the value. */
+    movw r1, 0xDEF0
+    movt r1, 0x9ABC
+    str r1, [r0]
+
+    /* Check that it changed. */
+    ldr r1, [r0]
+    movw r2, 0xDEF0
+    movt r2, 0x9ABC
+    ASSERT_EQ_REG(r1, r2)
+
+    /* Cannot use PC relative addressing to a different segment,
+     * or else it fails with:
+     *
+     * ....
+     * Error: internal_relocation (type: OFFSET_IMM) not fixed up
+     * ....
+     *
+     * https://stackoverflow.com/questions/10094282/internal-relocation-not-fixed-up
+     */
+    /*ldr r0, myvar*/
+
+#if 0
+    /* We could in theory write this to set the address of myvar,
+     * but it will always segfault under Linux because the text segment is read-only.
+     * This is however useful in baremetal programming.
+     * This construct is not possible in ARMv8 for str:
+     * https://github.com/cirosantilli/arm-assembly-cheat#armv8-str
+     */
+    str r1, var_in_same_section
+var_in_same_section:
+#endif
+
+    /* = sign just doesn't make sense for str, you can't set the
+     * address of a variable.
+     */
+#if 0
+    str r1, =myvar
+#endif
+
+EXIT
--- a/userland/arch/arm/sub.S
+++ b/userland/arch/arm/sub.S
@@ -0,0 +1,11 @@
+/* Subtraction. */
+
+#include "common.h"
+
+ENTRY
+    /* 3 - 2 == 1 , register version.*/
+    mov r0, 3
+    mov r1, 2
+    sub r0, r0, r1
+    ASSERT_EQ(r0, 1)
+EXIT
--- a/userland/arch/arm/thumb.S
+++ b/userland/arch/arm/thumb.S
@@ -0,0 +1,17 @@
+/* Illustrates features that are only available in thumb. */
+
+.syntax unified
+.text
+.thumb_func
+.global asm_main
+asm_main:
+asm_main_after_prologue:
+
+    /* CBZ: cmp and branch if zero instruction. Equivalent to CMP + BEQ.
+     * TODO create an interesting assertion here.
+     */
+    cbz r1, 1f
+    1:
+
+    mov r0, 0
+    bx lr
--- a/userland/arch/arm/tst.S
+++ b/userland/arch/arm/tst.S
@@ -0,0 +1,19 @@
+/* Test. Same as ands, but don't store the result, just update flags. */
+
+#include "common.h"
+
+ENTRY
+
+    /* 0x0F && 0xF0 == 0x00, so beq. */
+    mov r0, 0x0F
+    tst r0, 0xF0
+    ASSERT(beq)
+
+    /* bne */
+    mov r0, 0xFF
+    tst r0, 0x0F
+    ASSERT(bne)
+    # r0 was not modified.
+    ASSERT_EQ(r0, 0xFF)
+
+EXIT
--- a/userland/arch/arm/vcvt.S
+++ b/userland/arch/arm/vcvt.S
@@ -0,0 +1,90 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vcvt */
+
+#include "common.h"
+
+ENTRY
+    /* SIMD positive. */
+.data
+    vcvt_positive_0:      .float 1.25, 2.5, 3.75, 4.0
+    vcvt_positive_expect: .word  1,    2,   3,    4
+.bss
+    vcvt_positive_result: .skip 0x10
+.text
+    ldr r0, =vcvt_positive_0
+    vld1.32 {q0}, [r0]
+    vcvt.u32.f32 q1, q0
+    ldr r0, =vcvt_positive_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(vcvt_positive_result, vcvt_positive_expect, 0x10)
+
+    /* SIMD negative. */
+.data
+    vcvt_negative_0:      .float -1.25, -2.5, -3.75, -4.0
+    vcvt_negative_expect: .word  -1,    -2,   -3,    -4
+.bss
+    vcvt_negative_result: .skip 0x10
+.text
+    ldr r0, =vcvt_negative_0
+    vld1.32 {q0}, [r0]
+    vcvt.s32.f32 q1, q0
+    ldr r0, =vcvt_negative_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(vcvt_negative_result, vcvt_negative_expect, 0x10)
+
+    /* Floating point. */
+.data
+    vcvt_positive_float_0:      .float 1.5, 2.5
+    vcvt_positive_float_expect: .word  1
+                                .float      2.5
+.bss
+    vcvt_positive_float_result: .skip 0x8
+.text
+    ldr r0, =vcvt_positive_float_0
+    vld1.32 {d0}, [r0]
+    vcvt.u32.f32 s0, s0
+    ldr r0, =vcvt_positive_float_result
+    vst1.32 {d0}, [r0]
+    ASSERT_MEMCMP(vcvt_positive_float_result, vcvt_positive_float_expect, 0x8)
+
+    /* Floating point but with immediates.
+     *
+     * You have to worry of course about representability of
+     * the immediate in 4 bytes, which is even more fun for
+     * floating point numbers :-)
+     *
+     * Doing this mostly to illustrate the joys of vmov.i32.
+     *
+     * For some reason, there is no vmov.i32 sn, only dn.
+     * If you try to use sn, it does the same as .f32 and
+     * stores a float instead. Horrible!
+     */
+    vmov.f32 d0, 1.5
+    vcvt.u32.f32 s0, s0
+    vmov.i32 d1, 1
+    vcmp.f32 s0, s2
+    vmrs apsr_nzcv, fpscr
+    ASSERT(beq)
+    /* Check that s1 wasn't modified by vcvt. */
+    vmov.f32 s2, 1.5
+    vcmp.f32 s1, s2
+    vmrs apsr_nzcv, fpscr
+    ASSERT(beq)
+
+    /* Floating point double precision. */
+.data
+    vcvt_positive_double_0:      .double 1.5
+    vcvt_positive_double_expect: .word   1
+.bss
+    vcvt_positive_double_result: .skip 0x8
+.text
+    ldr r0, =vcvt_positive_double_0
+    vld1.64 {d0}, [r0]
+    vcvt.u32.f64 s0, d0
+    ldr r0, =vcvt_positive_double_result
+    vst1.32 {d0}, [r0]
+    ASSERT_MEMCMP(
+        vcvt_positive_double_result,
+        vcvt_positive_double_expect,
+        0x4
+    )
+EXIT
--- a/userland/arch/arm/vcvta.S
+++ b/userland/arch/arm/vcvta.S
@@ -0,0 +1,41 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vcvta */
+
+#include "common.h"
+
+ENTRY
+    /* SIMD positive. */
+.data
+    vcvta_positive_0:      .float 1.25, 2.5, 3.75, 4.0
+    vcvta_positive_expect: .word  1,    3,   4,    4
+.bss
+    vcvta_positive_result: .skip 0x10
+.text
+    ldr r0, =vcvta_positive_0
+    vld1.32 {q0}, [r0]
+    vcvta.u32.f32 q1, q0
+    ldr r0, =vcvta_positive_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvta_positive_result,
+        vcvta_positive_expect,
+        0x10
+    )
+
+    /* SIMD negative. */
+.data
+    vcvta_negative_0:      .float -1.25, -2.5, -3.75, -4.0
+    vcvta_negative_expect: .word  -1,    -3,   -4,    -4
+.bss
+    vcvta_negative_result: .skip 0x10
+.text
+    ldr r0, =vcvta_negative_0
+    vld1.32 {q0}, [r0]
+    vcvta.s32.f32 q1, q0
+    ldr r0, =vcvta_negative_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvta_negative_result,
+        vcvta_negative_expect,
+        0x10
+    )
+EXIT
--- a/userland/arch/arm/vcvtr.S
+++ b/userland/arch/arm/vcvtr.S
@@ -0,0 +1,46 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vcvtrr */
+
+#include "common.h"
+
+ENTRY
+.data
+    vcvtr_0:                    .float 1.25, 2.5, 3.75, 4.0
+    vcvtr_expect_zero:          .word  1,    2,   3,    4
+    vcvtr_expect_plus_infinity: .word  2,    3,   4,    4
+.bss
+    vcvtr_result_zero:          .skip 0x10
+    vcvtr_result_plus_infinity: .skip 0x10
+.text
+    ldr r0, =vcvtr_0
+    vld1.32 {q0}, [r0]
+
+    /* zero */
+    vmrs r0, fpscr
+    orr r0, r0, (3 << 22)
+    vmsr fpscr, r0
+    vcvtr.u32.f32 q1, q0
+    ldr r0, =vcvtr_result_zero
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvtr_result_zero,
+        vcvtr_expect_zero,
+        0x10
+    )
+
+#if 0
+    /* TODO why is this not working? Rounds to zero still. */
+    /* plus infinity */
+    vmrs r0, fpscr
+    mov r1, 1
+    bfi r0, r1, 22, 2
+    vmsr fpscr, r0
+    vcvtr.u32.f32 q1, q0
+    ldr r0, =vcvtr_result_plus_infinity
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvtr_result_plus_infinity,
+        vcvtr_expect_plus_infinity,
+        0x10
+    )
+#endif
+EXIT
--- a/userland/arch/arm/vfp.S
+++ b/userland/arch/arm/vfp.S
@@ -0,0 +1,152 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vfp
+ * Adapted from: https://mindplusplus.wordpress.com/2013/06/27/arm-vfp-vector-programming-part-2-examples/ */
+
+#include "common.h"
+
+.data;
+a1:
+    .float 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5
+a2:
+    .float 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5
+sum:
+    .skip 32
+sum_expect:
+    .float 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0
+
+ENTRY
+    /* Minimal single precision floating point example.
+     * TODO: floating point representation constraints due to 4-byte instruction?
+     */
+    vmov s0, 1.5
+    vmov s1, 2.5
+    vadd.f32 s2, s0, s1
+    vmov s3, 4.0
+    /* Compare two floating point registers. Stores results in fpscr:
+     * (floating point status and control register).
+     */
+    vcmp.f32 s2, s3
+    /* Move the nzcv bits from fpscr to apsr */
+    vmrs apsr_nzcv, fpscr
+    /* This branch uses the Z bit of apsr, which was set accordingly. */
+    ASSERT(beq)
+
+    /* Now the same from memory with vldr and vstr. */
+.data
+my_float_0:
+    .float 1.5
+my_float_1:
+    .float 2.5
+my_float_sum_expect:
+    .float 4.0
+.bss
+my_float_sum:
+    .skip 4
+.text
+    ldr r0, =my_float_0
+    vldr s0, [r0]
+    ldr r0, =my_float_1
+    vldr s1, [r0]
+    vadd.f32 s2, s0, s1
+    ldr r0, =my_float_sum
+    vstr.f32 s2, [r0]
+    ASSERT_MEMCMP(my_float_sum, my_float_sum_expect, 4)
+
+#if 0
+    /* We can't do pseudo vldr as for ldr, fails with:
+     * Error: cannot represent CP_OFF_IMM relocation in this object file format
+     * It works on ARMv8 however, so the relocation must have been added.
+     */
+    vldr s0, my_float_0
+#endif
+
+    /* Minimal double precision floating point example. */
+    vmov.f64 d0, 1.5
+    vmov.f64 d1, 2.5
+    vadd.f64 d2, d0, d1
+    vmov.f64 d3, 4.0
+    vcmp.f64 d2, d3
+    vmrs apsr_nzcv, fpscr
+    ASSERT(beq)
+
+    /* vmov can also move to general purpose registers.
+     *
+     * Just remember that we can't use float immediates with general purpose registers:
+     * https://stackoverflow.com/questions/6514537/how-do-i-specify-immediate-floating-point-numbers-with-inline-assembly/52906126#52906126
+     */
+    mov r1, 2
+    mov r0, 1
+    vmov s0, r0
+    vmov s1, s0
+    vmov r1, s1
+    ASSERT_EQ_REG(r0, r1)
+
+    /* Now a more complex test function. */
+    ldr r0, =sum
+    ldr r1, =a1
+    ldr r2, =a2
+    mov r3, 8
+    bl vec_sum
+    /* The assert works easily because all floats used
+     * have exact base-2 representation.
+     */
+    ASSERT_MEMCMP(sum, sum_expect, 0x20)
+EXIT
+
+/* void vec_sum(float *sum, float *a1, float *a2, int length) {
+ *   int i;
+ *   for (i=0; i &lt; length; i++)
+ *     *(sum+i) = *(a1+i) + *(a2+i);
+ * }
+ */
+vec_sum:
+    /* Setup */
+    push {r0, r1, r4, lr}
+    push {r0, r1}
+    mov r0, 1
+    mov r1, 8
+    bl reconfig
+    pop {r0, r1}
+    asr r3, 3
+
+    /* Do the sum. */
+1:
+    fldmias r1!, {s8-s15}
+    fldmias r2!, {s16-s23}
+    vadd.f32 s24, s8, s16
+    fstmias r0!, {s24-s31}
+    subs r3, r3, 1
+    bne 1b
+
+    /* Teardown. */
+    bl deconfig
+    pop {r0, r1, r4, pc}
+
+/* inputs:
+ * r0: desired vector stride (1 or 2)
+ * r1: desired vector length (min. 1, max. 8)
+ * outputs: (none)
+ * modified: r0, r1, FPSCR
+ * notes:
+ * r0 and r1 will be truncated before fitting into FPSCR
+ */
+reconfig:
+    push {r0-r2}
+    and r0, r0, 3
+    eor r0, r0, 1
+    sub r1, r1, 1
+    and r1, r1, 7
+    mov r0, r0, lsl 20
+    orr r0, r0, r1, lsl 16
+    vmrs r2, fpscr
+    bic r2, 55*65536
+    orr r2, r2, r0
+    vmsr fpscr, r0
+    pop {r0-r2}
+    bx lr
+
+deconfig:
+    push {r0, r1, lr}
+    mov r0, 1
+    mov r1, 1
+    bl reconfig
+    pop {r0, r1, pc}
--- a/userland/arch/common.h
+++ b/userland/arch/common.h
@@ -0,0 +1,28 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+/* We define in this header only macros that are the same on all archs. */
+
+/* common_arch.h contains arch specific macros. */
+#include "common_arch.h"
+
+.extern \
+    exit, \
+    printf, \
+    puts \
+;
+
+/* Assert that the given branch instruction is taken. */
+#define ASSERT(branch_if_pass) \
+    branch_if_pass 1f; \
+    FAIL; \
+1: \
+;
+
+/* Assert that a register equals another register. */
+#define ASSERT_EQ_REG(reg1, reg2) \
+	cmp reg1, reg2; \
+	ASSERT(beq); \
+;
+
+#endif
--- a/userland/arch/empty.S
+++ b/userland/arch/empty.S
@@ -0,0 +1,6 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#about */
+
+#include "common.h"
+
+ENTRY
+EXIT
--- a/userland/arch/fail.S
+++ b/userland/arch/fail.S
@@ -0,0 +1,10 @@
+/* See what happens on test failure. */
+
+#include "common.h"
+
+ENTRY
+#if 0
+    /* Uncomment this to see it fail. */
+    FAIL
+#endif
+EXIT
--- a/userland/arch/main.c
+++ b/userland/arch/main.c
@@ -0,0 +1,17 @@
+/* This is the main entrypoint for all .S examples. */
+
+#include "stdio.h"
+#include "stdint.h"
+
+#include "lkmc.h"
+
+int asm_main(uint32_t *line);
+
+int main(void) {
+    uint32_t ret, line;
+    ret = asm_main(&line);
+    if (ret) {
+        printf("error %d at line %d\n", ret, line);
+    }
+    return ret;
+}
--- a/userland/arch/x86_64/c/add.c
+++ b/userland/arch/x86_64/c/add.c
@@ -0,0 +1,16 @@
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint64_t in1 = 0xFFFFFFFF;
+    uint64_t in2 = 0x1;
+    uint64_t out;
+    __asm__ (
+        "lea (%[in1], %[in2]), %[out];"
+        : [out] "=r" (out)
+        : [in1] "r" (in1),
+          [in2] "r" (in2)
+        :
+    );
+    assert(out == 0x100000000);
+}
--- a/userland/arch/x86_64/c/binutils_hack.c
+++ b/userland/arch/x86_64/c/binutils_hack.c
--- a/userland/arch/x86_64/c/binutils_nohack.c
+++ b/userland/arch/x86_64/c/binutils_nohack.c
@@ -1,3 +1,5 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#your-first-binutils-hack */
+
 #include <assert.h>
 #include <inttypes.h>

--- a/Show More
+++ b/Show More