userland: add assembly support

Move arm assembly cheat here, and start some work on x86 cheat as well.
2026-01-27 12:04:27 +01:00 · 2019-03-22 00:00:00 +00:00
parent 4943c9ed2e
commit 287c83f3f9
117 changed files with 3870 additions and 547 deletions
--- a/userland/arch/arm/add.S
+++ b/userland/arch/arm/add.S
@@ -0,0 +1,58 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+
+    /* Immediate encoding.
+     *
+     * r1 = r0 + 2
+     */
+    mov r0, 1
+    /* r1 = r0 + 2 */
+    add r1, r0, 2
+    ASSERT_EQ(r1, 3)
+
+    /* If src == dest, we can omit one of them.
+     *
+     * r0 = r0 + 2
+     */
+    mov r0, 1
+    add r0, 2
+    ASSERT_EQ(r0, 3)
+
+    /* Same as above but explicit. */
+    mov r0, 1
+    add r0, r0, 2
+    ASSERT_EQ(r0, 3)
+
+#if 0
+    /* But we cannot omit the register if there is a shift when using .syntx unified:
+     * https://github.com/cirosantilli/arm-assembly-cheat#shift-suffixes
+     */
+    .syntax unified
+    /* Error: garbage following instruction */
+    add r0, r1, lsl 1
+    /* OK */
+    add r0, r0, r1, lsl 1
+#endif
+
+    /* Register encoding.
+     *
+     * r2 = r0 + r1
+     */
+    mov r0, 1
+    mov r1, 2
+    add r2, r0, r1
+    ASSERT_EQ(r2, 3)
+
+    /* Register encoding, omit implicit register.
+     *
+     * r1 = r1 + r0
+     */
+    mov r0, 1
+    mov r1, 2
+    add r1, r0
+    ASSERT_EQ(r1, 3)
+
+EXIT
--- a/userland/arch/arm/address_modes.S
+++ b/userland/arch/arm/address_modes.S
@@ -0,0 +1,51 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#addressing-modes */
+
+#include "common.h"
+
+ENTRY
+
+    /* Offset mode with immediate. Add 4 to the address register, which ends up
+     * reading myvar2 instead of myvar.
+     */
+    adr r0, myvar
+    ldr r1, [r0, 4]
+    ASSERT_EQ(r1, 0x9ABCDEF0)
+    /* r0 was not modified. */
+    ASSERT_EQ(r0, myvar)
+
+    /* Pre-indexed mode */
+    adr r0, myvar
+    ldr r1, [r0, 4]!
+    ASSERT_EQ(r1, 0x9ABCDEF0)
+    /* r0 was modified. */
+    ASSERT_EQ(r0, myvar2)
+
+    /* Post-indexed mode */
+    adr r0, myvar
+    ldr r1, [r0], 4
+    ASSERT_EQ(r1, 0x12345678)
+    /* r0 was modified. */
+    ASSERT_EQ(r0, myvar2)
+
+    /* Offset in register. */
+    adr r0, myvar
+    mov r1, 4
+    ldr r2, [r0, r1]
+    ASSERT_EQ(r2, 0x9ABCDEF0)
+
+    /* Offset in shifted register:
+     * r2 =
+     * (r0 + (r1 << 1))
+     * == *(myvar + (2 << 1))
+     * == *(myvar + 4)
+     */
+    adr r0, myvar
+    mov r1, 2
+    ldr r2, [r0, r1, lsl 1]
+    ASSERT_EQ(r2, 0x9ABCDEF0)
+
+EXIT
+myvar:
+    .word 0x12345678
+myvar2:
+    .word 0x9ABCDEF0
--- a/userland/arch/arm/adr.S
+++ b/userland/arch/arm/adr.S
@@ -0,0 +1,33 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#adr */
+
+#include "common.h"
+
+.data
+data_label:
+    .word 0x1234678
+ENTRY
+    adr r0, label
+    /* objdump tells us that this uses the literal pool,
+     * it does not get converted to adr, which is the better
+     * alternative here.
+     */
+    adr r1, label
+    adrl r2, label
+label:
+    ASSERT_EQ_REG(r0, r1)
+    ASSERT_EQ_REG(r0, r2)
+
+#if 0
+    /* Error: symbol .data is in a different section.
+     *
+     * It works however in ARMv8.
+     * I think this means that there is no relocation type
+     * that takes care of this encoding in ARMv8, but there
+     * is one in ARMv8.
+     *
+     * If you have no idea what I'm talking about, read this:
+     * https://stackoverflow.com/questions/3322911/what-do-linkers-do/33690144#33690144
+     */
+    adr r1, data_label
+#endif
+EXIT
--- a/userland/arch/arm/and.S
+++ b/userland/arch/arm/and.S
@@ -0,0 +1,27 @@
+/* Bitwise AND. */
+
+#include "common.h"
+
+ENTRY
+
+    /* 0x00 && 0xFF == 0x00 */
+    mov r0, 0x00
+    and r0, 0xFF
+    ASSERT_EQ(r0, 0x00)
+
+    /* 0x0F && 0xF0 == 0x00 */
+    mov r0, 0x0F
+    and r0, 0xF0
+    ASSERT_EQ(r0, 0x00)
+
+    /* 0x0F && 0xFF == 0x0F */
+    mov r0, 0x0F
+    and r0, 0xFF
+    ASSERT_EQ(r0, 0x0F)
+
+    /* 0xF0 && 0xFF == 0xF0 */
+    mov r0, 0xF0
+    and r0, 0xFF
+    ASSERT_EQ(r0, 0xF0)
+
+EXIT
--- a/userland/arch/arm/b.S
+++ b/userland/arch/arm/b.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#b */
+
+#include "common.h"
+ENTRY
+    /* Jump over the fail. 26-bit PC-relative. */
+    b ok
+    FAIL
+ok:
+EXIT
--- a/userland/arch/arm/beq.S
+++ b/userland/arch/arm/beq.S
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#beq */
+
+#include "common.h"
+
+ENTRY
+
+    /* Smaller*/
+    mov r0, 1
+    cmp r0, 2
+    ASSERT(ble)
+    ASSERT(blt)
+    ASSERT(bne)
+
+    /* Equal. */
+    mov r1, 0
+    cmp r1, 0
+    ASSERT(beq)
+    ASSERT(bge)
+    ASSERT(ble)
+
+    /* Greater. */
+    mov r0, 2
+    cmp r0, 1
+    ASSERT(bge)
+    ASSERT(bgt)
+    ASSERT(bne)
+
+EXIT
--- a/userland/arch/arm/bfi.S
+++ b/userland/arch/arm/bfi.S
@@ -0,0 +1,10 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bfi */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =0x11223344
+    ldr r1, =0xFFFFFFFF
+    bfi r1, r0, 8, 16
+    ASSERT_EQ(r1, 0xFF3344FF)
+EXIT
--- a/userland/arch/arm/bic.S
+++ b/userland/arch/arm/bic.S
@@ -0,0 +1,10 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bic */
+
+#include "common.h"
+
+ENTRY
+    /* 0x0F & ~0x55 == 0x0F & 0xAA == 0x0A */
+    mov r0, 0x0F
+    bic r0, 0x55
+    ASSERT_EQ(r0, 0x0A)
+EXIT
--- a/userland/arch/arm/bl.S
+++ b/userland/arch/arm/bl.S
@@ -0,0 +1,14 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bl */
+
+#include "common.h"
+
+ENTRY
+    mov r0, 1
+    bl inc
+    ASSERT_EQ(r0, 2)
+EXIT
+
+/* void inc(int *i) { (*i)++ } */
+inc:
+    add r0, 1
+    bx lr
--- a/userland/arch/arm/build
+++ b/userland/arch/arm/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/c/add.c
+++ b/userland/arch/arm/c/add.c
@@ -0,0 +1,17 @@
+/* 1 + 2 == 3 */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint32_t in0 = 1, in1 = 2, out;
+    __asm__ (
+        "add %[out], %[in0], %[in1];"
+        : [out] "=r" (out)
+        : [in0] "r"  (in0),
+          [in1] "r"  (in1)
+    );
+    assert(in0 == 1);
+    assert(in1 == 2);
+    assert(out == 3);
+}
--- a/userland/arch/arm/c/build
+++ b/userland/arch/arm/c/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/c/freestanding/build
+++ b/userland/arch/arm/c/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/c/freestanding/hello.c
+++ b/userland/arch/arm/c/freestanding/hello.c
@@ -0,0 +1,35 @@
+#include <inttypes.h>
+
+void _start(void) {
+    uint32_t exit_status;
+
+    /* write */
+    {
+        char msg[] = "hello\n";
+        uint32_t syscall_return;
+        register uint32_t r0 __asm__ ("r0") = 1; /* stdout */
+        register char *r1 __asm__ ("r1") = msg;
+        register uint32_t r2 __asm__ ("r2") = sizeof(msg);
+        register uint32_t r8 __asm__ ("r7") = 4; /* syscall number */
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (r0)
+            : "r" (r1), "r" (r2), "r" (r8)
+            : "memory"
+        );
+        syscall_return = r0;
+        exit_status = (syscall_return != sizeof(msg));
+    }
+
+    /* exit */
+    {
+        register uint32_t r0 __asm__ ("r0") = exit_status;
+        register uint32_t r7 __asm__ ("r7") = 1;
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (r0)
+            : "r" (r7)
+            :
+        );
+    }
+}
--- a/userland/arch/arm/c/inc.c
+++ b/userland/arch/arm/c/inc.c
@@ -0,0 +1,15 @@
+/* Increment a variable in inline assembly. */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint32_t my_local_var = 1;
+    __asm__ (
+        "add %[my_local_var], %[my_local_var], #1;"
+        : [my_local_var] "+r" (my_local_var)
+        :
+        :
+    );
+    assert(my_local_var == 2);
+}
--- a/userland/arch/arm/c/inc_float.c
+++ b/userland/arch/arm/c/inc_float.c
@@ -0,0 +1,28 @@
+/* https://stackoverflow.com/questions/53960240/armv8-floating-point-output-inline-assembly */
+
+#include <assert.h>
+
+int main(void) {
+    float my_float = 1.5;
+    __asm__ (
+        "vmov s0, 1.0;"
+        "vadd.f32 %[my_float], %[my_float], s0;"
+        : [my_float] "+t" (my_float)
+        :
+        : "s0"
+    );
+    assert(my_float == 2.5);
+
+    /* Undocumented %P
+     * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89482
+     */
+    double my_double = 1.5;
+    __asm__ (
+        "vmov.f64 d0, 1.0;"
+        "vadd.f64 %P[my_double], %P[my_double], d0;"
+        : [my_double] "+w" (my_double)
+        :
+        : "d0"
+    );
+    assert(my_double == 2.5);
+}
--- a/userland/arch/arm/c/inc_memory.c
+++ b/userland/arch/arm/c/inc_memory.c
@@ -0,0 +1,32 @@
+/* Like inc.c but less good since we do more work ourselves.
+ *
+ * Just doing this to test out the "m" memory constraint.
+ *
+ * GCC 8.2.0 -O0 assembles ldr line to:
+ *
+ * ....
+ * ldr r0, [fp, #-12]
+ * ....
+ *
+ * and `-O3` assembles to:
+ *
+ * ....
+ * ldr r0, [sp]
+ * ....
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint32_t my_local_var = 1;
+    __asm__ (
+        "ldr r0, %[my_local_var];"
+        "add r0, r0, #1;"
+        "str r0, %[my_local_var];"
+        : [my_local_var] "+m" (my_local_var)
+        :
+        : "r0"
+    );
+    assert(my_local_var == 2);
+}
--- a/userland/arch/arm/c/inc_memory_global.c
+++ b/userland/arch/arm/c/inc_memory_global.c
@@ -0,0 +1,25 @@
+/* GCC 8.2.0 -O0 and -O3 assembles ldr line to:
+ *
+ * ....
+ * movw r3, #<lower address part>
+ * movt r3, #<higher address part>
+ * ldr r0, [r3]
+ * ....
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+uint32_t my_global_var = 1;
+
+int main(void) {
+    __asm__ (
+        "ldr r0, %[my_global_var];"
+        "add r0, r0, #1;"
+        "str r0, %[my_global_var];"
+        : [my_global_var] "+m" (my_global_var)
+        :
+        : "r0"
+    );
+    assert(my_global_var == 2);
+}
--- a/userland/arch/arm/c/reg_var.c
+++ b/userland/arch/arm/c/reg_var.c
@@ -0,0 +1,38 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#register-variables */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    register uint32_t r0 __asm__ ("r0");
+    register uint32_t r1 __asm__ ("r1");
+    uint32_t new_r0;
+    uint32_t new_r1;
+    {
+        /* We must set the registers immediately before calling,
+         * without making any function calls in between.
+         */
+        r0 = 1;
+        r1 = 2;
+        __asm__ (
+            /* We intentionally use an explicit r0 and r1 here,
+            * just to illustrate that we are certain that the
+            * r0 variable will go in r0. Real code would never do this.
+            */
+            "add %[r0], r0, #1;"
+            "add %[r1], r1, #1;"
+            /* We have to specify r0 in the constraints.*/
+            : [r0] "+r" (r0),
+              [r1] "+r" (r1)
+            :
+            :
+        );
+        /* When we are done, we must immediatly assign
+         * the register variables to regular variables.
+         */
+        new_r0 = r0;
+        new_r1 = r1;
+    }
+    assert(new_r0 == 2);
+    assert(new_r1 == 3);
+}
--- a/userland/arch/arm/c_from_asm.S
+++ b/userland/arch/arm/c_from_asm.S
@@ -0,0 +1,59 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#calling-convention */
+
+#include "common.h"
+
+.data
+puts_s:
+    .asciz "hello puts"
+printf_format:
+    .asciz "hello printf %x\n"
+my_array_0:
+    .word 0x11111111, 0x22222222, 0x33333333, 0x44444444
+my_array_1:
+    .word 0x55555555, 0x66666666, 0x77777777, 0x88888888
+
+ENTRY
+    /* puts("hello world") */
+    /* r0 is first argument. */
+    ldr r0, =puts_s
+    bl puts
+    /* Check exit statut >= 0 for success. */
+    cmp r0, 0
+    ASSERT(bge)
+
+    /* printf */
+    ldr r0, =printf_format
+    ldr r1, =0x12345678
+    bl printf
+    cmp r0, 0
+    ASSERT(bge)
+
+    /* memcpy and memcmp. */
+
+        /* Smaller. */
+        ldr r0, =my_array_0
+        ldr r1, =my_array_1
+        ldr r2, =0x10
+        bl memcmp
+        cmp r0, 0
+        ASSERT(blt)
+
+        /* Copy. */
+        ldr r0, =my_array_0
+        ldr r1, =my_array_1
+        ldr r2, =0x10
+        bl memcpy
+
+        /* Equal. */
+        ldr r0, =my_array_0
+        ldr r1, =my_array_1
+        ldr r2, =0x10
+        bl memcmp
+        ASSERT_EQ(r0, 0)
+
+    /* exit(0) */
+    mov r0, 0
+    bl exit
+
+    /* Never reached, just for the fail symbol. */
+EXIT
--- a/userland/arch/arm/clz.S
+++ b/userland/arch/arm/clz.S
@@ -0,0 +1,17 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =0x7FFFFFFF
+    clz r1, r0
+    ASSERT_EQ(r1, 1)
+
+    ldr r0, =0x3FFFFFFF
+    clz r1, r0
+    ASSERT_EQ(r1, 2)
+
+    ldr r0, =0x1FFFFFFF
+    clz r1, r0
+    ASSERT_EQ(r1, 3)
+EXIT
--- a/userland/arch/arm/comments.S
+++ b/userland/arch/arm/comments.S
@@ -0,0 +1,14 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#comments */
+
+#include "common.h"
+ENTRY
+    # mycomment
+    @ mycomment
+    /* # only works at the beginning of the line.
+     * Error: garbage following instruction -- `nop #comment'
+     */
+#if 0
+    nop # mycomment
+#endif
+    nop @ mycomment
+EXIT
--- a/userland/arch/arm/common_arch.h
+++ b/userland/arch/arm/common_arch.h
@@ -0,0 +1,71 @@
+#ifndef COMMON_ARCH_H
+#define COMMON_ARCH_H
+
+.syntax unified
+
+/* Assert that a register equals a constant.
+ * * reg: the register to check. Can be r0-r10, but not r11. r11 is overwritten.
+ * * const: the constant to compare to. Only works for literals or labels, not for registers.
+ *          For register / register comparision, use ASSERT_EQ_REG.
+ */
+#define ASSERT_EQ(reg, const) \
+    ldr r11, =const; \
+	cmp reg, r11; \
+	ASSERT(beq); \
+;
+
+/* Assert that two arrays are the same. */
+#define ASSERT_MEMCMP(s1, s2, n) \
+	MEMCMP(s1, s2, n); \
+	ASSERT_EQ(r0, 0); \
+;
+
+/* Store all callee saved registers, and LR in case we make further BL calls.
+ *
+ * Also save the input arguments r0-r3 on the stack, so we can access them later on,
+ * despite those registers being overwritten.
+ */
+#define ENTRY \
+.text; \
+.global asm_main; \
+asm_main: \
+    stmdb sp!, {r0-r12, lr}; \
+asm_main_after_prologue: \
+;
+
+/* Meant to be called at the end of ENTRY.*
+ *
+ * Branching to "fail" makes tests fail with exit status 1.
+ *
+ * If EXIT is reached, the program ends successfully.
+ *
+ * Restore LR and bx jump to it to return from asm_main.
+ */
+#define EXIT \
+    mov r0, 0; \
+    mov r1, 0; \
+    b pass; \
+fail: \
+    ldr r1, [sp]; \
+    str r0, [r1]; \
+    mov r0, 1; \
+pass: \
+    add sp, 16; \
+    ldmia sp!, {r4-r12, lr}; \
+    bx lr; \
+;
+
+/* Always fail. */
+#define FAIL \
+    ldr r0, =__LINE__; \
+    b fail; \
+;
+
+#define MEMCMP(s1, s2, n) \
+    ldr r0, =s1; \
+    ldr r1, =s2; \
+    ldr r2, =n; \
+    bl memcmp; \
+;
+
+#endif
--- a/userland/arch/arm/cond.S
+++ b/userland/arch/arm/cond.S
@@ -0,0 +1,16 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#conditional-execution */
+
+#include "common.h"
+
+ENTRY
+    mov r0, 0
+    mov r1, 1
+    cmp r0, 1
+    /* Previous cmp failed, skip this operation. */
+    addeq r1, 1
+    ASSERT_EQ(r1, 1)
+    cmp r0, 0
+    /* Previous passed, do this operation. */
+    addeq r1, 1
+    ASSERT_EQ(r1, 2)
+EXIT
--- a/userland/arch/arm/empty.S
+++ b/userland/arch/arm/empty.S
@@ -0,0 +1 @@
+../empty.S
--- a/userland/arch/arm/fail.S
+++ b/userland/arch/arm/fail.S
@@ -0,0 +1 @@
+../fail.S
--- a/userland/arch/arm/freestanding/build
+++ b/userland/arch/arm/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/arm/freestanding/hello.S
+++ b/userland/arch/arm/freestanding/hello.S
@@ -0,0 +1,21 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#linux-system-calls */
+
+.syntax unified
+.text
+.global _start
+_start:
+asm_main_after_prologue:
+    /* write */
+    mov r0, 1     /* stdout */
+    adr r1, msg   /* buffer */
+    ldr r2, =len  /* len */
+    mov r7, 4     /* syscall number */
+    svc 0
+
+    /* exit */
+    mov r0, 0     /* exit status */
+    mov r7, 1     /* syscall number */
+    svc 0
+msg:
+    .ascii "hello\n"
+len = . - msg
--- a/userland/arch/arm/hello_driver.S
+++ b/userland/arch/arm/hello_driver.S
@@ -0,0 +1,23 @@
+/* Minimal example using driver.
+ *
+ * Controls the exit status of the program.
+ */
+
+.syntax unified
+.text
+.global asm_main
+asm_main:
+asm_main_after_prologue:
+
+    /* Set the return value according to the ARM calling convention. */
+    mov r0, 0
+
+    /* Try some whacky value to see tests break. */
+    /*mov r0, 77*/
+
+    /* Branch to the address at register lr.
+     * That is the return value which was put there by the C driver (likely with a bl).
+     *
+     * X means eXchange encoding from thumb back to ARM, which is what the driver uses.
+     */
+    bx lr
--- a/userland/arch/arm/immediates.S
+++ b/userland/arch/arm/immediates.S
@@ -0,0 +1,24 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#immediates */
+
+#include "common.h"
+
+ENTRY
+    /* This is the default. We hack it in common.h however. */
+.syntax divided
+   /* These fail. */
+#if 0
+    mov r0, 1
+    mov r0, 0x1
+#endif
+    mov r0, #1
+    mov r0, #0x1
+    mov r0, $1
+    mov r0, $0x1
+.syntax unified
+    mov r0, 1
+    mov r0, 0x1
+    mov r0, 1
+    mov r0, 0x1
+    mov r0, $1
+    mov r0, $0x1
+EXIT
--- a/userland/arch/arm/inc_array.S
+++ b/userland/arch/arm/inc_array.S
@@ -0,0 +1,27 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#loop-over-array */
+
+#include "common.h"
+
+#define NELEM 4
+#define ELEM_SIZE 4
+
+.data;
+my_array:
+    .word 0x11111111, 0x22222222, 0x33333333, 0x44444444
+my_array_expect:
+    .word 0x11111112, 0x22222223, 0x33333334, 0x44444445
+
+ENTRY
+    /* Increment. */
+    ldr r0, =my_array
+    mov r1, NELEM
+increment:
+    ldr r2, [r0]
+    add r2, 1
+    /* Post index usage. */
+    str r2, [r0], ELEM_SIZE
+    sub r1, 1
+    cmp r1, 0
+    bne increment
+    ASSERT_MEMCMP(my_array, my_array_expect, 0x10)
+EXIT
--- a/userland/arch/arm/ldmia.S
+++ b/userland/arch/arm/ldmia.S
@@ -0,0 +1,62 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#loop-over-array */
+
+#include "common.h"
+
+#define NELEM 4
+#define ELEM_SIZE 4
+
+.data;
+my_array_0:
+    .word 0x11111111, 0x22222222, 0x33333333, 0x44444444
+my_array_1:
+    .word 0x55555555, 0x66666666, 0x77777777, 0x88888888
+
+ENTRY
+
+    /* Load r1, r2, r3 and r4 starting from the address in r0. Don't change r0 */
+    ldr r0, =my_array_0
+    ldr r1, =0
+    ldr r2, =0
+    ldr r3, =0
+    ldr r4, =0
+    ldmia r0, {r1-r4}
+    ASSERT_EQ(r0, my_array_0)
+    ASSERT_EQ(r1, 0x11111111)
+    ASSERT_EQ(r2, 0x22222222)
+    ASSERT_EQ(r3, 0x33333333)
+    ASSERT_EQ(r4, 0x44444444)
+
+    /* Swapping the order of r1 and r2 on the mnemonic makes no difference to load order.
+     *
+     * But it gives an assembler warning, so we won't do it by default:
+     *
+     *  ldmia.S: Assembler messages:
+     *  ldmia.S:32: Warning: register range not in ascending order
+     */
+#if 0
+    ldr r0, =my_array_0
+    ldr r1, =0
+    ldr r2, =0
+    ldmia r0, {r2,r1}
+    ASSERT_EQ(r1, 0x11111111)
+    ASSERT_EQ(r2, 0x22222222)
+#endif
+
+    /* Modify the array */
+    ldr r0, =my_array_1
+    ldr r1, =0x55555555
+    ldr r2, =0x66666666
+    ldr r3, =0x77777777
+    ldr r4, =0x88888888
+    stmdb r0, {r1-r4}
+
+    /* Verify that my_array_0 changed and is equal to my_array_1. */
+    MEMCMP(my_array_0, my_array_1, 0x10)
+    ASSERT_EQ(r0, 0)
+
+    /* Load registers and increment r0. */
+    ldr r0, =my_array_0
+    ldmia r0!, {r1-r4}
+    ASSERT_EQ(r0, my_array_1)
+
+EXIT
--- a/userland/arch/arm/ldr_pseudo.S
+++ b/userland/arch/arm/ldr_pseudo.S
@@ -0,0 +1,65 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldr-pseudo-instruction */
+
+#include "common.h"
+
+ENTRY
+
+    /* Mnemonic for a PC relative load:
+     *
+     * ....
+     * ldr r0, [pc, offset]
+     * r0 = myvar
+     * ....
+     */
+    ldr r0, myvar
+    ASSERT_EQ(r0, 0x12345678)
+
+    /* Mnemonic PC relative load with an offset.
+     * Load myvar2 instead of myvar.
+     */
+    ldr r0, myvar + 4
+    ASSERT_EQ(r0, 0x9ABCDEF0)
+
+    /* First store the address in r0 using a magic =myvar, which creates
+     * a new variable containing the address and PC-relative addresses it
+     * https://stackoverflow.com/questions/17214962/what-is-the-difference-between-label-equals-sign-and-label-brackets-in-ar
+     *
+     * Use the adr instruction would likely be better for this application however.
+     *
+     * ....
+     * r0 = &myvar
+     * r1 = *r0
+     * ....
+     */
+    ldr r0, =myvar
+    ldr r1, [r0]
+    ASSERT_EQ(r1, 0x12345678)
+
+    /* More efficiently, use r0 as the address to read, and write to r0 itself. */
+    ldr r0, =myvar
+    ldr r0, [r0]
+    ASSERT_EQ(r0, 0x12345678)
+
+    /* Same as =myvar but store a constant to a register.
+     * Can also be done with movw and movt. */
+    ldr r0, =0x11112222
+    ASSERT_EQ(r0, 0x11112222)
+
+    /* We can also use GAS tolower16 and topper16  and movw and movt
+     * to load the address of myvar into r0 with two immediates.
+     *
+     * This results in one extra 4 byte instruction read from memory,
+     * and one less data read, so it is likely more cache efficient.
+     *
+     * https://sourceware.org/binutils/docs-2.19/as/ARM_002dRelocations.html
+     */
+    movw r0, #:lower16:myvar
+    movt r0, #:upper16:myvar
+    ldr r1, [r0]
+    ASSERT_EQ(r1, 0x12345678)
+
+EXIT
+myvar:
+    .word 0x12345678
+myvar2:
+    .word 0x9ABCDEF0
--- a/userland/arch/arm/ldrb.S
+++ b/userland/arch/arm/ldrb.S
@@ -0,0 +1,12 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldrh-and-ldrb */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =myvar
+    mov r1, 0x0
+    ldrb r1, [r0]
+    ASSERT_EQ(r1, 0x00000078)
+EXIT
+myvar:
+    .word 0x12345678
--- a/userland/arch/arm/ldrh.S
+++ b/userland/arch/arm/ldrh.S
@@ -0,0 +1,12 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldrh-and-ldrb */
+
+#include "common.h"
+
+ENTRY
+    ldr r0, =myvar
+    mov r1, 0x0
+    ldrh r1, [r0]
+    ASSERT_EQ(r1, 0x00005678)
+EXIT
+myvar:
+    .word 0x12345678
--- a/userland/arch/arm/mov.S
+++ b/userland/arch/arm/mov.S
@@ -0,0 +1,19 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#mov */
+
+#include "common.h"
+
+ENTRY
+
+    /* Immediate. */
+    mov r0, 0
+    ASSERT_EQ(r0, 0)
+    mov r0, 1
+    ASSERT_EQ(r0, 1)
+
+    /* Register. */
+    mov r0, 0
+    mov r1, 1
+    mov r1, r0
+    ASSERT_EQ(r1, 0)
+
+EXIT
--- a/userland/arch/arm/movw.S
+++ b/userland/arch/arm/movw.S
@@ -0,0 +1,27 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#movw-and-movt */
+
+#include "common.h"
+
+ENTRY
+
+    /* movt (top) and movw (TODO what is w) set the higher
+     * and lower 16 bits of the register.
+     */
+    movw r0, 0xFFFF
+    movt r0, 0x1234
+    add r0, 1
+    ASSERT_EQ(r0, 0x12350000)
+
+    /* movw also zeroes out the top bits, allowing small 16-bit
+     * C constants to be assigned in a single instruction.
+     *
+     * It differs from mov because mov can only encode 8 bits
+     * at a time, while movw can encode 16.
+     *
+     * movt does not modify the lower bits however.
+     */
+    ldr r0, =0x12345678
+    movw r0, 0x1111
+    ASSERT_EQ(r0, 0x00001111)
+
+EXIT
--- a/userland/arch/arm/mul.S
+++ b/userland/arch/arm/mul.S
@@ -0,0 +1,12 @@
+/* Multiplication. */
+
+#include "common.h"
+
+ENTRY
+    /* 2 * 3 = 6 */
+    mov r0, 0
+    mov r1, 2
+    mov r2, 3
+    mul r1, r2
+    ASSERT_EQ(r1, 6)
+EXIT
--- a/userland/arch/arm/nop.S
+++ b/userland/arch/arm/nop.S
@@ -0,0 +1,32 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#nop */
+
+#include "common.h"
+
+ENTRY
+    /* Disassembles as:
+     *
+     * ....
+     * nop {0}
+     * ....
+     *
+     * TODO what is the `{0}`?
+     */
+    nop
+
+    /* Disassembles as:
+     *
+     * ....
+     * nop ; (mov r0, r0)
+     * ....
+     */
+    mov r0, r0
+
+    /* Disassemble as mov. TODO Why not as nop as in `mov r0, r0`?
+     * Do they have any effect?
+     */
+    mov r1, r1
+    mov r8, r8
+
+    /* And there are other nops as well? Disassembles as `and`. */
+    and r0, r0, r0
+EXIT
--- a/userland/arch/arm/push.S
+++ b/userland/arch/arm/push.S
@@ -0,0 +1,31 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ldmia */
+
+#include "common.h"
+
+ENTRY
+
+    /* Save sp before push. */
+    mov r0, sp
+
+    /* Push. */
+    mov r1, 1
+    mov r2, 2
+    push {r1, r2}
+
+    /* Save sp after push. */
+    mov r1, sp
+
+    /* Restore. */
+    mov r3, 0
+    mov r4, 0
+    pop {r3, r4}
+    ASSERT_EQ(r3, 1)
+    ASSERT_EQ(r4, 2)
+
+    /* Check that stack pointer moved down by 8 bytes
+     * (2 registers x 4 bytes each).
+     */
+    sub r0, r1
+    ASSERT_EQ(r0, 8)
+
+EXIT
--- a/userland/arch/arm/rbit.S
+++ b/userland/arch/arm/rbit.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#rbit */
+
+#include "common.h"
+
+ENTRY
+    ldr r0,      =0b00000001001000110100010101100101
+    rbit r1, r0
+    ASSERT_EQ(r1, 0b10100110101000101100010010000000)
+EXIT
--- a/userland/arch/arm/regs.S
+++ b/userland/arch/arm/regs.S
@@ -0,0 +1,69 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#registers */
+
+#include "common.h"
+
+ENTRY
+
+    /* 13 general purpose registers. */
+    mov r0, 0
+    mov r1, 1
+    mov r2, 2
+    mov r3, 3
+    mov r4, 4
+    mov r5, 5
+    mov r6, 6
+    mov r7, 7
+    mov r8, 8
+    mov r9, 9
+    mov r10, 10
+    mov r11, 11
+    mov r12, 12
+
+    /* * r11: aliased to FP (frame pointer, debug stack trace usage only)
+     * +
+     * I think FP is only a convention with no instruction impact, but TODO:
+     * not mentioned on AAPCS. aarch64 AAPCS mentions it though.
+     * * r13: aliased to SP (stack pointer), what push / pop use
+     * * r14: aliased to LR (link register), what bl writes the return address to
+     * * r15: aliased to PC (program counter), contains the current instruction address
+     *
+     * In ARMv8, SP and PC have dedicated registers in addition to
+     * the 32-general purpose ones. LR is still general purpose as before.
+     *
+     * Therefore, it is possible to use those registers in any place
+     * other registers may be used.
+     *
+     * This is not possible in ARMv8 anymore.
+     *
+     * For example, we can load an address into PC, which is very similar to what B / BX does:
+     * https://stackoverflow.com/questions/32304646/arm-assembly-branch-to-address-inside-register-or-memory/54145818#54145818
+     */
+    ldr pc, =10f
+    FAIL
+10:
+
+    /* Same with r15, which is the same as pc. */
+    ldr r15, =10f
+    FAIL
+10:
+
+    /* Another example with mov reading from pc. */
+pc_addr:
+    mov r0, pc
+    /* Why sub 8:
+     * https://stackoverflow.com/questions/24091566/why-does-the-arm-pc-register-point-to-the-instruction-after-the-next-one-to-be-e
+     */
+    sub r0, r0, 8
+
+    /* pc-relative load also just work just like any other register. */
+    ldr r0, [pc]
+    b 1f
+    .word 0x12345678
+1:
+    ASSERT_EQ(r0, 0x12345678)
+
+    /* We can also use fp in GNU GAS assembly. */
+    mov r11, 0
+    mov fp, 1
+    ASSERT_EQ(r11, 1)
+EXIT
--- a/userland/arch/arm/rev.S
+++ b/userland/arch/arm/rev.S
@@ -0,0 +1,15 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* All bytes in register. */
+    ldr r0, =0x11223344
+    rev r1, r0
+    ASSERT_EQ(r1, 0x44332211)
+
+    /* Groups of 16-bits. */
+    ldr r0, =0x11223344
+    rev16 r1, r0
+    ASSERT_EQ(r1, 0x22114433)
+EXIT
--- a/userland/arch/arm/s_suffix.S
+++ b/userland/arch/arm/s_suffix.S
@@ -0,0 +1,35 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#s-suffix */
+
+#include "common.h"
+
+ENTRY
+
+    /* Result is 0, set beq. */
+    movs r0, 0
+    ASSERT(beq)
+
+    /* The opposite. */
+    movs r0, 1
+    ASSERT(bne)
+
+    /* mov without s does not set the status. */
+    movs r0, 0
+    mov r0, 1
+    ASSERT(beq)
+
+    /* movs still moves... */
+    mov r0, 0
+    movs r0, 1
+    ASSERT_EQ(r0, 1)
+
+    /* add: the result is 0. */
+    mov r0, 1
+    adds r0, -1
+    ASSERT(beq)
+
+    /* add: result non 0. */
+    mov r0, 1
+    adds r0, 1
+    ASSERT(bne)
+
+EXIT
--- a/userland/arch/arm/shift.S
+++ b/userland/arch/arm/shift.S
@@ -0,0 +1,79 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#shift-suffixes */
+
+#include "common.h"
+
+ENTRY
+
+    /* lsr */
+    ldr r0, =0xFFF00FFF
+    mov r1, r0, lsl 8
+    ldr r2, =0xF00FFF00
+    ASSERT_EQ_REG(r1, r2)
+
+    /* lsl */
+    ldr r0, =0xFFF00FFF
+    mov r1, r0, lsr 8
+    ldr r2, =0x00FFF00F
+    ASSERT_EQ_REG(r1, r2)
+
+    /* ror */
+    ldr r0, =0xFFF00FFF
+    mov r1, r0, ror 8
+    ldr r2, =0xFFFFF00F
+    ASSERT_EQ_REG(r1, r2)
+
+    /* asr negative */
+    ldr r0, =0x80000008
+    mov r1, r0, asr 1
+    ldr r2, =0xC0000004
+    ASSERT_EQ_REG(r1, r2)
+
+    /* asr positive */
+    ldr r0, =0x40000008
+    mov r1, r0, asr 1
+    ldr r2, =0x20000004
+    ASSERT_EQ_REG(r1, r2)
+
+    /* There are also direct shift mnemonics for the mov shifts.
+     *
+     * They assembly to the exact same bytes as the mov version
+     */
+    ldr r0, =0xFFF00FFF
+    lsl r1, r0, 8
+    ldr r2, =0xF00FFF00
+    ASSERT_EQ_REG(r1, r2)
+
+    /* If used with the `mov` instruction, it results in a pure shift,
+     * but the suffixes also exist for all the other data processing instructions.
+     *
+     * Here we illustrate a shifted add instruction which calculates:
+     *
+     * ....
+     * r1 = r1 + (r0 << 1)
+     * ....
+     */
+    ldr r0, =0x10
+    ldr r1, =0x100
+    add r1, r1, r0, lsl 1
+    ldr r2, =0x00000120
+    ASSERT_EQ_REG(r1, r2)
+
+    /* The shift takes up the same encoding slot as the immediate,
+     * therefore it is not possible to both use an immediate and shift.
+     *
+     * Error: shift expression expected -- `add r1,r0,1,lsl#1'
+     */
+#if 0
+    add r1, r0, 1, lsl 1
+#endif
+
+    /* However, you can still encode shifted bitmasks of
+     * limited width in immediates, so why not just use the
+     * assembler pre-processing for it?
+     */
+    ldr r1, =0x100
+    add r1, r1, (0x10 << 1)
+    ldr r2, =0x00000120
+    ASSERT_EQ_REG(r1, r2)
+
+EXIT
--- a/userland/arch/arm/simd.S
+++ b/userland/arch/arm/simd.S
@@ -0,0 +1,113 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* vadd.u32
+     *
+     * Add 4x 32-bit unsigned integers in one go.
+     *
+     * q means 128-bits.
+     *
+     * u32 means that we treat memory as uint32_t types.
+     *
+     * 4 is deduced: in 128 bits you can fit 4 u32.
+     *
+     * Observe how the carry is propagated within u32 integers,
+     * but not across them.
+     */
+.data
+    u32_0:          .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444
+    u32_1:          .word 0x15551555, 0x16661666, 0x17771777, 0x18881888
+    u32_sum_expect: .word 0x06670666, 0x08890888, 0x0AAB0AAA, 0x0CCD0CCC
+.bss
+    u32_sum: .skip 0x10
+.text
+    ldr r0, =u32_0
+    vld1.32 {q0}, [r0]
+    ldr r0, =u32_1
+    vld1.32 {q1}, [r0]
+    vadd.u32 q2, q0, q1
+    ldr r0, =u32_sum
+    vst1.u32 {q2}, [r0]
+    ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10)
+
+    /* vadd.u64: 2x 64-bit unsigned integer add. */
+.data
+    u64_0:          .quad 0xF1111111F1111111, 0xF2222222F2222222
+    u64_1:          .quad 0x1555555515555555, 0x1666666616666666
+    u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
+.bss
+    u64_sum: .skip 0x10
+.text
+    ldr r0, =u64_0
+    vld1.64 {q0}, [r0]
+    ldr r0, =u64_1
+    vld1.64 {q1}, [r0]
+    vadd.u64 q2, q0, q1
+    ldr r0, =u64_sum
+    vst1.u64 {q2}, [r0]
+    ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10)
+
+    /* vadd.s64: 2x 64-bit signed integer add. TODO: how to differentiate
+     * it from signed? I think signed and unsigned addition are identical
+     * in two's complement, the only difference is overflow / carry detection
+     * flags. But how do flags work when there are many values being added
+     * at once?
+     */
+.data
+    s64_0:          .quad -1, -2
+    s64_1:          .quad -1, -2
+    s64_sum_expect: .quad -2, -4
+.bss
+    s64_sum: .skip 0x10
+.text
+    ldr r0, =s64_0
+    vld1.64 {q0}, [r0]
+    ldr r0, =s64_1
+    vld1.64 {q1}, [r0]
+    vadd.s64 q2, q0, q1
+    ldr r0, =s64_sum
+    vst1.s64 {q2}, [r0]
+    ASSERT_MEMCMP(s64_sum, s64_sum_expect, 0x10)
+
+    /* vadd.f32: 4x 32-bit float add. */
+.data
+    f32_0:          .float 1.5, 2.5,  3.5,  4.5
+    f32_1:          .float 5.5, 6.5,  7.5,  8.5
+    f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
+.bss
+    f32_sum: .skip 0x10
+.text
+    ldr r0, =f32_0
+    vld1.32 {q0}, [r0]
+    ldr r0, =f32_1
+    vld1.32 {q1}, [r0]
+    vadd.f32 q2, q0, q1
+    ldr r0, =f32_sum
+    vst1.32 {q2}, [r0]
+    ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10)
+
+    /* vadd.f64: 2x 64-bit float add: appears not possible.
+     *
+     * https://stackoverflow.com/questions/36052564/does-arm-support-simd-operations-for-64-bit-floating-point-numbers
+     */
+.data
+    f64_0:          .double 1.5, 2.5
+    f64_1:          .double 5.5, 6.5
+    f64_sum_expect: .double 7.0, 9.0
+.bss
+    f64_sum: .skip 0x10
+.text
+    ldr r0, =f64_0
+    vld1.64 {q0}, [r0]
+    ldr r0, =f64_1
+    vld1.64 {q1}, [r0]
+#if 0
+    /* bad type in Neon instruction -- `vadd.f64 q2,q0,q1' */
+    vadd.f64 q2, q0, q1
+    ldr r0, =f64_sum
+    vst1.64 {q2}, [r0]
+    ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10)
+#endif
+EXIT
--- a/userland/arch/arm/str.S
+++ b/userland/arch/arm/str.S
@@ -0,0 +1,60 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#load-and-store-instructions */
+
+#include "common.h"
+
+.data;
+    /* Must be in the .data section, since we want to modify it. */
+myvar:
+    .word 0x12345678
+
+ENTRY
+    /* r0 will contain the address. */
+    ldr r0, =myvar
+
+    /* Sanity check. */
+    ldr r1, [r0]
+    movw r2, 0x5678
+    movt r2, 0x1234
+    ASSERT_EQ_REG(r1, r2)
+
+    /* Modify the value. */
+    movw r1, 0xDEF0
+    movt r1, 0x9ABC
+    str r1, [r0]
+
+    /* Check that it changed. */
+    ldr r1, [r0]
+    movw r2, 0xDEF0
+    movt r2, 0x9ABC
+    ASSERT_EQ_REG(r1, r2)
+
+    /* Cannot use PC relative addressing to a different segment,
+     * or else it fails with:
+     *
+     * ....
+     * Error: internal_relocation (type: OFFSET_IMM) not fixed up
+     * ....
+     *
+     * https://stackoverflow.com/questions/10094282/internal-relocation-not-fixed-up
+     */
+    /*ldr r0, myvar*/
+
+#if 0
+    /* We could in theory write this to set the address of myvar,
+     * but it will always segfault under Linux because the text segment is read-only.
+     * This is however useful in baremetal programming.
+     * This construct is not possible in ARMv8 for str:
+     * https://github.com/cirosantilli/arm-assembly-cheat#armv8-str
+     */
+    str r1, var_in_same_section
+var_in_same_section:
+#endif
+
+    /* = sign just doesn't make sense for str, you can't set the
+     * address of a variable.
+     */
+#if 0
+    str r1, =myvar
+#endif
+
+EXIT
--- a/userland/arch/arm/sub.S
+++ b/userland/arch/arm/sub.S
@@ -0,0 +1,11 @@
+/* Subtraction. */
+
+#include "common.h"
+
+ENTRY
+    /* 3 - 2 == 1 , register version.*/
+    mov r0, 3
+    mov r1, 2
+    sub r0, r0, r1
+    ASSERT_EQ(r0, 1)
+EXIT
--- a/userland/arch/arm/thumb.S
+++ b/userland/arch/arm/thumb.S
@@ -0,0 +1,17 @@
+/* Illustrates features that are only available in thumb. */
+
+.syntax unified
+.text
+.thumb_func
+.global asm_main
+asm_main:
+asm_main_after_prologue:
+
+    /* CBZ: cmp and branch if zero instruction. Equivalent to CMP + BEQ.
+     * TODO create an interesting assertion here.
+     */
+    cbz r1, 1f
+    1:
+
+    mov r0, 0
+    bx lr
--- a/userland/arch/arm/tst.S
+++ b/userland/arch/arm/tst.S
@@ -0,0 +1,19 @@
+/* Test. Same as ands, but don't store the result, just update flags. */
+
+#include "common.h"
+
+ENTRY
+
+    /* 0x0F && 0xF0 == 0x00, so beq. */
+    mov r0, 0x0F
+    tst r0, 0xF0
+    ASSERT(beq)
+
+    /* bne */
+    mov r0, 0xFF
+    tst r0, 0x0F
+    ASSERT(bne)
+    # r0 was not modified.
+    ASSERT_EQ(r0, 0xFF)
+
+EXIT
--- a/userland/arch/arm/vcvt.S
+++ b/userland/arch/arm/vcvt.S
@@ -0,0 +1,90 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vcvt */
+
+#include "common.h"
+
+ENTRY
+    /* SIMD positive. */
+.data
+    vcvt_positive_0:      .float 1.25, 2.5, 3.75, 4.0
+    vcvt_positive_expect: .word  1,    2,   3,    4
+.bss
+    vcvt_positive_result: .skip 0x10
+.text
+    ldr r0, =vcvt_positive_0
+    vld1.32 {q0}, [r0]
+    vcvt.u32.f32 q1, q0
+    ldr r0, =vcvt_positive_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(vcvt_positive_result, vcvt_positive_expect, 0x10)
+
+    /* SIMD negative. */
+.data
+    vcvt_negative_0:      .float -1.25, -2.5, -3.75, -4.0
+    vcvt_negative_expect: .word  -1,    -2,   -3,    -4
+.bss
+    vcvt_negative_result: .skip 0x10
+.text
+    ldr r0, =vcvt_negative_0
+    vld1.32 {q0}, [r0]
+    vcvt.s32.f32 q1, q0
+    ldr r0, =vcvt_negative_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(vcvt_negative_result, vcvt_negative_expect, 0x10)
+
+    /* Floating point. */
+.data
+    vcvt_positive_float_0:      .float 1.5, 2.5
+    vcvt_positive_float_expect: .word  1
+                                .float      2.5
+.bss
+    vcvt_positive_float_result: .skip 0x8
+.text
+    ldr r0, =vcvt_positive_float_0
+    vld1.32 {d0}, [r0]
+    vcvt.u32.f32 s0, s0
+    ldr r0, =vcvt_positive_float_result
+    vst1.32 {d0}, [r0]
+    ASSERT_MEMCMP(vcvt_positive_float_result, vcvt_positive_float_expect, 0x8)
+
+    /* Floating point but with immediates.
+     *
+     * You have to worry of course about representability of
+     * the immediate in 4 bytes, which is even more fun for
+     * floating point numbers :-)
+     *
+     * Doing this mostly to illustrate the joys of vmov.i32.
+     *
+     * For some reason, there is no vmov.i32 sn, only dn.
+     * If you try to use sn, it does the same as .f32 and
+     * stores a float instead. Horrible!
+     */
+    vmov.f32 d0, 1.5
+    vcvt.u32.f32 s0, s0
+    vmov.i32 d1, 1
+    vcmp.f32 s0, s2
+    vmrs apsr_nzcv, fpscr
+    ASSERT(beq)
+    /* Check that s1 wasn't modified by vcvt. */
+    vmov.f32 s2, 1.5
+    vcmp.f32 s1, s2
+    vmrs apsr_nzcv, fpscr
+    ASSERT(beq)
+
+    /* Floating point double precision. */
+.data
+    vcvt_positive_double_0:      .double 1.5
+    vcvt_positive_double_expect: .word   1
+.bss
+    vcvt_positive_double_result: .skip 0x8
+.text
+    ldr r0, =vcvt_positive_double_0
+    vld1.64 {d0}, [r0]
+    vcvt.u32.f64 s0, d0
+    ldr r0, =vcvt_positive_double_result
+    vst1.32 {d0}, [r0]
+    ASSERT_MEMCMP(
+        vcvt_positive_double_result,
+        vcvt_positive_double_expect,
+        0x4
+    )
+EXIT
--- a/userland/arch/arm/vcvta.S
+++ b/userland/arch/arm/vcvta.S
@@ -0,0 +1,41 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vcvta */
+
+#include "common.h"
+
+ENTRY
+    /* SIMD positive. */
+.data
+    vcvta_positive_0:      .float 1.25, 2.5, 3.75, 4.0
+    vcvta_positive_expect: .word  1,    3,   4,    4
+.bss
+    vcvta_positive_result: .skip 0x10
+.text
+    ldr r0, =vcvta_positive_0
+    vld1.32 {q0}, [r0]
+    vcvta.u32.f32 q1, q0
+    ldr r0, =vcvta_positive_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvta_positive_result,
+        vcvta_positive_expect,
+        0x10
+    )
+
+    /* SIMD negative. */
+.data
+    vcvta_negative_0:      .float -1.25, -2.5, -3.75, -4.0
+    vcvta_negative_expect: .word  -1,    -3,   -4,    -4
+.bss
+    vcvta_negative_result: .skip 0x10
+.text
+    ldr r0, =vcvta_negative_0
+    vld1.32 {q0}, [r0]
+    vcvta.s32.f32 q1, q0
+    ldr r0, =vcvta_negative_result
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvta_negative_result,
+        vcvta_negative_expect,
+        0x10
+    )
+EXIT
--- a/userland/arch/arm/vcvtr.S
+++ b/userland/arch/arm/vcvtr.S
@@ -0,0 +1,46 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vcvtrr */
+
+#include "common.h"
+
+ENTRY
+.data
+    vcvtr_0:                    .float 1.25, 2.5, 3.75, 4.0
+    vcvtr_expect_zero:          .word  1,    2,   3,    4
+    vcvtr_expect_plus_infinity: .word  2,    3,   4,    4
+.bss
+    vcvtr_result_zero:          .skip 0x10
+    vcvtr_result_plus_infinity: .skip 0x10
+.text
+    ldr r0, =vcvtr_0
+    vld1.32 {q0}, [r0]
+
+    /* zero */
+    vmrs r0, fpscr
+    orr r0, r0, (3 << 22)
+    vmsr fpscr, r0
+    vcvtr.u32.f32 q1, q0
+    ldr r0, =vcvtr_result_zero
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvtr_result_zero,
+        vcvtr_expect_zero,
+        0x10
+    )
+
+#if 0
+    /* TODO why is this not working? Rounds to zero still. */
+    /* plus infinity */
+    vmrs r0, fpscr
+    mov r1, 1
+    bfi r0, r1, 22, 2
+    vmsr fpscr, r0
+    vcvtr.u32.f32 q1, q0
+    ldr r0, =vcvtr_result_plus_infinity
+    vst1.32 {q1}, [r0]
+    ASSERT_MEMCMP(
+        vcvtr_result_plus_infinity,
+        vcvtr_expect_plus_infinity,
+        0x10
+    )
+#endif
+EXIT
--- a/userland/arch/arm/vfp.S
+++ b/userland/arch/arm/vfp.S
@@ -0,0 +1,152 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#vfp
+ * Adapted from: https://mindplusplus.wordpress.com/2013/06/27/arm-vfp-vector-programming-part-2-examples/ */
+
+#include "common.h"
+
+.data;
+a1:
+    .float 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5
+a2:
+    .float 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5
+sum:
+    .skip 32
+sum_expect:
+    .float 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0
+
+ENTRY
+    /* Minimal single precision floating point example.
+     * TODO: floating point representation constraints due to 4-byte instruction?
+     */
+    vmov s0, 1.5
+    vmov s1, 2.5
+    vadd.f32 s2, s0, s1
+    vmov s3, 4.0
+    /* Compare two floating point registers. Stores results in fpscr:
+     * (floating point status and control register).
+     */
+    vcmp.f32 s2, s3
+    /* Move the nzcv bits from fpscr to apsr */
+    vmrs apsr_nzcv, fpscr
+    /* This branch uses the Z bit of apsr, which was set accordingly. */
+    ASSERT(beq)
+
+    /* Now the same from memory with vldr and vstr. */
+.data
+my_float_0:
+    .float 1.5
+my_float_1:
+    .float 2.5
+my_float_sum_expect:
+    .float 4.0
+.bss
+my_float_sum:
+    .skip 4
+.text
+    ldr r0, =my_float_0
+    vldr s0, [r0]
+    ldr r0, =my_float_1
+    vldr s1, [r0]
+    vadd.f32 s2, s0, s1
+    ldr r0, =my_float_sum
+    vstr.f32 s2, [r0]
+    ASSERT_MEMCMP(my_float_sum, my_float_sum_expect, 4)
+
+#if 0
+    /* We can't do pseudo vldr as for ldr, fails with:
+     * Error: cannot represent CP_OFF_IMM relocation in this object file format
+     * It works on ARMv8 however, so the relocation must have been added.
+     */
+    vldr s0, my_float_0
+#endif
+
+    /* Minimal double precision floating point example. */
+    vmov.f64 d0, 1.5
+    vmov.f64 d1, 2.5
+    vadd.f64 d2, d0, d1
+    vmov.f64 d3, 4.0
+    vcmp.f64 d2, d3
+    vmrs apsr_nzcv, fpscr
+    ASSERT(beq)
+
+    /* vmov can also move to general purpose registers.
+     *
+     * Just remember that we can't use float immediates with general purpose registers:
+     * https://stackoverflow.com/questions/6514537/how-do-i-specify-immediate-floating-point-numbers-with-inline-assembly/52906126#52906126
+     */
+    mov r1, 2
+    mov r0, 1
+    vmov s0, r0
+    vmov s1, s0
+    vmov r1, s1
+    ASSERT_EQ_REG(r0, r1)
+
+    /* Now a more complex test function. */
+    ldr r0, =sum
+    ldr r1, =a1
+    ldr r2, =a2
+    mov r3, 8
+    bl vec_sum
+    /* The assert works easily because all floats used
+     * have exact base-2 representation.
+     */
+    ASSERT_MEMCMP(sum, sum_expect, 0x20)
+EXIT
+
+/* void vec_sum(float *sum, float *a1, float *a2, int length) {
+ *   int i;
+ *   for (i=0; i &lt; length; i++)
+ *     *(sum+i) = *(a1+i) + *(a2+i);
+ * }
+ */
+vec_sum:
+    /* Setup */
+    push {r0, r1, r4, lr}
+    push {r0, r1}
+    mov r0, 1
+    mov r1, 8
+    bl reconfig
+    pop {r0, r1}
+    asr r3, 3
+
+    /* Do the sum. */
+1:
+    fldmias r1!, {s8-s15}
+    fldmias r2!, {s16-s23}
+    vadd.f32 s24, s8, s16
+    fstmias r0!, {s24-s31}
+    subs r3, r3, 1
+    bne 1b
+
+    /* Teardown. */
+    bl deconfig
+    pop {r0, r1, r4, pc}
+
+/* inputs:
+ * r0: desired vector stride (1 or 2)
+ * r1: desired vector length (min. 1, max. 8)
+ * outputs: (none)
+ * modified: r0, r1, FPSCR
+ * notes:
+ * r0 and r1 will be truncated before fitting into FPSCR
+ */
+reconfig:
+    push {r0-r2}
+    and r0, r0, 3
+    eor r0, r0, 1
+    sub r1, r1, 1
+    and r1, r1, 7
+    mov r0, r0, lsl 20
+    orr r0, r0, r1, lsl 16
+    vmrs r2, fpscr
+    bic r2, 55*65536
+    orr r2, r2, r0
+    vmsr fpscr, r0
+    pop {r0-r2}
+    bx lr
+
+deconfig:
+    push {r0, r1, lr}
+    mov r0, 1
+    mov r1, 1
+    bl reconfig
+    pop {r0, r1, pc}