userland: add assembly support

Move arm assembly cheat here, and start some work on x86 cheat as well.
2026-01-27 20:14:27 +01:00 · 2019-03-22 00:00:00 +00:00
parent 4943c9ed2e
commit 287c83f3f9
117 changed files with 3870 additions and 547 deletions
--- a/userland/arch/aarch64/add.S
+++ b/userland/arch/aarch64/add.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#data-processing-instructions */
+
+#include "common.h"
+
+ENTRY
+    mov x0, 1
+    add x1, x0, 2
+    ASSERT_EQ(x1, 3)
+EXIT
--- a/userland/arch/aarch64/adr.S
+++ b/userland/arch/aarch64/adr.S
@@ -0,0 +1,21 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#adr */
+
+#include "common.h"
+
+.data
+data_label:
+    .word 0x1234678
+ENTRY
+    /* This is not possible in v7 because the label is in another section.
+     * objdump says that this generates a R_AARCH64_ADR_PRE relocation.
+     * which looks specific to ADR, and therefore makes it more likely
+     * that there was no such relocation in v7.
+     *
+     * This relocation is particularly important because str does not have a
+     * pc-relative mode in ARMv8.
+     */
+    adr x0, data_label
+    ldr x1, =data_label
+label:
+    ASSERT_EQ_REG(x0, x1)
+EXIT
--- a/userland/arch/aarch64/adrp.S
+++ b/userland/arch/aarch64/adrp.S
@@ -0,0 +1,13 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#adr */
+
+#include "common.h"
+
+ENTRY
+    adrp x0, label
+    adr x1, label
+label:
+    /* Clear the lower 12 bits. */
+    bic x1, x1, 0xFF
+    bic x1, x1, 0xF00
+    ASSERT_EQ_REG(x0, x1)
+EXIT
--- a/userland/arch/aarch64/asm_hello.c
+++ b/userland/arch/aarch64/asm_hello.c
@@ -1,13 +0,0 @@
-#include <assert.h>
-#include <inttypes.h>
-
-int main(void) {
-    uint32_t myvar = 1;
-    __asm__ (
-        "add %[myvar], %[myvar], 1;"
-        : [myvar] "=r" (myvar)
-        :
-        :
-    );
-    assert(myvar == 2);
-}
--- a/userland/arch/aarch64/beq.S
+++ b/userland/arch/aarch64/beq.S
@@ -0,0 +1,33 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#cbz */
+
+#include "common.h"
+
+ENTRY
+    /* cbz == 0 */
+    mov x0, 0
+    cbz x0, 1f
+    FAIL
+1:
+
+    /* cbz != 0 */
+    mov x0, 1
+    cbz x0, 1f
+    b 2f
+1:
+    FAIL
+2:
+
+    /* cbnz != 0 */
+    mov x0, 1
+    cbnz x0, 1f
+    FAIL
+1:
+
+    /* cbnz == 0 */
+    mov x0, 0
+    cbnz x0, 1f
+    b 2f
+1:
+    FAIL
+2:
+EXIT
--- a/userland/arch/aarch64/bfi.S
+++ b/userland/arch/aarch64/bfi.S
@@ -0,0 +1,11 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bfi */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x1122334455667788
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    bfi x1, x0, 16, 32
+    ASSERT_EQ(x1, 0xFFFF55667788FFFF)
+EXIT
--- a/userland/arch/aarch64/c/asm_from_c.c
+++ b/userland/arch/aarch64/c/asm_from_c.c
@@ -0,0 +1,39 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#calling-convention */
+
+#include <assert.h>
+#include <inttypes.h>
+
+uint64_t my_asm_func(void);
+/* { return 42; } */
+__asm__(
+    ".global my_asm_func;"
+    "my_asm_func:"
+    "mov x0, 42;"
+    "ret;"
+);
+
+/* Now a more complex example that also calls a C function.
+ * We have to store the return value x30 for later because bl modifies it.
+ * https://stackoverflow.com/questions/27941220/push-lr-and-pop-lr-in-arm-arch64/34504752#34504752
+ * We are not modifying any other callee saved register in this function,
+ * since my_c_func is not either (unless GCC has a bug ;-)), so everything else if fine.
+ */
+uint64_t my_asm_func_2(void);
+/* { return my_c_func(); } */
+__asm__(
+    ".global my_asm_func_2;"
+    "my_asm_func_2:"
+    "str x30, [sp, -16]!;"
+    "bl my_c_func;"
+    "ldr x30, [sp], 16;"
+    "ret;"
+);
+
+uint64_t my_c_func(void) {
+    return 42;
+}
+
+int main(void) {
+    assert(my_asm_func() == 42);
+    assert(my_asm_func_2() == 42);
+}
--- a/userland/arch/aarch64/c/build
+++ b/userland/arch/aarch64/c/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/aarch64/c/earlyclobber.c
+++ b/userland/arch/aarch64/c/earlyclobber.c
@@ -0,0 +1,21 @@
+/* An example of using the '&' earlyclobber modifier.
+ * https://stackoverflow.com/questions/15819794/when-to-use-earlyclobber-constraint-in-extended-gcc-inline-assembly/54853663#54853663
+ * The assertion may fail without it. It actually does fail in GCC 8.2.0 at
+ * 34017bcd0bc96a3cf77f6acba4d58350e67c2694 + 1.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint64_t in = 1;
+    uint64_t out;
+    __asm__ (
+        "add %[out], %[in], 1;"
+        "add %[out], %[in], 1;"
+        : [out] "=&r" (out)
+        : [in] "r" (in)
+        :
+    );
+    assert(out == 2);
+}
--- a/userland/arch/aarch64/c/freestanding/build
+++ b/userland/arch/aarch64/c/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/aarch64/c/freestanding/hello.c
+++ b/userland/arch/aarch64/c/freestanding/hello.c
@@ -0,0 +1,37 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#freestanding-linux-inline-assembly-system-calls */
+
+#include <inttypes.h>
+
+void _start(void) {
+    uint64_t exit_status;
+
+    /* write */
+    {
+        char msg[] = "hello\n";
+        uint64_t syscall_return;
+        register uint64_t x0 __asm__ ("x0") = 1; /* stdout */
+        register char *x1 __asm__ ("x1") = msg;
+        register uint64_t x2 __asm__ ("x2") = sizeof(msg);
+        register uint64_t x8 __asm__ ("x8") = 64; /* syscall number */
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (x0)
+            : "r" (x1), "r" (x2), "r" (x8)
+            : "memory"
+        );
+        syscall_return = x0;
+        exit_status = (syscall_return != sizeof(msg));
+    }
+
+    /* exit */
+    {
+        register uint64_t x0 __asm__ ("x0") = exit_status;
+        register uint64_t x8 __asm__ ("x8") = 93;
+        __asm__ __volatile__ (
+            "svc 0;"
+            : "+r" (x0)
+            : "r" (x8)
+            :
+        );
+    }
+}
--- a/userland/arch/aarch64/c/freestanding/hello_clobbers.c
+++ b/userland/arch/aarch64/c/freestanding/hello_clobbers.c
@@ -0,0 +1,40 @@
+/* Like hello.c trying to do it without named register variables.
+ * The code is more complicated, and I was not able to get as efficient,
+ * so better just stick to named register variables.
+ */
+
+#include <inttypes.h>
+
+void _start(void) {
+    uint64_t exit_status;
+
+    /* write */
+    {
+        char msg[] = "hello\n";
+        uint64_t syscall_return;
+        __asm__ (
+            "mov x0, 1;" /* stdout */
+            "mov x1, %[msg];"
+            "mov x2, %[len];"
+            "mov x8, 64;" /* syscall number */
+            "svc 0;"
+            "mov %[syscall_return], x0;"
+            : [syscall_return] "=r" (syscall_return)
+            : [msg] "p" (msg),
+            [len] "i" (sizeof(msg))
+            : "x0", "x1", "x2", "x8", "memory"
+        );
+        exit_status = (syscall_return != sizeof(msg));
+    }
+
+    /* exit */
+    __asm__ (
+        "mov x0, %[exit_status];"
+        "mov x8, 93;" /* syscall number */
+        "svc 0;"
+        :
+        : [exit_status] "r" (exit_status)
+        : "x0", "x8"
+    );
+}
+
--- a/userland/arch/aarch64/c/inc.c
+++ b/userland/arch/aarch64/c/inc.c
@@ -0,0 +1,13 @@
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint64_t io = 1;
+    __asm__ (
+        "add %[io], %[io], 1;"
+        : [io] "+r" (io)
+        :
+        :
+    );
+    assert(io == 2);
+}
--- a/userland/arch/aarch64/c/inc_float.c
+++ b/userland/arch/aarch64/c/inc_float.c
@@ -0,0 +1,28 @@
+/* https://stackoverflow.com/questions/53960240/armv8-floating-point-output-inline-assembly
+ *
+ * We use the undocumented %s and %d modifiers!
+ */
+
+#include <assert.h>
+
+int main(void) {
+    float my_float = 1.5;
+    __asm__ (
+        "fmov s0, 1.0;"
+        "fadd %s[my_float], %s[my_float], s0;"
+        : [my_float] "+w" (my_float)
+        :
+        : "s0"
+    );
+    assert(my_float == 2.5);
+
+    double my_double = 1.5;
+    __asm__ (
+        "fmov d0, 1.0;"
+        "fadd %d[my_double], %d[my_double], d0;"
+        : [my_double] "+w" (my_double)
+        :
+        : "d0"
+    );
+    assert(my_double == 2.5);
+}
--- a/userland/arch/aarch64/c/multiline.cpp
+++ b/userland/arch/aarch64/c/multiline.cpp
@@ -0,0 +1,18 @@
+// https://stackoverflow.com/questions/3666013/how-to-write-multiline-inline-assembly-code-in-gcc-c/54575948#54575948
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint64_t io = 0;
+    __asm__ (
+        R"(
+add %[io], %[io], #1
+add %[io], %[io], #1
+)"
+        : [io] "+r" (io)
+        :
+        :
+    );
+    assert(io == 2);
+}
--- a/userland/arch/aarch64/c/reg_var.c
+++ b/userland/arch/aarch64/c/reg_var.c
@@ -0,0 +1,27 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#register-variables */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    register uint32_t x0 __asm__ ("x0");
+    register uint32_t x1 __asm__ ("x1");
+    uint32_t new_x0;
+    uint32_t new_x1;
+    {
+        x0 = 1;
+        x1 = 2;
+        __asm__ (
+            "add %[x0], x0, #1;"
+            "add %[x1], x1, #1;"
+            : [x0] "+r" (x0),
+              [x1] "+r" (x1)
+            :
+            :
+        );
+        new_x0 = x0;
+        new_x1 = x1;
+    }
+    assert(new_x0 == 2);
+    assert(new_x1 == 3);
+}
--- a/userland/arch/aarch64/c/reg_var_float.c
+++ b/userland/arch/aarch64/c/reg_var_float.c
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#register-variables */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    register double d0 __asm__ ("d0");
+    register double d1 __asm__ ("d1");
+    double new_d0;
+    double new_d1;
+    {
+        d0 = 1.5;
+        d1 = 2.5;
+        __asm__ (
+            "fmov d2, 1.5;"
+            "fadd %d[d0], d0, d2;"
+            "fadd %d[d1], d1, d2;"
+            : [d0] "+w" (d0),
+              [d1] "+w" (d1)
+            :
+            : "d2"
+        );
+        new_d0 = d0;
+        new_d1 = d1;
+    }
+    assert(new_d0 == 3.0);
+    assert(new_d1 == 4.0);
+}
--- a/userland/arch/aarch64/cbz.S
+++ b/userland/arch/aarch64/cbz.S
@@ -0,0 +1,19 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#cbz */
+
+#include "common.h"
+
+ENTRY
+
+    /* Branch. */
+    mov x0, 0x0
+    cbz x0, ok
+    FAIL
+ok:
+
+    /* Don't branch. */
+    mov x0, 0x1
+    cbz x0, ko
+
+EXIT
+ko:
+    FAIL
--- a/userland/arch/aarch64/comments.S
+++ b/userland/arch/aarch64/comments.S
@@ -0,0 +1,17 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#comments */
+
+#include "common.h"
+ENTRY
+    # mycomment
+
+    /* ARMv8 has // instead of @ as for comments. */
+    // mycomment
+    nop // mycomment
+
+    /* All these fail. Lol, different than v7, no consistency. */
+#if 0
+    nop # mycomment
+    @ mycomment
+    nop @ mycomment
+#endif
+EXIT
--- a/userland/arch/aarch64/common_arch.h
+++ b/userland/arch/aarch64/common_arch.h
@@ -0,0 +1,64 @@
+#ifndef COMMON_ARCH_H
+#define COMMON_ARCH_H
+
+#define ASSERT_EQ(reg, const) \
+    ldr x11, =const; \
+	cmp reg, x11; \
+	ASSERT(beq); \
+;
+
+#define ASSERT_MEMCMP(s1, s2, n) \
+	MEMCMP(s1, s2, n); \
+	ASSERT_EQ(x0, 0); \
+;
+
+#define ENTRY \
+.text; \
+.global asm_main; \
+asm_main: \
+    sub  sp, sp, 0xA0; \
+    stp  x29, x30, [sp]; \
+    stp  x27, x28, [sp, 0x10]; \
+    stp  x25, x26, [sp, 0x20]; \
+    stp  x23, x24, [sp, 0x30]; \
+    stp  x21, x22, [sp, 0x40]; \
+    stp  x19, x20, [sp, 0x50]; \
+    stp  x6, x7, [sp, 0x60]; \
+    stp  x4, x5, [sp, 0x70]; \
+    stp  x2, x3, [sp, 0x80]; \
+    stp  x0, x1, [sp, 0x90]; \
+asm_main_after_prologue: \
+;
+
+#define EXIT \
+    mov w0, 0; \
+    mov w1, 0; \
+    b pass; \
+fail: \
+    ldr x1, [sp, 0x90]; \
+    str w0, [x1]; \
+    mov w0, 1; \
+pass: \
+    ldp x19, x20, [sp, 0x50]; \
+    ldp x21, x22, [sp, 0x40]; \
+    ldp x23, x24, [sp, 0x30]; \
+    ldp x25, x26, [sp, 0x20]; \
+    ldp x27, x28, [sp, 0x10]; \
+    ldp x29, x30, [sp]; \
+    add sp, sp, 0xA0; \
+    ret; \
+;
+
+#define FAIL \
+    ldr w0, =__LINE__; \
+    b fail; \
+;
+
+#define MEMCMP(s1, s2, n) \
+    adr x0, s1; \
+    adr x1, s2; \
+    ldr x2, =n; \
+    bl memcmp; \
+;
+
+#endif
--- a/userland/arch/aarch64/cset.S
+++ b/userland/arch/aarch64/cset.S
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#cset */
+
+#include "common.h"
+
+ENTRY
+    /* Test values. */
+    mov x0, 0
+    mov x1, 1
+
+    /* eq is true, set x2 = 1. */
+    cmp x0, x0
+    cset x2, eq
+    ASSERT_EQ(x2, 1)
+
+    /* eq is false, set x2 = 0. */
+    cmp x0, x1
+    cset x2, eq
+    ASSERT_EQ(x2, 0)
+
+    /* Same for ne. */
+    cmp x0, x0
+    cset x2, ne
+    ASSERT_EQ(x2, 0)
+
+    cmp x0, x1
+    cset x2, ne
+    ASSERT_EQ(x2, 1)
+EXIT
--- a/userland/arch/aarch64/empty.S
+++ b/userland/arch/aarch64/empty.S
@@ -0,0 +1 @@
+../empty.S
--- a/userland/arch/aarch64/fail.S
+++ b/userland/arch/aarch64/fail.S
@@ -0,0 +1 @@
+../fail.S
--- a/userland/arch/aarch64/floating_point.S
+++ b/userland/arch/aarch64/floating_point.S
@@ -0,0 +1,60 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* 1.5 + 2.5 == 4.0
+     * using 64-bit double immediates.
+     */
+    fmov d0, 1.5
+    fmov d1, 2.5
+    fadd d2, d0, d1
+    fmov d3, 4.0
+    /* Unlike VFP vcmp, this stores the status
+     * automatically in the main CPSR.
+     */
+    fcmp d2, d3
+    ASSERT(beq)
+
+    /* Now with a memory stored value. */
+.data
+my_double_0:
+    .double 1.5
+my_double_1:
+    .double 2.5
+my_double_sum_expect:
+    .double 4.0
+.text
+    ldr d0, my_double_0
+    ldr d1, my_double_1
+    fadd d2, d0, d1
+    ldr d3, my_double_sum_expect
+    fcmp d2, d3
+    ASSERT(beq)
+
+    /* Now in 32-bit. */
+    fmov s0, 1.5
+    fmov s1, 2.5
+    fadd s2, s0, s1
+    fmov s3, 4.0
+    fcmp s2, s3
+    ASSERT(beq)
+
+    /* TODO why? What's the point of q then?
+     * Error: operand mismatch -- `fmov q0,1.5'
+     */
+#if 0
+    fmov q0, 1.5
+#endif
+
+    /* Much like integers, immediates are constrained to
+     * fit in 32-byte instructions. TODO exact rules.
+     *
+     * Assembly here would fail with:
+     *
+     * Error: invalid floating-point constant at operand 2
+     */
+#if 0
+    fmov d0, 1.23456798
+#endif
+EXIT
--- a/userland/arch/aarch64/freestanding/build
+++ b/userland/arch/aarch64/freestanding/build
@@ -0,0 +1 @@
+../build
--- a/userland/arch/aarch64/freestanding/hello.S
+++ b/userland/arch/aarch64/freestanding/hello.S
@@ -0,0 +1,20 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#linux-system-calls */
+
+.text
+.global _start
+_start:
+asm_main_after_prologue:
+    /* write */
+    mov x0, 1     /* stdout */
+    adr x1, msg   /* buffer */
+    ldr x2, =len  /* len */
+    mov x8, 64    /* syscall number */
+    svc 0
+
+    /* exit */
+    mov x0, 0     /* exit status */
+    mov x8, 93    /* syscall number */
+    svc 0
+msg:
+    .ascii "hello\n"
+len = . - msg
--- a/userland/arch/aarch64/hello_driver.S
+++ b/userland/arch/aarch64/hello_driver.S
@@ -0,0 +1,6 @@
+.text
+.global asm_main
+asm_main:
+asm_main_after_prologue:
+    mov w0, 0
+    ret
--- a/userland/arch/aarch64/immediates.S
+++ b/userland/arch/aarch64/immediates.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#immediates */
+
+#include "common.h"
+ENTRY
+    mov x0, 1
+    mov x0, 0x1
+    mov x0, 1
+    mov x0, 0x1
+EXIT
--- a/userland/arch/aarch64/movk.S
+++ b/userland/arch/aarch64/movk.S
@@ -0,0 +1,26 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#movk */
+
+#include "common.h"
+
+ENTRY
+    movk x0, 0x4444, lsl 0
+    movk x0, 0x3333, lsl 16
+    movk x0, 0x2222, lsl 32
+    movk x0, 0x1111, lsl 48
+    ASSERT_EQ(x0, 0x1111222233334444)
+
+    /* Set a label (addresses are 48-bit) with immediates:
+     *
+     * * https://stackoverflow.com/questions/38570495/aarch64-relocation-prefixes
+     * * https://sourceware.org/binutils/docs-2.26/as/AArch64_002dRelocations.html
+     *
+     * This could be used if the label is too far away for
+     * adr relative addressing.
+     */
+    movz x0, :abs_g2:label     /* bits 32-47, overflow check */
+    movk x0, :abs_g1_nc:label  /* bits 16-31, no overflow check */
+    movk x0, :abs_g0_nc:label  /* bits  0-15, no overflow check */
+    adr x1, label
+label:
+    ASSERT_EQ_REG(x0, x1)
+EXIT
--- a/userland/arch/aarch64/movn.S
+++ b/userland/arch/aarch64/movn.S
@@ -0,0 +1,9 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#movn */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x123456789ABCDEF0
+    movn x0, 0x8888, lsl 16
+    ASSERT_EQ(x0, 0xFFFFFFFF7777FFFF)
+EXIT
--- a/userland/arch/aarch64/pc.S
+++ b/userland/arch/aarch64/pc.S
@@ -0,0 +1,78 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#registers */
+
+#include "common.h"
+
+ENTRY
+#if 0
+    /* Unlike v7, we can't use PC like any other register in ARMv8,
+     * since it is not a general purpose register anymore.
+     *
+     * Only branch instructions can modify the PC.
+     *
+     * B1.2.1 "Registers in AArch64 state" says:
+     *
+     * Software cannot write directly to the PC. It
+     * can only be updated on a branch, exception entry or
+     * exception return.
+     */
+    ldr pc, =10f
+    FAIL
+10:
+#endif
+#if 0
+    mov x0, pc
+#endif
+
+    /* LDR PC-relative loads exist in ARMv8, but they have a separate encoding
+     * "LDR (literal)" instead of "LDR (immediate)":
+     * https://stackoverflow.com/questions/28638981/howto-write-pc-relative-adressing-on-arm-asm/54480999#54480999
+     */
+    ldr x0, pc_relative_ldr
+    b 1f
+pc_relative_ldr:
+    .quad 0x123456789ABCDEF0
+1:
+    ASSERT_EQ(x0, 0x123456789ABCDEF0)
+
+    /* Just for fun, we can also use relative numbers instead of labels.
+     * https://reverseengineering.stackexchange.com/questions/17666/how-does-the-ldr-instruction-work-on-arm/20567#20567
+     */
+    ldr x0, 0x8
+    b 1f
+    .quad 0x123456789ABCDEF0
+1:
+    ASSERT_EQ(x0, 0x123456789ABCDEF0)
+
+    /* Analogous for b with PC. */
+    mov x0, 0
+    /* Jumps over mov to ASSERT_EQ. */
+    b 8
+    mov x0, 1
+    ASSERT_EQ(x0, 0)
+
+    /* Trying to use the old "LDR (immediate)" PC-relative
+     * syntax does not work.
+     */
+#if 0
+    /* 64-bit integer or SP register expected at operand 2 -- `ldr x0,[pc]' */
+    ldr x0, [pc]
+#endif
+
+    /* There is however no analogue for str. TODO rationale? */
+#if 0
+    /* Error: invalid addressing mode at operand 2 -- `str x0,pc_relative_str' */
+    str x0, pc_relative_str
+#endif
+
+    /* You just have to use adr + "STR (register)". */
+    ldr x0, pc_relative_str
+    ASSERT_EQ(x0, 0x0)
+    adr x1, pc_relative_str
+    ldr x0, pc_relative_ldr
+    str x0, [x1]
+    ldr x0, pc_relative_str
+    ASSERT_EQ(x0, 0x123456789ABCDEF0)
+EXIT
+.data
+pc_relative_str:
+    .quad 0x0000000000000000
--- a/userland/arch/aarch64/regs.S
+++ b/userland/arch/aarch64/regs.S
@@ -0,0 +1,47 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#armv8-registers */
+
+#include "common.h"
+
+ENTRY
+
+    /* 31 64-bit eXtended general purpose registers. */
+    mov x0, 0
+    mov x1, 1
+    mov x2, 2
+    mov x3, 3
+    mov x4, 4
+    mov x5, 5
+    mov x6, 6
+    mov x7, 7
+    mov x8, 8
+    mov x9, 9
+    mov x10, 10
+    mov x11, 11
+    mov x12, 12
+    mov x13, 13
+    mov x14, 14
+    mov x15, 15
+    mov x16, 16
+    mov x17, 17
+    mov x18, 18
+    mov x19, 19
+    mov x20, 20
+    mov x21, 21
+    mov x22, 22
+    mov x23, 23
+    mov x24, 24
+    mov x25, 25
+    mov x26, 26
+    mov x27, 27
+    mov x28, 28
+    mov x29, 29
+
+    /* x30 is the link register. BL stores the return address here. */
+    /*mov x30, 30*/
+
+    /* W form addresses the lower 4 bytes word, and zeroes the top. */
+    ldr x0, =0x1111222233334444
+    ldr x1, =0x5555666677778888
+    mov w0, w1
+    ASSERT_EQ(x0, 0x0000000077778888)
+EXIT
--- a/userland/arch/aarch64/ret.S
+++ b/userland/arch/aarch64/ret.S
@@ -0,0 +1,28 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#bl */
+
+#include "common.h"
+
+ENTRY
+    mov x0, 1
+    bl inc
+    ASSERT_EQ(x0, 2)
+    bl inc2
+    ASSERT_EQ(x0, 3)
+    bl inc3
+    ASSERT_EQ(x0, 4)
+EXIT
+
+/* void inc(uint64_t *i) { (*i)++ } */
+inc:
+    add x0, x0, 1
+    ret
+
+/* Same but explicit return register. */
+inc2:
+    add x0, x0, 1
+    ret x30
+
+/* Same but with br. */
+inc3:
+    add x0, x0, 1
+    br x30
--- a/userland/arch/aarch64/simd.S
+++ b/userland/arch/aarch64/simd.S
@@ -0,0 +1,86 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
+
+#include "common.h"
+
+ENTRY
+    /* 4x 32-bit integer add.
+     *
+     * s stands for single == 32 bits.
+     *
+     * 1 in ld1 means to load just one register, see:
+     * https://github.com/cirosantilli/arm-assembly-cheat#simd-interleaving
+     */
+.data
+    u32_0:          .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444
+    u32_1:          .word 0x15551555, 0x16661666, 0x17771777, 0x18881888
+    u32_sum_expect: .word 0x06670666, 0x08890888, 0x0AAB0AAA, 0x0CCD0CCC
+.bss
+    u32_sum:        .skip 16
+.text
+    adr x0, u32_0
+    ld1 {v0.4s}, [x0]
+    adr x1, u32_1
+    ld1 {v1.4s}, [x1]
+    add v2.4s, v0.4s, v1.4s
+    adr x0, u32_sum
+    st1 {v2.4s}, [x0]
+    ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10)
+
+    /* 2x 64-bit integer add.
+     *
+     * d stands for double == 64 bits.
+     */
+.data
+    u64_0:          .quad 0xF1111111F1111111, 0xF2222222F2222222
+    u64_1:          .quad 0x1555555515555555, 0x1666666616666666
+    u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
+.bss
+    u64_sum: .skip 16
+.text
+    adr x0, u64_0
+    ld1 {v0.2d}, [x0]
+    adr x1, u64_1
+    ld1 {v1.2d}, [x1]
+    add v2.2d, v0.2d, v1.2d
+    adr x0, u64_sum
+    st1 {v2.2d}, [x0]
+    ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10)
+
+    /* 4x 32-bit float add.
+     *
+     * The only difference between the integer point version
+     * is that we use fadd instead of add.
+     */
+.data
+    f32_0:          .float 1.5, 2.5,  3.5,  4.5
+    f32_1:          .float 5.5, 6.5,  7.5,  8.5
+    f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
+.bss
+    f32_sum: .skip 16
+.text
+    adr x0, f32_0
+    ld1 {v0.4s}, [x0]
+    adr x1, f32_1
+    ld1 {v1.4s}, [x1]
+    fadd v2.4s, v0.4s, v1.4s
+    adr x0, f32_sum
+    st1 {v2.4s}, [x0]
+    ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10)
+
+    /* 2x 64-bit float add. */
+.data
+    f64_0:          .double 1.5, 2.5
+    f64_1:          .double 5.5, 6.5
+    f64_sum_expect: .double 7.0, 9.0
+.bss
+    f64_sum: .skip 16
+.text
+    adr x0, f64_0
+    ld1 {v0.2d}, [x0]
+    adr x1, f64_1
+    ld1 {v1.2d}, [x1]
+    fadd v2.2d, v0.2d, v1.2d
+    adr x0, f64_sum
+    st1 {v2.2d}, [x0]
+    ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10)
+EXIT
--- a/userland/arch/aarch64/simd_interleave.S
+++ b/userland/arch/aarch64/simd_interleave.S
@@ -0,0 +1,26 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#simd-interleaving */
+
+#include "common.h"
+
+ENTRY
+.data
+    u32_interleave: .word \
+        0x11111111, 0x55555555, \
+        0x22222222, 0x66666666, \
+        0x33333333, 0x77777777, \
+        0x44444444, 0x88888888
+    u32_interleave_sum_expect: .word \
+        0x66666666, \
+        0x88888888, \
+        0xAAAAAAAA, \
+        0xCCCCCCCC
+.bss
+    u32_interleave_sum: .skip 16
+.text
+    adr x0, u32_interleave
+    ld2 {v0.4s, v1.4s}, [x0]
+    add v2.4s, v0.4s, v1.4s
+    adr x0, u32_interleave_sum
+    st1 {v2.4s}, [x0]
+    ASSERT_MEMCMP(u32_interleave_sum, u32_interleave_sum_expect, 0x10)
+EXIT
--- a/userland/arch/aarch64/str.S
+++ b/userland/arch/aarch64/str.S
@@ -0,0 +1,13 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#armv8-str */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, myvar
+    ASSERT_EQ(x0, 0x12346789ABCDEF0)
+#if 0
+    /* Error: invalid addressing mode at operand 2 -- `str x0,myvar' */
+    str x0, myvar
+#endif
+EXIT
+    myvar: .quad 0x12346789ABCDEF0
--- a/userland/arch/aarch64/ubfm.S
+++ b/userland/arch/aarch64/ubfm.S
@@ -0,0 +1,17 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ubfm */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x1122334455667788
+
+    // lsr alias: imms == 63
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfm x1, x0, 16, 63
+    ASSERT_EQ(x1, 0x0000112233445566)
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfm x1, x0, 32, 63
+    ASSERT_EQ(x1, 0x0000000011223344)
+EXIT
--- a/userland/arch/aarch64/ubfx.S
+++ b/userland/arch/aarch64/ubfx.S
@@ -0,0 +1,15 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#ubfx */
+
+#include "common.h"
+
+ENTRY
+    ldr x0, =0x1122334455667788
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfx x1, x0, 8, 16
+    ASSERT_EQ(x1, 0x0000000000006677)
+
+    ldr x1, =0xFFFFFFFFFFFFFFFF
+    ubfx x1, x0, 8, 32
+    ASSERT_EQ(x1, 0x0000000044556677)
+EXIT
--- a/userland/arch/aarch64/x31.S
+++ b/userland/arch/aarch64/x31.S
@@ -0,0 +1,51 @@
+/* https://github.com/cirosantilli/arm-assembly-cheat#x31 */
+
+#include "common.h"
+
+ENTRY
+    /* ERROR: can never use the name x31. */
+#if 0
+    mov x31, 31
+#endif
+
+    /* mov (register) is an alias for ORR, which accepts xzr. */
+    mov x0, 1
+    mov x0, xzr
+    ASSERT_EQ(x0, 0)
+
+    /* Same encoding as the mov version. */
+    mov x0, 1
+    orr x0, xzr, xzr
+    ASSERT_EQ(x0, 0)
+
+    /* So, orr, which is not an alias, can only take xzr, not sp. */
+#if 0
+    orr sp, sp, sp
+#endif
+
+    /* Zero register discards result if written to. */
+    mov x0, 1
+    orr xzr, x0, x0
+    ASSERT_EQ(xzr, 0)
+
+    /* MOV (to/from SP) is an alias for ADD (immediate). */
+    mov x0, sp
+    mov sp, 1
+    /* Alias to add. */
+    mov x1, sp
+    /* Exact same encoding as above. */
+    add x1, sp, 0
+    ASSERT_EQ(x1, 1)
+    mov sp, x0
+
+    /* So, ADD (immediate), which is not an alias, can only take sp, not xzr. */
+#if 0
+    /* Error: integer register expected in the extended/shifted operand register at operand 3 -- `add xzr,xzr,1' */
+    add xzr, xzr, 1
+#endif
+
+    /* Note however that ADD (register), unlike ADD (immediate),
+     * does not say anything about SP, and so does accept xzr just fine.
+     */
+    add xzr, xzr, xzr
+EXIT