Rationalize -mcpu for emulators, compilers and assemblers on ARM

Move SVE example in from arm-assembly-cheat. atomic.cpp aarch64 add LSE ldadd placeholder, not compiling yet
2026-01-23 02:05:57 +01:00 · 2019-06-26 00:00:00 +00:00
parent ce3d546ac8
commit 3fdd83c2c5
8 changed files with 168 additions and 19 deletions
--- a/userland/arch/aarch64/sve.S
+++ b/userland/arch/aarch64/sve.S
@@ -0,0 +1,49 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#arm-sve */
+
+#include <lkmc.h>
+
+.data
+    x: .double        1.5,  2.5,  3.5,  4.5
+    y: .double        5.0,  6.0,  7.0,  8.0
+    y_expect: .double 8.0, 11.0, 14.0, 17.0
+    a: .double        2.0
+    n: .word          4
+
+LKMC_PROLOGUE
+    adr x0, x
+    adr x1, y
+    adr x2, a
+    adr x3, n
+    bl daxpy
+    LKMC_ASSERT_MEMCMP(y, y_expect, =0x20)
+LKMC_EPILOGUE
+
+/* Multiply by a scalar and add.
+ *
+ * Operation:
+ *
+ *      Y += a * X
+ *
+ * C signature:
+ *
+ *      void daxpy(double *x, double *y, double *a, int *n)
+ *
+ * The name "daxpy" comes from LAPACK:
+ * http://www.netlib.org/lapack/explore-html/de/da4/group__double__blas__level1_ga8f99d6a644d3396aa32db472e0cfc91c.html
+ *
+ * Adapted from: https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf
+ */
+daxpy:
+    ldrsw x3, [x3]
+    mov x4, 0
+    whilelt p0.d, x4, x3
+    ld1rd z0.d, p0/z, [x2]
+.loop:
+    ld1d z1.d, p0/z, [x0, x4, lsl 3]
+    ld1d z2.d, p0/z, [x1, x4, lsl 3]
+    fmla z2.d, p0/m, z1.d, z0.d
+    st1d z2.d, p0, [x1, x4, lsl 3]
+    incd x4
+    whilelt p0.d, x4, x3
+    b.first .loop
+    ret
--- a/userland/arch/x86_64/cmpxchg.S
+++ b/userland/arch/x86_64/cmpxchg.S
@@ -1,4 +1,4 @@
-/* https://github.com/cirosantilli/linux-kernel-module-cheat#cmpxchg-instruction */
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-cmpxchg-instruction */

 #include <lkmc.h>

@@ -24,5 +24,4 @@ LKMC_PROLOGUE
    LKMC_ASSERT_EQ(%rax, $0)
    LKMC_ASSERT_EQ(%r13, $2)
    LKMC_ASSERT_EQ(%r14, $2)
-
 LKMC_EPILOGUE
--- a/userland/cpp/atomic.cpp
+++ b/userland/cpp/atomic.cpp
@@ -1,5 +1,4 @@
 // https://github.com/cirosantilli/linux-kernel-module-cheat#cpp
-// https://github.com/cirosantilli/linux-kernel-module-cheat#x86-lock-prefix
 //
 // The non-atomic counters have undefined values which get printed:
 // they are extremely likely to be less than the correct value due to
@@ -15,7 +14,6 @@
 // On GCC 4.8 x86-64, using atomic offered a 5x peformance improvement
 // over the same program with mutexes.

-
 #if __cplusplus >= 201103L
 #include <atomic>
 #include <cassert>
@@ -24,7 +22,7 @@
 #include <vector>
 std::atomic_ulong my_atomic_ulong(0);
 unsigned long my_non_atomic_ulong = 0;
-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(__aarch64__)
 unsigned long my_arch_atomic_ulong = 0;
 unsigned long my_arch_non_atomic_ulong = 0;
 #endif
@@ -41,6 +39,7 @@ void threadMain() {
            :
            :
        );
+        // https://github.com/cirosantilli/linux-kernel-module-cheat#x86-lock-prefix
        __asm__ __volatile__ (
            "lock;"
            "incq %0;"
@@ -48,6 +47,21 @@ void threadMain() {
            :
            :
        );
+#elif defined(__aarch64__)
+        __asm__ __volatile__ (
+            "add %0, %0, 1;"
+            : "+r" (my_arch_non_atomic_ulong)
+            :
+            :
+        );
+        // https://github.com/cirosantilli/linux-kernel-module-cheat#arm-lse
+        __asm__ __volatile__ (
+            "ldadd %[inc], xzr, [%[addr]];"
+            : "=m" (my_arch_atomic_ulong)
+            : [inc] "r" (1),
+              [addr] "r" (&my_arch_atomic_ulong)
+            :
+        );
 #endif
    }
 }
@@ -75,7 +89,7 @@ int main(int argc, char **argv) {
    // We can also use the atomics direclty through `operator T` conversion.
    assert(my_atomic_ulong == my_atomic_ulong.load());
    std::cout << "my_non_atomic_ulong " << my_non_atomic_ulong << std::endl;
-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(__aarch64__)
    assert(my_arch_atomic_ulong == nthreads * niters);
    std::cout << "my_arch_non_atomic_ulong " << my_arch_non_atomic_ulong << std::endl;
 #endif