From 3fdd83c2c58327d9714fa2347c724b78d7c05e2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?=
 =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= <ciro.santilli@gmail.com>
Date: Wed, 26 Jun 2019 00:00:00 +0000
Subject: [PATCH] Rationalize -mcpu for emulators, compilers and assemblers on
 ARM

Move SVE example in from arm-assembly-cheat.

atomic.cpp aarch64 add LSE ldadd placeholder, not compiling yet
---
 README.adoc                    | 54 ++++++++++++++++++++++++++++++++++
 build-baremetal                |  1 -
 common.py                      | 37 +++++++++++++++++++++--
 path_properties.py             |  6 ++--
 run                            | 15 +++++-----
 userland/arch/aarch64/sve.S    | 49 ++++++++++++++++++++++++++++++
 userland/arch/x86_64/cmpxchg.S |  3 +-
 userland/cpp/atomic.cpp        | 22 +++++++++++---
 8 files changed, 168 insertions(+), 19 deletions(-)
 create mode 100644 userland/arch/aarch64/sve.S
diff --git a/README.adoc b/README.adoc
index 34d3357..2822420 100644
--- a/README.adoc
+++ b/README.adoc
@@ -14006,6 +14006,60 @@ There are analogous LD3 and LD4 instruction.
 * assembly optimized libraries:
 ** https://github.com/projectNe10/Ne10
 
+==== ARM SVE
+
+Example: link:userland/arch/aarch64/sve.S[]
+
+Scalable Vector Extension.
+
+aarch64 only, newer than <<arm-neon>>.
+
+It is called Scalable because it does not specify the vector width! Therefore we don't have to worry about new vector width instructions every few years! Hurray!
+
+The instructions then allow implicitly tracking the loop index without knowing the actual vector length.
+
+Added to QEMU use mode in 3.0.0.
+
+TODO announcement date. Possibly 2017: https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf There is also a 2016 mention: https://community.arm.com/tools/hpc/b/hpc/posts/technology-update-the-scalable-vector-extension-sve-for-the-armv8-a-architecture
+
+The Linux kernel shows `/proc/cpuinfo` compatibility as `sve`.
+
+Official spec: https://developer.arm.com/docs/100891/latest/sve-overview/introducing-sve
+
+===== SVE bibliography
+
+* https://www.rico.cat/files/ICS18-gem5-sve-tutorial.pdf step by step of a complete code execution examples, the best initial tutorial so far
+* https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf paper with some nice few concrete examples, illustrations and rationale
+* https://static.docs.arm.com/dui0965/c/DUI0965C_scalable_vector_extension_guide.pdf
+* https://developer.arm.com/products/software-development-tools/hpc/documentation/writing-inline-sve-assembly quick inlining guide
+
+====== SVE spec
+
+<<armarm8>> A1.7 "ARMv8 architecture extensions" says:
+
+____
+SVE is an optional extension to ARMv8.2. That is, SVE requires the implementation of ARMv8.2.
+____
+
+A1.7.8 "The Scalable Vector Extension (SVE)": then says that only changes to the existing registers are described in that manual, and that you should look instead at the "ARM Architecture Reference Manual Supplement, The Scalable Vector Extension (SVE), for ARMv8-A."
+
+We then download the zip from: https://developer.arm.com/docs/ddi0584/latest/arm-architecture-reference-manual-supplement-the-scalable-vector-extension-sve-for-armv8-a and it contains the PDF: `DDI0584A_d_SVE_supp_armv8A.pdf` which we use here.
+
+That document then describes the SVE instructions and registers.
+
+=== ARMv8 architecture extensions
+
+==== ARMv8.1 architecture extension
+
+<<armarm8-db>> A1.7.3 "The ARMv8.1 architecture extension"
+
+[[arm-lse]]
+===== ARM Large System Extensions (LSE)
+
+<<armarm8-db>> "ARMv8.1-LSE, ARMv8.1 Large System Extensions"
+
+* LDADD: link:userland/cpp/atomic.cpp[]
+
 === ARM assembly bibliography
 
 ==== ARM non-official bibliography
diff --git a/build-baremetal b/build-baremetal
index ec9d78a..32ddbb3 100755
--- a/build-baremetal
+++ b/build-baremetal
@@ -48,7 +48,6 @@ Build the baremetal examples with crosstool-NG.
         cc_flags = [
             '-I', self.env['root_dir'], LF,
             '-O{}'.format(self.env['optimization_level']), LF,
-            '-mcpu={}'.format(self.env['mcpu']), LF,
             '-nostartfiles', LF,
         ]
         if self.env['arch'] == 'arm':
diff --git a/common.py b/common.py
index a245d8f..1402992 100644
--- a/common.py
+++ b/common.py
@@ -658,20 +658,40 @@ Incompatible archs are skipped.
             else:
                 env['gem5_build_id'] = consts['default_build_id']
         env['is_arm'] = False
+        # Our approach is as follows:
+        #
+        # * compilers: control maximum arch version emitted explicitly -mcpu
+        # +
+        # This helps to prevent blowing up simulation unnecessarily.
+        # +
+        # It does not matter if we miss any perf features for QEMU which is functional,
+        # but it could matter for gem5 perf simulations.
+        # * assemblers: enable as many features as possible.
+        # +
+        # Well, if I'm explicitly writing down the instructions, I want
+        # my emulator to blow up in peace!
+        # * emulators: enable as many features as possible
+        # +
+        # This is the gem5 default behavior, for QEMU TODO not sure if default,
+        # but we select it explicitly with -cpu max.
+        # https://habkost.net/posts/2017/03/qemu-cpu-model-probing-story.html
+        # +
+        # We doe this because QEMU does not add all possible Cortex Axx, there are
+        # just too many, and gem5 does not allow selecting lower feature in general.
         if env['arch'] == 'arm':
             env['armv'] = 7
-            env['mcpu'] = 'cortex-a15'
             env['buildroot_toolchain_prefix'] = 'arm-buildroot-linux-gnueabihf'
             env['crosstool_ng_toolchain_prefix'] = 'arm-unknown-eabi'
             env['ubuntu_toolchain_prefix'] = 'arm-linux-gnueabihf'
             env['is_arm'] = True
+            env['march'] = 'armv8-a'
         elif env['arch'] == 'aarch64':
             env['armv'] = 8
-            env['mcpu'] = 'cortex-a57'
             env['buildroot_toolchain_prefix'] = 'aarch64-buildroot-linux-gnu'
             env['crosstool_ng_toolchain_prefix'] = 'aarch64-unknown-elf'
             env['ubuntu_toolchain_prefix'] = 'aarch64-linux-gnu'
             env['is_arm'] = True
+            env['march'] = 'armv8-a+lse'
         elif env['arch'] == 'x86_64':
             env['crosstool_ng_toolchain_prefix'] = 'x86_64-unknown-elf'
             env['gem5_arch'] = 'X86'
@@ -1545,6 +1565,10 @@ https://github.com/cirosantilli/linux-kernel-module-cheat#gem5-debug-build
                     cc_flags_after.extend(['-pthread', LF])
             if self.need_rebuild([in_path] + extra_objs + extra_deps, out_path):
                 cc_flags.extend(my_path_properties['cc_flags'])
+                if self.env['verbose']:
+                    cc_flags.extend([
+                        '-v', LF,
+                    ])
                 cc_flags_after.extend(my_path_properties['cc_flags_after'])
                 if my_path_properties['cc_pedantic']:
                     cc_flags.extend(['-pedantic', LF])
@@ -1557,6 +1581,15 @@ https://github.com/cirosantilli/linux-kernel-module-cheat#gem5-debug-build
                 elif in_ext == self.env['cxx_ext']:
                     cc = self.env['gxx_path']
                     std = my_path_properties['cxx_std']
+                if self.env['is_arm']:
+                    if in_ext == self.env['asm_ext']:
+                        cc_flags.extend([
+                            '-Xassembler', '-march=all', LF,
+                        ])
+                    else:
+                        cc_flags.extend([
+                            '-march={}'.format(self.env['march']), LF,
+                        ])
                 if dirpath_relative_root_components_len > 0:
                     if dirpath_relative_root_components[0] == 'userland':
                         if dirpath_relative_root_components_len > 1:
diff --git a/path_properties.py b/path_properties.py
index d9ddb3c..56afd45 100644
--- a/path_properties.py
+++ b/path_properties.py
@@ -344,7 +344,6 @@ path_properties_tuples = (
                             {
                                 'allowed_archs': {'arm'},
                                 'cc_flags': [
-                                    '-Xassembler', '-mcpu=cortex-a72', LF,
                                     # To prevent:
                                     # > vfp.S: Error: selected processor does not support <FPU instruction> in ARM mode
                                     # https://stackoverflow.com/questions/41131432/cross-compiling-error-selected-processor-does-not-support-fmrx-r3-fpexc-in/52875732#52875732
@@ -383,7 +382,9 @@ path_properties_tuples = (
                             }
                         ),
                         'aarch64': (
-                            {'allowed_archs': {'aarch64'}},
+                            {
+                                'allowed_archs': {'aarch64'},
+                            },
                             {
                                 'inline_asm': (
                                     {
@@ -399,6 +400,7 @@ path_properties_tuples = (
                                     'signal_generated_by_os': True,
                                     'signal_received': signal.Signals.SIGILL,
                                 },
+                                'sve.S': {'gem5_unimplemented_instruction': True}
                             }
                         ),
                         'x86_64': (
diff --git a/run b/run
index 7c0d05d..711552f 100755
--- a/run
+++ b/run
@@ -576,6 +576,7 @@ Extra options to append at the end of the emulator command line.
                     qemu_user_and_system_options +
                     debug_args
                 )
+                cpu = 'max'
             else:
                 extra_emulator_args.extend(extra_qemu_args)
                 self.make_run_dirs()
@@ -594,9 +595,11 @@ Extra options to append at the end of the emulator command line.
                         serial_monitor = ['-serial', serial, LF]
                 if self.env['kvm']:
                     extra_emulator_args.extend([
-                        '-cpu', 'host', LF,
                         '-enable-kvm', LF,
                     ])
+                    cpu = 'host'
+                else:
+                    cpu = 'max'
                 extra_emulator_args.extend([
                     '-serial',
                     'tcp::{},server,nowait'.format(self.env['extra_serial_port']), LF
@@ -706,19 +709,15 @@ Extra options to append at the end of the emulator command line.
                     ])
                 elif self.env['is_arm']:
                     extra_emulator_args.extend(['-semihosting', LF])
-                    if self.env['arch'] == 'arm':
-                        cpu = 'cortex-a15'
-                    else:
-                        cpu = 'cortex-a57'
                     append = ['-append', '{} {}'.format(root, kernel_cli), LF]
                     cmd.extend(
-                        [
-                            '-cpu', cpu, LF,
-                        ] +
                         virtio_gpu_pci
                     )
                 if self.env['baremetal'] is None:
                     cmd.extend(append)
+            extra_emulator_args.extend([
+                '-cpu', cpu, LF,
+            ])
         if self.env['tmux']:
             tmux_args = '--run-id {}'.format(self.env['run_id'])
             if self.env['tmux_program'] == 'shell':
diff --git a/userland/arch/aarch64/sve.S b/userland/arch/aarch64/sve.S
new file mode 100644
index 0000000..9173fe9
--- /dev/null
+++ b/userland/arch/aarch64/sve.S
@@ -0,0 +1,49 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#arm-sve */
+
+#include <lkmc.h>
+
+.data
+    x: .double        1.5,  2.5,  3.5,  4.5
+    y: .double        5.0,  6.0,  7.0,  8.0
+    y_expect: .double 8.0, 11.0, 14.0, 17.0
+    a: .double        2.0
+    n: .word          4
+
+LKMC_PROLOGUE
+    adr x0, x
+    adr x1, y
+    adr x2, a
+    adr x3, n
+    bl daxpy
+    LKMC_ASSERT_MEMCMP(y, y_expect, =0x20)
+LKMC_EPILOGUE
+
+/* Multiply by a scalar and add.
+ *
+ * Operation:
+ *
+ *      Y += a * X
+ *
+ * C signature:
+ *
+ *      void daxpy(double *x, double *y, double *a, int *n)
+ *
+ * The name "daxpy" comes from LAPACK:
+ * http://www.netlib.org/lapack/explore-html/de/da4/group__double__blas__level1_ga8f99d6a644d3396aa32db472e0cfc91c.html
+ *
+ * Adapted from: https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf
+ */
+daxpy:
+    ldrsw x3, [x3]
+    mov x4, 0
+    whilelt p0.d, x4, x3
+    ld1rd z0.d, p0/z, [x2]
+.loop:
+    ld1d z1.d, p0/z, [x0, x4, lsl 3]
+    ld1d z2.d, p0/z, [x1, x4, lsl 3]
+    fmla z2.d, p0/m, z1.d, z0.d
+    st1d z2.d, p0, [x1, x4, lsl 3]
+    incd x4
+    whilelt p0.d, x4, x3
+    b.first .loop
+    ret
diff --git a/userland/arch/x86_64/cmpxchg.S b/userland/arch/x86_64/cmpxchg.S
index 595b928..3735273 100644
--- a/userland/arch/x86_64/cmpxchg.S
+++ b/userland/arch/x86_64/cmpxchg.S
@@ -1,4 +1,4 @@
-/* https://github.com/cirosantilli/linux-kernel-module-cheat#cmpxchg-instruction */
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-cmpxchg-instruction */
 
 #include <lkmc.h>
 
@@ -24,5 +24,4 @@ LKMC_PROLOGUE
     LKMC_ASSERT_EQ(%rax, $0)
     LKMC_ASSERT_EQ(%r13, $2)
     LKMC_ASSERT_EQ(%r14, $2)
-
 LKMC_EPILOGUE
diff --git a/userland/cpp/atomic.cpp b/userland/cpp/atomic.cpp
index cb5635b..c98f874 100644
--- a/userland/cpp/atomic.cpp
+++ b/userland/cpp/atomic.cpp
@@ -1,5 +1,4 @@
 // https://github.com/cirosantilli/linux-kernel-module-cheat#cpp
-// https://github.com/cirosantilli/linux-kernel-module-cheat#x86-lock-prefix
 //
 // The non-atomic counters have undefined values which get printed:
 // they are extremely likely to be less than the correct value due to
@@ -15,7 +14,6 @@
 // On GCC 4.8 x86-64, using atomic offered a 5x peformance improvement
 // over the same program with mutexes.
 
-
 #if __cplusplus >= 201103L
 #include <atomic>
 #include <cassert>
@@ -24,7 +22,7 @@
 #include <vector>
 std::atomic_ulong my_atomic_ulong(0);
 unsigned long my_non_atomic_ulong = 0;
-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(__aarch64__)
 unsigned long my_arch_atomic_ulong = 0;
 unsigned long my_arch_non_atomic_ulong = 0;
 #endif
@@ -41,6 +39,7 @@ void threadMain() {
             :
             :
         );
+        // https://github.com/cirosantilli/linux-kernel-module-cheat#x86-lock-prefix
         __asm__ __volatile__ (
             "lock;"
             "incq %0;"
@@ -48,6 +47,21 @@ void threadMain() {
             :
             :
         );
+#elif defined(__aarch64__)
+        __asm__ __volatile__ (
+            "add %0, %0, 1;"
+            : "+r" (my_arch_non_atomic_ulong)
+            :
+            :
+        );
+        // https://github.com/cirosantilli/linux-kernel-module-cheat#arm-lse
+        __asm__ __volatile__ (
+            "ldadd %[inc], xzr, [%[addr]];"
+            : "=m" (my_arch_atomic_ulong)
+            : [inc] "r" (1),
+              [addr] "r" (&my_arch_atomic_ulong)
+            :
+        );
 #endif
     }
 }
@@ -75,7 +89,7 @@ int main(int argc, char **argv) {
     // We can also use the atomics direclty through `operator T` conversion.
     assert(my_atomic_ulong == my_atomic_ulong.load());
     std::cout << "my_non_atomic_ulong " << my_non_atomic_ulong << std::endl;
-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(__aarch64__)
     assert(my_arch_atomic_ulong == nthreads * niters);
     std::cout << "my_arch_non_atomic_ulong " << my_arch_non_atomic_ulong << std::endl;
 #endif