From a59c773124793ec2b37af523ab8bb4cc57eedeff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?=
 =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= <ciro.santilli@gmail.com>
Date: Wed, 18 Dec 2019 00:00:00 +0000
Subject: [PATCH] userland/arch/aarch64/inline_asm/reg_var.c: use 64 bit
 variables

32-bit ones likely copy paste error from coming from arm v7.

Also create userland/arch/aarch64/inline_asm/int_32.c:

Also create aarch64_ldaxr_stlxr.cpp and start documenting LDAXR and STLXR.
---
 README.adoc                                 | 63 ++++++++++++++++-----
 userland/arch/aarch64/inline_asm/inc_32.c   | 15 +++++
 userland/arch/aarch64/inline_asm/reg_var.c  |  8 +--
 userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp |  2 +
 userland/cpp/atomic/main.hpp                | 19 ++++++-
 5 files changed, 87 insertions(+), 20 deletions(-)
 create mode 100644 userland/arch/aarch64/inline_asm/inc_32.c
 create mode 100644 userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp
diff --git a/README.adoc b/README.adoc
index 8baeee6..1146bd9 100644
--- a/README.adoc
+++ b/README.adoc
@@ -12056,7 +12056,7 @@ The exact same can be achieved with the older hardcoded `--maxinsts` mechanism p
 
 Other related fs.py options are:
 
-* `--abs-max-tick`: set the maximum number of ticks rather than instructions
+* `--abs-max-tick`: set the maximum guest simulation time. The same scale as the ExecAll trace is used. E.g., for the above example with 3 instructions, the same trace would be achieved with a value of 3000.
 
 The message also shows on <<user-mode-simulation>> deadlocks, for example in link:userland/posix/pthread_deadlock.c[]:
 
@@ -13988,6 +13988,7 @@ In this set of examples, we exemplify various synchronization mechanisms, includ
 
 * link:userland/cpp/atomic/main.hpp[]: contains all the code which is then specialized in separated `.cpp` files with macros
 * link:userland/cpp/atomic/aarch64_add.cpp[]: non synchronized aarch64 inline assembly
+* link:userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp[]: see: <<arm-ldxr-and-stxr-instructions>>
 * link:userland/cpp/atomic/aarch64_ldadd.cpp[]: synchronized aarch64 inline assembly with the <<arm-lse>> LDADD instruction
 * link:userland/cpp/atomic/fail.cpp[]: non synchronized C++ operator `++`
 * link:userland/cpp/atomic/mutex.cpp[]: synchronized `std::mutex`. `std;`
@@ -14287,7 +14288,9 @@ The following sections are related to multithreading in userland:
 ** <<pthreads>>
 * ISA topics:
 ** <<x86-thread-synchronization-primitives>>
-** <<arm-lse>>
+** <<arm-thread-synchronization-primitives>>
+*** <<arm-ldxr-stxr>>
+*** <<arm-lse>>
 * emulator topics:
 ** <<qemu-user-mode-multithreading>>
 ** <<gem5-syscall-emulation-multithreading>>
@@ -15338,6 +15341,7 @@ Examples under `arch/<arch>/c/` directories show to how use inline assembly from
 * aarch64
 ** link:userland/arch/aarch64/inline_asm/earlyclobber.c[]
 ** link:userland/arch/aarch64/inline_asm/inc.c[]
+** link:userland/arch/aarch64/inline_asm/inc_32.c[]: how to use 32-bit `w` registers in aarch64. We have to add `w` to the `%` as in `%w[io]` instead of `%[io]`
 ** link:userland/arch/aarch64/inline_asm/multiline.cpp[]
 
 ==== GCC inline assembly register variables
@@ -17468,25 +17472,54 @@ We then download the zip from: https://developer.arm.com/docs/ddi0584/latest/arm
 
 That document then describes the SVE instructions and registers.
 
+=== ARM thread synchronization primitives
+
+Parent section: <<userland-multithreading>>.
+
+==== ARM LDXR and STXR instructions
+
+Parent section: <<atomic-cpp>>
+
+link:userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp[]
+
+LDXR and STXR vs LDAXR and STLXR: https://stackoverflow.com/questions/21535058/arm64-ldxr-stxr-vs-ldaxr-stlxr TODO understand better and example.
+
+LDXR and STXR for a so-called "Load-link/store-conditional" (LLSC) pattern: https://en.wikipedia.org/wiki/Load-link/store-conditional which appears in many RISC ISAs.
+
+This pattern makes it such that basically:
+
+* LDXR marks an address for exclusive access by the current CPU
+* STXR:
+** marks the address as not being exclusive to other CPUs that may have done LDXR before
+** loads fine if the address is still marked as exclusive, and stores 0 on a third register for success
+** fails to load if the address is not, and stores 1 on the third register for failure
+
+In case of failure, we just have to loop back to just before the LDXR and try again.
+
+This is therefore basically a spinlock and should only be used to cover very short critical sections such as atomic increments.
+
+C++ `std::atomic` uses this for increments before v8.1 <<arm-lse>>: https://stackoverflow.com/questions/56810/how-do-i-start-threads-in-plain-c/52453291#52453291
+
+[[arm-lse]]
+==== ARM Large System Extensions (LSE)
+
+Set of atomic and synchronization primitives added in <<armv8-1-architecture-extension>>.
+
+Documented at <<armarm8-db>> "ARMv8.1-LSE, ARMv8.1 Large System Extensions"
+
+* LDADD: link:userland/cpp/atomic/aarch64_ldadd.cpp[], see also: <<atomic-cpp>>. Kernel inspiration: https://github.com/torvalds/linux/blob/v5.4/arch/arm64/include/asm/atomic_lse.h#L56
+
+Bibliography:
+
+* https://preshing.com/20120710/memory-barriers-are-like-source-control-operations/
+
 === ARMv8 architecture extensions
 
 ==== ARMv8.1 architecture extension
 
 <<armarm8-db>> A1.7.3 "The ARMv8.1 architecture extension"
 
-[[arm-lse]]
-===== ARM Large System Extensions (LSE)
-
-Parent section: <<arm-baremetal-multicore>>.
-
-<<armarm8-db>> "ARMv8.1-LSE, ARMv8.1 Large System Extensions"
-
-* LDADD: link:userland/cpp/atomic/aarch64_ldadd.cpp[], see also: <<atomic-cpp>>. Kernel inspiration: https://github.com/torvalds/linux/blob/v5.4/arch/arm64/include/asm/atomic_lse.h#L56
-
-Bibliography:
-
-* https://stackoverflow.com/questions/21535058/arm64-ldxr-stxr-vs-ldaxr-stlxr
-* https://preshing.com/20120710/memory-barriers-are-like-source-control-operations/
+* <<arm-lse>>
 
 === ARM assembly bibliography
 
diff --git a/userland/arch/aarch64/inline_asm/inc_32.c b/userland/arch/aarch64/inline_asm/inc_32.c
new file mode 100644
index 0000000..49da411
--- /dev/null
+++ b/userland/arch/aarch64/inline_asm/inc_32.c
@@ -0,0 +1,15 @@
+/* https://cirosantilli.com/linux-kernel-module-cheat#gcc-inline-assembly */
+
+#include <assert.h>
+#include <inttypes.h>
+
+int main(void) {
+    uint32_t io = 1;
+    __asm__ (
+        "add %w[io], %w[io], 1;"
+        : [io] "+r" (io)
+        :
+        :
+    );
+    assert(io == 2);
+}
diff --git a/userland/arch/aarch64/inline_asm/reg_var.c b/userland/arch/aarch64/inline_asm/reg_var.c
index e5adecb..6bdbe72 100644
--- a/userland/arch/aarch64/inline_asm/reg_var.c
+++ b/userland/arch/aarch64/inline_asm/reg_var.c
@@ -4,10 +4,10 @@
 #include <inttypes.h>
 
 int main(void) {
-    register uint32_t x0 __asm__ ("x0");
-    register uint32_t x1 __asm__ ("x1");
-    uint32_t new_x0;
-    uint32_t new_x1;
+    register uint64_t x0 __asm__ ("x0");
+    register uint64_t x1 __asm__ ("x1");
+    uint64_t new_x0;
+    uint64_t new_x1;
     {
         x0 = 1;
         x1 = 2;
diff --git a/userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp b/userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp
new file mode 100644
index 0000000..2d43cb2
--- /dev/null
+++ b/userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp
@@ -0,0 +1,2 @@
+#define LKMC_USERLAND_ATOMIC_LDAXR_STLXR 1
+#include "main.hpp"
diff --git a/userland/cpp/atomic/main.hpp b/userland/cpp/atomic/main.hpp
index 4a01d2b..9011bf5 100644
--- a/userland/cpp/atomic/main.hpp
+++ b/userland/cpp/atomic/main.hpp
@@ -51,11 +51,28 @@ void threadMain() {
             :
             :
         );
+#elif LKMC_USERLAND_ATOMIC_LDAXR_STLXR
+        // Was used by std::atomic before LDADD was added
+        uint64_t scratch64;
+        uint64_t scratch32;
+        __asm__ __volatile__ (
+            "1:"
+            "ldaxr %[scratch64], [%[addr]];"
+            "add   %[scratch64], %[scratch64], 1;"
+            "stlxr %w[scratch32], %[scratch64], [%[addr]];"
+            "cbnz  %w[scratch32], 1b;"
+            : "=m" (global), // indicate that global is modified
+              "+g" (i), // to prevent loop unrolling
+              [scratch64] "=&r" (scratch64),
+              [scratch32] "=&r" (scratch32)
+            : [addr] "r" (&global)
+            :
+        );
 #elif LKMC_USERLAND_ATOMIC_AARCH64_LDADD
         // https://cirosantilli.com/linux-kernel-module-cheat#arm-lse
         __asm__ __volatile__ (
             "ldadd %[inc], xzr, [%[addr]];"
-            : "=m" (global),
+            : "=m" (global), // indicate that global is modified
               "+g" (i) // to prevent loop unrolling
             : [inc] "r" (1),
               [addr] "r" (&global)