From a59c773124793ec2b37af523ab8bb4cc57eedeff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?= =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= Date: Wed, 18 Dec 2019 00:00:00 +0000 Subject: [PATCH] userland/arch/aarch64/inline_asm/reg_var.c: use 64 bit variables 32-bit ones likely copy paste error from coming from arm v7. Also create userland/arch/aarch64/inline_asm/int_32.c: Also create aarch64_ldaxr_stlxr.cpp and start documenting LDAXR and STLXR. --- README.adoc | 63 ++++++++++++++++----- userland/arch/aarch64/inline_asm/inc_32.c | 15 +++++ userland/arch/aarch64/inline_asm/reg_var.c | 8 +-- userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp | 2 + userland/cpp/atomic/main.hpp | 19 ++++++- 5 files changed, 87 insertions(+), 20 deletions(-) create mode 100644 userland/arch/aarch64/inline_asm/inc_32.c create mode 100644 userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp diff --git a/README.adoc b/README.adoc index 8baeee6..1146bd9 100644 --- a/README.adoc +++ b/README.adoc @@ -12056,7 +12056,7 @@ The exact same can be achieved with the older hardcoded `--maxinsts` mechanism p Other related fs.py options are: -* `--abs-max-tick`: set the maximum number of ticks rather than instructions +* `--abs-max-tick`: set the maximum guest simulation time. The same scale as the ExecAll trace is used. E.g., for the above example with 3 instructions, the same trace would be achieved with a value of 3000. The message also shows on <> deadlocks, for example in link:userland/posix/pthread_deadlock.c[]: @@ -13988,6 +13988,7 @@ In this set of examples, we exemplify various synchronization mechanisms, includ * link:userland/cpp/atomic/main.hpp[]: contains all the code which is then specialized in separated `.cpp` files with macros * link:userland/cpp/atomic/aarch64_add.cpp[]: non synchronized aarch64 inline assembly +* link:userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp[]: see: <> * link:userland/cpp/atomic/aarch64_ldadd.cpp[]: synchronized aarch64 inline assembly with the <> LDADD instruction * link:userland/cpp/atomic/fail.cpp[]: non synchronized C++ operator `++` * link:userland/cpp/atomic/mutex.cpp[]: synchronized `std::mutex`. `std;` @@ -14287,7 +14288,9 @@ The following sections are related to multithreading in userland: ** <> * ISA topics: ** <> -** <> +** <> +*** <> +*** <> * emulator topics: ** <> ** <> @@ -15338,6 +15341,7 @@ Examples under `arch//c/` directories show to how use inline assembly from * aarch64 ** link:userland/arch/aarch64/inline_asm/earlyclobber.c[] ** link:userland/arch/aarch64/inline_asm/inc.c[] +** link:userland/arch/aarch64/inline_asm/inc_32.c[]: how to use 32-bit `w` registers in aarch64. We have to add `w` to the `%` as in `%w[io]` instead of `%[io]` ** link:userland/arch/aarch64/inline_asm/multiline.cpp[] ==== GCC inline assembly register variables @@ -17468,25 +17472,54 @@ We then download the zip from: https://developer.arm.com/docs/ddi0584/latest/arm That document then describes the SVE instructions and registers. +=== ARM thread synchronization primitives + +Parent section: <>. + +==== ARM LDXR and STXR instructions + +Parent section: <> + +link:userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp[] + +LDXR and STXR vs LDAXR and STLXR: https://stackoverflow.com/questions/21535058/arm64-ldxr-stxr-vs-ldaxr-stlxr TODO understand better and example. + +LDXR and STXR for a so-called "Load-link/store-conditional" (LLSC) pattern: https://en.wikipedia.org/wiki/Load-link/store-conditional which appears in many RISC ISAs. + +This pattern makes it such that basically: + +* LDXR marks an address for exclusive access by the current CPU +* STXR: +** marks the address as not being exclusive to other CPUs that may have done LDXR before +** loads fine if the address is still marked as exclusive, and stores 0 on a third register for success +** fails to load if the address is not, and stores 1 on the third register for failure + +In case of failure, we just have to loop back to just before the LDXR and try again. + +This is therefore basically a spinlock and should only be used to cover very short critical sections such as atomic increments. + +C++ `std::atomic` uses this for increments before v8.1 <>: https://stackoverflow.com/questions/56810/how-do-i-start-threads-in-plain-c/52453291#52453291 + +[[arm-lse]] +==== ARM Large System Extensions (LSE) + +Set of atomic and synchronization primitives added in <>. + +Documented at <> "ARMv8.1-LSE, ARMv8.1 Large System Extensions" + +* LDADD: link:userland/cpp/atomic/aarch64_ldadd.cpp[], see also: <>. Kernel inspiration: https://github.com/torvalds/linux/blob/v5.4/arch/arm64/include/asm/atomic_lse.h#L56 + +Bibliography: + +* https://preshing.com/20120710/memory-barriers-are-like-source-control-operations/ + === ARMv8 architecture extensions ==== ARMv8.1 architecture extension <> A1.7.3 "The ARMv8.1 architecture extension" -[[arm-lse]] -===== ARM Large System Extensions (LSE) - -Parent section: <>. - -<> "ARMv8.1-LSE, ARMv8.1 Large System Extensions" - -* LDADD: link:userland/cpp/atomic/aarch64_ldadd.cpp[], see also: <>. Kernel inspiration: https://github.com/torvalds/linux/blob/v5.4/arch/arm64/include/asm/atomic_lse.h#L56 - -Bibliography: - -* https://stackoverflow.com/questions/21535058/arm64-ldxr-stxr-vs-ldaxr-stlxr -* https://preshing.com/20120710/memory-barriers-are-like-source-control-operations/ +* <> === ARM assembly bibliography diff --git a/userland/arch/aarch64/inline_asm/inc_32.c b/userland/arch/aarch64/inline_asm/inc_32.c new file mode 100644 index 0000000..49da411 --- /dev/null +++ b/userland/arch/aarch64/inline_asm/inc_32.c @@ -0,0 +1,15 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#gcc-inline-assembly */ + +#include +#include + +int main(void) { + uint32_t io = 1; + __asm__ ( + "add %w[io], %w[io], 1;" + : [io] "+r" (io) + : + : + ); + assert(io == 2); +} diff --git a/userland/arch/aarch64/inline_asm/reg_var.c b/userland/arch/aarch64/inline_asm/reg_var.c index e5adecb..6bdbe72 100644 --- a/userland/arch/aarch64/inline_asm/reg_var.c +++ b/userland/arch/aarch64/inline_asm/reg_var.c @@ -4,10 +4,10 @@ #include int main(void) { - register uint32_t x0 __asm__ ("x0"); - register uint32_t x1 __asm__ ("x1"); - uint32_t new_x0; - uint32_t new_x1; + register uint64_t x0 __asm__ ("x0"); + register uint64_t x1 __asm__ ("x1"); + uint64_t new_x0; + uint64_t new_x1; { x0 = 1; x1 = 2; diff --git a/userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp b/userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp new file mode 100644 index 0000000..2d43cb2 --- /dev/null +++ b/userland/cpp/atomic/aarch64_ldaxr_stlxr.cpp @@ -0,0 +1,2 @@ +#define LKMC_USERLAND_ATOMIC_LDAXR_STLXR 1 +#include "main.hpp" diff --git a/userland/cpp/atomic/main.hpp b/userland/cpp/atomic/main.hpp index 4a01d2b..9011bf5 100644 --- a/userland/cpp/atomic/main.hpp +++ b/userland/cpp/atomic/main.hpp @@ -51,11 +51,28 @@ void threadMain() { : : ); +#elif LKMC_USERLAND_ATOMIC_LDAXR_STLXR + // Was used by std::atomic before LDADD was added + uint64_t scratch64; + uint64_t scratch32; + __asm__ __volatile__ ( + "1:" + "ldaxr %[scratch64], [%[addr]];" + "add %[scratch64], %[scratch64], 1;" + "stlxr %w[scratch32], %[scratch64], [%[addr]];" + "cbnz %w[scratch32], 1b;" + : "=m" (global), // indicate that global is modified + "+g" (i), // to prevent loop unrolling + [scratch64] "=&r" (scratch64), + [scratch32] "=&r" (scratch32) + : [addr] "r" (&global) + : + ); #elif LKMC_USERLAND_ATOMIC_AARCH64_LDADD // https://cirosantilli.com/linux-kernel-module-cheat#arm-lse __asm__ __volatile__ ( "ldadd %[inc], xzr, [%[addr]];" - : "=m" (global), + : "=m" (global), // indicate that global is modified "+g" (i) // to prevent loop unrolling : [inc] "r" (1), [addr] "r" (&global)