From 0d5c7f5c4c7983116519da0b322cdace0d428a2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?=
 =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= <ciro.santilli@gmail.com>
Date: Fri, 5 Jun 2020 06:00:05 +0000
Subject: [PATCH] Detailed gem5 analysis of how data races happen

And pass niters as a thread argument to all threading implementations...
otherwise every loop has to do a memory load from the global!
---
 README.adoc                     | 143 +++++++++++++++++++++++++++++++-
 userland/c/atomic.c             |  11 ++-
 userland/c/snprintf.c           |   8 +-
 userland/cpp/atomic/main.hpp    |   8 +-
 userland/kernel_modules/ioctl.c |   4 +
 userland/posix/pthread_mutex.c  |   7 +-
 6 files changed, 162 insertions(+), 19 deletions(-)
diff --git a/README.adoc b/README.adoc
index cad3f4f..adfbf63 100644
--- a/README.adoc
+++ b/README.adoc
@@ -17253,12 +17253,74 @@ Algorithm used by the OOM: https://unix.stackexchange.com/questions/153585/how-d
 
 Added in C11!
 
-* link:userland/c/atomic.c[]: `atomic_int` and `thrd_create`
-
 Bibliography:
 
+* <<userland-multithreading>>
 * https://stackoverflow.com/questions/3908031/how-to-multithread-c-code/52453354#52453354
 
+===== atomic.c
+
+link:userland/c/atomic.c[]
+
+Demonstrates `atomic_int` and `thrd_create`.
+
+Disassembly with GDB at LKMC 619fef4b04bddc4a5a38aec5e207dd4d5a25d206 + 1:
+
+....
+./run-toolchain \
+  --arch aarch64 gdb \
+  -- \
+  -batch \
+  -ex 'disas/rs my_thread_main' $(./getvar \
+  --arch aarch64 userland_build_dir)/c/atomic.out \
+;
+....
+
+shows on ARM:
+
+....
+16              ++cnt;
+   0x00000000004008cc <+28>:    80 00 00 b0     adrp    x0, 0x411000 <malloc@got.plt>
+   0x00000000004008d0 <+32>:    00 80 01 91     add     x0, x0, #0x60
+   0x00000000004008d4 <+36>:    00 00 40 b9     ldr     w0, [x0]
+   0x00000000004008d8 <+40>:    01 04 00 11     add     w1, w0, #0x1
+   0x00000000004008dc <+44>:    80 00 00 b0     adrp    x0, 0x411000 <malloc@got.plt>
+   0x00000000004008e0 <+48>:    00 80 01 91     add     x0, x0, #0x60
+   0x00000000004008e4 <+52>:    01 00 00 b9     str     w1, [x0]
+
+17              ++acnt;
+   0x00000000004008e8 <+56>:    20 00 80 52     mov     w0, #0x1                        // #1
+   0x00000000004008ec <+60>:    e0 1b 00 b9     str     w0, [sp, #24]
+   0x00000000004008f0 <+64>:    e0 1b 40 b9     ldr     w0, [sp, #24]
+   0x00000000004008f4 <+68>:    e2 03 00 2a     mov     w2, w0
+   0x00000000004008f8 <+72>:    80 00 00 b0     adrp    x0, 0x411000 <malloc@got.plt>
+   0x00000000004008fc <+76>:    00 70 01 91     add     x0, x0, #0x5c
+   0x0000000000400900 <+80>:    03 00 e2 b8     ldaddal w2, w3, [x0]
+   0x0000000000400904 <+84>:    61 00 02 0b     add     w1, w3, w2
+   0x0000000000400908 <+88>:    e0 03 01 2a     mov     w0, w1
+   0x000000000040090c <+92>:    e0 1f 00 b9     str     w0, [sp, #28]
+....
+
+so:
+
+* the atomic increment uses <<arm-lse,`ldadd`>>
+* the non-atomic increment just does LDR, ADD, STR: <<arm-lse,`ldadd`>>
+
+With `-O3`:
+
+....
+16              ++cnt;
+   0x0000000000400a00 <+32>:    60 00 40 b9     ldr     w0, [x3]
+   0x0000000000400a04 <+36>:    00 04 00 11     add     w0, w0, #0x1
+   0x0000000000400a08 <+40>:    60 00 00 b9     str     w0, [x3]
+
+17              ++acnt;
+   0x0000000000400a0c <+44>:    20 00 80 52     mov     w0, #0x1                        // #1
+   0x0000000000400a10 <+48>:    40 00 e0 b8     ldaddal w0, w0, [x2]
+....
+
+so the situation is the same but without all the horrible stack noise.
+
 ==== GCC C extensions
 
 ===== C empty struct
@@ -17480,6 +17542,83 @@ Bibliography:
 
 * https://stackoverflow.com/questions/31978324/what-exactly-is-stdatomic/58904448#58904448 "What exactly is std::atomic?"
 
+====== Detailed gem5 analysis of how data races happen
+
+The smallest data race we managed to come up as of LKMC 7c01b29f1ee7da878c7cc9cb4565f3f3cf516a92 and gem5 872cb227fdc0b4d60acc7840889d567a6936b6e1 was with link:userland/c/atomic.c[] (see also <<c-multithreading>>):
+
+....
+./run \
+  --arch aarch64 \
+  --cli-args '2 10' \
+  --cpus 3 \
+  --emulator gem5 \
+  --userland userland/c/atomic.c \
+;
+....
+
+which outputs:
+
+....
+atomic 20
+non-atomic 19
+....
+
+./run -aA -eg -u userland/c/atomic.c --cli-args '2 200' --cpus 3 --userland-build-id o3 -N1 --trace ExecAll -- --caches --cpu-type TimingSimpleCPU
+
+Note that that the system is very minimal, and doesn't even have caches, so I'm curious as to how this can happen at all.
+
+So first we do a run with <<gem5-tracing,`--trace Exec`>> and look at the `my_thread_main` entries.
+
+From there we see that first CPU1 enters the function, since it was spawned first.
+
+Then for some time, both CPU1 and CPU2 are running at the same time.
+
+Finally, CPU1 exists, then CPU2 runs alone for a while to finish its loops, and then CPU2 exits.
+
+By greping the LDR data read from the log, we are able to easily spot the moment where things started to go wrong based on the `D=` data:
+
+....
+grep -E 'my_thread_main\+36' trace.txt > trace-ldr.txt
+....
+
+The `grep` output contains
+
+....
+94024500: system.cpu1: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000006 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
+94036500: system.cpu1: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000007 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
+94048500: system.cpu1: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000008 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
+94058500: system.cpu2: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000009 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
+94060500: system.cpu1: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000009 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
+94070500: system.cpu2: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x000000000000000a A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
+94082500: system.cpu2: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x000000000000000b A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
+....
+
+and so se see that it is at `94058500` that things started going bad, since two consecutive loads from different CPUs read the same value `D=9`! Actually, things were not too bad afterwards because this was by coincidence the last CPU1 read, we would have missed many more increments if the number of iterations had been larger.
+
+Now that we have the first bad time, let's look at the fuller disassembly to better understand what happens around that point.
+
+....
+94058500: system.cpu2: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000009 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
+94059000: system.cpu2: A0 T0 : @my_thread_main+40    :   add   w1, w0, #1         : IntAlu :  D=0x000000000000000a  flags=(IsInteger)
+94059000: system.cpu1: A0 T0 : @my_thread_main+120    :   b.cc   <my_thread_main+28> : IntAlu :   flags=(IsControl|IsDirectControl|IsCondControl)
+94059500: system.cpu1: A0 T0 : @my_thread_main+28    :   adrp   x0, #69632        : IntAlu :  D=0x0000000000411000  flags=(IsInteger)
+94059500: system.cpu2: A0 T0 : @my_thread_main+44    :   adrp   x0, #69632        : IntAlu :  D=0x0000000000411000  flags=(IsInteger)
+94060000: system.cpu2: A0 T0 : @my_thread_main+48    :   add   x0, x0, #96        : IntAlu :  D=0x0000000000411060  flags=(IsInteger)
+94060000: system.cpu1: A0 T0 : @my_thread_main+32    :   add   x0, x0, #96        : IntAlu :  D=0x0000000000411060  flags=(IsInteger)
+94060500: system.cpu1: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000009 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
+94060500: system.cpu2: A0 T0 : @my_thread_main+52    :   str   x1, [x0]           : MemWrite :  D=0x000000000000000a A=0x411060  flags=(IsInteger|IsMemRef|IsStore)
+....
+
+and from this, all becomes crystal clear:
+
+* 94058500: CPU2 loads
+* 94060500: CPU1 loads
+* 94060500: CPU2 stores
+
+so we see that CPU2 just happened to store after CPU1 loads.
+
+We also understand why LDADD solves the race problem in AtomicSimpleCPU: it does the load and store in one single go!
+
 [[cpp-memory-order]]
 ===== C++ std::memory_order
 
diff --git a/userland/c/atomic.c b/userland/c/atomic.c
index d826951..d8e0bcd 100644
--- a/userland/c/atomic.c
+++ b/userland/c/atomic.c
@@ -1,4 +1,4 @@
-/* https://cirosantilli.com/linux-kernel-module-cheat#c-multithreading */
+/* https://cirosantilli.com/linux-kernel-module-cheat#atomic-c */
 
 #if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
 #include <stdatomic.h>
@@ -9,10 +9,9 @@
 
 atomic_int acnt;
 int cnt;
-size_t niters;
 
-int f(void *thr_data) {
-    (void)thr_data;
+int my_thread_main(void *thr_data) {
+    size_t niters = *(size_t *)thr_data;
     for (size_t i = 0; i < niters; ++i) {
         ++cnt;
         ++acnt;
@@ -23,7 +22,7 @@ int f(void *thr_data) {
 
 int main(int argc, char **argv) {
 #if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
-    size_t nthreads;
+    size_t niters, nthreads;
     thrd_t *threads;
     if (argc > 1) {
         nthreads = strtoull(argv[1], NULL, 0);
@@ -37,7 +36,7 @@ int main(int argc, char **argv) {
     }
     threads = malloc(sizeof(thrd_t) * nthreads);
     for(size_t i = 0; i < nthreads; ++i)
-        thrd_create(threads + i, f, NULL);
+        thrd_create(threads + i, my_thread_main, &niters);
     for(size_t i = 0; i < nthreads; ++i)
         thrd_join(threads[i], NULL);
     free(threads);
diff --git a/userland/c/snprintf.c b/userland/c/snprintf.c
index da23e48..bb38939 100644
--- a/userland/c/snprintf.c
+++ b/userland/c/snprintf.c
@@ -39,14 +39,18 @@ int main(void) {
     /* Less common case where string does not fit. Error handling would
      * normally follow in a real program. */
     {
-        int in = 1234;
-        char out[6];
         /* The return here is the same as before.
          *
          * Because it is >= than the imposed limit of 6, we know that
          * the write failed to fully complete. */
+#if 0
+        /* GCC 8.3.0 with -O3 actually detects this and breaks the build.
+         * error: ‘cd’ directive output truncated writing 2 bytes into a region of size 0 [-Werror=format-truncation=] */
+        int in = 1234;
+        char out[6];
         assert(snprintf(out, sizeof(out), "ab%dcd", in) == 8);
         assert(strcmp(out, "ab123") == 0);
+#endif
     }
 #endif
     return EXIT_SUCCESS;
diff --git a/userland/cpp/atomic/main.hpp b/userland/cpp/atomic/main.hpp
index 04c5ace..1fed060 100644
--- a/userland/cpp/atomic/main.hpp
+++ b/userland/cpp/atomic/main.hpp
@@ -8,8 +8,6 @@
 #include <thread>
 #include <vector>
 
-size_t niters;
-
 #if LKMC_USERLAND_ATOMIC_STD_ATOMIC
 std::atomic_ulong global(0);
 #else
@@ -20,7 +18,7 @@ uint64_t global = 0;
 std::mutex mutex;
 #endif
 
-void threadMain() {
+void threadMain(size_t niters) {
     for (size_t i = 0; i < niters; ++i) {
 #if LKMC_USERLAND_ATOMIC_MUTEX
         mutex.lock();
@@ -97,7 +95,7 @@ void threadMain() {
 
 int main(int argc, char **argv) {
 #if __cplusplus >= 201103L
-    size_t nthreads;
+    size_t niters, nthreads;
     if (argc > 1) {
         nthreads = std::stoull(argv[1], NULL, 0);
     } else {
@@ -110,7 +108,7 @@ int main(int argc, char **argv) {
     }
     std::vector<std::thread> threads(nthreads);
     for (size_t i = 0; i < nthreads; ++i)
-        threads[i] = std::thread(threadMain);
+        threads[i] = std::thread(threadMain, niters);
     for (size_t i = 0; i < nthreads; ++i)
         threads[i].join();
     uint64_t expect = nthreads * niters;
diff --git a/userland/kernel_modules/ioctl.c b/userland/kernel_modules/ioctl.c
index 897a995..4145412 100644
--- a/userland/kernel_modules/ioctl.c
+++ b/userland/kernel_modules/ioctl.c
@@ -25,9 +25,13 @@ int main(int argc, char **argv) {
     request = strtol(argv[2], NULL, 10);
     if (argc > 3) {
         arg0 = strtol(argv[3], NULL, 10);
+    } else {
+        arg0 = 0;
     }
     if (argc > 4) {
         arg1 = strtol(argv[4], NULL, 10);
+    } else {
+        arg1 = 0;
     }
 
     fd = open(ioctl_path, O_RDONLY);
diff --git a/userland/posix/pthread_mutex.c b/userland/posix/pthread_mutex.c
index 916fb1c..79790e4 100644
--- a/userland/posix/pthread_mutex.c
+++ b/userland/posix/pthread_mutex.c
@@ -7,12 +7,11 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-unsigned long long niters;
 unsigned long long global = 0;
 pthread_mutex_t main_thread_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 void* main_thread(void *arg) {
-    (void)arg;
+    unsigned long long niters = *(unsigned long long *)arg;
     unsigned long long i;
     for (i = 0; i < niters; ++i) {
         pthread_mutex_lock(&main_thread_mutex);
@@ -24,7 +23,7 @@ void* main_thread(void *arg) {
 
 int main(int argc, char **argv) {
     pthread_t *threads;
-    unsigned long long i, nthreads;
+    unsigned long long i, niters, nthreads;
 
     /* CLI arguments. */
     if (argc > 1) {
@@ -41,7 +40,7 @@ int main(int argc, char **argv) {
 
     /* Action */
     for (i = 0; i < nthreads; ++i)
-        pthread_create(&threads[i], NULL, main_thread, NULL);
+        pthread_create(&threads[i], NULL, main_thread, &niters);
     for (i = 0; i < nthreads; ++i)
         pthread_join(threads[i], NULL);
     assert(global == nthreads * niters);