Detailed gem5 analysis of how data races happen

And pass niters as a thread argument to all threading implementations... otherwise every loop has to do a memory load from the global!
2026-01-23 02:05:57 +01:00 · 2020-06-05 06:00:05 +00:00
parent 619fef4b04
commit 0d5c7f5c4c
6 changed files with 162 additions and 19 deletions
--- a/README.adoc
+++ b/README.adoc
@@ -17253,12 +17253,74 @@ Algorithm used by the OOM: https://unix.stackexchange.com/questions/153585/how-d
 Added in C11!
 * link:userland/c/atomic.c[]: `atomic_int` and `thrd_create`
 Bibliography:
 * <<userland-multithreading>>
 * https://stackoverflow.com/questions/3908031/how-to-multithread-c-code/52453354#52453354
 ===== atomic.c
 link:userland/c/atomic.c[]
 Demonstrates `atomic_int` and `thrd_create`.
 Disassembly with GDB at LKMC 619fef4b04bddc4a5a38aec5e207dd4d5a25d206 + 1:
 ....
 ./run-toolchain \
  --arch aarch64 gdb \
  -- \
  -batch \
  -ex 'disas/rs my_thread_main' $(./getvar \
  --arch aarch64 userland_build_dir)/c/atomic.out \
 ;
 ....
 shows on ARM:
 ....
 16              ++cnt;
   0x00000000004008cc <+28>:    80 00 00 b0     adrp    x0, 0x411000 <malloc@got.plt>
   0x00000000004008d0 <+32>:    00 80 01 91     add     x0, x0, #0x60
   0x00000000004008d4 <+36>:    00 00 40 b9     ldr     w0, [x0]
   0x00000000004008d8 <+40>:    01 04 00 11     add     w1, w0, #0x1
   0x00000000004008dc <+44>:    80 00 00 b0     adrp    x0, 0x411000 <malloc@got.plt>
   0x00000000004008e0 <+48>:    00 80 01 91     add     x0, x0, #0x60
   0x00000000004008e4 <+52>:    01 00 00 b9     str     w1, [x0]
 17              ++acnt;
   0x00000000004008e8 <+56>:    20 00 80 52     mov     w0, #0x1                        // #1
   0x00000000004008ec <+60>:    e0 1b 00 b9     str     w0, [sp, #24]
   0x00000000004008f0 <+64>:    e0 1b 40 b9     ldr     w0, [sp, #24]
   0x00000000004008f4 <+68>:    e2 03 00 2a     mov     w2, w0
   0x00000000004008f8 <+72>:    80 00 00 b0     adrp    x0, 0x411000 <malloc@got.plt>
   0x00000000004008fc <+76>:    00 70 01 91     add     x0, x0, #0x5c
   0x0000000000400900 <+80>:    03 00 e2 b8     ldaddal w2, w3, [x0]
   0x0000000000400904 <+84>:    61 00 02 0b     add     w1, w3, w2
   0x0000000000400908 <+88>:    e0 03 01 2a     mov     w0, w1
   0x000000000040090c <+92>:    e0 1f 00 b9     str     w0, [sp, #28]
 ....
 so:
 * the atomic increment uses <<arm-lse,`ldadd`>>
 * the non-atomic increment just does LDR, ADD, STR: <<arm-lse,`ldadd`>>
 With `-O3`:
 ....
 16              ++cnt;
   0x0000000000400a00 <+32>:    60 00 40 b9     ldr     w0, [x3]
   0x0000000000400a04 <+36>:    00 04 00 11     add     w0, w0, #0x1
   0x0000000000400a08 <+40>:    60 00 00 b9     str     w0, [x3]
 17              ++acnt;
   0x0000000000400a0c <+44>:    20 00 80 52     mov     w0, #0x1                        // #1
   0x0000000000400a10 <+48>:    40 00 e0 b8     ldaddal w0, w0, [x2]
 ....
 so the situation is the same but without all the horrible stack noise.
 ==== GCC C extensions
 ===== C empty struct
@@ -17480,6 +17542,83 @@ Bibliography:
 * https://stackoverflow.com/questions/31978324/what-exactly-is-stdatomic/58904448#58904448 "What exactly is std::atomic?"
 ====== Detailed gem5 analysis of how data races happen
 The smallest data race we managed to come up as of LKMC 7c01b29f1ee7da878c7cc9cb4565f3f3cf516a92 and gem5 872cb227fdc0b4d60acc7840889d567a6936b6e1 was with link:userland/c/atomic.c[] (see also <<c-multithreading>>):
 ....
 ./run \
  --arch aarch64 \
  --cli-args '2 10' \
  --cpus 3 \
  --emulator gem5 \
  --userland userland/c/atomic.c \
 ;
 ....
 which outputs:
 ....
 atomic 20
 non-atomic 19
 ....
 ./run -aA -eg -u userland/c/atomic.c --cli-args '2 200' --cpus 3 --userland-build-id o3 -N1 --trace ExecAll -- --caches --cpu-type TimingSimpleCPU
 Note that that the system is very minimal, and doesn't even have caches, so I'm curious as to how this can happen at all.
 So first we do a run with <<gem5-tracing,`--trace Exec`>> and look at the `my_thread_main` entries.
 From there we see that first CPU1 enters the function, since it was spawned first.
 Then for some time, both CPU1 and CPU2 are running at the same time.
 Finally, CPU1 exists, then CPU2 runs alone for a while to finish its loops, and then CPU2 exits.
 By greping the LDR data read from the log, we are able to easily spot the moment where things started to go wrong based on the `D=` data:
 ....
 grep -E 'my_thread_main\+36' trace.txt > trace-ldr.txt
 ....
 The `grep` output contains
 ....
 94024500: system.cpu1: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000006 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
 94036500: system.cpu1: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000007 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
 94048500: system.cpu1: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000008 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
 94058500: system.cpu2: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000009 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
 94060500: system.cpu1: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000009 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
 94070500: system.cpu2: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x000000000000000a A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
 94082500: system.cpu2: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x000000000000000b A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
 ....
 and so se see that it is at `94058500` that things started going bad, since two consecutive loads from different CPUs read the same value `D=9`! Actually, things were not too bad afterwards because this was by coincidence the last CPU1 read, we would have missed many more increments if the number of iterations had been larger.
 Now that we have the first bad time, let's look at the fuller disassembly to better understand what happens around that point.
 ....
 94058500: system.cpu2: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000009 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
 94059000: system.cpu2: A0 T0 : @my_thread_main+40    :   add   w1, w0, #1         : IntAlu :  D=0x000000000000000a  flags=(IsInteger)
 94059000: system.cpu1: A0 T0 : @my_thread_main+120    :   b.cc   <my_thread_main+28> : IntAlu :   flags=(IsControl|IsDirectControl|IsCondControl)
 94059500: system.cpu1: A0 T0 : @my_thread_main+28    :   adrp   x0, #69632        : IntAlu :  D=0x0000000000411000  flags=(IsInteger)
 94059500: system.cpu2: A0 T0 : @my_thread_main+44    :   adrp   x0, #69632        : IntAlu :  D=0x0000000000411000  flags=(IsInteger)
 94060000: system.cpu2: A0 T0 : @my_thread_main+48    :   add   x0, x0, #96        : IntAlu :  D=0x0000000000411060  flags=(IsInteger)
 94060000: system.cpu1: A0 T0 : @my_thread_main+32    :   add   x0, x0, #96        : IntAlu :  D=0x0000000000411060  flags=(IsInteger)
 94060500: system.cpu1: A0 T0 : @my_thread_main+36    :   ldr   x0, [x0]           : MemRead :  D=0x0000000000000009 A=0x411060  flags=(IsInteger|IsMemRef|IsLoad)
 94060500: system.cpu2: A0 T0 : @my_thread_main+52    :   str   x1, [x0]           : MemWrite :  D=0x000000000000000a A=0x411060  flags=(IsInteger|IsMemRef|IsStore)
 ....
 and from this, all becomes crystal clear:
 * 94058500: CPU2 loads
 * 94060500: CPU1 loads
 * 94060500: CPU2 stores
 so we see that CPU2 just happened to store after CPU1 loads.
 We also understand why LDADD solves the race problem in AtomicSimpleCPU: it does the load and store in one single go!
 [[cpp-memory-order]]
 ===== C++ std::memory_order
--- a/userland/c/atomic.c
+++ b/userland/c/atomic.c
@@ -1,4 +1,4 @@
-/* https://cirosantilli.com/linux-kernel-module-cheat#c-multithreading */
+/* https://cirosantilli.com/linux-kernel-module-cheat#atomic-c */
 #if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
 #include <stdatomic.h>
@@ -9,10 +9,9 @@
 atomic_int acnt;
 int cnt;
 size_t niters;
-int f(void *thr_data) {
+int my_thread_main(void *thr_data) {
-    (void)thr_data;
+    size_t niters = *(size_t *)thr_data;
    for (size_t i = 0; i < niters; ++i) {
        ++cnt;
        ++acnt;
@@ -23,7 +22,7 @@ int f(void *thr_data) {
 int main(int argc, char **argv) {
 #if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
-    size_t nthreads;
+    size_t niters, nthreads;
    thrd_t *threads;
    if (argc > 1) {
        nthreads = strtoull(argv[1], NULL, 0);
@@ -37,7 +36,7 @@ int main(int argc, char **argv) {
    }
    threads = malloc(sizeof(thrd_t) * nthreads);
    for(size_t i = 0; i < nthreads; ++i)
-        thrd_create(threads + i, f, NULL);
+        thrd_create(threads + i, my_thread_main, &niters);
    for(size_t i = 0; i < nthreads; ++i)
        thrd_join(threads[i], NULL);
    free(threads);
--- a/userland/c/snprintf.c
+++ b/userland/c/snprintf.c
@@ -39,14 +39,18 @@ int main(void) {
    /* Less common case where string does not fit. Error handling would
     * normally follow in a real program. */
    {
        int in = 1234;
        char out[6];
        /* The return here is the same as before.
         *
         * Because it is >= than the imposed limit of 6, we know that
         * the write failed to fully complete. */
 #if 0
        /* GCC 8.3.0 with -O3 actually detects this and breaks the build.
         * error: ‘cd’ directive output truncated writing 2 bytes into a region of size 0 [-Werror=format-truncation=] */
        int in = 1234;
        char out[6];
        assert(snprintf(out, sizeof(out), "ab%dcd", in) == 8);
        assert(strcmp(out, "ab123") == 0);
 #endif
    }
 #endif
    return EXIT_SUCCESS;
--- a/userland/cpp/atomic/main.hpp
+++ b/userland/cpp/atomic/main.hpp
@@ -8,8 +8,6 @@
 #include <thread>
 #include <vector>
 size_t niters;
 #if LKMC_USERLAND_ATOMIC_STD_ATOMIC
 std::atomic_ulong global(0);
 #else
@@ -20,7 +18,7 @@ uint64_t global = 0;
 std::mutex mutex;
 #endif
-void threadMain() {
+void threadMain(size_t niters) {
    for (size_t i = 0; i < niters; ++i) {
 #if LKMC_USERLAND_ATOMIC_MUTEX
        mutex.lock();
@@ -97,7 +95,7 @@ void threadMain() {
 int main(int argc, char **argv) {
 #if __cplusplus >= 201103L
-    size_t nthreads;
+    size_t niters, nthreads;
    if (argc > 1) {
        nthreads = std::stoull(argv[1], NULL, 0);
    } else {
@@ -110,7 +108,7 @@ int main(int argc, char **argv) {
    }
    std::vector<std::thread> threads(nthreads);
    for (size_t i = 0; i < nthreads; ++i)
-        threads[i] = std::thread(threadMain);
+        threads[i] = std::thread(threadMain, niters);
    for (size_t i = 0; i < nthreads; ++i)
        threads[i].join();
    uint64_t expect = nthreads * niters;
--- a/userland/kernel_modules/ioctl.c
+++ b/userland/kernel_modules/ioctl.c
@@ -25,9 +25,13 @@ int main(int argc, char **argv) {
    request = strtol(argv[2], NULL, 10);
    if (argc > 3) {
        arg0 = strtol(argv[3], NULL, 10);
    } else {
        arg0 = 0;
    }
    if (argc > 4) {
        arg1 = strtol(argv[4], NULL, 10);
    } else {
        arg1 = 0;
    }
    fd = open(ioctl_path, O_RDONLY);
--- a/userland/posix/pthread_mutex.c
+++ b/userland/posix/pthread_mutex.c
@@ -7,12 +7,11 @@
 #include <sys/types.h>
 #include <unistd.h>
 unsigned long long niters;
 unsigned long long global = 0;
 pthread_mutex_t main_thread_mutex = PTHREAD_MUTEX_INITIALIZER;
 void* main_thread(void *arg) {
-    (void)arg;
+    unsigned long long niters = *(unsigned long long *)arg;
    unsigned long long i;
    for (i = 0; i < niters; ++i) {
        pthread_mutex_lock(&main_thread_mutex);
@@ -24,7 +23,7 @@ void* main_thread(void *arg) {
 int main(int argc, char **argv) {
    pthread_t *threads;
-    unsigned long long i, nthreads;
+    unsigned long long i, niters, nthreads;
    /* CLI arguments. */
    if (argc > 1) {
@@ -41,7 +40,7 @@ int main(int argc, char **argv) {
    /* Action */
    for (i = 0; i < nthreads; ++i)
-        pthread_create(&threads[i], NULL, main_thread, NULL);
+        pthread_create(&threads[i], NULL, main_thread, &niters);
    for (i = 0; i < nthreads; ++i)
        pthread_join(threads[i], NULL);
    assert(global == nthreads * niters);