From 0d5c7f5c4c7983116519da0b322cdace0d428a2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?= =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= Date: Fri, 5 Jun 2020 06:00:05 +0000 Subject: [PATCH] Detailed gem5 analysis of how data races happen And pass niters as a thread argument to all threading implementations... otherwise every loop has to do a memory load from the global! --- README.adoc | 143 +++++++++++++++++++++++++++++++- userland/c/atomic.c | 11 ++- userland/c/snprintf.c | 8 +- userland/cpp/atomic/main.hpp | 8 +- userland/kernel_modules/ioctl.c | 4 + userland/posix/pthread_mutex.c | 7 +- 6 files changed, 162 insertions(+), 19 deletions(-) diff --git a/README.adoc b/README.adoc index cad3f4f..adfbf63 100644 --- a/README.adoc +++ b/README.adoc @@ -17253,12 +17253,74 @@ Algorithm used by the OOM: https://unix.stackexchange.com/questions/153585/how-d Added in C11! -* link:userland/c/atomic.c[]: `atomic_int` and `thrd_create` - Bibliography: +* <> * https://stackoverflow.com/questions/3908031/how-to-multithread-c-code/52453354#52453354 +===== atomic.c + +link:userland/c/atomic.c[] + +Demonstrates `atomic_int` and `thrd_create`. + +Disassembly with GDB at LKMC 619fef4b04bddc4a5a38aec5e207dd4d5a25d206 + 1: + +.... +./run-toolchain \ + --arch aarch64 gdb \ + -- \ + -batch \ + -ex 'disas/rs my_thread_main' $(./getvar \ + --arch aarch64 userland_build_dir)/c/atomic.out \ +; +.... + +shows on ARM: + +.... +16 ++cnt; + 0x00000000004008cc <+28>: 80 00 00 b0 adrp x0, 0x411000 + 0x00000000004008d0 <+32>: 00 80 01 91 add x0, x0, #0x60 + 0x00000000004008d4 <+36>: 00 00 40 b9 ldr w0, [x0] + 0x00000000004008d8 <+40>: 01 04 00 11 add w1, w0, #0x1 + 0x00000000004008dc <+44>: 80 00 00 b0 adrp x0, 0x411000 + 0x00000000004008e0 <+48>: 00 80 01 91 add x0, x0, #0x60 + 0x00000000004008e4 <+52>: 01 00 00 b9 str w1, [x0] + +17 ++acnt; + 0x00000000004008e8 <+56>: 20 00 80 52 mov w0, #0x1 // #1 + 0x00000000004008ec <+60>: e0 1b 00 b9 str w0, [sp, #24] + 0x00000000004008f0 <+64>: e0 1b 40 b9 ldr w0, [sp, #24] + 0x00000000004008f4 <+68>: e2 03 00 2a mov w2, w0 + 0x00000000004008f8 <+72>: 80 00 00 b0 adrp x0, 0x411000 + 0x00000000004008fc <+76>: 00 70 01 91 add x0, x0, #0x5c + 0x0000000000400900 <+80>: 03 00 e2 b8 ldaddal w2, w3, [x0] + 0x0000000000400904 <+84>: 61 00 02 0b add w1, w3, w2 + 0x0000000000400908 <+88>: e0 03 01 2a mov w0, w1 + 0x000000000040090c <+92>: e0 1f 00 b9 str w0, [sp, #28] +.... + +so: + +* the atomic increment uses <> +* the non-atomic increment just does LDR, ADD, STR: <> + +With `-O3`: + +.... +16 ++cnt; + 0x0000000000400a00 <+32>: 60 00 40 b9 ldr w0, [x3] + 0x0000000000400a04 <+36>: 00 04 00 11 add w0, w0, #0x1 + 0x0000000000400a08 <+40>: 60 00 00 b9 str w0, [x3] + +17 ++acnt; + 0x0000000000400a0c <+44>: 20 00 80 52 mov w0, #0x1 // #1 + 0x0000000000400a10 <+48>: 40 00 e0 b8 ldaddal w0, w0, [x2] +.... + +so the situation is the same but without all the horrible stack noise. + ==== GCC C extensions ===== C empty struct @@ -17480,6 +17542,83 @@ Bibliography: * https://stackoverflow.com/questions/31978324/what-exactly-is-stdatomic/58904448#58904448 "What exactly is std::atomic?" +====== Detailed gem5 analysis of how data races happen + +The smallest data race we managed to come up as of LKMC 7c01b29f1ee7da878c7cc9cb4565f3f3cf516a92 and gem5 872cb227fdc0b4d60acc7840889d567a6936b6e1 was with link:userland/c/atomic.c[] (see also <>): + +.... +./run \ + --arch aarch64 \ + --cli-args '2 10' \ + --cpus 3 \ + --emulator gem5 \ + --userland userland/c/atomic.c \ +; +.... + +which outputs: + +.... +atomic 20 +non-atomic 19 +.... + +./run -aA -eg -u userland/c/atomic.c --cli-args '2 200' --cpus 3 --userland-build-id o3 -N1 --trace ExecAll -- --caches --cpu-type TimingSimpleCPU + +Note that that the system is very minimal, and doesn't even have caches, so I'm curious as to how this can happen at all. + +So first we do a run with <> and look at the `my_thread_main` entries. + +From there we see that first CPU1 enters the function, since it was spawned first. + +Then for some time, both CPU1 and CPU2 are running at the same time. + +Finally, CPU1 exists, then CPU2 runs alone for a while to finish its loops, and then CPU2 exits. + +By greping the LDR data read from the log, we are able to easily spot the moment where things started to go wrong based on the `D=` data: + +.... +grep -E 'my_thread_main\+36' trace.txt > trace-ldr.txt +.... + +The `grep` output contains + +.... +94024500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000006 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad) +94036500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000007 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad) +94048500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000008 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad) +94058500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad) +94060500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad) +94070500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x000000000000000a A=0x411060 flags=(IsInteger|IsMemRef|IsLoad) +94082500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x000000000000000b A=0x411060 flags=(IsInteger|IsMemRef|IsLoad) +.... + +and so se see that it is at `94058500` that things started going bad, since two consecutive loads from different CPUs read the same value `D=9`! Actually, things were not too bad afterwards because this was by coincidence the last CPU1 read, we would have missed many more increments if the number of iterations had been larger. + +Now that we have the first bad time, let's look at the fuller disassembly to better understand what happens around that point. + +.... +94058500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad) +94059000: system.cpu2: A0 T0 : @my_thread_main+40 : add w1, w0, #1 : IntAlu : D=0x000000000000000a flags=(IsInteger) +94059000: system.cpu1: A0 T0 : @my_thread_main+120 : b.cc : IntAlu : flags=(IsControl|IsDirectControl|IsCondControl) +94059500: system.cpu1: A0 T0 : @my_thread_main+28 : adrp x0, #69632 : IntAlu : D=0x0000000000411000 flags=(IsInteger) +94059500: system.cpu2: A0 T0 : @my_thread_main+44 : adrp x0, #69632 : IntAlu : D=0x0000000000411000 flags=(IsInteger) +94060000: system.cpu2: A0 T0 : @my_thread_main+48 : add x0, x0, #96 : IntAlu : D=0x0000000000411060 flags=(IsInteger) +94060000: system.cpu1: A0 T0 : @my_thread_main+32 : add x0, x0, #96 : IntAlu : D=0x0000000000411060 flags=(IsInteger) +94060500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad) +94060500: system.cpu2: A0 T0 : @my_thread_main+52 : str x1, [x0] : MemWrite : D=0x000000000000000a A=0x411060 flags=(IsInteger|IsMemRef|IsStore) +.... + +and from this, all becomes crystal clear: + +* 94058500: CPU2 loads +* 94060500: CPU1 loads +* 94060500: CPU2 stores + +so we see that CPU2 just happened to store after CPU1 loads. + +We also understand why LDADD solves the race problem in AtomicSimpleCPU: it does the load and store in one single go! + [[cpp-memory-order]] ===== C++ std::memory_order diff --git a/userland/c/atomic.c b/userland/c/atomic.c index d826951..d8e0bcd 100644 --- a/userland/c/atomic.c +++ b/userland/c/atomic.c @@ -1,4 +1,4 @@ -/* https://cirosantilli.com/linux-kernel-module-cheat#c-multithreading */ +/* https://cirosantilli.com/linux-kernel-module-cheat#atomic-c */ #if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__) #include @@ -9,10 +9,9 @@ atomic_int acnt; int cnt; -size_t niters; -int f(void *thr_data) { - (void)thr_data; +int my_thread_main(void *thr_data) { + size_t niters = *(size_t *)thr_data; for (size_t i = 0; i < niters; ++i) { ++cnt; ++acnt; @@ -23,7 +22,7 @@ int f(void *thr_data) { int main(int argc, char **argv) { #if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__) - size_t nthreads; + size_t niters, nthreads; thrd_t *threads; if (argc > 1) { nthreads = strtoull(argv[1], NULL, 0); @@ -37,7 +36,7 @@ int main(int argc, char **argv) { } threads = malloc(sizeof(thrd_t) * nthreads); for(size_t i = 0; i < nthreads; ++i) - thrd_create(threads + i, f, NULL); + thrd_create(threads + i, my_thread_main, &niters); for(size_t i = 0; i < nthreads; ++i) thrd_join(threads[i], NULL); free(threads); diff --git a/userland/c/snprintf.c b/userland/c/snprintf.c index da23e48..bb38939 100644 --- a/userland/c/snprintf.c +++ b/userland/c/snprintf.c @@ -39,14 +39,18 @@ int main(void) { /* Less common case where string does not fit. Error handling would * normally follow in a real program. */ { - int in = 1234; - char out[6]; /* The return here is the same as before. * * Because it is >= than the imposed limit of 6, we know that * the write failed to fully complete. */ +#if 0 + /* GCC 8.3.0 with -O3 actually detects this and breaks the build. + * error: ‘cd’ directive output truncated writing 2 bytes into a region of size 0 [-Werror=format-truncation=] */ + int in = 1234; + char out[6]; assert(snprintf(out, sizeof(out), "ab%dcd", in) == 8); assert(strcmp(out, "ab123") == 0); +#endif } #endif return EXIT_SUCCESS; diff --git a/userland/cpp/atomic/main.hpp b/userland/cpp/atomic/main.hpp index 04c5ace..1fed060 100644 --- a/userland/cpp/atomic/main.hpp +++ b/userland/cpp/atomic/main.hpp @@ -8,8 +8,6 @@ #include #include -size_t niters; - #if LKMC_USERLAND_ATOMIC_STD_ATOMIC std::atomic_ulong global(0); #else @@ -20,7 +18,7 @@ uint64_t global = 0; std::mutex mutex; #endif -void threadMain() { +void threadMain(size_t niters) { for (size_t i = 0; i < niters; ++i) { #if LKMC_USERLAND_ATOMIC_MUTEX mutex.lock(); @@ -97,7 +95,7 @@ void threadMain() { int main(int argc, char **argv) { #if __cplusplus >= 201103L - size_t nthreads; + size_t niters, nthreads; if (argc > 1) { nthreads = std::stoull(argv[1], NULL, 0); } else { @@ -110,7 +108,7 @@ int main(int argc, char **argv) { } std::vector threads(nthreads); for (size_t i = 0; i < nthreads; ++i) - threads[i] = std::thread(threadMain); + threads[i] = std::thread(threadMain, niters); for (size_t i = 0; i < nthreads; ++i) threads[i].join(); uint64_t expect = nthreads * niters; diff --git a/userland/kernel_modules/ioctl.c b/userland/kernel_modules/ioctl.c index 897a995..4145412 100644 --- a/userland/kernel_modules/ioctl.c +++ b/userland/kernel_modules/ioctl.c @@ -25,9 +25,13 @@ int main(int argc, char **argv) { request = strtol(argv[2], NULL, 10); if (argc > 3) { arg0 = strtol(argv[3], NULL, 10); + } else { + arg0 = 0; } if (argc > 4) { arg1 = strtol(argv[4], NULL, 10); + } else { + arg1 = 0; } fd = open(ioctl_path, O_RDONLY); diff --git a/userland/posix/pthread_mutex.c b/userland/posix/pthread_mutex.c index 916fb1c..79790e4 100644 --- a/userland/posix/pthread_mutex.c +++ b/userland/posix/pthread_mutex.c @@ -7,12 +7,11 @@ #include #include -unsigned long long niters; unsigned long long global = 0; pthread_mutex_t main_thread_mutex = PTHREAD_MUTEX_INITIALIZER; void* main_thread(void *arg) { - (void)arg; + unsigned long long niters = *(unsigned long long *)arg; unsigned long long i; for (i = 0; i < niters; ++i) { pthread_mutex_lock(&main_thread_mutex); @@ -24,7 +23,7 @@ void* main_thread(void *arg) { int main(int argc, char **argv) { pthread_t *threads; - unsigned long long i, nthreads; + unsigned long long i, niters, nthreads; /* CLI arguments. */ if (argc > 1) { @@ -41,7 +40,7 @@ int main(int argc, char **argv) { /* Action */ for (i = 0; i < nthreads; ++i) - pthread_create(&threads[i], NULL, main_thread, NULL); + pthread_create(&threads[i], NULL, main_thread, &niters); for (i = 0; i < nthreads; ++i) pthread_join(threads[i], NULL); assert(global == nthreads * niters);