Detailed gem5 analysis of how data races happen

And pass niters as a thread argument to all threading implementations...
otherwise every loop has to do a memory load from the global!
This commit is contained in:
Ciro Santilli 六四事件 法轮功
2020-06-05 06:00:05 +00:00
parent 619fef4b04
commit 0d5c7f5c4c
6 changed files with 162 additions and 19 deletions

View File

@@ -17253,12 +17253,74 @@ Algorithm used by the OOM: https://unix.stackexchange.com/questions/153585/how-d
Added in C11! Added in C11!
* link:userland/c/atomic.c[]: `atomic_int` and `thrd_create`
Bibliography: Bibliography:
* <<userland-multithreading>>
* https://stackoverflow.com/questions/3908031/how-to-multithread-c-code/52453354#52453354 * https://stackoverflow.com/questions/3908031/how-to-multithread-c-code/52453354#52453354
===== atomic.c
link:userland/c/atomic.c[]
Demonstrates `atomic_int` and `thrd_create`.
Disassembly with GDB at LKMC 619fef4b04bddc4a5a38aec5e207dd4d5a25d206 + 1:
....
./run-toolchain \
--arch aarch64 gdb \
-- \
-batch \
-ex 'disas/rs my_thread_main' $(./getvar \
--arch aarch64 userland_build_dir)/c/atomic.out \
;
....
shows on ARM:
....
16 ++cnt;
0x00000000004008cc <+28>: 80 00 00 b0 adrp x0, 0x411000 <malloc@got.plt>
0x00000000004008d0 <+32>: 00 80 01 91 add x0, x0, #0x60
0x00000000004008d4 <+36>: 00 00 40 b9 ldr w0, [x0]
0x00000000004008d8 <+40>: 01 04 00 11 add w1, w0, #0x1
0x00000000004008dc <+44>: 80 00 00 b0 adrp x0, 0x411000 <malloc@got.plt>
0x00000000004008e0 <+48>: 00 80 01 91 add x0, x0, #0x60
0x00000000004008e4 <+52>: 01 00 00 b9 str w1, [x0]
17 ++acnt;
0x00000000004008e8 <+56>: 20 00 80 52 mov w0, #0x1 // #1
0x00000000004008ec <+60>: e0 1b 00 b9 str w0, [sp, #24]
0x00000000004008f0 <+64>: e0 1b 40 b9 ldr w0, [sp, #24]
0x00000000004008f4 <+68>: e2 03 00 2a mov w2, w0
0x00000000004008f8 <+72>: 80 00 00 b0 adrp x0, 0x411000 <malloc@got.plt>
0x00000000004008fc <+76>: 00 70 01 91 add x0, x0, #0x5c
0x0000000000400900 <+80>: 03 00 e2 b8 ldaddal w2, w3, [x0]
0x0000000000400904 <+84>: 61 00 02 0b add w1, w3, w2
0x0000000000400908 <+88>: e0 03 01 2a mov w0, w1
0x000000000040090c <+92>: e0 1f 00 b9 str w0, [sp, #28]
....
so:
* the atomic increment uses <<arm-lse,`ldadd`>>
* the non-atomic increment just does LDR, ADD, STR: <<arm-lse,`ldadd`>>
With `-O3`:
....
16 ++cnt;
0x0000000000400a00 <+32>: 60 00 40 b9 ldr w0, [x3]
0x0000000000400a04 <+36>: 00 04 00 11 add w0, w0, #0x1
0x0000000000400a08 <+40>: 60 00 00 b9 str w0, [x3]
17 ++acnt;
0x0000000000400a0c <+44>: 20 00 80 52 mov w0, #0x1 // #1
0x0000000000400a10 <+48>: 40 00 e0 b8 ldaddal w0, w0, [x2]
....
so the situation is the same but without all the horrible stack noise.
==== GCC C extensions ==== GCC C extensions
===== C empty struct ===== C empty struct
@@ -17480,6 +17542,83 @@ Bibliography:
* https://stackoverflow.com/questions/31978324/what-exactly-is-stdatomic/58904448#58904448 "What exactly is std::atomic?" * https://stackoverflow.com/questions/31978324/what-exactly-is-stdatomic/58904448#58904448 "What exactly is std::atomic?"
====== Detailed gem5 analysis of how data races happen
The smallest data race we managed to come up as of LKMC 7c01b29f1ee7da878c7cc9cb4565f3f3cf516a92 and gem5 872cb227fdc0b4d60acc7840889d567a6936b6e1 was with link:userland/c/atomic.c[] (see also <<c-multithreading>>):
....
./run \
--arch aarch64 \
--cli-args '2 10' \
--cpus 3 \
--emulator gem5 \
--userland userland/c/atomic.c \
;
....
which outputs:
....
atomic 20
non-atomic 19
....
./run -aA -eg -u userland/c/atomic.c --cli-args '2 200' --cpus 3 --userland-build-id o3 -N1 --trace ExecAll -- --caches --cpu-type TimingSimpleCPU
Note that that the system is very minimal, and doesn't even have caches, so I'm curious as to how this can happen at all.
So first we do a run with <<gem5-tracing,`--trace Exec`>> and look at the `my_thread_main` entries.
From there we see that first CPU1 enters the function, since it was spawned first.
Then for some time, both CPU1 and CPU2 are running at the same time.
Finally, CPU1 exists, then CPU2 runs alone for a while to finish its loops, and then CPU2 exits.
By greping the LDR data read from the log, we are able to easily spot the moment where things started to go wrong based on the `D=` data:
....
grep -E 'my_thread_main\+36' trace.txt > trace-ldr.txt
....
The `grep` output contains
....
94024500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000006 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
94036500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000007 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
94048500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000008 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
94058500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
94060500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
94070500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x000000000000000a A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
94082500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x000000000000000b A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
....
and so se see that it is at `94058500` that things started going bad, since two consecutive loads from different CPUs read the same value `D=9`! Actually, things were not too bad afterwards because this was by coincidence the last CPU1 read, we would have missed many more increments if the number of iterations had been larger.
Now that we have the first bad time, let's look at the fuller disassembly to better understand what happens around that point.
....
94058500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
94059000: system.cpu2: A0 T0 : @my_thread_main+40 : add w1, w0, #1 : IntAlu : D=0x000000000000000a flags=(IsInteger)
94059000: system.cpu1: A0 T0 : @my_thread_main+120 : b.cc <my_thread_main+28> : IntAlu : flags=(IsControl|IsDirectControl|IsCondControl)
94059500: system.cpu1: A0 T0 : @my_thread_main+28 : adrp x0, #69632 : IntAlu : D=0x0000000000411000 flags=(IsInteger)
94059500: system.cpu2: A0 T0 : @my_thread_main+44 : adrp x0, #69632 : IntAlu : D=0x0000000000411000 flags=(IsInteger)
94060000: system.cpu2: A0 T0 : @my_thread_main+48 : add x0, x0, #96 : IntAlu : D=0x0000000000411060 flags=(IsInteger)
94060000: system.cpu1: A0 T0 : @my_thread_main+32 : add x0, x0, #96 : IntAlu : D=0x0000000000411060 flags=(IsInteger)
94060500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
94060500: system.cpu2: A0 T0 : @my_thread_main+52 : str x1, [x0] : MemWrite : D=0x000000000000000a A=0x411060 flags=(IsInteger|IsMemRef|IsStore)
....
and from this, all becomes crystal clear:
* 94058500: CPU2 loads
* 94060500: CPU1 loads
* 94060500: CPU2 stores
so we see that CPU2 just happened to store after CPU1 loads.
We also understand why LDADD solves the race problem in AtomicSimpleCPU: it does the load and store in one single go!
[[cpp-memory-order]] [[cpp-memory-order]]
===== C++ std::memory_order ===== C++ std::memory_order

View File

@@ -1,4 +1,4 @@
/* https://cirosantilli.com/linux-kernel-module-cheat#c-multithreading */ /* https://cirosantilli.com/linux-kernel-module-cheat#atomic-c */
#if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__) #if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
#include <stdatomic.h> #include <stdatomic.h>
@@ -9,10 +9,9 @@
atomic_int acnt; atomic_int acnt;
int cnt; int cnt;
size_t niters;
int f(void *thr_data) { int my_thread_main(void *thr_data) {
(void)thr_data; size_t niters = *(size_t *)thr_data;
for (size_t i = 0; i < niters; ++i) { for (size_t i = 0; i < niters; ++i) {
++cnt; ++cnt;
++acnt; ++acnt;
@@ -23,7 +22,7 @@ int f(void *thr_data) {
int main(int argc, char **argv) { int main(int argc, char **argv) {
#if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__) #if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
size_t nthreads; size_t niters, nthreads;
thrd_t *threads; thrd_t *threads;
if (argc > 1) { if (argc > 1) {
nthreads = strtoull(argv[1], NULL, 0); nthreads = strtoull(argv[1], NULL, 0);
@@ -37,7 +36,7 @@ int main(int argc, char **argv) {
} }
threads = malloc(sizeof(thrd_t) * nthreads); threads = malloc(sizeof(thrd_t) * nthreads);
for(size_t i = 0; i < nthreads; ++i) for(size_t i = 0; i < nthreads; ++i)
thrd_create(threads + i, f, NULL); thrd_create(threads + i, my_thread_main, &niters);
for(size_t i = 0; i < nthreads; ++i) for(size_t i = 0; i < nthreads; ++i)
thrd_join(threads[i], NULL); thrd_join(threads[i], NULL);
free(threads); free(threads);

View File

@@ -39,14 +39,18 @@ int main(void) {
/* Less common case where string does not fit. Error handling would /* Less common case where string does not fit. Error handling would
* normally follow in a real program. */ * normally follow in a real program. */
{ {
int in = 1234;
char out[6];
/* The return here is the same as before. /* The return here is the same as before.
* *
* Because it is >= than the imposed limit of 6, we know that * Because it is >= than the imposed limit of 6, we know that
* the write failed to fully complete. */ * the write failed to fully complete. */
#if 0
/* GCC 8.3.0 with -O3 actually detects this and breaks the build.
* error: cd directive output truncated writing 2 bytes into a region of size 0 [-Werror=format-truncation=] */
int in = 1234;
char out[6];
assert(snprintf(out, sizeof(out), "ab%dcd", in) == 8); assert(snprintf(out, sizeof(out), "ab%dcd", in) == 8);
assert(strcmp(out, "ab123") == 0); assert(strcmp(out, "ab123") == 0);
#endif
} }
#endif #endif
return EXIT_SUCCESS; return EXIT_SUCCESS;

View File

@@ -8,8 +8,6 @@
#include <thread> #include <thread>
#include <vector> #include <vector>
size_t niters;
#if LKMC_USERLAND_ATOMIC_STD_ATOMIC #if LKMC_USERLAND_ATOMIC_STD_ATOMIC
std::atomic_ulong global(0); std::atomic_ulong global(0);
#else #else
@@ -20,7 +18,7 @@ uint64_t global = 0;
std::mutex mutex; std::mutex mutex;
#endif #endif
void threadMain() { void threadMain(size_t niters) {
for (size_t i = 0; i < niters; ++i) { for (size_t i = 0; i < niters; ++i) {
#if LKMC_USERLAND_ATOMIC_MUTEX #if LKMC_USERLAND_ATOMIC_MUTEX
mutex.lock(); mutex.lock();
@@ -97,7 +95,7 @@ void threadMain() {
int main(int argc, char **argv) { int main(int argc, char **argv) {
#if __cplusplus >= 201103L #if __cplusplus >= 201103L
size_t nthreads; size_t niters, nthreads;
if (argc > 1) { if (argc > 1) {
nthreads = std::stoull(argv[1], NULL, 0); nthreads = std::stoull(argv[1], NULL, 0);
} else { } else {
@@ -110,7 +108,7 @@ int main(int argc, char **argv) {
} }
std::vector<std::thread> threads(nthreads); std::vector<std::thread> threads(nthreads);
for (size_t i = 0; i < nthreads; ++i) for (size_t i = 0; i < nthreads; ++i)
threads[i] = std::thread(threadMain); threads[i] = std::thread(threadMain, niters);
for (size_t i = 0; i < nthreads; ++i) for (size_t i = 0; i < nthreads; ++i)
threads[i].join(); threads[i].join();
uint64_t expect = nthreads * niters; uint64_t expect = nthreads * niters;

View File

@@ -25,9 +25,13 @@ int main(int argc, char **argv) {
request = strtol(argv[2], NULL, 10); request = strtol(argv[2], NULL, 10);
if (argc > 3) { if (argc > 3) {
arg0 = strtol(argv[3], NULL, 10); arg0 = strtol(argv[3], NULL, 10);
} else {
arg0 = 0;
} }
if (argc > 4) { if (argc > 4) {
arg1 = strtol(argv[4], NULL, 10); arg1 = strtol(argv[4], NULL, 10);
} else {
arg1 = 0;
} }
fd = open(ioctl_path, O_RDONLY); fd = open(ioctl_path, O_RDONLY);

View File

@@ -7,12 +7,11 @@
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
unsigned long long niters;
unsigned long long global = 0; unsigned long long global = 0;
pthread_mutex_t main_thread_mutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t main_thread_mutex = PTHREAD_MUTEX_INITIALIZER;
void* main_thread(void *arg) { void* main_thread(void *arg) {
(void)arg; unsigned long long niters = *(unsigned long long *)arg;
unsigned long long i; unsigned long long i;
for (i = 0; i < niters; ++i) { for (i = 0; i < niters; ++i) {
pthread_mutex_lock(&main_thread_mutex); pthread_mutex_lock(&main_thread_mutex);
@@ -24,7 +23,7 @@ void* main_thread(void *arg) {
int main(int argc, char **argv) { int main(int argc, char **argv) {
pthread_t *threads; pthread_t *threads;
unsigned long long i, nthreads; unsigned long long i, niters, nthreads;
/* CLI arguments. */ /* CLI arguments. */
if (argc > 1) { if (argc > 1) {
@@ -41,7 +40,7 @@ int main(int argc, char **argv) {
/* Action */ /* Action */
for (i = 0; i < nthreads; ++i) for (i = 0; i < nthreads; ++i)
pthread_create(&threads[i], NULL, main_thread, NULL); pthread_create(&threads[i], NULL, main_thread, &niters);
for (i = 0; i < nthreads; ++i) for (i = 0; i < nthreads; ++i)
pthread_join(threads[i], NULL); pthread_join(threads[i], NULL);
assert(global == nthreads * niters); assert(global == nthreads * niters);