mirror of
https://github.com/cirosantilli/linux-kernel-module-cheat.git
synced 2026-01-23 02:05:57 +01:00
Detailed gem5 analysis of how data races happen
And pass niters as a thread argument to all threading implementations... otherwise every loop has to do a memory load from the global!
This commit is contained in:
143
README.adoc
143
README.adoc
@@ -17253,12 +17253,74 @@ Algorithm used by the OOM: https://unix.stackexchange.com/questions/153585/how-d
|
||||
|
||||
Added in C11!
|
||||
|
||||
* link:userland/c/atomic.c[]: `atomic_int` and `thrd_create`
|
||||
|
||||
Bibliography:
|
||||
|
||||
* <<userland-multithreading>>
|
||||
* https://stackoverflow.com/questions/3908031/how-to-multithread-c-code/52453354#52453354
|
||||
|
||||
===== atomic.c
|
||||
|
||||
link:userland/c/atomic.c[]
|
||||
|
||||
Demonstrates `atomic_int` and `thrd_create`.
|
||||
|
||||
Disassembly with GDB at LKMC 619fef4b04bddc4a5a38aec5e207dd4d5a25d206 + 1:
|
||||
|
||||
....
|
||||
./run-toolchain \
|
||||
--arch aarch64 gdb \
|
||||
-- \
|
||||
-batch \
|
||||
-ex 'disas/rs my_thread_main' $(./getvar \
|
||||
--arch aarch64 userland_build_dir)/c/atomic.out \
|
||||
;
|
||||
....
|
||||
|
||||
shows on ARM:
|
||||
|
||||
....
|
||||
16 ++cnt;
|
||||
0x00000000004008cc <+28>: 80 00 00 b0 adrp x0, 0x411000 <malloc@got.plt>
|
||||
0x00000000004008d0 <+32>: 00 80 01 91 add x0, x0, #0x60
|
||||
0x00000000004008d4 <+36>: 00 00 40 b9 ldr w0, [x0]
|
||||
0x00000000004008d8 <+40>: 01 04 00 11 add w1, w0, #0x1
|
||||
0x00000000004008dc <+44>: 80 00 00 b0 adrp x0, 0x411000 <malloc@got.plt>
|
||||
0x00000000004008e0 <+48>: 00 80 01 91 add x0, x0, #0x60
|
||||
0x00000000004008e4 <+52>: 01 00 00 b9 str w1, [x0]
|
||||
|
||||
17 ++acnt;
|
||||
0x00000000004008e8 <+56>: 20 00 80 52 mov w0, #0x1 // #1
|
||||
0x00000000004008ec <+60>: e0 1b 00 b9 str w0, [sp, #24]
|
||||
0x00000000004008f0 <+64>: e0 1b 40 b9 ldr w0, [sp, #24]
|
||||
0x00000000004008f4 <+68>: e2 03 00 2a mov w2, w0
|
||||
0x00000000004008f8 <+72>: 80 00 00 b0 adrp x0, 0x411000 <malloc@got.plt>
|
||||
0x00000000004008fc <+76>: 00 70 01 91 add x0, x0, #0x5c
|
||||
0x0000000000400900 <+80>: 03 00 e2 b8 ldaddal w2, w3, [x0]
|
||||
0x0000000000400904 <+84>: 61 00 02 0b add w1, w3, w2
|
||||
0x0000000000400908 <+88>: e0 03 01 2a mov w0, w1
|
||||
0x000000000040090c <+92>: e0 1f 00 b9 str w0, [sp, #28]
|
||||
....
|
||||
|
||||
so:
|
||||
|
||||
* the atomic increment uses <<arm-lse,`ldadd`>>
|
||||
* the non-atomic increment just does LDR, ADD, STR: <<arm-lse,`ldadd`>>
|
||||
|
||||
With `-O3`:
|
||||
|
||||
....
|
||||
16 ++cnt;
|
||||
0x0000000000400a00 <+32>: 60 00 40 b9 ldr w0, [x3]
|
||||
0x0000000000400a04 <+36>: 00 04 00 11 add w0, w0, #0x1
|
||||
0x0000000000400a08 <+40>: 60 00 00 b9 str w0, [x3]
|
||||
|
||||
17 ++acnt;
|
||||
0x0000000000400a0c <+44>: 20 00 80 52 mov w0, #0x1 // #1
|
||||
0x0000000000400a10 <+48>: 40 00 e0 b8 ldaddal w0, w0, [x2]
|
||||
....
|
||||
|
||||
so the situation is the same but without all the horrible stack noise.
|
||||
|
||||
==== GCC C extensions
|
||||
|
||||
===== C empty struct
|
||||
@@ -17480,6 +17542,83 @@ Bibliography:
|
||||
|
||||
* https://stackoverflow.com/questions/31978324/what-exactly-is-stdatomic/58904448#58904448 "What exactly is std::atomic?"
|
||||
|
||||
====== Detailed gem5 analysis of how data races happen
|
||||
|
||||
The smallest data race we managed to come up as of LKMC 7c01b29f1ee7da878c7cc9cb4565f3f3cf516a92 and gem5 872cb227fdc0b4d60acc7840889d567a6936b6e1 was with link:userland/c/atomic.c[] (see also <<c-multithreading>>):
|
||||
|
||||
....
|
||||
./run \
|
||||
--arch aarch64 \
|
||||
--cli-args '2 10' \
|
||||
--cpus 3 \
|
||||
--emulator gem5 \
|
||||
--userland userland/c/atomic.c \
|
||||
;
|
||||
....
|
||||
|
||||
which outputs:
|
||||
|
||||
....
|
||||
atomic 20
|
||||
non-atomic 19
|
||||
....
|
||||
|
||||
./run -aA -eg -u userland/c/atomic.c --cli-args '2 200' --cpus 3 --userland-build-id o3 -N1 --trace ExecAll -- --caches --cpu-type TimingSimpleCPU
|
||||
|
||||
Note that that the system is very minimal, and doesn't even have caches, so I'm curious as to how this can happen at all.
|
||||
|
||||
So first we do a run with <<gem5-tracing,`--trace Exec`>> and look at the `my_thread_main` entries.
|
||||
|
||||
From there we see that first CPU1 enters the function, since it was spawned first.
|
||||
|
||||
Then for some time, both CPU1 and CPU2 are running at the same time.
|
||||
|
||||
Finally, CPU1 exists, then CPU2 runs alone for a while to finish its loops, and then CPU2 exits.
|
||||
|
||||
By greping the LDR data read from the log, we are able to easily spot the moment where things started to go wrong based on the `D=` data:
|
||||
|
||||
....
|
||||
grep -E 'my_thread_main\+36' trace.txt > trace-ldr.txt
|
||||
....
|
||||
|
||||
The `grep` output contains
|
||||
|
||||
....
|
||||
94024500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000006 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
|
||||
94036500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000007 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
|
||||
94048500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000008 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
|
||||
94058500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
|
||||
94060500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
|
||||
94070500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x000000000000000a A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
|
||||
94082500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x000000000000000b A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
|
||||
....
|
||||
|
||||
and so se see that it is at `94058500` that things started going bad, since two consecutive loads from different CPUs read the same value `D=9`! Actually, things were not too bad afterwards because this was by coincidence the last CPU1 read, we would have missed many more increments if the number of iterations had been larger.
|
||||
|
||||
Now that we have the first bad time, let's look at the fuller disassembly to better understand what happens around that point.
|
||||
|
||||
....
|
||||
94058500: system.cpu2: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
|
||||
94059000: system.cpu2: A0 T0 : @my_thread_main+40 : add w1, w0, #1 : IntAlu : D=0x000000000000000a flags=(IsInteger)
|
||||
94059000: system.cpu1: A0 T0 : @my_thread_main+120 : b.cc <my_thread_main+28> : IntAlu : flags=(IsControl|IsDirectControl|IsCondControl)
|
||||
94059500: system.cpu1: A0 T0 : @my_thread_main+28 : adrp x0, #69632 : IntAlu : D=0x0000000000411000 flags=(IsInteger)
|
||||
94059500: system.cpu2: A0 T0 : @my_thread_main+44 : adrp x0, #69632 : IntAlu : D=0x0000000000411000 flags=(IsInteger)
|
||||
94060000: system.cpu2: A0 T0 : @my_thread_main+48 : add x0, x0, #96 : IntAlu : D=0x0000000000411060 flags=(IsInteger)
|
||||
94060000: system.cpu1: A0 T0 : @my_thread_main+32 : add x0, x0, #96 : IntAlu : D=0x0000000000411060 flags=(IsInteger)
|
||||
94060500: system.cpu1: A0 T0 : @my_thread_main+36 : ldr x0, [x0] : MemRead : D=0x0000000000000009 A=0x411060 flags=(IsInteger|IsMemRef|IsLoad)
|
||||
94060500: system.cpu2: A0 T0 : @my_thread_main+52 : str x1, [x0] : MemWrite : D=0x000000000000000a A=0x411060 flags=(IsInteger|IsMemRef|IsStore)
|
||||
....
|
||||
|
||||
and from this, all becomes crystal clear:
|
||||
|
||||
* 94058500: CPU2 loads
|
||||
* 94060500: CPU1 loads
|
||||
* 94060500: CPU2 stores
|
||||
|
||||
so we see that CPU2 just happened to store after CPU1 loads.
|
||||
|
||||
We also understand why LDADD solves the race problem in AtomicSimpleCPU: it does the load and store in one single go!
|
||||
|
||||
[[cpp-memory-order]]
|
||||
===== C++ std::memory_order
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* https://cirosantilli.com/linux-kernel-module-cheat#c-multithreading */
|
||||
/* https://cirosantilli.com/linux-kernel-module-cheat#atomic-c */
|
||||
|
||||
#if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
|
||||
#include <stdatomic.h>
|
||||
@@ -9,10 +9,9 @@
|
||||
|
||||
atomic_int acnt;
|
||||
int cnt;
|
||||
size_t niters;
|
||||
|
||||
int f(void *thr_data) {
|
||||
(void)thr_data;
|
||||
int my_thread_main(void *thr_data) {
|
||||
size_t niters = *(size_t *)thr_data;
|
||||
for (size_t i = 0; i < niters; ++i) {
|
||||
++cnt;
|
||||
++acnt;
|
||||
@@ -23,7 +22,7 @@ int f(void *thr_data) {
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
#if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
|
||||
size_t nthreads;
|
||||
size_t niters, nthreads;
|
||||
thrd_t *threads;
|
||||
if (argc > 1) {
|
||||
nthreads = strtoull(argv[1], NULL, 0);
|
||||
@@ -37,7 +36,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
threads = malloc(sizeof(thrd_t) * nthreads);
|
||||
for(size_t i = 0; i < nthreads; ++i)
|
||||
thrd_create(threads + i, f, NULL);
|
||||
thrd_create(threads + i, my_thread_main, &niters);
|
||||
for(size_t i = 0; i < nthreads; ++i)
|
||||
thrd_join(threads[i], NULL);
|
||||
free(threads);
|
||||
|
||||
@@ -39,14 +39,18 @@ int main(void) {
|
||||
/* Less common case where string does not fit. Error handling would
|
||||
* normally follow in a real program. */
|
||||
{
|
||||
int in = 1234;
|
||||
char out[6];
|
||||
/* The return here is the same as before.
|
||||
*
|
||||
* Because it is >= than the imposed limit of 6, we know that
|
||||
* the write failed to fully complete. */
|
||||
#if 0
|
||||
/* GCC 8.3.0 with -O3 actually detects this and breaks the build.
|
||||
* error: ‘cd’ directive output truncated writing 2 bytes into a region of size 0 [-Werror=format-truncation=] */
|
||||
int in = 1234;
|
||||
char out[6];
|
||||
assert(snprintf(out, sizeof(out), "ab%dcd", in) == 8);
|
||||
assert(strcmp(out, "ab123") == 0);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
return EXIT_SUCCESS;
|
||||
|
||||
@@ -8,8 +8,6 @@
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
size_t niters;
|
||||
|
||||
#if LKMC_USERLAND_ATOMIC_STD_ATOMIC
|
||||
std::atomic_ulong global(0);
|
||||
#else
|
||||
@@ -20,7 +18,7 @@ uint64_t global = 0;
|
||||
std::mutex mutex;
|
||||
#endif
|
||||
|
||||
void threadMain() {
|
||||
void threadMain(size_t niters) {
|
||||
for (size_t i = 0; i < niters; ++i) {
|
||||
#if LKMC_USERLAND_ATOMIC_MUTEX
|
||||
mutex.lock();
|
||||
@@ -97,7 +95,7 @@ void threadMain() {
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
#if __cplusplus >= 201103L
|
||||
size_t nthreads;
|
||||
size_t niters, nthreads;
|
||||
if (argc > 1) {
|
||||
nthreads = std::stoull(argv[1], NULL, 0);
|
||||
} else {
|
||||
@@ -110,7 +108,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
std::vector<std::thread> threads(nthreads);
|
||||
for (size_t i = 0; i < nthreads; ++i)
|
||||
threads[i] = std::thread(threadMain);
|
||||
threads[i] = std::thread(threadMain, niters);
|
||||
for (size_t i = 0; i < nthreads; ++i)
|
||||
threads[i].join();
|
||||
uint64_t expect = nthreads * niters;
|
||||
|
||||
@@ -25,9 +25,13 @@ int main(int argc, char **argv) {
|
||||
request = strtol(argv[2], NULL, 10);
|
||||
if (argc > 3) {
|
||||
arg0 = strtol(argv[3], NULL, 10);
|
||||
} else {
|
||||
arg0 = 0;
|
||||
}
|
||||
if (argc > 4) {
|
||||
arg1 = strtol(argv[4], NULL, 10);
|
||||
} else {
|
||||
arg1 = 0;
|
||||
}
|
||||
|
||||
fd = open(ioctl_path, O_RDONLY);
|
||||
|
||||
@@ -7,12 +7,11 @@
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
unsigned long long niters;
|
||||
unsigned long long global = 0;
|
||||
pthread_mutex_t main_thread_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
void* main_thread(void *arg) {
|
||||
(void)arg;
|
||||
unsigned long long niters = *(unsigned long long *)arg;
|
||||
unsigned long long i;
|
||||
for (i = 0; i < niters; ++i) {
|
||||
pthread_mutex_lock(&main_thread_mutex);
|
||||
@@ -24,7 +23,7 @@ void* main_thread(void *arg) {
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
pthread_t *threads;
|
||||
unsigned long long i, nthreads;
|
||||
unsigned long long i, niters, nthreads;
|
||||
|
||||
/* CLI arguments. */
|
||||
if (argc > 1) {
|
||||
@@ -41,7 +40,7 @@ int main(int argc, char **argv) {
|
||||
|
||||
/* Action */
|
||||
for (i = 0; i < nthreads; ++i)
|
||||
pthread_create(&threads[i], NULL, main_thread, NULL);
|
||||
pthread_create(&threads[i], NULL, main_thread, &niters);
|
||||
for (i = 0; i < nthreads; ++i)
|
||||
pthread_join(threads[i], NULL);
|
||||
assert(global == nthreads * niters);
|
||||
|
||||
Reference in New Issue
Block a user