From efc4205416efa88912620959311cbfb92f40b4e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?= =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= Date: Tue, 27 Aug 2019 00:00:00 +0000 Subject: [PATCH] Become a memory accounting amateur --- README.adoc | 171 ++++++++++++++++++++++---- buildroot_config/default | 3 + userland/c/hello.c | 4 +- userland/c/malloc.c | 16 ++- userland/c/malloc_max.c | 18 --- userland/c/malloc_size.c | 26 ++++ userland/c/snprintf.c | 54 ++++++++ userland/linux/mmap_anonymous.c | 29 ++--- userland/linux/mmap_anonymous_touch.c | 125 +++++++++++++++++++ userland/linux/total_memory.c | 30 +++++ 10 files changed, 411 insertions(+), 65 deletions(-) delete mode 100644 userland/c/malloc_max.c create mode 100644 userland/c/malloc_size.c create mode 100644 userland/c/snprintf.c create mode 100644 userland/linux/mmap_anonymous_touch.c create mode 100644 userland/linux/total_memory.c diff --git a/README.adoc b/README.adoc index 55db2bf..ced00aa 100644 --- a/README.adoc +++ b/README.adoc @@ -3198,7 +3198,7 @@ One downside of this method is that it has to put the entire filesystem into mem end Kernel panic - not syncing: Out of memory and no killable processes... .... -This can be solved by increasing the memory with: +This can be solved by increasing the memory as explained at <>: .... ./run --initrd --memory 256M @@ -10746,15 +10746,79 @@ TODO: now to verify this with the Linux kernel? Besides raw performance benchmar ===== Memory size .... -./run --arch arm --memory 512M +./run --memory 512M .... -and verify inside the guest with: +We can verify this on the guest directly from the kernel with: .... -free -m +cat /proc/meminfo .... +as of LKMC 1e969e832f66cb5a72d12d57c53fb09e9721d589 this output contains: + +.... +MemTotal: 498472 kB +.... + +which we expand with: + +.... +printf '0x%X\n' $((498472 * 1024)) +.... + +to: + +.... +0x1E6CA000 +.... + +TODO: why is this value a bit smaller than 512M? + +`free` also gives the same result: + +.... +free -b +.... + +contains: + +.... + total used free shared buffers cached +Mem: 510435328 20385792 490049536 0 503808 2760704 +-/+ buffers/cache: 17121280 493314048 +Swap: 0 0 0 +.... + +which we expand with: + +.... +printf '0x%X\n' 510435328$((498472 * 1024) +.... + +`man free` from Ubuntu's procps 3.3.15 tells us that `free` obtains this information from `/proc/meminfo` as well. + +From C, we can get this information with `sysconf(_SC_PHYS_PAGES)` or `get_phys_pages()`: + +.... +./linux/total_memory.out +.... + +Source: link:userland/linux/total_memory.c[] + +Output: + +.... +sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE) = 0x1E6CA000 +sysconf(_SC_AVPHYS_PAGES) * sysconf(_SC_PAGESIZE) = 0x1D178000 +get_phys_pages() * sysconf(_SC_PAGESIZE) = 0x1E6CA000 +get_avphys_pages() * sysconf(_SC_PAGESIZE) = 0x1D178000 +.... + +This is mentioned at: https://stackoverflow.com/questions/22670257/getting-ram-size-in-c-linux-non-precise-result/22670407#22670407 + +AV means available and gives the free memory: https://stackoverflow.com/questions/14386856/c-check-available-ram/57659190#57659190 + ===== gem5 disk and network latency TODO These look promising: @@ -12707,8 +12771,9 @@ Programs under link:userland/c/[] are examples of https://en.wikipedia.org/wiki/ *** exit **** link:userland/c/abort.c[] ** `stdio.h` -*** link:userland/c/stderr.c[] *** link:userland/c/getchar.c[] +*** link:userland/c/snprintf.c[] +*** link:userland/c/stderr.c[] *** File IO **** link:userland/c/file_write_read.c[] * Fun @@ -12722,39 +12787,99 @@ link:userland/c/malloc.c[]: `malloc` hello world: allocate two ints and use them LInux 5.1 / glibc 2.29 implements it with the <>. +===== malloc implementation + +TODO: the exact answer is going to be hard. + +But at least let's verify that large `malloc` calls use the `mmap` syscall with: + +.... +strace -x ./c/malloc_size.out 0x100000 2>&1 | grep mmap | tail -n 1 +strace -x ./c/malloc_size.out 0x200000 2>&1 | grep mmap | tail -n 1 +strace -x ./c/malloc_size.out 0x400000 2>&1 | grep mmap | tail -n 1 +.... + +Source: link:userland/c/malloc_size.c[]. + +From this we sese that the last `mmap` calls are: + +.... +mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffff7ef2000 +mmap(NULL, 2101248, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffff7271000 +mmap(NULL, 4198400, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffff7071000 +.... + +which in hex are: + +.... +printf '%x\n' 1052672 +# 101000 +printf '%x\n' 2101248 +# 201000 +printf '%x\n' 4198400 +# 401000 +.... + +so we figured out the pattern: those 1, 2, and 4 MiB mallocs are mmaping N + 0x1000 bytes. + ===== malloc maximum size -Test how much memory Linux lets us allocate by doubling a buffer with `realloc` until it fails: +General overview at: https://stackoverflow.com/questions/2798330/maximum-memory-which-malloc-can-allocate + +See also: + +* https://stackoverflow.com/questions/13127855/what-is-the-size-limit-for-mmap +* https://stackoverflow.com/questions/7504139/malloc-allocates-memory-more-than-ram + +From <> and `./run --help`, we see that at we set the emulator memory by default to 256MB. Let's see how much Linux allows us to malloc. + +Then from <> we see that `malloc` is implemented with `mmap`. Therefore, let's simplify the problam and try to understand what is the larges mmap we can do first. This way we can ignore how glibc implements malloc for now. + +In Linux, the maximum `mmap` value in controlled by: .... -./run --userland userland/c/malloc_max.c +cat /proc/sys/vm/overcommit_memory .... -Source: link:userland/c/malloc_max.c[] +which is documented in `man proc`. -Outcome at c03d5d18ea971ae85d008101528d84c2ff25eb27 on Ubuntu 19.04 <> host (16GiB RAM): prints up to `0x1000000000` (64GiB). - -TODO dive into source code. - -TODO: if we do direct <> allocations with link:userland/c/malloc.c[] or <> with link:userland/linux/mmap_anonymous.c[], then the limit was smaller than 64GiB! - -These work: +The default value is `0`, which I can't find a precise documentation for. `2` is precisly documented but I'm lazy to do all calculations. So let's just verify `0` vs `1` by trying to `mmap` 1GiB of memory: .... -./userland/c/malloc.out 0x100000000 -./userland/linux/mmap_anonymous.out 0x100000000 +echo 0 > /proc/sys/vm/overcommit_memory +./linux/mmap_anonymous.out 0x40000000 +echo 1 > /proc/sys/vm/overcommit_memory +./linux/mmap_anonymous.out 0x40000000 .... -which is `4Gib * sizeof(int) == 16GiB`, but these fail at 32GiB: +Source: link:userland/linux/mmap_anonymous.c[] + +With `0`, we get a failure: .... -./userland/c/malloc.out 0x200000000 -./userland/linux/mmap_anonymous.out 0x200000000 +mmap: Cannot allocate memory .... -`malloc` returns NULL, and `mmap` goes a bit further and segfauls on the first assignment `array[0] = 1`. +but with `1` the allocation works. -Bibliography: https://stackoverflow.com/questions/2798330/maximum-memory-which-malloc-can-allocate +We are allowed to allocate more than the actual memory + swap because the memory is only virtual, as explained at: https://stackoverflow.com/questions/7880784/what-is-rss-and-vsz-in-linux-memory-management/57453334#57453334 + +If we start using the pages, the OOM killer would sooner or later step in and kill our process: <>. + +====== Linux out-of-memory killer + +We can observe the OOM in LKMC 1e969e832f66cb5a72d12d57c53fb09e9721d589 which defaults to 256MiB of memory with: + +.... +echo 1 > /proc/sys/vm/overcommit_memory +./linux/mmap_anonymous_touch.out 0x40000000 0x8000000 +.... + +This first allows memory overcommit so to that the program can mmap 1GiB, 4x more than total RAM without failing as mentioned at <>. + +It then walks over every page and writes a value in it to ensure that it is used. + +Algorithm used by the OOM: https://unix.stackexchange.com/questions/153585/how-does-the-oom-killer-decide-which-process-to-kill-first ==== GCC C extensions @@ -17122,7 +17247,7 @@ Or to conveniently do a clean build without affecting your current one: cat ../linux-kernel-module-cheat-regression/*/build-time.log .... -===== Find which packages are making the build slow and big +===== Find which Buildroot packages are making the build slow and big .... ./build-buildroot -- graph-build graph-size graph-depends diff --git a/buildroot_config/default b/buildroot_config/default index 4403a30..1876ce3 100644 --- a/buildroot_config/default +++ b/buildroot_config/default @@ -30,6 +30,9 @@ BR2_PACKAGE_HOST_GDB_PYTHON=y BR2_PACKAGE_HOST_GDB_SIM=y BR2_PACKAGE_HOST_GDB_TUI=y +# Host debug tools. +BR2_PACKAGE_STRACE=y + # DTC. BR2_PACKAGE_HOST_DTC=y diff --git a/userland/c/hello.c b/userland/c/hello.c index 42cb55d..aa9f0c5 100644 --- a/userland/c/hello.c +++ b/userland/c/hello.c @@ -1,4 +1,6 @@ -/* Print hello to stdout ;-) */ +/* https://cirosantilli.com/linux-kernel-module-cheat#c + * + * Print hello to stdout ;-) */ #include #include diff --git a/userland/c/malloc.c b/userland/c/malloc.c index f522933..812245b 100644 --- a/userland/c/malloc.c +++ b/userland/c/malloc.c @@ -4,22 +4,26 @@ #include #include +/* We do this in a separate function just to illustrate that + * this is allows for malloc memory! This is unlike regular stack + * variables which may be deallocated when the function returns. */ +void *allocate_bytes(size_t nbytes) { + return malloc(nbytes); +} + int main(int argc, char **argv) { int *is; size_t nbytes, nints; - /* Decide how many ints to allocate. */ + /* Decide how many ints to allocate. + * Unlike usual non-VLA arrays, the size is determined dynamically at runtime! */ if (argc < 2) { nints = 2; } else { nints = strtoull(argv[1], NULL, 0); } nbytes = nints * sizeof(*is); - - /* Allocate the ints. - * Note that unlike traditional stack arrays (non-VLA) - * this value does not have to be determined at compile time! */ - is = malloc(nbytes); + is = allocate_bytes(nbytes); /* This can happen for example if we ask for too much memory. */ if (is == NULL) { diff --git a/userland/c/malloc_max.c b/userland/c/malloc_max.c deleted file mode 100644 index 8eafb4c..0000000 --- a/userland/c/malloc_max.c +++ /dev/null @@ -1,18 +0,0 @@ -/* https://cirosantilli.com/linux-kernel-module-cheat#malloc-maximum-size */ - -#include -#include - -int main(void) { - char *ptr = NULL; - size_t size = 1; - while (1) { - printf("0x%zx\n", size); - ptr = realloc(ptr, size); - if (ptr == NULL) { - break; - } else { - size <<= 1; - } - } -} diff --git a/userland/c/malloc_size.c b/userland/c/malloc_size.c new file mode 100644 index 0000000..762888a --- /dev/null +++ b/userland/c/malloc_size.c @@ -0,0 +1,26 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#malloc + * + * Malloc n bytes as given from the command line. + */ + +#include +#include +#include + +int main(int argc, char **argv) { + char *chars; + size_t nbytes; + + if (argc < 2) { + nbytes = 2; + } else { + nbytes = strtoull(argv[1], NULL, 0); + } + chars = malloc(nbytes); + if (chars == NULL) { + perror("malloc"); + exit(EXIT_FAILURE); + } + free(chars); + return EXIT_SUCCESS; +} diff --git a/userland/c/snprintf.c b/userland/c/snprintf.c new file mode 100644 index 0000000..da23e48 --- /dev/null +++ b/userland/c/snprintf.c @@ -0,0 +1,54 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#c + * + * Like `sprintf`, but writes at most n bytes, so it is safer, + * because it may not be possible or easy to calculate the resulting + * size of a formated string. + * + * The size given includes the null terminator. */ + +#include +#include +#include +#include + +int main(void) { +#if __STDC_VERSION__ >= 199901L + /* Common usage when string fits. + * + * Ensures that there will be no out or bounds access on out. */ + { + int in = 1234; + char out[1024]; + int snprintf_return; + snprintf_return = snprintf(out, sizeof(out), "ab%dcd", in); + + /* The usual error checking. */ + if (snprintf_return < 0) { + perror("snprintf"); + exit(EXIT_FAILURE); + } + assert((size_t)snprintf_return < sizeof(out)); + + /* Assert because we know the return here. */ + assert(snprintf_return == 8); + + /* What it actually copied. */ + assert(strcmp(out, "ab1234cd") == 0); + } + + /* Less common case where string does not fit. Error handling would + * normally follow in a real program. */ + { + int in = 1234; + char out[6]; + /* The return here is the same as before. + * + * Because it is >= than the imposed limit of 6, we know that + * the write failed to fully complete. */ + assert(snprintf(out, sizeof(out), "ab%dcd", in) == 8); + assert(strcmp(out, "ab123") == 0); + } +#endif + return EXIT_SUCCESS; +} + diff --git a/userland/linux/mmap_anonymous.c b/userland/linux/mmap_anonymous.c index 1b8f1da..5ea2fa8 100644 --- a/userland/linux/mmap_anonymous.c +++ b/userland/linux/mmap_anonymous.c @@ -1,7 +1,9 @@ -/* https://cirosantilli.com/linux-kernel-module-cheat#mmap-map-anonymous */ +/* https://cirosantilli.com/linux-kernel-module-cheat#mmap-map-anonymous + * + * Malloc n bytes as given from the command line. + */ #define _GNU_SOURCE -#include #include #include #include @@ -9,19 +11,18 @@ #include int main(int argc, char **argv) { - int *is; - size_t nbytes, nints; + char *chars; + size_t nbytes; /* Decide how many ints to allocate. */ if (argc < 2) { - nints = 2; + nbytes = 2; } else { - nints = strtoull(argv[1], NULL, 0); + nbytes = strtoull(argv[1], NULL, 0); } - nbytes = nints * sizeof(*is); - /* Allocate 2 ints. */ - is = mmap( + /* Allocate the bytes. */ + chars = mmap( NULL, nbytes, PROT_READ | PROT_WRITE, @@ -31,19 +32,13 @@ int main(int argc, char **argv) { ); /* This can happen for example if we ask for too much memory. */ - if (is == NULL) { + if (chars == MAP_FAILED) { perror("mmap"); exit(EXIT_FAILURE); } - /* Write to and read from the allocated memory. */ - is[0] = 1; - is[1] = 2; - assert(is[0] == 1); - assert(is[1] == 2); - /* Free the allocated memory. */ - munmap(is, nbytes); + munmap(chars, nbytes); return EXIT_SUCCESS; } diff --git a/userland/linux/mmap_anonymous_touch.c b/userland/linux/mmap_anonymous_touch.c new file mode 100644 index 0000000..513864f --- /dev/null +++ b/userland/linux/mmap_anonymous_touch.c @@ -0,0 +1,125 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#malloc-maximum-size + * + * mmap memory, then write something to each page to ensure it is not just virtual. + * We want to meet the OOM. + * + * ./prog [nbytes [print_interval]] + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +typedef struct { + unsigned long size,resident,share,text,lib,data,dt; +} ProcStatm; + +/* https://stackoverflow.com/questions/1558402/memory-usage-of-current-process-in-c/7212248#7212248 */ +void ProcStat_init(ProcStatm *result) { + const char* statm_path = "/proc/self/statm"; + FILE *f = fopen(statm_path, "r"); + if(!f) { + perror(statm_path); + abort(); + } + if(7 != fscanf( + f, + "%ld %ld %ld %ld %ld %ld %ld", + &(result->size), + &(result->resident), + &(result->share), + &(result->text), + &(result->lib), + &(result->data), + &(result->dt) + )) { + perror(statm_path); + abort(); + } + fclose(f); +} + +int main(int argc, char **argv) { + ProcStatm proc_statm; + char *base, *p; + char system_cmd[1024]; + long page_size; + size_t i, nbytes, print_interval, bytes_since_last_print; + int snprintf_return; + + /* Decide how many ints to allocate. */ + if (argc < 2) { + nbytes = 0x10000; + } else { + nbytes = strtoull(argv[1], NULL, 0); + } + if (argc < 3) { + print_interval = 0x1000; + } else { + print_interval = strtoull(argv[2], NULL, 0); + } + page_size = sysconf(_SC_PAGESIZE); + + /* Allocate the memory. */ + base = mmap( + NULL, + nbytes, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + -1, + 0 + ); + if (base == MAP_FAILED) { + perror("mmap"); + exit(EXIT_FAILURE); + } + + /* Write to all the allocated pages. */ + i = 0; + p = base; + bytes_since_last_print = 0; + /* Produce the ps command that lists only our VSZ and RSS. */ + snprintf_return = snprintf( + system_cmd, + sizeof(system_cmd), + "ps -o pid,vsz,rss | awk '{if (NR == 1 || $1 == \"%ju\") print}'", + (uintmax_t)getpid() + ); + assert(snprintf_return >= 0); + assert((size_t)snprintf_return < sizeof(system_cmd)); + bytes_since_last_print = print_interval; + do { + /* Modify a byte in the page. */ + *p = i; + p += page_size; + bytes_since_last_print += page_size; + /* Print process memory usage every print_interval bytes. + * We count memory using a few techniques from: + * https://stackoverflow.com/questions/1558402/memory-usage-of-current-process-in-c */ + if (bytes_since_last_print > print_interval) { + bytes_since_last_print -= print_interval; + printf("extra_memory_committed %lu KiB\n", (i * page_size) / 1024); + ProcStat_init(&proc_statm); + /* Check /proc/self/statm */ + printf( + "/proc/self/statm size resident %lu %lu KiB\n", + (proc_statm.size * page_size) / 1024, + (proc_statm.resident * page_size) / 1024 + ); + /* Check ps. */ + puts(system_cmd); + system(system_cmd); + puts(""); + } + i++; + } while (p < base + nbytes); + + /* Cleanup. */ + munmap(base, nbytes); + return EXIT_SUCCESS; +} diff --git a/userland/linux/total_memory.c b/userland/linux/total_memory.c new file mode 100644 index 0000000..426d37f --- /dev/null +++ b/userland/linux/total_memory.c @@ -0,0 +1,30 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#memory-size */ + +#define _GNU_SOURCE +#include +#include +#include + +int main(void) { + /* PAGESIZE is POSIX: http://pubs.opengroup.org/onlinepubs/9699919799/ + * but PHYS_PAGES and AVPHYS_PAGES are glibc extensions. I bet those are + * parsed from /proc/meminfo. */ + printf( + "sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE) = 0x%lX\n", + sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE) + ); + printf( + "sysconf(_SC_AVPHYS_PAGES) * sysconf(_SC_PAGESIZE) = 0x%lX\n", + sysconf(_SC_AVPHYS_PAGES) * sysconf(_SC_PAGESIZE) + ); + + /* glibc extensions. man says they are parsed from /proc/meminfo. */ + printf( + "get_phys_pages() * sysconf(_SC_PAGESIZE) = 0x%lX\n", + get_phys_pages() * sysconf(_SC_PAGESIZE) + ); + printf( + "get_avphys_pages() * sysconf(_SC_PAGESIZE) = 0x%lX\n", + get_avphys_pages() * sysconf(_SC_PAGESIZE) + ); +}