diff --git a/README.adoc b/README.adoc index 125ffcb..e57fa6e 100644 --- a/README.adoc +++ b/README.adoc @@ -3864,8 +3864,8 @@ We pass `-L` by default, so everything just works. However, in case something goes wrong, you can also try statically linked executables, since this mechanism tends to be a bit more stable, for example: -* gem5 user mode currently only supports static executables as mentioned at: xref:gem5-syscall-emulation-mode[xrefstyle=full] * QEMU x86_64 guest on x86_64 host was failing with <>, but we found a workaround +* gem5 user only supported static executables in the past, as mentioned at: xref:gem5-syscall-emulation-mode[xrefstyle=full] Running statically linked executables sometimes makes things break: @@ -3875,6 +3875,7 @@ Running statically linked executables sometimes makes things break: .... ./run --static --userland userland/c/file_write_read.c .... ++ fails our assertion that the data was read back correctly: + .... @@ -3965,7 +3966,7 @@ Input from a file by explicitly requesting our scripts to use it via the Python .... printf a > f.tmp -./run --emulator gem5 --userland userland/c/getchar.c --static --stdin-file f.tmp +./run --emulator gem5 --userland userland/c/getchar.c --stdin-file f.tmp .... This is especially useful when running tests that require stdin input. @@ -3978,24 +3979,12 @@ Less robust than QEMU's, but still usable: There are much more unimplemented syscalls in gem5 than in QEMU. Many of those are trivial to implement however. -As of 185c2730cc78d5adda683d76c0e3b35e7cb534f0, dynamically linked executables only work on x86, and they can only use the host libraries, which is ugly: - -* https://stackoverflow.com/questions/50542222/how-to-run-a-dynamically-linked-executable-syscall-emulation-mode-se-py-in-gem5 -* https://www.mail-archive.com/gem5-users@gem5.org/msg15585.html - -If you try dynamically linked executables on ARM, they fail with: - -.... -fatal: Unable to open dynamic executable's interpreter. -.... +Support for dynamic linking was added in November 2019: https://stackoverflow.com/questions/50542222/how-to-run-a-dynamically-linked-executable-syscall-emulation-mode-se-py-in-gem5/50696098#50696098 So let's just play with some static ones: .... -./build-userland \ - --arch aarch64 \ - --static \ -; +./build-userland --arch aarch64 ./run \ --arch aarch64 \ --emulator gem5 \ @@ -4013,14 +4002,12 @@ TODO: how to escape spaces on the command line arguments? --arch aarch64 \ --emulator gem5 \ --gdb-wait \ - --static \ --userland userland/c/command_line_arguments.c \ --userland-args 'asdf "qw er"' \ ; ./run-gdb \ --arch aarch64 \ --emulator gem5 \ - --static \ --userland userland/c/command_line_arguments.c \ main \ ; @@ -4031,7 +4018,7 @@ TODO: how to escape spaces on the command line arguments? As of gem5 7fa4c946386e7207ad5859e8ade0bbfc14000d91, the crappy `se.py` script does not forward the exit status of syscall emulation mode, you can test it with: .... -./run --dry-run --emulator gem5 --static --userland userland/c/false.c +./run --dry-run --emulator gem5 --userland userland/c/false.c .... Source: link:userland/c/false.c[]. @@ -4061,8 +4048,7 @@ Since gem5 has to implement syscalls itself in syscall emulation mode, it can of .... ./run \ --emulator gem5 \ - --static userland/arch/x86_64/freestanding/linux/hello.S \ - --userland \ + --userland userland/arch/x86_64/freestanding/linux/hello.S \ --trace-stdout \ --trace ExecAll,SyscallBase,SyscallVerbose \ ; @@ -10749,7 +10735,7 @@ gem5 user mode multithreading has been particularly flaky compared <> deadlocks, for example in lin .... ./run \ --emulator gem5 \ - --static \ --userland userland/posix/pthread_deadlock.c \ --userland-args 1 \ ; @@ -14381,13 +14363,11 @@ To benchmark on gem5, we first build the benchmark with <> e --arch x86_64 \ --ccflags='-DLKMC_M5OPS_ENABLE=1' \ --force-rebuild userland/cpp/bst_vs_heap_vs_hashmap.cpp \ - --static \ --optimization-level 3 \ ; ./run \ --arch x86_64 \ --emulator gem5 \ - --static \ --userland userland/cpp/bst_vs_heap_vs_hashmap.cpp \ --userland-args='100000 1 0' \ -- \ @@ -14504,6 +14484,15 @@ https://en.wikipedia.org/wiki/Dhrystone Created in the 80's, it is not a representative measure of performance in modern computers anymore. It has mostly been replaced by https://en.wikipedia.org/wiki/SPECint[SPEC], which is... closed source! Unbelievable. +Dhrystone is very simple: + +* there is one loop in the `dhry_1.c` main function that gets executed N times +* that loop calls 9 short functions called `Proc_0` to `Proc_9`, most of which are defined in `dhry_1.c`, and a few others in `dhry_2.c` + +The benchmark is single-threaded. + +After a quick look at it, Dhrystone in `-O3` is is very likely completely CPU bound, as there are no loops over variable sized arrays, except for some dummy ones that only run once. It just does a bunch of operations on local and global C variables, which are very likely to be inlined and treated fully in registers until the final write back. TODO confirm with some kind of measurement. + <> has a `dhrystone` package, but because it is so interesting to us, we decided to also build it ourselves, which allows things like static and baremetal compilation more easily. Build and run on QEMU <>: @@ -14525,8 +14514,8 @@ Increase the number of loops to try and reach more meaningful results: Build and run on gem5 user mode: .... -./build-dhrystone --optimization-level 3 --static -./run --emulator gem5 --userland "$(./getvar --static userland_build_dir)/submodules/dhrystone/dhrystone" +./build-dhrystone --optimization-level 3 +./run --emulator gem5 --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone" .... Run natively on the host: @@ -14615,8 +14604,8 @@ Decrease the benchmark size and the retry count to finish simulation faster, but Build and run on gem5 user mode: .... -./build-stream --optimization-level 3 --static -./run --emulator gem5 --userland "$(./getvar --static userland_build_dir)/submodules/stream-benchmark/stream_c.exe" --userland-args '1000 2' +./build-stream --optimization-level 3 +./run --emulator gem5 --userland "$(./getvar userland_build_dir)/submodules/stream-benchmark/stream_c.exe" --userland-args '1000 2' .... ==== PARSEC benchmark @@ -16077,7 +16066,7 @@ TODO: review this section, make a more controlled userland experiment with <> `system.cpu.numCycles` cycle count with the https://en.wikipedia.org/wiki/Time_Stamp_Counter[x86 RDTSC instruction] that is supposed to do the same thing: .... -./build-userland --static userland/arch/x86_64/inline_asm/rdtsc.S +./build-userland userland/arch/x86_64/inline_asm/rdtsc.S ./run --eval './arch/x86_64/rdtsc.out;m5 exit;' --emulator gem5 ./gem5-stat .... @@ -17157,9 +17146,9 @@ gem5 covered at: https://stackoverflow.com/questions/57692765/how-to-change-the- It is fun to observe this directly with the <> in SE: .... -./run --arch aarch64 --userland userland/arch/aarch64/sve_addvl.S --static --emulator gem5 -- --param 'system.cpu[:].isa[:].sve_vl_se = 1' -./run --arch aarch64 --userland userland/arch/aarch64/sve_addvl.S --static --emulator gem5 -- --param 'system.cpu[:].isa[:].sve_vl_se = 2' -./run --arch aarch64 --userland userland/arch/aarch64/sve_addvl.S --static --emulator gem5 -- --param 'system.cpu[:].isa[:].sve_vl_se = 4' +./run --arch aarch64 --userland userland/arch/aarch64/sve_addvl.S --emulator gem5 -- --param 'system.cpu[:].isa[:].sve_vl_se = 1' +./run --arch aarch64 --userland userland/arch/aarch64/sve_addvl.S --emulator gem5 -- --param 'system.cpu[:].isa[:].sve_vl_se = 2' +./run --arch aarch64 --userland userland/arch/aarch64/sve_addvl.S --emulator gem5 -- --param 'system.cpu[:].isa[:].sve_vl_se = 4' .... which consecutively: @@ -19052,7 +19041,7 @@ TODO: automate this further, produce the results table automatically, possibly b For now we can just run on gem5 to estimate the instruction count per input size and extrapolate? -For example, the simplest scalable CPU content would be a busy loop: link:userland/gcc/busy_loop.c[], so let's focus on that for now. +For example, the simplest scalable CPU content would be a busy loop: link:userland/gcc/busy_loop.c[], so let's start by analyzing that one. Summary of manually collected results on <> at LKMC a18f28e263c91362519ef550150b5c9d75fa3679 + 1: xref:table-busy-loop-dmips[xrefstyle=full]. As expected, the less native / more detailed / more complex simulations are slower! @@ -19060,46 +19049,64 @@ Summary of manually collected results on <> at LKMC a18f28e263c91362519ef55 .Busy loop MIPS for different simulator setups [options="header"] |=== -|Simulator |Loops |Time (s) |Instruction count |Approximate MIPS +|LKMC |Benchmark |Emulator |Loops |Time (s) |Instruction count |Approximate MIPS +|a18f28e263c91362519ef550150b5c9d75fa3679 + 1 +|userland/gcc/busy_loop.c -O0 |`qemu --arch aarch64` |10^10 |68 |1.1 * 10^11 (approx) |2000 +|a18f28e263c91362519ef550150b5c9d75fa3679 + 1 +|userland/gcc/busy_loop.c -O0 |`gem5 --arch aarch64` |10^7 |100 |1.10018162 * 10^8 |1 +|a18f28e263c91362519ef550150b5c9d75fa3679 + 1 +|userland/gcc/busy_loop.c -O0 |`+gem5 --arch aarch64 -- --cpu-type MinorCPU --caches+` |10^6 |31 |1.1018152 * 10^7 |0.4 +|a18f28e263c91362519ef550150b5c9d75fa3679 + 1 +|userland/gcc/busy_loop.c -O0 |`+gem5 --arch aarch64 -- --cpu-type DerivO3CPU --caches+` |10^6 |52 |1.1018128 * 10^7 |0.2 +|a18f28e263c91362519ef550150b5c9d75fa3679 + 1 +|userland/gcc/busy_loop.c -O0 |`+gem5 --arch aarch64 --gem5-build-id MOESI_CMP_directory -- --cpu-type DerivO3CPU --caches --ruby+` |1 * 1000000 = 10^6 |63 |1.1005150 * 10^7 |0.2 +|a605448f07e6380634b1aa7e9732d111759f69fd + 1 +|<> -O3 +|`gem5 --arch aarch64` +|4 * 10^5 +|68 +|9.2034139 * 10^7 +|1.6 + |=== -The first step is to determine a number of loops that will run long enough to have meaningful results, but not too long that we will get bored. +The first step is to determine a number of loops that will run long enough to have meaningful results, but not too long that we will get bored, so about 1 minute. On our <> machine, we found 10^7 (10 million == 1000 times 10000) loops to be a good number for a gem5 atomic simulation: .... -./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --userland-args '1 10000000' --static +./run --arch aarch64 --emulator gem5 --userland userland/gcc/busy_loop.c --userland-args '1 10000000' ./gem5-stat --arch aarch64 sim_insts .... @@ -20690,6 +20697,22 @@ Properties of parent directories apply to all children. Lists coming from parent directories are extended instead of overwritten by children, this is especially useful for C compiler flags. +To quickly determine which properties a path has, you can use link:getprops[], e.g.: + +.... +./getprops userland/c/hello.c +.... + +which outputs values such as: + +.... +allowed_archs=None +allowed_emulators=None +arm_aarch32=False +arm_sve=False +baremetal=True +.... + === Update a forked submodule This is a template update procedure for submodules for which we have some patches on on top of mainline. diff --git a/common.py b/common.py index a050be5..e95516c 100644 --- a/common.py +++ b/common.py @@ -151,6 +151,8 @@ consts['build_type_choices'] = [ 'debug' ] consts['build_type_default'] = 'opt' +# Files whose basename start with this are gitignored. +consts['tmp_prefix'] = 'tmp.' class ExitLoop(Exception): pass @@ -543,7 +545,8 @@ are available. default=False, help='''\ Build userland executables statically. Set --userland-build-id to 'static' -if one was not given explicitly. +if one was not given explicitly. See also: +https://cirosantilli.com/linux-kernel-module-cheat#user-mode-static-executables ''', ) self.add_argument( @@ -1103,6 +1106,7 @@ lunch aosp_{}-eng env['buildroot_toolchain_prefix'] ) env['userland_library_dir'] = env['buildroot_target_dir'] + env['userland_library_redirects'] = ['lib', 'lib64', os.path.join('usr', 'lib'), os.path.join('usr', 'lib64')] env['pkg_config'] = env['buildroot_pkg_config'] elif env['gcc_which'] == 'crosstool-ng': env['toolchain_prefix'] = os.path.join( diff --git a/getprops b/getprops new file mode 100755 index 0000000..0b30be3 --- /dev/null +++ b/getprops @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +import common +import json +import path_properties + +class Main(common.LkmcCliFunction): + def __init__(self): + super().__init__( + defaults = { + 'show_time': False, + }, + description='''\ +Get the path_properties for an userland executable: +https://cirosantilli.com/linux-kernel-module-cheat#path-properties +TODO check that the path exists. +''', + ) + self.add_argument('path') + + def timed_main(self): + properties = path_properties.get(self.env['path']).properties + for key in sorted(properties): + print('{}={}'.format(key, properties[key])) + +if __name__ == '__main__': + Main().cli() diff --git a/path_properties.py b/path_properties.py index 99dbfba..5203459 100644 --- a/path_properties.py +++ b/path_properties.py @@ -231,10 +231,12 @@ class PathProperties: ) def should_be_tested(self, env): + basename = self.path_components[-1] return ( self.should_be_built( env, ) and + not basename.startswith(env['tmp_prefix']) and not ( env['mode'] == 'baremetal' and ( self['arm_aarch32'] or @@ -523,7 +525,13 @@ path_properties_tuples = ( 'freestanding': freestanding_properties, 'lkmc_assert_eq_fail.S': {'signal_received': signal.Signals.SIGABRT}, 'lkmc_assert_memcmp_fail.S': {'signal_received': signal.Signals.SIGABRT}, - 'nostartfiles': nostartfiles_properties, + 'nostartfiles': ( + nostartfiles_properties, + { + # https://github.com/cirosantilli/linux-kernel-module-cheat/issues/107 + 'exit.s': {'skip_run_unclassified': True}, + } + ), 'udf.S': { 'signal_generated_by_os': True, 'signal_received': signal.Signals.SIGILL, diff --git a/run b/run index 15cc1c9..2c1cef4 100755 --- a/run +++ b/run @@ -498,6 +498,17 @@ Extra options to append at the end of the emulator command line. ]) if self.env['userland_args'] is not None: cmd.extend(['--options', self.env['userland_args'], LF]) + if not self.env['static']: + for path in self.env['userland_library_redirects']: + cmd.extend([ + '--redirects', + '{}={}'.format( + os.sep + path, + os.path.join(self.env['userland_library_dir'], path) + ), + LF + ]) + cmd.extend(['--interp-dir', self.env['userland_library_dir'], LF]) else: if self.env['gem5_script'] == 'fs': if self.env['gem5_restore'] is not None: diff --git a/submodules/dhrystone b/submodules/dhrystone index fb5e012..1621d23 160000 --- a/submodules/dhrystone +++ b/submodules/dhrystone @@ -1 +1 @@ -Subproject commit fb5e01298a16e793672316067d8034d608e84c13 +Subproject commit 1621d234df82f406bd74d0231d13e6a6f3ebae9c diff --git a/submodules/gem5 b/submodules/gem5 index fa877e1..bcf041f 160000 --- a/submodules/gem5 +++ b/submodules/gem5 @@ -1 +1 @@ -Subproject commit fa877e19d31203744aecbb252a4f0207dc7491dd +Subproject commit bcf041f257623e5c9e77d35b7531bae59edc0423 diff --git a/test-executables b/test-executables index f4ced74..7b5c3a5 100755 --- a/test-executables +++ b/test-executables @@ -44,8 +44,6 @@ If given, run only the given tests. Otherwise, run all tests. def timed_main(self): run_args = self.get_common_args() - if self.env['mode'] == 'userland' and self.env['emulator'] == 'gem5': - run_args['userland_build_id'] = 'static' rootdir_abs_len = len(self.env['root_dir']) with thread_pool.ThreadPool( self.run_test, diff --git a/userland/algorithm/set/generate_io b/userland/algorithm/set/generate_io index 93c1584..6fe68b7 100755 --- a/userland/algorithm/set/generate_io +++ b/userland/algorithm/set/generate_io @@ -21,10 +21,10 @@ random.seed(args.seed) input_data = common.LkmcList() for i in range(args.size): input_data.append(random.randint(args.min, args.max)) -with open('tmp.i', 'w') as i: +with open(common.consts['tmp_prefix'] + 'i', 'w') as i: i.write(str(input_data) + '\n') if args.unique: input_data = common.LkmcList(set(input_data)) input_data.sort() -with open('tmp.e', 'w') as e: +with open(common.consts['tmp_prefix'] + 'e', 'w') as e: e.write(str(input_data) + '\n')