From ba2976cc7f3fedde691f771d38fcdb4ce2e12b94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?= =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= Date: Sun, 25 Nov 2018 00:00:00 +0000 Subject: [PATCH] gem5: fix arm multicore with system.auto_reset_addr = True baremetal: fix aarch64/no_bootloader/semihost_exit.S which was wrong because was using unset sp for register block. Tests needed urgently!! --- README.adoc | 102 +++++++++++++++++- baremetal/arch/aarch64/multicore.S | 11 +- .../aarch64/no_bootloader/semihost_exit.S | 15 +-- baremetal/arch/aarch64/semihost_exit.S | 15 +-- baremetal/arch/arm/multicore.S | 37 +++++++ common.py | 8 +- run | 20 ++-- run-gdb | 6 +- run-toolchain | 2 +- 9 files changed, 180 insertions(+), 36 deletions(-) create mode 100644 baremetal/arch/arm/multicore.S diff --git a/README.adoc b/README.adoc index b94c6cb..46523c2 100644 --- a/README.adoc +++ b/README.adoc @@ -10560,9 +10560,14 @@ output: .... ./run --arch aarch64 --baremetal arch/aarch64/multicore --cpus 2 ./run --arch aarch64 --baremetal arch/aarch64/multicore --cpus 2 --gem5 +./run --arch arm --baremetal arch/aarch64/multicore --cpus 2 +./run --arch arm --baremetal arch/aarch64/multicore --cpus 2 --gem5 .... -Source: link:baremetal/arch/aarch64/multicore.S[] +Sources: + +* link:baremetal/arch/aarch64/multicore.S[] +* link:baremetal/arch/arm/multicore.S[] CPU 0 of this program enters a spinlock loop: it repeatedly checks if a given memory address is `1`. @@ -10576,6 +10581,26 @@ Don't believe me? Then try: and watch it hang forever. +Note that if you try the same thing on gem5: + +.... +./run --arch aarch64 --baremetal arch/aarch64/multicore --cpus 1 --gem5 +.... + +then the gem5 actually exits, but with a different message: + +.... +Exiting @ tick 18446744073709551615 because simulate() limit reached +.... + +as opposed to the expected: + +.... +Exiting @ tick 36500 because m5_exit instruction encountered +.... + +since gem5 is able to detect when nothing will ever happen, and exits. + When GDB step debugging, switch between cores with the usual `thread` commands, see also: <>. Bibliography: @@ -10594,6 +10619,81 @@ However, likely no implementation likely does (TODO confirm), since: and power consumption is key in ARM applications. +In QEMU 3.0.0, `SEV` is a NOPs, and `WFE` might be, but I'm not sure, see: https://github.com/qemu/qemu/blob/v3.0.0/target/arm/translate-a64.c#L1423 + +.... + case 2: /* WFE */ + if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) { + s->base.is_jmp = DISAS_WFE; + } + return; + case 4: /* SEV */ + case 5: /* SEVL */ + /* we treat all as NOP at least for now */ + return; +.... + +TODO: what does the WFE code do? How can it not be a NOP if SEV is a NOP? https://github.com/qemu/qemu/blob/v3.0.0/target/arm/translate.c#L4609 might explain why, but it is Chinese to me (I only understand 30% ;-)): + +.... + * For WFI we will halt the vCPU until an IRQ. For WFE and YIELD we + * only call the helper when running single threaded TCG code to ensure + * the next round-robin scheduled vCPU gets a crack. In MTTCG mode we + * just skip this instruction. Currently the SEV/SEVL instructions + * which are *one* of many ways to wake the CPU from WFE are not + * implemented so we can't sleep like WFI does. + */ +.... + +For gem5 however, if we comment out the `SVE` instruction, then it actually exits with `simulate() limit reached`, so the CPU truly never wakes up, which is a more realistic behaviour. + +The following Raspberry Pi bibliography helped us get this sample up and running: + +* https://github.com/bztsrc/raspi3-tutorial/tree/a3f069b794aeebef633dbe1af3610784d55a0efa/02_multicorec +* https://github.com/dwelch67/raspberrypi/tree/a09771a1d5a0b53d8e7a461948dc226c5467aeec/multi00 +* https://github.com/LdB-ECM/Raspberry-Pi/blob/3b628a2c113b3997ffdb408db03093b2953e4961/Multicore/SmartStart64.S +* https://github.com/LdB-ECM/Raspberry-Pi/blob/3b628a2c113b3997ffdb408db03093b2953e4961/Multicore/SmartStart32.S + +===== PSCI + +In QEMU, CPU 1 starts in a halted state. This can be observed from GDB, where: + +.... +info threads +.... + +shows something like: + +.... +* 1 Thread 1 (CPU#0 [running]) mystart + 2 Thread 2 (CPU#1 [halted ]) mystart +.... + +To wake up CPU 1 on QEMU, we must use the Power State Coordination Interface (PSCI) which is documented at: link:https://developer.arm.com/docs/den0022/latest/arm-power-state-coordination-interface-platform-design-document[]. + +This interface uses `HVC` calls, and the calling convention is documented at "SMC CALLING CONVENTION" link:https://developer.arm.com/docs/den0028/latest[]. + +If we boot the Linux kernel on QEMU and <>, we observe that it contains the address of the PSCI CPU_ON call: + +.... + psci { + method = "hvc"; + compatible = "arm,psci-0.2", "arm,psci"; + cpu_on = <0xc4000003>; + migrate = <0xc4000005>; + cpu_suspend = <0xc4000001>; + cpu_off = <0x84000002>; + }; +.... + +The Linux kernel wakes up the secondary cores in this exact same way at: https://github.com/torvalds/linux/blob/v4.19/drivers/firmware/psci.c#L122 We first actually got it working here by grepping the kernel and step debugging that call :-) + +In gem5, CPU 1 starts woken up from the start, so PSCI is not needed. TODO gem5 actually blows up if we try to do the `hvc` call, understand why. + +===== DMB + +TODO: create and study a minimal examples in gem5 where the `DMB` instruction leads to less cycles: https://stackoverflow.com/questions/15491751/real-life-use-cases-of-barriers-dsb-dmb-isb-in-arm + === How we got some baremetal stuff to work It is nice when thing just work. diff --git a/baremetal/arch/aarch64/multicore.S b/baremetal/arch/aarch64/multicore.S index 527ba37..9b8020c 100644 --- a/baremetal/arch/aarch64/multicore.S +++ b/baremetal/arch/aarch64/multicore.S @@ -7,10 +7,12 @@ main: ldr x1, =spinlock str x0, [x1] - /* Read cpu id into x1. */ + /* Read cpu id into x1. + * TODO: cores beyond 4th? + */ mrs x1, mpidr_el1 - and x1, x1, 3 - cbz x1, cpu0_only + ands x1, x1, 3 + beq cpu0_only cpu1_only: /* Only CPU 1 reaches this point and sets the spinlock. */ mov x0, 1 @@ -35,8 +37,7 @@ cpu0_only: #if !defined(GEM5) /* Wake up CPU 1 from initial sleep! - * In gem5, CPU 1 starts woken up from the start, - * so this is not needed. + * See:https://github.com/cirosantilli/linux-kernel-module-cheat#psci */ /* Function identifier: PCSI CPU_ON. */ ldr w0, =0xc4000003 diff --git a/baremetal/arch/aarch64/no_bootloader/semihost_exit.S b/baremetal/arch/aarch64/no_bootloader/semihost_exit.S index fd96772..5d87c96 100644 --- a/baremetal/arch/aarch64/no_bootloader/semihost_exit.S +++ b/baremetal/arch/aarch64/no_bootloader/semihost_exit.S @@ -2,11 +2,14 @@ .global mystart mystart: - mov x1, #0x26 - movk x1, #2, lsl #16 - str x1, [sp,#0] + mov x1, 0x26 + movk x1, 2, lsl 16 + ldr x2, =semihost_args + str x1, [x2, 0] mov x0, #0 - str x0, [sp,#8] - mov x1, sp - mov w0, #0x18 + str x0, [x2, 8] + mov x1, x2 + mov w0, 0x18 hlt 0xf000 +semihost_args: + .skip 16 diff --git a/baremetal/arch/aarch64/semihost_exit.S b/baremetal/arch/aarch64/semihost_exit.S index 6bdecfb..ce09a36 100644 --- a/baremetal/arch/aarch64/semihost_exit.S +++ b/baremetal/arch/aarch64/semihost_exit.S @@ -1,20 +1,21 @@ .global main main: /* 0x20026 == ADP_Stopped_ApplicationExit */ - mov x1, #0x26 - movk x1, #2, lsl #16 - str x1, [sp,#0] + mov x1, 0x26 + movk x1, 2, lsl 16 + str x1, [sp, 0] /* Exit status code. Host QEMU process exits with that status. */ - mov x0, #0 - str x0, [sp,#8] + mov x0, 0 + str x0, [sp, 8] /* x1 contains the address of parameter block. - * Any memory address could be used. */ + * Any memory address could be used. + */ mov x1, sp /* SYS_EXIT */ - mov w0, #0x18 + mov w0, 0x18 /* Do the semihosting call on A64. */ hlt 0xf000 diff --git a/baremetal/arch/arm/multicore.S b/baremetal/arch/arm/multicore.S new file mode 100644 index 0000000..9f461a0 --- /dev/null +++ b/baremetal/arch/arm/multicore.S @@ -0,0 +1,37 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#arm-multicore */ + +.global main +main: + mov r0, #0 + ldr r1, =spinlock + str r0, [r1] + /* Get CPU ID. */ + mrc p15, 0, r1, c0, c0, 5 + ands r1, r1, #3 + beq cpu0_only +cpu1_only: + mov r0, #1 + ldr r1, =spinlock + str r0, [r1] + dmb sy + sev +cpu1_sleep_forever: + wfe + b cpu1_sleep_forever +cpu0_only: +#if !defined(GEM5) + /* PCSI CPU_ON. */ + ldr r0, =0x84000003 + mov r1, #1 + ldr r2, =cpu1_only + mov r3, #0 + hvc 0 +#endif +spinlock_start: + ldr r0, spinlock + wfe + cmp r0, #0 + beq spinlock_start + bx lr +spinlock: + .skip 4 diff --git a/common.py b/common.py index f84e90b..d697cbc 100644 --- a/common.py +++ b/common.py @@ -931,7 +931,7 @@ def setup(parser): common.qcow2_file = common.buildroot_qcow2_file # Image. - if args.baremetal is None: + if common.baremetal is None: if common.emulator == 'gem5': common.image = common.vmlinux common.disk_image = common.rootfs_raw_file @@ -940,11 +940,11 @@ def setup(parser): common.disk_image = common.qcow2_file else: common.disk_image = common.gem5_fake_iso - if args.baremetal == 'all': - path = args.baremetal + if common.baremetal == 'all': + path = common.baremetal else: path = common.resolve_executable( - args.baremetal, + common.baremetal, common.baremetal_src_dir, common.baremetal_build_dir, common.baremetal_build_ext, diff --git a/run b/run index 69d294b..7174b6a 100755 --- a/run +++ b/run @@ -128,7 +128,7 @@ def main(args, extra_args=None): raise Exception('Baremetal ELF file not found. Tried:\n' + '\n'.join(paths)) cmd = debug_vm.copy() if common.emulator == 'gem5': - if args.baremetal is None: + if common.baremetal is None: if not os.path.exists(common.rootfs_raw_file): if not os.path.exists(common.qcow2_file): raise_rootfs_not_found() @@ -139,7 +139,7 @@ def main(args, extra_args=None): common.write_string_to_file(common.gem5_fake_iso, 'a' * 512) if not os.path.exists(common.image): # This is to run gem5 from a prebuilt download. - if (not args.baremetal is None) or (not os.path.exists(common.linux_image)): + if (not common.baremetal is None) or (not os.path.exists(common.linux_image)): raise_image_not_found() common.run_cmd([os.path.join(common.extract_vmlinux, common.linux_image)]) os.makedirs(os.path.dirname(common.gem5_readfile), exist_ok=True) @@ -194,15 +194,17 @@ def main(args, extra_args=None): '--dtb-filename', os.path.join(common.gem5_system_dir, 'arm', 'dt', 'armv{}_gem5_v1_{}cpu.dtb'.format(common.armv, args.cpus)), common.Newline, '--machine-type', common.machine, common.Newline, ]) - if args.baremetal is None: + if common.baremetal is None: cmd.extend([ '--param', 'system.panic_on_panic = True', common.Newline]) else: - cmd.extend(['--bare-metal', common.Newline]) + cmd.extend([ + '--bare-metal', common.Newline, + '--param', 'system.auto_reset_addr = True', common.Newline, + ]) if args.arch == 'aarch64': # https://stackoverflow.com/questions/43682311/uart-communication-in-gem5-with-arm-bare-metal/50983650#50983650 cmd.extend(['--param', 'system.highest_el_is_64 = True', common.Newline]) - cmd.extend(['--param', 'system.auto_reset_addr = True', common.Newline]) elif args.gem5_script == 'biglittle': if args.kvm: cpu_type = 'kvm' @@ -319,7 +321,7 @@ def main(args, extra_args=None): root = 'root=/dev/vda' rrid = '' snapshot = ',snapshot' - if args.baremetal is None: + if common.baremetal is None: if not os.path.exists(common.qcow2_file): if not os.path.exists(common.rootfs_raw_file): raise_rootfs_not_found() @@ -364,7 +366,7 @@ def main(args, extra_args=None): ] + virtio_gpu_pci ) - if args.baremetal is None: + if common.baremetal is None: cmd.extend(append) if args.tmux is not None: tmux_args = '--run-id {}'.format(args.run_id) @@ -381,8 +383,8 @@ def main(args, extra_args=None): args.linux_build_id, args.run_id, ) - if args.baremetal: - tmux_args += " --baremetal '{}'".format(args.baremetal) + if common.baremetal: + tmux_args += " --baremetal '{}'".format(common.baremetal) if args.userland: tmux_args += " --userland '{}'".format(args.userland) tmux_args += ' {}'.format(args.tmux) diff --git a/run-gdb b/run-gdb index 6b4b16c..55fc02f 100755 --- a/run-gdb +++ b/run-gdb @@ -120,15 +120,15 @@ def main(args, extra_args=None): break_at = ['-ex', 'break {}'.format(args.break_at), common.Newline] else: break_at = [] - linux_full_system = (args.baremetal is None and args.userland is None) + linux_full_system = (common.baremetal is None and args.userland is None) if args.userland: image = common.resolve_userland(args.userland) - elif args.baremetal: + elif common.baremetal: image = common.image test_script_path = os.path.splitext(common.source_path)[0] + '.py' else: image = common.vmlinux - if args.baremetal: + if common.baremetal: allowed_toolchains = ['crosstool-ng', 'buildroot', 'host'] else: allowed_toolchains = ['buildroot', 'crosstool-ng', 'host'] diff --git a/run-toolchain b/run-toolchain index f2d6e7c..041e1b5 100755 --- a/run-toolchain +++ b/run-toolchain @@ -35,7 +35,7 @@ parser.add_argument( nargs='*' ) args = common.setup(parser) -if args.baremetal is None: +if common.baremetal is None: image = common.vmlinux else: image = common.image