From 988359440b222bd6f533c46ee4aaa0c2b2181608 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?= =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= Date: Wed, 15 Jul 2020 01:00:00 +0000 Subject: [PATCH] A bunch of gem5 o3 example timelines. Not fully understood, but a good start. --- README.adoc | 343 +++++++++++++++++- .../arch/aarch64/freestanding/linux/hazard.S | 4 +- .../arch/aarch64/freestanding/linux/hazard4.S | 38 ++ .../aarch64/freestanding/linux/speculative.S | 42 +++ .../aarch64/freestanding/linux/stall-gain.S | 74 ++++ .../freestanding/linux/stall-hazard4.S | 74 ++++ .../arch/aarch64/freestanding/linux/stall.S | 74 ++++ 7 files changed, 641 insertions(+), 8 deletions(-) create mode 100644 userland/arch/aarch64/freestanding/linux/hazard4.S create mode 100644 userland/arch/aarch64/freestanding/linux/speculative.S create mode 100644 userland/arch/aarch64/freestanding/linux/stall-gain.S create mode 100644 userland/arch/aarch64/freestanding/linux/stall-hazard4.S create mode 100644 userland/arch/aarch64/freestanding/linux/stall.S diff --git a/README.adoc b/README.adoc index 322183a..f846bd2 100644 --- a/README.adoc +++ b/README.adoc @@ -13505,17 +13505,16 @@ Mentioned at: http://www.m5sim.org/Visualization -- \ --cpu-type DerivO3CPU \ --caches \ - > o3pipeview.tmp.log ; -"$(./getvar gem5_source_dir)/util/o3-pipeview.py" -c 500 -o o3pipeview-post.tmp.log --color o3pipeview.tmp.log -less -R o3pipeview-post.tmp.log +"$(./getvar gem5_source_dir)/util/o3-pipeview.py" -c 500 -o o3pipeview.tmp.log --color "$(./getvar --arch aarch64 trace_txt_file)" +less -R o3pipeview.tmp.log .... Or without color: .... -"$(./getvar gem5_source_dir)/util/o3-pipeview.py" -c 500 -o o3pipeview-post.tmp.log o3pipeview.tmp.log -less o3pipeview-post.tmp.log +"$(./getvar gem5_source_dir)/util/o3-pipeview.py" -c 500 -o o3pipeview.tmp.log "$(./getvar --arch aarch64 trace_txt_file)" +less o3pipeview.tmp.log .... A sample output for this can be seen at: <>. @@ -13530,6 +13529,8 @@ Appears to be browser based, so you can zoom in and out, rather than the forced Uses the same data source as `util/o3-pipeview.py`. +<> shows how the text-based visualization can get problematic due to stalls requiring wraparounds. + ==== gem5 ARM RSK https://github.com/arm-university/arm-gem5-rsk/blob/aa3b51b175a0f3b6e75c9c856092ae0c8f2a7cdc/gem5_rsk.pdf @@ -15924,6 +15925,8 @@ The key new <> is `O3CPUAll`: The output is huge and contains about 7 thousand lines!!! +This section and children are tested at LKMC 144a552cf926ea630ef9eadbb22b79fe2468c456. + ====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: hazardless Let's have a look at the arguably simplest example link:userland/arch/aarch64/freestanding/linux/hazardless.S[]. @@ -16168,6 +16171,318 @@ Now let's do the same as in <> but a hazard of depth 4: link:userland/arch/aarch64/freestanding/linux/hazard.S[]. + +.... +// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire + + timeline tick pc.upc disasm seq_num +[.ic.r........................................................................fdn]-( 40000) 0x00400078.0 movz x0, #0, #0 [ 1] +[.ic.r........................................................................fdn]-( 40000) 0x0040007c.0 movz x1, #1, #0 [ 2] +[....................fdn.ic.r....................................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 3] +[....................fdn.pic.r...................................................]-( 120000) 0x00400084.0 add x3, x2, #1 [ 4] +[....................fdn.p.ic.r..................................................]-( 120000) 0x00400088.0 add x4, x3, #1 [ 5] +[....................fdn.p..ic.r.................................................]-( 120000) 0x0040008c.0 add x5, x4, #1 [ 6] +[....................fdn.p...ic.r................................................]-( 120000) 0x00400090.0 add x6, x5, #1 [ 7] +[....................fdn.ic.....r................................................]-( 120000) 0x00400094.0 movz x7, #7, #0 [ 8] +[....................fdn.ic.....r................................................]-( 120000) 0x00400098.0 movz x8, #8, #0 [ 9] +[....................fdn.ic.....r................................................]-( 120000) 0x0040009c.0 movz x9, #9, #0 [ 10] +[.....................fdn.ic....r................................................]-( 120000) 0x004000a0.0 movz x10, #10, #0 [ 11] +[.....................fdn.ic....r................................................]-( 120000) 0x004000a4.0 movz x11, #11, #0 [ 12] +[.....................fdn.ic....r................................................]-( 120000) 0x004000a8.0 movz x12, #12, #0 [ 13] +[.....................fdn.ic....r................................................]-( 120000) 0x004000ac.0 movz x13, #13, #0 [ 14] +[.....................fdn.ic.....r...............................................]-( 120000) 0x004000b0.0 movz x14, #14, #0 [ 15] +[.....................fdn.pic....r...............................................]-( 120000) 0x004000b4.0 movz x15, #15, #0 [ 16] +[.....................fdn.pic....r...............................................]-( 120000) 0x004000b8.0 movz x16, #16, #0 [ 17] +[.....................fdn.pic....r...............................................]-( 120000) 0x004000bc.0 movz x17, #17, #0 [ 18] +[............................................fdn.ic.r............................]-( 160000) 0x004000c0.0 movz x18, #18, #0 [ 19] +[............................................fdn.ic.r............................]-( 160000) 0x004000c4.0 movz x19, #19, #0 [ 20] +[............................................fdn.ic.r............................]-( 160000) 0x004000c8.0 movz x20, #20, #0 [ 21] +[............................................fdn.ic.r............................]-( 160000) 0x004000cc.0 movz x21, #21, #0 [ 22] +[............................................fdn.ic.r............................]-( 160000) 0x004000d0.0 movz x22, #22, #0 [ 23] +[............................................fdn.ic.r............................]-( 160000) 0x004000d4.0 movz x23, #23, #0 [ 24] +[............................................fdn.pic.r...........................]-( 160000) 0x004000d8.0 movz x24, #24, #0 [ 25] +[............................................fdn.pic.r...........................]-( 160000) 0x004000dc.0 movz x25, #25, #0 [ 26] +[.............................................fdn.ic.r...........................]-( 160000) 0x004000e0.0 movz x0, #0, #0 [ 27] +[.............................................fdn.ic.r...........................]-( 160000) 0x004000e4.0 movz x8, #93, #0 [ 28] +.... + +====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: stall + +Like <> but now with an LDR stall: link:userland/arch/aarch64/freestanding/linux/stall.S[]. + +We can see here that: + +* the addition of a data section entry changed our previous address setup a bit, the entry point was now 0x004000b0 which fits 4 instructions in the cacheline instead of 2 +* the <> happens to be the fourth instruction, so it takes a long time to retire. The time is about 40k ticks, which is about the same time it takes for the instruction fetch as expected. +* fetch does not continue past the LDR, and so nothing is gained in this particular example, since the next instructions haven't been fetched from memory yet! + +.... + +// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire + + timeline tick pc.upc disasm seq_num +[.ic.r........................................................................fdn]-( 40000) 0x004000b0.0 movz x0, #0, #0 [ 1] +[.ic.r........................................................................fdn]-( 40000) 0x004000b4.0 movz x1, #1, #0 [ 2] +[.ic.r........................................................................fdn]-( 40000) 0x004000b8.0 adr x2, #65780 [ 3] +[.............................................................................fdn]-( 40000) 0x004000bc.0 ldr x3, [x2] [ 4] +[.pic............................................................................]-( 80000) ... +[................................r...............................................]-( 120000) ... +[....................fdn.ic......r...............................................]-( 120000) 0x004000c0.0 movz x4, #4, #0 [ 5] +[....................fdn.ic......r...............................................]-( 120000) 0x004000c4.0 movz x5, #5, #0 [ 6] +[....................fdn.ic......r...............................................]-( 120000) 0x004000c8.0 movz x6, #6, #0 [ 7] +[....................fdn.ic......r...............................................]-( 120000) 0x004000cc.0 movz x7, #7, #0 [ 8] +[....................fdn.ic......r...............................................]-( 120000) 0x004000d0.0 movz x8, #8, #0 [ 9] +[....................fdn.ic......r...............................................]-( 120000) 0x004000d4.0 movz x9, #9, #0 [ 10] +[....................fdn.pic.....r...............................................]-( 120000) 0x004000d8.0 movz x10, #10, #0 [ 11] +[....................fdn.pic......r..............................................]-( 120000) 0x004000dc.0 movz x11, #11, #0 [ 12] +[.....................fdn.ic......r..............................................]-( 120000) 0x004000e0.0 movz x12, #12, #0 [ 13] +[.....................fdn.ic......r..............................................]-( 120000) 0x004000e4.0 movz x13, #13, #0 [ 14] +[.....................fdn.ic......r..............................................]-( 120000) 0x004000e8.0 movz x14, #14, #0 [ 15] +[.....................fdn.ic......r..............................................]-( 120000) 0x004000ec.0 movz x15, #15, #0 [ 16] +[.....................fdn.pic.....r..............................................]-( 120000) 0x004000f0.0 movz x16, #16, #0 [ 17] +[.....................fdn.pic.....r..............................................]-( 120000) 0x004000f4.0 movz x17, #17, #0 [ 18] +[.....................fdn.pic.....r..............................................]-( 120000) 0x004000f8.0 movz x18, #18, #0 [ 19] +[.....................fdn.pic......r.............................................]-( 120000) 0x004000fc.0 movz x19, #19, #0 [ 20] +.... + +====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: stall-gain + +Like <> but now with an LDR stall: link:userland/arch/aarch64/freestanding/linux/stall-gain.S[]. + +So in this case we see that there were actual potential gains, since the `movz x11` started running immediately. We just stopped at `movz x20` because a new ifetch was needed. + +.... +// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire + + timeline tick pc.upc disasm seq_num +[.ic.r........................................................................fdn]-( 40000) 0x004000b0.0 movz x0, #0, #0 [ 1] +[.ic.r........................................................................fdn]-( 40000) 0x004000b4.0 movz x1, #1, #0 [ 2] +[.ic.r........................................................................fdn]-( 40000) 0x004000b8.0 movz x2, #4, #0 [ 3] +[.ic.r........................................................................fdn]-( 40000) 0x004000bc.0 movz x3, #5, #0 [ 4] +[....................fdn.ic.r....................................................]-( 120000) 0x004000c0.0 adr x4, #65772 [ 5] +[....................fdn.pic.....................................................]-( 120000) 0x004000c4.0 ldr x5, [x4] [ 6] +[........................................................r.......................]-( 160000) ... +[....................fdn.ic......................................................]-( 120000) 0x004000c8.0 movz x6, #6, #0 [ 7] +[........................................................r.......................]-( 160000) ... +[....................fdn.ic......................................................]-( 120000) 0x004000cc.0 movz x7, #7, #0 [ 8] +[........................................................r.......................]-( 160000) ... +[....................fdn.ic......................................................]-( 120000) 0x004000d0.0 movz x8, #8, #0 [ 9] +[........................................................r.......................]-( 160000) ... +[....................fdn.ic......................................................]-( 120000) 0x004000d4.0 movz x9, #9, #0 [ 10] +[........................................................r.......................]-( 160000) ... +[....................fdn.ic......................................................]-( 120000) 0x004000d8.0 movz x10, #10, #0 [ 11] +[........................................................r.......................]-( 160000) ... +[....................fdn.pic.....................................................]-( 120000) 0x004000dc.0 movz x11, #11, #0 [ 12] +[........................................................r.......................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000e0.0 movz x12, #12, #0 [ 13] +[........................................................r.......................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000e4.0 movz x13, #13, #0 [ 14] +[.........................................................r......................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000e8.0 movz x14, #14, #0 [ 15] +[.........................................................r......................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000ec.0 movz x15, #15, #0 [ 16] +[.........................................................r......................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000f0.0 movz x16, #16, #0 [ 17] +[.........................................................r......................]-( 160000) ... +[.....................fdn.pic....................................................]-( 120000) 0x004000f4.0 movz x17, #17, #0 [ 18] +[.........................................................r......................]-( 160000) ... +[.....................fdn.pic....................................................]-( 120000) 0x004000f8.0 movz x18, #18, #0 [ 19] +[.........................................................r......................]-( 160000) ... +[.....................fdn.pic....................................................]-( 120000) 0x004000fc.0 movz x19, #19, #0 [ 20] +[.........................................................r......................]-( 160000) ... +[............................................fdn.ic.......r......................]-( 160000) 0x00400100.0 movz x20, #20, #0 [ 21] +[............................................fdn.ic........r.....................]-( 160000) 0x00400104.0 movz x21, #21, #0 [ 22] +[............................................fdn.ic........r.....................]-( 160000) 0x00400108.0 movz x22, #22, #0 [ 23] +[............................................fdn.ic........r.....................]-( 160000) 0x0040010c.0 movz x23, #23, #0 [ 24] +[............................................fdn.ic........r.....................]-( 160000) 0x00400110.0 movz x24, #24, #0 [ 25] +[............................................fdn.ic........r.....................]-( 160000) 0x00400114.0 movz x25, #25, #0 [ 26] +[............................................fdn.pic.......r.....................]-( 160000) 0x00400118.0 movz x26, #26, #0 [ 27] +[............................................fdn.pic.......r.....................]-( 160000) 0x0040011c.0 movz x27, #27, #0 [ 28] +[.............................................fdn.ic.......r.....................]-( 160000) 0x00400120.0 movz x28, #28, #0 [ 29] +[.............................................fdn.ic........r....................]-( 160000) 0x00400124.0 movz x29, #29, #0 [ 30] +[.............................................fdn.ic........r....................]-( 160000) 0x00400128.0 movz x0, #0, #0 [ 31] +[.............................................fdn.ic........r....................]-( 160000) 0x0040012c.0 movz x1, #1, #0 [ 32] +[.............................................fdn.pic.......r....................]-( 160000) 0x00400130.0 movz x2, #2, #0 [ 33] +[.............................................fdn.pic.......r....................]-( 160000) 0x00400134.0 movz x3, #3, #0 [ 34] +[.............................................fdn.pic.......r....................]-( 160000) 0x00400138.0 movz x4, #4, #0 [ 35] +[.............................................fdn.pic.......r....................]-( 160000) 0x0040013c.0 movz x5, #5, #0 [ 36] +.... + +We now also understand the graph better from lines such as this: + +.... +[....................fdn.pic.....................................................]-( 120000) 0x004000c4.0 ldr x5, [x4] [ 6] +[........................................................r.......................]-( 160000) ... +[....................fdn.ic......................................................]-( 120000) 0x004000c8.0 movz x6, #6, #0 [ 7] +[........................................................r.......................]-( 160000) ... +.... + +We see that extra lines are drawn (the `160000 ... lines` here) whenever something stalls for a period longer than the width of the visualisation. + +Things are still relatively readable because the wrapping aligns them with events that actually happened on that line directly e.g. `160000) 0x00400100.0 movz x20, #20, #0.`. + +But from this we kind of see the need for: <>. + +====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: stall-hazard4 + +Like <> but now with some dependencies after the LDR: link:userland/arch/aarch64/freestanding/linux/stall-hazard4.S[]. + +So in this case the `ic` of dependencies like `add x6, x5, #1` have to wait until the LDR is finished: + +.... +// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire + + timeline tick pc.upc disasm seq_num +[.ic.r........................................................................fdn]-( 40000) 0x004000b0.0 movz x0, #0, #0 [ 1] +[.ic.r........................................................................fdn]-( 40000) 0x004000b4.0 movz x1, #1, #0 [ 2] +[.ic.r........................................................................fdn]-( 40000) 0x004000b8.0 movz x2, #4, #0 [ 3] +[.ic.r........................................................................fdn]-( 40000) 0x004000bc.0 movz x3, #5, #0 [ 4] +[....................fdn.ic.r....................................................]-( 120000) 0x004000c0.0 adr x4, #65772 [ 5] +[....................fdn.pic.....................................................]-( 120000) 0x004000c4.0 ldr x5, [x4] [ 6] +[........................................................r.......................]-( 160000) ... +[....................fdn.p.......................................................]-( 120000) 0x004000c8.0 add x6, x5, #1 [ 7] +[......................................................ic.r......................]-( 160000) ... +[....................fdn.p.......................................................]-( 120000) 0x004000cc.0 add x7, x6, #1 [ 8] +[.......................................................ic.r.....................]-( 160000) ... +[....................fdn.p.......................................................]-( 120000) 0x004000d0.0 add x8, x7, #1 [ 9] +[........................................................ic.r....................]-( 160000) ... +[....................fdn.p.......................................................]-( 120000) 0x004000d4.0 add x9, x8, #1 [ 10] +[.........................................................ic.r...................]-( 160000) ... +[....................fdn.ic......................................................]-( 120000) 0x004000d8.0 movz x10, #10, #0 [ 11] +[............................................................r...................]-( 160000) ... +[....................fdn.ic......................................................]-( 120000) 0x004000dc.0 movz x11, #11, #0 [ 12] +[............................................................r...................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000e0.0 movz x12, #12, #0 [ 13] +[............................................................r...................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000e4.0 movz x13, #13, #0 [ 14] +[............................................................r...................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000e8.0 movz x14, #14, #0 [ 15] +[............................................................r...................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000ec.0 movz x15, #15, #0 [ 16] +[............................................................r...................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000f0.0 movz x16, #16, #0 [ 17] +[............................................................r...................]-( 160000) ... +[.....................fdn.ic.....................................................]-( 120000) 0x004000f4.0 movz x17, #17, #0 [ 18] +[.............................................................r..................]-( 160000) ... +[.....................fdn.pic....................................................]-( 120000) 0x004000f8.0 movz x18, #18, #0 [ 19] +[.............................................................r..................]-( 160000) ... +[.....................fdn.pic....................................................]-( 120000) 0x004000fc.0 movz x19, #19, #0 [ 20] +[.............................................................r..................]-( 160000) ... +[............................................fdn.ic...........r..................]-( 160000) 0x00400100.0 movz x20, #20, #0 [ 21] +[............................................fdn.ic...........r..................]-( 160000) 0x00400104.0 movz x21, #21, #0 [ 22] +[............................................fdn.ic...........r..................]-( 160000) 0x00400108.0 movz x22, #22, #0 [ 23] +[............................................fdn.ic...........r..................]-( 160000) 0x0040010c.0 movz x23, #23, #0 [ 24] +[............................................fdn.ic...........r..................]-( 160000) 0x00400110.0 movz x24, #24, #0 [ 25] +[............................................fdn.ic............r.................]-( 160000) 0x00400114.0 movz x25, #25, #0 [ 26] +[............................................fdn.pic...........r.................]-( 160000) 0x00400118.0 movz x26, #26, #0 [ 27] +[............................................fdn.pic...........r.................]-( 160000) 0x0040011c.0 movz x27, #27, #0 [ 28] +[.............................................fdn.ic...........r.................]-( 160000) 0x00400120.0 movz x28, #28, #0 [ 29] +[.............................................fdn.ic...........r.................]-( 160000) 0x00400124.0 movz x29, #29, #0 [ 30] +[.............................................fdn.ic...........r.................]-( 160000) 0x00400128.0 movz x0, #0, #0 [ 31] +[.............................................fdn.ic...........r.................]-( 160000) 0x0040012c.0 movz x1, #1, #0 [ 32] +[.............................................fdn.pic..........r.................]-( 160000) 0x00400130.0 movz x2, #2, #0 [ 33] +[.............................................fdn.pic...........r................]-( 160000) 0x00400134.0 movz x3, #3, #0 [ 34] +[.............................................fdn.pic...........r................]-( 160000) 0x00400138.0 movz x4, #4, #0 [ 35] +[.............................................fdn.pic...........r................]-( 160000) 0x0040013c.0 movz x5, #5, #0 [ 36] +.... + +====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: speculative + +Now let's try to see some <> in action with link:userland/arch/aarch64/freestanding/linux/speculative.S[]. + +That program is setup such that the branch is not taken if an extra CLI argument is passed with `--cli-args`. + +We purposefully set things up so that speculation will be running from the icache so we can see what is going on more clearly without ifetch stalls. + +Without an extra CLI argument (the branch is taken): + +.... +// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire + + timeline tick pc.upc disasm seq_num +[.............................................................................fdn]-( 40000) 0x00400078.0 ldr x0, [sp] [ 1] +[.ic.............................................................................]-( 80000) ... +[................................r...............................................]-( 120000) ... +[.............................................................................fdn]-( 40000) 0x0040007c.0 movz x1, #1, #0 [ 2] +[.ic.............................................................................]-( 80000) ... +[................................r...............................................]-( 120000) ... +[....................fdn.ic......r...............................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 3] +[....................fdn.ic......r...............................................]-( 120000) 0x00400084.0 movz x3, #3, #0 [ 4] +[....................fdn.ic......r...............................................]-( 120000) 0x00400088.0 movz x4, #4, #0 [ 5] +[....................fdn.ic......r...............................................]-( 120000) 0x0040008c.0 movz x5, #5, #0 [ 6] +[....................fdn.ic......r...............................................]-( 120000) 0x00400090.0 movz x6, #6, #0 [ 7] +[....................fdn.p.....ic..r.............................................]-( 120000) 0x00400094.0 subs x0, #2 [ 8] +[....................fdn.ic........r.............................................]-( 120000) 0x00400098.0 movz x0, #3, #0 [ 9] +[....................fdn.p......ic.r.............................................]-( 120000) 0x0040009c.0 b.lt 0x400080 [ 10] +[=====================fdn=ic=====================================================]-( 120000) 0x004000a0.0 -----movz x10, #10, #0 [ 11] +[=====================fdn=ic=====================================================]-( 120000) 0x004000a4.0 -----movz x11, #11, #0 [ 12] +[=====================fdn=ic=====================================================]-( 120000) 0x004000a8.0 -----movz x12, #12, #0 [ 13] +[=====================fdn=ic=====================================================]-( 120000) 0x004000ac.0 -----movz x13, #13, #0 [ 14] +[=====================fdn=ic=====================================================]-( 120000) 0x004000b0.0 -----movz x14, #14, #0 [ 15] +[=====================fdn=ic=====================================================]-( 120000) 0x004000b4.0 -----movz x15, #15, #0 [ 16] +[=====================fdn=pic====================================================]-( 120000) 0x004000b8.0 -----movz x16, #16, #0 [ 17] +[=====================fdn=pic====================================================]-( 120000) 0x004000bc.0 -----movz x17, #17, #0 [ 18] +[.....................................fdn.ic.r...................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 19] +[.....................................fdn.ic.r...................................]-( 120000) 0x00400084.0 movz x3, #3, #0 [ 20] +[.....................................fdn.ic.r...................................]-( 120000) 0x00400088.0 movz x4, #4, #0 [ 21] +[.....................................fdn.ic.r...................................]-( 120000) 0x0040008c.0 movz x5, #5, #0 [ 22] +[.....................................fdn.ic.r...................................]-( 120000) 0x00400090.0 movz x6, #6, #0 [ 23] +[.....................................fdn.pic.r..................................]-( 120000) 0x00400098.0 movz x0, #3, #0 [ 25] +[.....................................fdn.pic.r..................................]-( 120000) 0x0040009c.0 b.lt 0x400080 [ 26] +[......................................fdn.ic.r..................................]-( 120000) 0x004000a0.0 movz x10, #10, #0 [ 27] +[......................................fdn.ic.r..................................]-( 120000) 0x004000a4.0 movz x11, #11, #0 [ 28] +[......................................fdn.ic.r..................................]-( 120000) 0x004000a8.0 movz x12, #12, #0 [ 29] +[......................................fdn.ic.r..................................]-( 120000) 0x004000ac.0 movz x13, #13, #0 [ 30] +[......................................fdn.pic.r.................................]-( 120000) 0x004000b0.0 movz x14, #14, #0 [ 31] +[......................................fdn.pic.r.................................]-( 120000) 0x004000b4.0 movz x15, #15, #0 [ 32] +[......................................fdn.pic.r.................................]-( 120000) 0x004000b8.0 movz x16, #16, #0 [ 33] +[......................................fdn.pic.r.................................]-( 120000) 0x004000bc.0 movz x17, #17, #0 [ 34] +[.............................................fdn.ic.r...........................]-( 160000) 0x004000c0.0 movz x0, #0, #0 [ 35] +[.............................................fdn.ic.r...........................]-( 160000) 0x004000c4.0 movz x8, #93, #0 [ 36] +.... + +So here we see that the CPU mispredicted! After the <>, the CPU continued to run `movz x10`, assuming that the branch would not be taken. + +Then, at time 120000, the LDR data came back, after the wrong prediction had already been fully executed. + +The CPU then noticed that it mispredicted, and so it started again from the correct branch target `movz x2`, and the instructions that were thrown away are marked as `=====` in the timeline. + +With an extra CLI (the branch is not taken): + +.... +// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire + + timeline tick pc.upc disasm seq_num +[.............................................................................fdn]-( 40000) 0x00400078.0 ldr x0, [sp] [ 1] +[.ic.............................................................................]-( 80000) ... +[................................r...............................................]-( 120000) ... +[.............................................................................fdn]-( 40000) 0x0040007c.0 movz x1, #1, #0 [ 2] +[.ic.............................................................................]-( 80000) ... +[................................r...............................................]-( 120000) ... +[....................fdn.ic......r...............................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 3] +[....................fdn.ic......r...............................................]-( 120000) 0x00400084.0 movz x3, #3, #0 [ 4] +[....................fdn.ic......r...............................................]-( 120000) 0x00400088.0 movz x4, #4, #0 [ 5] +[....................fdn.ic......r...............................................]-( 120000) 0x0040008c.0 movz x5, #5, #0 [ 6] +[....................fdn.ic......r...............................................]-( 120000) 0x00400090.0 movz x6, #6, #0 [ 7] +[....................fdn.ic.......r..............................................]-( 120000) 0x00400098.0 movz x0, #3, #0 [ 9] +[....................fdn.p......ic.r.............................................]-( 120000) 0x0040009c.0 b.lt 0x400080 [ 10] +[.....................fdn.ic.......r.............................................]-( 120000) 0x004000a0.0 movz x10, #10, #0 [ 11] +[.....................fdn.ic.......r.............................................]-( 120000) 0x004000a4.0 movz x11, #11, #0 [ 12] +[.....................fdn.ic.......r.............................................]-( 120000) 0x004000a8.0 movz x12, #12, #0 [ 13] +[.....................fdn.ic.......r.............................................]-( 120000) 0x004000ac.0 movz x13, #13, #0 [ 14] +[.....................fdn.ic.......r.............................................]-( 120000) 0x004000b0.0 movz x14, #14, #0 [ 15] +[.....................fdn.ic.......r.............................................]-( 120000) 0x004000b4.0 movz x15, #15, #0 [ 16] +[.....................fdn.pic......r.............................................]-( 120000) 0x004000b8.0 movz x16, #16, #0 [ 17] +[.....................fdn.pic.......r............................................]-( 120000) 0x004000bc.0 movz x17, #17, #0 [ 18] +[............................................fdn.ic.r............................]-( 160000) 0x004000c0.0 movz x0, #0, #0 [ 19] +[............................................fdn.ic.r............................]-( 160000) 0x004000c4.0 movz x8, #93, #0 [ 20] +.... + +So this time the prediction was correct. Retire is delayed until the memory comes back, but we otherwise just kept running forward until hitting the next ifetch cache line. + ==== gem5 instruction definitions This is one of the parts of gem5 that rely on semi-useless <> inside the `.isa` sublanguage. @@ -19585,6 +19900,14 @@ and run inside the guest from `PATH` with: dhrystone .... +==== LMbench + +http://www.bitmover.com/lmbench/ + +Canonical source at https://sourceforge.net/projects/lmbench/ but Intel has a fork at: https://github.com/intel/lmbench which has more recent build updates, so I think that's the one I'd put my money on as of 2020. + +Feels old, guessing not representative anymore like <>. But hey, history! + ==== STREAM benchmark http://www.cs.virginia.edu/stream/ref.html @@ -25237,6 +25560,16 @@ Likely used on basically all (?) 2020 non-power-constrained CPUs. As mentioned at: https://stackoverflow.com/questions/10074831/what-is-general-difference-between-superscalar-and-ooo-execution it is in theory possible for an out-of-order CPU to not a <>, but the combination is so natural (since you can look ahead, you might as well run it!) that it is not super common. +==== Speculative execution + +https://en.wikipedia.org/wiki/Speculative_execution + +A gem5 example can be seen at: <>. + +Bibliography: + +* https://stackoverflow.com/questions/49601910/out-of-order-execution-vs-speculative-execution + ==== Re-order buffer https://en.wikipedia.org/wiki/Re-order_buffer diff --git a/userland/arch/aarch64/freestanding/linux/hazard.S b/userland/arch/aarch64/freestanding/linux/hazard.S index 420f099..3468086 100644 --- a/userland/arch/aarch64/freestanding/linux/hazard.S +++ b/userland/arch/aarch64/freestanding/linux/hazard.S @@ -10,7 +10,7 @@ _start: asm_main_after_prologue: /* Skip these two instructions to align with the cache line. * Now that's some proper microbenchmarking! - * https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis-hazardless */ + * https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-hazardless */ mov x0, 0 mov x1, 1 @@ -22,7 +22,6 @@ asm_main_after_prologue: mov x7, 7 mov x8, 8 mov x9, 9 - mov x10, 10 mov x11, 11 mov x12, 12 @@ -31,7 +30,6 @@ asm_main_after_prologue: mov x15, 15 mov x16, 16 mov x17, 17 - mov x18, 18 mov x19, 19 mov x20, 20 diff --git a/userland/arch/aarch64/freestanding/linux/hazard4.S b/userland/arch/aarch64/freestanding/linux/hazard4.S new file mode 100644 index 0000000..0c61dbe --- /dev/null +++ b/userland/arch/aarch64/freestanding/linux/hazard4.S @@ -0,0 +1,38 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-hazard4 */ + +.text +.global _start +_start: +asm_main_after_prologue: + mov x0, 0 + mov x1, 1 + + mov x2, 2 + add x3, x2, 1 + add x4, x3, 1 + add x5, x4, 1 + add x6, x5, 1 + mov x7, 7 + mov x8, 8 + mov x9, 9 + mov x10, 10 + mov x11, 11 + mov x12, 12 + mov x13, 13 + mov x14, 14 + mov x15, 15 + mov x16, 16 + mov x17, 17 + mov x18, 18 + mov x19, 19 + mov x20, 20 + mov x21, 21 + mov x22, 22 + mov x23, 23 + mov x24, 24 + mov x25, 25 + + /* exit */ + mov x0, 0 + mov x8, 93 + svc 0 diff --git a/userland/arch/aarch64/freestanding/linux/speculative.S b/userland/arch/aarch64/freestanding/linux/speculative.S new file mode 100644 index 0000000..b6b2170 --- /dev/null +++ b/userland/arch/aarch64/freestanding/linux/speculative.S @@ -0,0 +1,42 @@ +/* An example that does not have any CPU pipeline hazards, to observe + * the simpliest possible steady state of CPU execution. + * Useful during microarchitectural analysis such as: + * https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis + */ + +.text +.global _start +_start: +asm_main_after_prologue: + + /* ifetch 1 */ + /* argc */ + ldr x0, [sp] + mov x1, 1 + + /* ifetch 2 */ +.Lifetch2: + mov x2, 2 + mov x3, 3 + mov x4, 4 + mov x5, 5 + mov x6, 6 + /* Check if a CLi argument was passed. */ + cmp x0, 2 + /* Ensure that we will break out on the next loop. */ + mov x0, 3 + blt .Lifetch2 + + mov x10, 10 + mov x11, 11 + mov x12, 12 + mov x13, 13 + mov x14, 14 + mov x15, 15 + mov x16, 16 + mov x17, 17 + + /* exit */ + mov x0, 0 + mov x8, 93 + svc 0 diff --git a/userland/arch/aarch64/freestanding/linux/stall-gain.S b/userland/arch/aarch64/freestanding/linux/stall-gain.S new file mode 100644 index 0000000..8c6ab35 --- /dev/null +++ b/userland/arch/aarch64/freestanding/linux/stall-gain.S @@ -0,0 +1,74 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-stall-gain */ + +.text +.global _start +_start: +asm_main_after_prologue: + mov x0, 0 + mov x1, 1 + mov x2, 4 + mov x3, 5 + adr x4, mydata + ldr x5, [x4] + mov x6, 6 + mov x7, 7 + mov x8, 8 + mov x9, 9 + mov x10, 10 + mov x11, 11 + mov x12, 12 + mov x13, 13 + mov x14, 14 + mov x15, 15 + mov x16, 16 + mov x17, 17 + mov x18, 18 + mov x19, 19 + mov x20, 20 + mov x21, 21 + mov x22, 22 + mov x23, 23 + mov x24, 24 + mov x25, 25 + mov x26, 26 + mov x27, 27 + mov x28, 28 + mov x29, 29 + + mov x0, 0 + mov x1, 1 + mov x2, 2 + mov x3, 3 + mov x4, 4 + mov x5, 5 + mov x6, 6 + mov x7, 7 + mov x8, 8 + mov x9, 9 + mov x10, 10 + mov x11, 11 + mov x12, 12 + mov x13, 13 + mov x14, 14 + mov x15, 15 + mov x16, 16 + mov x17, 17 + mov x18, 18 + mov x19, 19 + mov x20, 20 + mov x21, 21 + mov x22, 22 + mov x23, 23 + mov x24, 24 + mov x25, 25 + mov x26, 26 + mov x27, 27 + mov x28, 28 + mov x29, 29 + + /* exit */ + mov x0, 0 + mov x8, 93 + svc 0 +.data + mydata: .skip 16 diff --git a/userland/arch/aarch64/freestanding/linux/stall-hazard4.S b/userland/arch/aarch64/freestanding/linux/stall-hazard4.S new file mode 100644 index 0000000..4f0b664 --- /dev/null +++ b/userland/arch/aarch64/freestanding/linux/stall-hazard4.S @@ -0,0 +1,74 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-stall-hazard4 */ + +.text +.global _start +_start: +asm_main_after_prologue: + mov x0, 0 + mov x1, 1 + mov x2, 4 + mov x3, 5 + adr x4, mydata + ldr x5, [x4] + add x6, x5, 1 + add x7, x6, 1 + add x8, x7, 1 + add x9, x8, 1 + mov x10, 10 + mov x11, 11 + mov x12, 12 + mov x13, 13 + mov x14, 14 + mov x15, 15 + mov x16, 16 + mov x17, 17 + mov x18, 18 + mov x19, 19 + mov x20, 20 + mov x21, 21 + mov x22, 22 + mov x23, 23 + mov x24, 24 + mov x25, 25 + mov x26, 26 + mov x27, 27 + mov x28, 28 + mov x29, 29 + + mov x0, 0 + mov x1, 1 + mov x2, 2 + mov x3, 3 + mov x4, 4 + mov x5, 5 + mov x6, 6 + mov x7, 7 + mov x8, 8 + mov x9, 9 + mov x10, 10 + mov x11, 11 + mov x12, 12 + mov x13, 13 + mov x14, 14 + mov x15, 15 + mov x16, 16 + mov x17, 17 + mov x18, 18 + mov x19, 19 + mov x20, 20 + mov x21, 21 + mov x22, 22 + mov x23, 23 + mov x24, 24 + mov x25, 25 + mov x26, 26 + mov x27, 27 + mov x28, 28 + mov x29, 29 + + /* exit */ + mov x0, 0 + mov x8, 93 + svc 0 +.data + mydata: .skip 16 diff --git a/userland/arch/aarch64/freestanding/linux/stall.S b/userland/arch/aarch64/freestanding/linux/stall.S new file mode 100644 index 0000000..ea073bf --- /dev/null +++ b/userland/arch/aarch64/freestanding/linux/stall.S @@ -0,0 +1,74 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-stall */ + +.text +.global _start +_start: +asm_main_after_prologue: + mov x0, 0 + mov x1, 1 + adr x2, mydata + ldr x3, [x2] + mov x4, 4 + mov x5, 5 + mov x6, 6 + mov x7, 7 + mov x8, 8 + mov x9, 9 + mov x10, 10 + mov x11, 11 + mov x12, 12 + mov x13, 13 + mov x14, 14 + mov x15, 15 + mov x16, 16 + mov x17, 17 + mov x18, 18 + mov x19, 19 + mov x20, 20 + mov x21, 21 + mov x22, 22 + mov x23, 23 + mov x24, 24 + mov x25, 25 + mov x26, 26 + mov x27, 27 + mov x28, 28 + mov x29, 29 + + mov x0, 0 + mov x1, 1 + mov x2, 2 + mov x3, 3 + mov x4, 4 + mov x5, 5 + mov x6, 6 + mov x7, 7 + mov x8, 8 + mov x9, 9 + mov x10, 10 + mov x11, 11 + mov x12, 12 + mov x13, 13 + mov x14, 14 + mov x15, 15 + mov x16, 16 + mov x17, 17 + mov x18, 18 + mov x19, 19 + mov x20, 20 + mov x21, 21 + mov x22, 22 + mov x23, 23 + mov x24, 24 + mov x25, 25 + mov x26, 26 + mov x27, 27 + mov x28, 28 + mov x29, 29 + + /* exit */ + mov x0, 0 + mov x8, 93 + svc 0 +.data + mydata: .skip 16