A bunch of gem5 o3 example timelines. Not fully understood, but a good start.

This commit is contained in:
Ciro Santilli 六四事件 法轮功
2020-07-15 01:00:00 +00:00
parent 144a552cf9
commit 988359440b
7 changed files with 641 additions and 8 deletions

View File

@@ -13505,17 +13505,16 @@ Mentioned at: http://www.m5sim.org/Visualization
-- \
--cpu-type DerivO3CPU \
--caches \
> o3pipeview.tmp.log
;
"$(./getvar gem5_source_dir)/util/o3-pipeview.py" -c 500 -o o3pipeview-post.tmp.log --color o3pipeview.tmp.log
less -R o3pipeview-post.tmp.log
"$(./getvar gem5_source_dir)/util/o3-pipeview.py" -c 500 -o o3pipeview.tmp.log --color "$(./getvar --arch aarch64 trace_txt_file)"
less -R o3pipeview.tmp.log
....
Or without color:
....
"$(./getvar gem5_source_dir)/util/o3-pipeview.py" -c 500 -o o3pipeview-post.tmp.log o3pipeview.tmp.log
less o3pipeview-post.tmp.log
"$(./getvar gem5_source_dir)/util/o3-pipeview.py" -c 500 -o o3pipeview.tmp.log "$(./getvar --arch aarch64 trace_txt_file)"
less o3pipeview.tmp.log
....
A sample output for this can be seen at: <<hazardless-o3-pipeline>>.
@@ -13530,6 +13529,8 @@ Appears to be browser based, so you can zoom in and out, rather than the forced
Uses the same data source as `util/o3-pipeview.py`.
<<gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-stall-gain>> shows how the text-based visualization can get problematic due to stalls requiring wraparounds.
==== gem5 ARM RSK
https://github.com/arm-university/arm-gem5-rsk/blob/aa3b51b175a0f3b6e75c9c856092ae0c8f2a7cdc/gem5_rsk.pdf
@@ -15924,6 +15925,8 @@ The key new <<gem5-tracing,debug flag>> is `O3CPUAll`:
The output is huge and contains about 7 thousand lines!!!
This section and children are tested at LKMC 144a552cf926ea630ef9eadbb22b79fe2468c456.
====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: hazardless
Let's have a look at the arguably simplest example link:userland/arch/aarch64/freestanding/linux/hazardless.S[].
@@ -16168,6 +16171,318 @@ Now let's do the same as in <<gem5-event-queue-derivo3cpu-syscall-emulation-free
TODO understand how the hazard happens in detail.
====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: hazard4
Like <<gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-hazard>> but a hazard of depth 4: link:userland/arch/aarch64/freestanding/linux/hazard.S[].
....
// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire
timeline tick pc.upc disasm seq_num
[.ic.r........................................................................fdn]-( 40000) 0x00400078.0 movz x0, #0, #0 [ 1]
[.ic.r........................................................................fdn]-( 40000) 0x0040007c.0 movz x1, #1, #0 [ 2]
[....................fdn.ic.r....................................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 3]
[....................fdn.pic.r...................................................]-( 120000) 0x00400084.0 add x3, x2, #1 [ 4]
[....................fdn.p.ic.r..................................................]-( 120000) 0x00400088.0 add x4, x3, #1 [ 5]
[....................fdn.p..ic.r.................................................]-( 120000) 0x0040008c.0 add x5, x4, #1 [ 6]
[....................fdn.p...ic.r................................................]-( 120000) 0x00400090.0 add x6, x5, #1 [ 7]
[....................fdn.ic.....r................................................]-( 120000) 0x00400094.0 movz x7, #7, #0 [ 8]
[....................fdn.ic.....r................................................]-( 120000) 0x00400098.0 movz x8, #8, #0 [ 9]
[....................fdn.ic.....r................................................]-( 120000) 0x0040009c.0 movz x9, #9, #0 [ 10]
[.....................fdn.ic....r................................................]-( 120000) 0x004000a0.0 movz x10, #10, #0 [ 11]
[.....................fdn.ic....r................................................]-( 120000) 0x004000a4.0 movz x11, #11, #0 [ 12]
[.....................fdn.ic....r................................................]-( 120000) 0x004000a8.0 movz x12, #12, #0 [ 13]
[.....................fdn.ic....r................................................]-( 120000) 0x004000ac.0 movz x13, #13, #0 [ 14]
[.....................fdn.ic.....r...............................................]-( 120000) 0x004000b0.0 movz x14, #14, #0 [ 15]
[.....................fdn.pic....r...............................................]-( 120000) 0x004000b4.0 movz x15, #15, #0 [ 16]
[.....................fdn.pic....r...............................................]-( 120000) 0x004000b8.0 movz x16, #16, #0 [ 17]
[.....................fdn.pic....r...............................................]-( 120000) 0x004000bc.0 movz x17, #17, #0 [ 18]
[............................................fdn.ic.r............................]-( 160000) 0x004000c0.0 movz x18, #18, #0 [ 19]
[............................................fdn.ic.r............................]-( 160000) 0x004000c4.0 movz x19, #19, #0 [ 20]
[............................................fdn.ic.r............................]-( 160000) 0x004000c8.0 movz x20, #20, #0 [ 21]
[............................................fdn.ic.r............................]-( 160000) 0x004000cc.0 movz x21, #21, #0 [ 22]
[............................................fdn.ic.r............................]-( 160000) 0x004000d0.0 movz x22, #22, #0 [ 23]
[............................................fdn.ic.r............................]-( 160000) 0x004000d4.0 movz x23, #23, #0 [ 24]
[............................................fdn.pic.r...........................]-( 160000) 0x004000d8.0 movz x24, #24, #0 [ 25]
[............................................fdn.pic.r...........................]-( 160000) 0x004000dc.0 movz x25, #25, #0 [ 26]
[.............................................fdn.ic.r...........................]-( 160000) 0x004000e0.0 movz x0, #0, #0 [ 27]
[.............................................fdn.ic.r...........................]-( 160000) 0x004000e4.0 movz x8, #93, #0 [ 28]
....
====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: stall
Like <<gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-hazard>> but now with an LDR stall: link:userland/arch/aarch64/freestanding/linux/stall.S[].
We can see here that:
* the addition of a data section entry changed our previous address setup a bit, the entry point was now 0x004000b0 which fits 4 instructions in the cacheline instead of 2
* the <<arm-ldr-instruction,LDR>> happens to be the fourth instruction, so it takes a long time to retire. The time is about 40k ticks, which is about the same time it takes for the instruction fetch as expected.
* fetch does not continue past the LDR, and so nothing is gained in this particular example, since the next instructions haven't been fetched from memory yet!
....
// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire
timeline tick pc.upc disasm seq_num
[.ic.r........................................................................fdn]-( 40000) 0x004000b0.0 movz x0, #0, #0 [ 1]
[.ic.r........................................................................fdn]-( 40000) 0x004000b4.0 movz x1, #1, #0 [ 2]
[.ic.r........................................................................fdn]-( 40000) 0x004000b8.0 adr x2, #65780 [ 3]
[.............................................................................fdn]-( 40000) 0x004000bc.0 ldr x3, [x2] [ 4]
[.pic............................................................................]-( 80000) ...
[................................r...............................................]-( 120000) ...
[....................fdn.ic......r...............................................]-( 120000) 0x004000c0.0 movz x4, #4, #0 [ 5]
[....................fdn.ic......r...............................................]-( 120000) 0x004000c4.0 movz x5, #5, #0 [ 6]
[....................fdn.ic......r...............................................]-( 120000) 0x004000c8.0 movz x6, #6, #0 [ 7]
[....................fdn.ic......r...............................................]-( 120000) 0x004000cc.0 movz x7, #7, #0 [ 8]
[....................fdn.ic......r...............................................]-( 120000) 0x004000d0.0 movz x8, #8, #0 [ 9]
[....................fdn.ic......r...............................................]-( 120000) 0x004000d4.0 movz x9, #9, #0 [ 10]
[....................fdn.pic.....r...............................................]-( 120000) 0x004000d8.0 movz x10, #10, #0 [ 11]
[....................fdn.pic......r..............................................]-( 120000) 0x004000dc.0 movz x11, #11, #0 [ 12]
[.....................fdn.ic......r..............................................]-( 120000) 0x004000e0.0 movz x12, #12, #0 [ 13]
[.....................fdn.ic......r..............................................]-( 120000) 0x004000e4.0 movz x13, #13, #0 [ 14]
[.....................fdn.ic......r..............................................]-( 120000) 0x004000e8.0 movz x14, #14, #0 [ 15]
[.....................fdn.ic......r..............................................]-( 120000) 0x004000ec.0 movz x15, #15, #0 [ 16]
[.....................fdn.pic.....r..............................................]-( 120000) 0x004000f0.0 movz x16, #16, #0 [ 17]
[.....................fdn.pic.....r..............................................]-( 120000) 0x004000f4.0 movz x17, #17, #0 [ 18]
[.....................fdn.pic.....r..............................................]-( 120000) 0x004000f8.0 movz x18, #18, #0 [ 19]
[.....................fdn.pic......r.............................................]-( 120000) 0x004000fc.0 movz x19, #19, #0 [ 20]
....
====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: stall-gain
Like <<gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-stall>> but now with an LDR stall: link:userland/arch/aarch64/freestanding/linux/stall-gain.S[].
So in this case we see that there were actual potential gains, since the `movz x11` started running immediately. We just stopped at `movz x20` because a new ifetch was needed.
....
// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire
timeline tick pc.upc disasm seq_num
[.ic.r........................................................................fdn]-( 40000) 0x004000b0.0 movz x0, #0, #0 [ 1]
[.ic.r........................................................................fdn]-( 40000) 0x004000b4.0 movz x1, #1, #0 [ 2]
[.ic.r........................................................................fdn]-( 40000) 0x004000b8.0 movz x2, #4, #0 [ 3]
[.ic.r........................................................................fdn]-( 40000) 0x004000bc.0 movz x3, #5, #0 [ 4]
[....................fdn.ic.r....................................................]-( 120000) 0x004000c0.0 adr x4, #65772 [ 5]
[....................fdn.pic.....................................................]-( 120000) 0x004000c4.0 ldr x5, [x4] [ 6]
[........................................................r.......................]-( 160000) ...
[....................fdn.ic......................................................]-( 120000) 0x004000c8.0 movz x6, #6, #0 [ 7]
[........................................................r.......................]-( 160000) ...
[....................fdn.ic......................................................]-( 120000) 0x004000cc.0 movz x7, #7, #0 [ 8]
[........................................................r.......................]-( 160000) ...
[....................fdn.ic......................................................]-( 120000) 0x004000d0.0 movz x8, #8, #0 [ 9]
[........................................................r.......................]-( 160000) ...
[....................fdn.ic......................................................]-( 120000) 0x004000d4.0 movz x9, #9, #0 [ 10]
[........................................................r.......................]-( 160000) ...
[....................fdn.ic......................................................]-( 120000) 0x004000d8.0 movz x10, #10, #0 [ 11]
[........................................................r.......................]-( 160000) ...
[....................fdn.pic.....................................................]-( 120000) 0x004000dc.0 movz x11, #11, #0 [ 12]
[........................................................r.......................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000e0.0 movz x12, #12, #0 [ 13]
[........................................................r.......................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000e4.0 movz x13, #13, #0 [ 14]
[.........................................................r......................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000e8.0 movz x14, #14, #0 [ 15]
[.........................................................r......................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000ec.0 movz x15, #15, #0 [ 16]
[.........................................................r......................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000f0.0 movz x16, #16, #0 [ 17]
[.........................................................r......................]-( 160000) ...
[.....................fdn.pic....................................................]-( 120000) 0x004000f4.0 movz x17, #17, #0 [ 18]
[.........................................................r......................]-( 160000) ...
[.....................fdn.pic....................................................]-( 120000) 0x004000f8.0 movz x18, #18, #0 [ 19]
[.........................................................r......................]-( 160000) ...
[.....................fdn.pic....................................................]-( 120000) 0x004000fc.0 movz x19, #19, #0 [ 20]
[.........................................................r......................]-( 160000) ...
[............................................fdn.ic.......r......................]-( 160000) 0x00400100.0 movz x20, #20, #0 [ 21]
[............................................fdn.ic........r.....................]-( 160000) 0x00400104.0 movz x21, #21, #0 [ 22]
[............................................fdn.ic........r.....................]-( 160000) 0x00400108.0 movz x22, #22, #0 [ 23]
[............................................fdn.ic........r.....................]-( 160000) 0x0040010c.0 movz x23, #23, #0 [ 24]
[............................................fdn.ic........r.....................]-( 160000) 0x00400110.0 movz x24, #24, #0 [ 25]
[............................................fdn.ic........r.....................]-( 160000) 0x00400114.0 movz x25, #25, #0 [ 26]
[............................................fdn.pic.......r.....................]-( 160000) 0x00400118.0 movz x26, #26, #0 [ 27]
[............................................fdn.pic.......r.....................]-( 160000) 0x0040011c.0 movz x27, #27, #0 [ 28]
[.............................................fdn.ic.......r.....................]-( 160000) 0x00400120.0 movz x28, #28, #0 [ 29]
[.............................................fdn.ic........r....................]-( 160000) 0x00400124.0 movz x29, #29, #0 [ 30]
[.............................................fdn.ic........r....................]-( 160000) 0x00400128.0 movz x0, #0, #0 [ 31]
[.............................................fdn.ic........r....................]-( 160000) 0x0040012c.0 movz x1, #1, #0 [ 32]
[.............................................fdn.pic.......r....................]-( 160000) 0x00400130.0 movz x2, #2, #0 [ 33]
[.............................................fdn.pic.......r....................]-( 160000) 0x00400134.0 movz x3, #3, #0 [ 34]
[.............................................fdn.pic.......r....................]-( 160000) 0x00400138.0 movz x4, #4, #0 [ 35]
[.............................................fdn.pic.......r....................]-( 160000) 0x0040013c.0 movz x5, #5, #0 [ 36]
....
We now also understand the graph better from lines such as this:
....
[....................fdn.pic.....................................................]-( 120000) 0x004000c4.0 ldr x5, [x4] [ 6]
[........................................................r.......................]-( 160000) ...
[....................fdn.ic......................................................]-( 120000) 0x004000c8.0 movz x6, #6, #0 [ 7]
[........................................................r.......................]-( 160000) ...
....
We see that extra lines are drawn (the `160000 ... lines` here) whenever something stalls for a period longer than the width of the visualisation.
Things are still relatively readable because the wrapping aligns them with events that actually happened on that line directly e.g. `160000) 0x00400100.0 movz x20, #20, #0.`.
But from this we kind of see the need for: <<gem5-konata-o3-pipeline-viewer>>.
====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: stall-hazard4
Like <<gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-stall-gain>> but now with some dependencies after the LDR: link:userland/arch/aarch64/freestanding/linux/stall-hazard4.S[].
So in this case the `ic` of dependencies like `add x6, x5, #1` have to wait until the LDR is finished:
....
// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire
timeline tick pc.upc disasm seq_num
[.ic.r........................................................................fdn]-( 40000) 0x004000b0.0 movz x0, #0, #0 [ 1]
[.ic.r........................................................................fdn]-( 40000) 0x004000b4.0 movz x1, #1, #0 [ 2]
[.ic.r........................................................................fdn]-( 40000) 0x004000b8.0 movz x2, #4, #0 [ 3]
[.ic.r........................................................................fdn]-( 40000) 0x004000bc.0 movz x3, #5, #0 [ 4]
[....................fdn.ic.r....................................................]-( 120000) 0x004000c0.0 adr x4, #65772 [ 5]
[....................fdn.pic.....................................................]-( 120000) 0x004000c4.0 ldr x5, [x4] [ 6]
[........................................................r.......................]-( 160000) ...
[....................fdn.p.......................................................]-( 120000) 0x004000c8.0 add x6, x5, #1 [ 7]
[......................................................ic.r......................]-( 160000) ...
[....................fdn.p.......................................................]-( 120000) 0x004000cc.0 add x7, x6, #1 [ 8]
[.......................................................ic.r.....................]-( 160000) ...
[....................fdn.p.......................................................]-( 120000) 0x004000d0.0 add x8, x7, #1 [ 9]
[........................................................ic.r....................]-( 160000) ...
[....................fdn.p.......................................................]-( 120000) 0x004000d4.0 add x9, x8, #1 [ 10]
[.........................................................ic.r...................]-( 160000) ...
[....................fdn.ic......................................................]-( 120000) 0x004000d8.0 movz x10, #10, #0 [ 11]
[............................................................r...................]-( 160000) ...
[....................fdn.ic......................................................]-( 120000) 0x004000dc.0 movz x11, #11, #0 [ 12]
[............................................................r...................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000e0.0 movz x12, #12, #0 [ 13]
[............................................................r...................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000e4.0 movz x13, #13, #0 [ 14]
[............................................................r...................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000e8.0 movz x14, #14, #0 [ 15]
[............................................................r...................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000ec.0 movz x15, #15, #0 [ 16]
[............................................................r...................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000f0.0 movz x16, #16, #0 [ 17]
[............................................................r...................]-( 160000) ...
[.....................fdn.ic.....................................................]-( 120000) 0x004000f4.0 movz x17, #17, #0 [ 18]
[.............................................................r..................]-( 160000) ...
[.....................fdn.pic....................................................]-( 120000) 0x004000f8.0 movz x18, #18, #0 [ 19]
[.............................................................r..................]-( 160000) ...
[.....................fdn.pic....................................................]-( 120000) 0x004000fc.0 movz x19, #19, #0 [ 20]
[.............................................................r..................]-( 160000) ...
[............................................fdn.ic...........r..................]-( 160000) 0x00400100.0 movz x20, #20, #0 [ 21]
[............................................fdn.ic...........r..................]-( 160000) 0x00400104.0 movz x21, #21, #0 [ 22]
[............................................fdn.ic...........r..................]-( 160000) 0x00400108.0 movz x22, #22, #0 [ 23]
[............................................fdn.ic...........r..................]-( 160000) 0x0040010c.0 movz x23, #23, #0 [ 24]
[............................................fdn.ic...........r..................]-( 160000) 0x00400110.0 movz x24, #24, #0 [ 25]
[............................................fdn.ic............r.................]-( 160000) 0x00400114.0 movz x25, #25, #0 [ 26]
[............................................fdn.pic...........r.................]-( 160000) 0x00400118.0 movz x26, #26, #0 [ 27]
[............................................fdn.pic...........r.................]-( 160000) 0x0040011c.0 movz x27, #27, #0 [ 28]
[.............................................fdn.ic...........r.................]-( 160000) 0x00400120.0 movz x28, #28, #0 [ 29]
[.............................................fdn.ic...........r.................]-( 160000) 0x00400124.0 movz x29, #29, #0 [ 30]
[.............................................fdn.ic...........r.................]-( 160000) 0x00400128.0 movz x0, #0, #0 [ 31]
[.............................................fdn.ic...........r.................]-( 160000) 0x0040012c.0 movz x1, #1, #0 [ 32]
[.............................................fdn.pic..........r.................]-( 160000) 0x00400130.0 movz x2, #2, #0 [ 33]
[.............................................fdn.pic...........r................]-( 160000) 0x00400134.0 movz x3, #3, #0 [ 34]
[.............................................fdn.pic...........r................]-( 160000) 0x00400138.0 movz x4, #4, #0 [ 35]
[.............................................fdn.pic...........r................]-( 160000) 0x0040013c.0 movz x5, #5, #0 [ 36]
....
====== gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: speculative
Now let's try to see some <<speculative-execution>> in action with link:userland/arch/aarch64/freestanding/linux/speculative.S[].
That program is setup such that the branch is not taken if an extra CLI argument is passed with `--cli-args`.
We purposefully set things up so that speculation will be running from the icache so we can see what is going on more clearly without ifetch stalls.
Without an extra CLI argument (the branch is taken):
....
// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire
timeline tick pc.upc disasm seq_num
[.............................................................................fdn]-( 40000) 0x00400078.0 ldr x0, [sp] [ 1]
[.ic.............................................................................]-( 80000) ...
[................................r...............................................]-( 120000) ...
[.............................................................................fdn]-( 40000) 0x0040007c.0 movz x1, #1, #0 [ 2]
[.ic.............................................................................]-( 80000) ...
[................................r...............................................]-( 120000) ...
[....................fdn.ic......r...............................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 3]
[....................fdn.ic......r...............................................]-( 120000) 0x00400084.0 movz x3, #3, #0 [ 4]
[....................fdn.ic......r...............................................]-( 120000) 0x00400088.0 movz x4, #4, #0 [ 5]
[....................fdn.ic......r...............................................]-( 120000) 0x0040008c.0 movz x5, #5, #0 [ 6]
[....................fdn.ic......r...............................................]-( 120000) 0x00400090.0 movz x6, #6, #0 [ 7]
[....................fdn.p.....ic..r.............................................]-( 120000) 0x00400094.0 subs x0, #2 [ 8]
[....................fdn.ic........r.............................................]-( 120000) 0x00400098.0 movz x0, #3, #0 [ 9]
[....................fdn.p......ic.r.............................................]-( 120000) 0x0040009c.0 b.lt 0x400080 [ 10]
[=====================fdn=ic=====================================================]-( 120000) 0x004000a0.0 -----movz x10, #10, #0 [ 11]
[=====================fdn=ic=====================================================]-( 120000) 0x004000a4.0 -----movz x11, #11, #0 [ 12]
[=====================fdn=ic=====================================================]-( 120000) 0x004000a8.0 -----movz x12, #12, #0 [ 13]
[=====================fdn=ic=====================================================]-( 120000) 0x004000ac.0 -----movz x13, #13, #0 [ 14]
[=====================fdn=ic=====================================================]-( 120000) 0x004000b0.0 -----movz x14, #14, #0 [ 15]
[=====================fdn=ic=====================================================]-( 120000) 0x004000b4.0 -----movz x15, #15, #0 [ 16]
[=====================fdn=pic====================================================]-( 120000) 0x004000b8.0 -----movz x16, #16, #0 [ 17]
[=====================fdn=pic====================================================]-( 120000) 0x004000bc.0 -----movz x17, #17, #0 [ 18]
[.....................................fdn.ic.r...................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 19]
[.....................................fdn.ic.r...................................]-( 120000) 0x00400084.0 movz x3, #3, #0 [ 20]
[.....................................fdn.ic.r...................................]-( 120000) 0x00400088.0 movz x4, #4, #0 [ 21]
[.....................................fdn.ic.r...................................]-( 120000) 0x0040008c.0 movz x5, #5, #0 [ 22]
[.....................................fdn.ic.r...................................]-( 120000) 0x00400090.0 movz x6, #6, #0 [ 23]
[.....................................fdn.pic.r..................................]-( 120000) 0x00400098.0 movz x0, #3, #0 [ 25]
[.....................................fdn.pic.r..................................]-( 120000) 0x0040009c.0 b.lt 0x400080 [ 26]
[......................................fdn.ic.r..................................]-( 120000) 0x004000a0.0 movz x10, #10, #0 [ 27]
[......................................fdn.ic.r..................................]-( 120000) 0x004000a4.0 movz x11, #11, #0 [ 28]
[......................................fdn.ic.r..................................]-( 120000) 0x004000a8.0 movz x12, #12, #0 [ 29]
[......................................fdn.ic.r..................................]-( 120000) 0x004000ac.0 movz x13, #13, #0 [ 30]
[......................................fdn.pic.r.................................]-( 120000) 0x004000b0.0 movz x14, #14, #0 [ 31]
[......................................fdn.pic.r.................................]-( 120000) 0x004000b4.0 movz x15, #15, #0 [ 32]
[......................................fdn.pic.r.................................]-( 120000) 0x004000b8.0 movz x16, #16, #0 [ 33]
[......................................fdn.pic.r.................................]-( 120000) 0x004000bc.0 movz x17, #17, #0 [ 34]
[.............................................fdn.ic.r...........................]-( 160000) 0x004000c0.0 movz x0, #0, #0 [ 35]
[.............................................fdn.ic.r...........................]-( 160000) 0x004000c4.0 movz x8, #93, #0 [ 36]
....
So here we see that the CPU mispredicted! After the <<arm-branch-instructions,BLT instruction>>, the CPU continued to run `movz x10`, assuming that the branch would not be taken.
Then, at time 120000, the LDR data came back, after the wrong prediction had already been fully executed.
The CPU then noticed that it mispredicted, and so it started again from the correct branch target `movz x2`, and the instructions that were thrown away are marked as `=====` in the timeline.
With an extra CLI (the branch is not taken):
....
// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire
timeline tick pc.upc disasm seq_num
[.............................................................................fdn]-( 40000) 0x00400078.0 ldr x0, [sp] [ 1]
[.ic.............................................................................]-( 80000) ...
[................................r...............................................]-( 120000) ...
[.............................................................................fdn]-( 40000) 0x0040007c.0 movz x1, #1, #0 [ 2]
[.ic.............................................................................]-( 80000) ...
[................................r...............................................]-( 120000) ...
[....................fdn.ic......r...............................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 3]
[....................fdn.ic......r...............................................]-( 120000) 0x00400084.0 movz x3, #3, #0 [ 4]
[....................fdn.ic......r...............................................]-( 120000) 0x00400088.0 movz x4, #4, #0 [ 5]
[....................fdn.ic......r...............................................]-( 120000) 0x0040008c.0 movz x5, #5, #0 [ 6]
[....................fdn.ic......r...............................................]-( 120000) 0x00400090.0 movz x6, #6, #0 [ 7]
[....................fdn.ic.......r..............................................]-( 120000) 0x00400098.0 movz x0, #3, #0 [ 9]
[....................fdn.p......ic.r.............................................]-( 120000) 0x0040009c.0 b.lt 0x400080 [ 10]
[.....................fdn.ic.......r.............................................]-( 120000) 0x004000a0.0 movz x10, #10, #0 [ 11]
[.....................fdn.ic.......r.............................................]-( 120000) 0x004000a4.0 movz x11, #11, #0 [ 12]
[.....................fdn.ic.......r.............................................]-( 120000) 0x004000a8.0 movz x12, #12, #0 [ 13]
[.....................fdn.ic.......r.............................................]-( 120000) 0x004000ac.0 movz x13, #13, #0 [ 14]
[.....................fdn.ic.......r.............................................]-( 120000) 0x004000b0.0 movz x14, #14, #0 [ 15]
[.....................fdn.ic.......r.............................................]-( 120000) 0x004000b4.0 movz x15, #15, #0 [ 16]
[.....................fdn.pic......r.............................................]-( 120000) 0x004000b8.0 movz x16, #16, #0 [ 17]
[.....................fdn.pic.......r............................................]-( 120000) 0x004000bc.0 movz x17, #17, #0 [ 18]
[............................................fdn.ic.r............................]-( 160000) 0x004000c0.0 movz x0, #0, #0 [ 19]
[............................................fdn.ic.r............................]-( 160000) 0x004000c4.0 movz x8, #93, #0 [ 20]
....
So this time the prediction was correct. Retire is delayed until the memory comes back, but we otherwise just kept running forward until hitting the next ifetch cache line.
==== gem5 instruction definitions
This is one of the parts of gem5 that rely on semi-useless <<gem5-code-generation,code generation>> inside the `.isa` sublanguage.
@@ -19585,6 +19900,14 @@ and run inside the guest from `PATH` with:
dhrystone
....
==== LMbench
http://www.bitmover.com/lmbench/
Canonical source at https://sourceforge.net/projects/lmbench/ but Intel has a fork at: https://github.com/intel/lmbench which has more recent build updates, so I think that's the one I'd put my money on as of 2020.
Feels old, guessing not representative anymore like <<dhrystone>>. But hey, history!
==== STREAM benchmark
http://www.cs.virginia.edu/stream/ref.html
@@ -25237,6 +25560,16 @@ Likely used on basically all (?) 2020 non-power-constrained CPUs.
As mentioned at: https://stackoverflow.com/questions/10074831/what-is-general-difference-between-superscalar-and-ooo-execution it is in theory possible for an out-of-order CPU to not a <<superscalar-processor>>, but the combination is so natural (since you can look ahead, you might as well run it!) that it is not super common.
==== Speculative execution
https://en.wikipedia.org/wiki/Speculative_execution
A gem5 example can be seen at: <<gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-speculative>>.
Bibliography:
* https://stackoverflow.com/questions/49601910/out-of-order-execution-vs-speculative-execution
==== Re-order buffer
https://en.wikipedia.org/wiki/Re-order_buffer

View File

@@ -10,7 +10,7 @@ _start:
asm_main_after_prologue:
/* Skip these two instructions to align with the cache line.
* Now that's some proper microbenchmarking!
* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis-hazardless */
* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-hazardless */
mov x0, 0
mov x1, 1
@@ -22,7 +22,6 @@ asm_main_after_prologue:
mov x7, 7
mov x8, 8
mov x9, 9
mov x10, 10
mov x11, 11
mov x12, 12
@@ -31,7 +30,6 @@ asm_main_after_prologue:
mov x15, 15
mov x16, 16
mov x17, 17
mov x18, 18
mov x19, 19
mov x20, 20

View File

@@ -0,0 +1,38 @@
/* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-hazard4 */
.text
.global _start
_start:
asm_main_after_prologue:
mov x0, 0
mov x1, 1
mov x2, 2
add x3, x2, 1
add x4, x3, 1
add x5, x4, 1
add x6, x5, 1
mov x7, 7
mov x8, 8
mov x9, 9
mov x10, 10
mov x11, 11
mov x12, 12
mov x13, 13
mov x14, 14
mov x15, 15
mov x16, 16
mov x17, 17
mov x18, 18
mov x19, 19
mov x20, 20
mov x21, 21
mov x22, 22
mov x23, 23
mov x24, 24
mov x25, 25
/* exit */
mov x0, 0
mov x8, 93
svc 0

View File

@@ -0,0 +1,42 @@
/* An example that does not have any CPU pipeline hazards, to observe
* the simpliest possible steady state of CPU execution.
* Useful during microarchitectural analysis such as:
* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-minorcpu-syscall-emulation-freestanding-example-analysis
*/
.text
.global _start
_start:
asm_main_after_prologue:
/* ifetch 1 */
/* argc */
ldr x0, [sp]
mov x1, 1
/* ifetch 2 */
.Lifetch2:
mov x2, 2
mov x3, 3
mov x4, 4
mov x5, 5
mov x6, 6
/* Check if a CLi argument was passed. */
cmp x0, 2
/* Ensure that we will break out on the next loop. */
mov x0, 3
blt .Lifetch2
mov x10, 10
mov x11, 11
mov x12, 12
mov x13, 13
mov x14, 14
mov x15, 15
mov x16, 16
mov x17, 17
/* exit */
mov x0, 0
mov x8, 93
svc 0

View File

@@ -0,0 +1,74 @@
/* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-stall-gain */
.text
.global _start
_start:
asm_main_after_prologue:
mov x0, 0
mov x1, 1
mov x2, 4
mov x3, 5
adr x4, mydata
ldr x5, [x4]
mov x6, 6
mov x7, 7
mov x8, 8
mov x9, 9
mov x10, 10
mov x11, 11
mov x12, 12
mov x13, 13
mov x14, 14
mov x15, 15
mov x16, 16
mov x17, 17
mov x18, 18
mov x19, 19
mov x20, 20
mov x21, 21
mov x22, 22
mov x23, 23
mov x24, 24
mov x25, 25
mov x26, 26
mov x27, 27
mov x28, 28
mov x29, 29
mov x0, 0
mov x1, 1
mov x2, 2
mov x3, 3
mov x4, 4
mov x5, 5
mov x6, 6
mov x7, 7
mov x8, 8
mov x9, 9
mov x10, 10
mov x11, 11
mov x12, 12
mov x13, 13
mov x14, 14
mov x15, 15
mov x16, 16
mov x17, 17
mov x18, 18
mov x19, 19
mov x20, 20
mov x21, 21
mov x22, 22
mov x23, 23
mov x24, 24
mov x25, 25
mov x26, 26
mov x27, 27
mov x28, 28
mov x29, 29
/* exit */
mov x0, 0
mov x8, 93
svc 0
.data
mydata: .skip 16

View File

@@ -0,0 +1,74 @@
/* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-stall-hazard4 */
.text
.global _start
_start:
asm_main_after_prologue:
mov x0, 0
mov x1, 1
mov x2, 4
mov x3, 5
adr x4, mydata
ldr x5, [x4]
add x6, x5, 1
add x7, x6, 1
add x8, x7, 1
add x9, x8, 1
mov x10, 10
mov x11, 11
mov x12, 12
mov x13, 13
mov x14, 14
mov x15, 15
mov x16, 16
mov x17, 17
mov x18, 18
mov x19, 19
mov x20, 20
mov x21, 21
mov x22, 22
mov x23, 23
mov x24, 24
mov x25, 25
mov x26, 26
mov x27, 27
mov x28, 28
mov x29, 29
mov x0, 0
mov x1, 1
mov x2, 2
mov x3, 3
mov x4, 4
mov x5, 5
mov x6, 6
mov x7, 7
mov x8, 8
mov x9, 9
mov x10, 10
mov x11, 11
mov x12, 12
mov x13, 13
mov x14, 14
mov x15, 15
mov x16, 16
mov x17, 17
mov x18, 18
mov x19, 19
mov x20, 20
mov x21, 21
mov x22, 22
mov x23, 23
mov x24, 24
mov x25, 25
mov x26, 26
mov x27, 27
mov x28, 28
mov x29, 29
/* exit */
mov x0, 0
mov x8, 93
svc 0
.data
mydata: .skip 16

View File

@@ -0,0 +1,74 @@
/* https://cirosantilli.com/linux-kernel-module-cheat#gem5-event-queue-derivo3cpu-syscall-emulation-freestanding-example-analysis-stall */
.text
.global _start
_start:
asm_main_after_prologue:
mov x0, 0
mov x1, 1
adr x2, mydata
ldr x3, [x2]
mov x4, 4
mov x5, 5
mov x6, 6
mov x7, 7
mov x8, 8
mov x9, 9
mov x10, 10
mov x11, 11
mov x12, 12
mov x13, 13
mov x14, 14
mov x15, 15
mov x16, 16
mov x17, 17
mov x18, 18
mov x19, 19
mov x20, 20
mov x21, 21
mov x22, 22
mov x23, 23
mov x24, 24
mov x25, 25
mov x26, 26
mov x27, 27
mov x28, 28
mov x29, 29
mov x0, 0
mov x1, 1
mov x2, 2
mov x3, 3
mov x4, 4
mov x5, 5
mov x6, 6
mov x7, 7
mov x8, 8
mov x9, 9
mov x10, 10
mov x11, 11
mov x12, 12
mov x13, 13
mov x14, 14
mov x15, 15
mov x16, 16
mov x17, 17
mov x18, 18
mov x19, 19
mov x20, 20
mov x21, 21
mov x22, 22
mov x23, 23
mov x24, 24
mov x25, 25
mov x26, 26
mov x27, 27
mov x28, 28
mov x29, 29
/* exit */
mov x0, 0
mov x8, 93
svc 0
.data
mydata: .skip 16