diff --git a/README.adoc b/README.adoc index 9ca18fa..423dae8 100644 --- a/README.adoc +++ b/README.adoc @@ -15112,6 +15112,7 @@ or for example the key methods of an <> we see that `completeAcc` gets called from `TimingSimpleCPU::completeDataAccess`. + ==== gem5 port system The gem5 memory system is connected in a very flexible way through the port system. @@ -15872,6 +15875,239 @@ therefore there is one `ExecContext` for each `ThreadContext`, and each `ExecCon This makes sense, since each `ThreadContext` represents one CPU register set, and therefore needs a separate `ExecContext` which allows instruction implementations to access those registers. +[[gem5-execcontext-readintregoperand-register-resolution]] +====== gem5 `ExecContext::readIntRegOperand` register resolution + +Let's have a look at how `ExecContext::readIntRegOperand` actually matches registers to decoded registers IDs, since it is not obvious. + +Let's study a simple aarch64 register register addition: + +.... +add x0, x1, x2 +.... + +which corresponds to the `AddXSReg` instruction (formatted and simplified): + +.... +Fault AddXSReg::execute(ExecContext *xc, Trace::InstRecord *traceData) const { + uint64_t Op264 = 0; + uint64_t Dest64 = 0; + uint64_t Op164 = 0; + Op264 = ((xc->readIntRegOperand(this, 0)) & mask(intWidth)); + Op164 = ((xc->readIntRegOperand(this, 1)) & mask(intWidth)); + uint64_t secOp = shiftReg64(Op264, shiftAmt, shiftType, intWidth); + Dest64 = Op164 + secOp; + uint64_t final_val = Dest64; + xc->setIntRegOperand(this, 0, (Dest64) & mask(intWidth)); + if (traceData) { traceData->setData(final_val); } + return NoFault; +} +.... + +So what are those magic `0` and `1` constants on `xc->readIntRegOperand(this, 0)` and `xc->readIntRegOperand(this, 1)`? + +First, we guess that they must be related to the reading of `x1` and `x2`, which are the inputs of the addition. + +Next, we also guess that the `0` read must correspond to `x2`, since it later gets potentially shifted as mentioned at xref:arm-shift-suffixes[xrefstyle=full]. + +Let's also have a look at the decoder code that builds the instruction instance in `build/ARM/arch/arm/generated/decoder-ns.cc.inc`: + +.... + +ArmShiftType type = + (ArmShiftType)(uint8_t)bits(machInst, 23, 22); +if (type == ROR) + return new Unknown64(machInst); +uint8_t imm6 = bits(machInst, 15, 10); +if (!bits(machInst, 31) && bits(imm6, 5)) + return new Unknown64(machInst); +IntRegIndex rd = (IntRegIndex)(uint8_t)bits(machInst, 4, 0); +IntRegIndex rdzr = makeZero(rd); +IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5); +IntRegIndex rm = (IntRegIndex)(uint8_t)bits(machInst, 20, 16); + +return new AddXSReg(machInst, rdzr, rn, rm, imm6, type); +.... + +and the ARM instruction pseudocode from the <>: + +.... +ADD , , {, #} +.... + +and the constructor: + +.... +AddXSReg::AddXSReg(ExtMachInst machInst, + IntRegIndex _dest, + IntRegIndex _op1, + IntRegIndex _op2, + int32_t _shiftAmt, + ArmShiftType _shiftType +) : DataXSRegOp("add", machInst, IntAluOp, + _dest, _op1, _op2, _shiftAmt, _shiftType) { + _numSrcRegs = 0; + _numDestRegs = 0; + _numFPDestRegs = 0; + _numVecDestRegs = 0; + _numVecElemDestRegs = 0; + _numVecPredDestRegs = 0; + _numIntDestRegs = 0; + _numCCDestRegs = 0; + _srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op2); + _destRegIdx[_numDestRegs++] = RegId(IntRegClass, dest); + _numIntDestRegs++; + _srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op1); + flags[IsInteger] = true;; +} +.... + +where `RegId` is just a container class, and so the lines that we care about for now are: + +.... +_srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op2); +_srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op1); +.... + +which matches the guess we made earlier: `op2` is `0` and `op1` is `1` (`op1` and `op2` are the same as `_op1` and `_op2` which are set in the base constructor `DataXSRegOp`). + +We also note that the register decodings (which the ARM spec says are `1` for `x1` and `2` for `x2`) are actually passed as enum `IntRegIndex`: + +.... + IntRegIndex _op1, + IntRegIndex _op2, +.... + +which are defined at `src/arch/arm/interegs.hh`: + +.... +enum IntRegIndex +{ + /* All the unique register indices. */ + INTREG_R0, + INTREG_R1, + INTREG_R2, +.... + +Then `SimpleExecContext::readIntRegOperand` does: + +.... + /** Reads an integer register. */ + RegVal + readIntRegOperand(const StaticInst *si, int idx) override + { + numIntRegReads++; + const RegId& reg = si->srcRegIdx(idx); + assert(reg.isIntReg()); + return thread->readIntReg(reg.index()); + } +.... + +and: + +.... +const RegId& srcRegIdx(int i) const { return _srcRegIdx[i]; } +.... + +which is what is populated in the constructor. + +Then, `RegIndex::index() { return regIdx; }` just returns the decoded register bytes, and now `SimpleThread::readIntReg`: + +.... +RegVal readIntReg(RegIndex reg_idx) const override { + int flatIndex = isa->flattenIntIndex(reg_idx); + return readIntRegFlat(flatIndex); +} +.... + +`readIntRegFlag` is what finally reads from the int register array: + +.... +RegVal SimpleThreadContext::readIntRegFlat(RegIndex idx) const override { return intRegs[idx]; } + +std::array SimpleThreadContext::intRegs; +.... + +and then there is the flattening magic at: + +.... +int +flattenIntIndex(int reg) const +{ + assert(reg >= 0); + if (reg < NUM_ARCH_INTREGS) { + return intRegMap[reg]; + } else if (reg < NUM_INTREGS) { + return reg; + } else if (reg == INTREG_SPX) { + CPSR cpsr = miscRegs[MISCREG_CPSR]; + ExceptionLevel el = opModeToEL( + (OperatingMode) (uint8_t) cpsr.mode); + if (!cpsr.sp && el != EL0) + return INTREG_SP0; + switch (el) { + case EL3: + return INTREG_SP3; + case EL2: + return INTREG_SP2; + case EL1: + return INTREG_SP1; + case EL0: + return INTREG_SP0; + default: + panic("Invalid exception level"); + return 0; // Never happens. + } + } else { + return flattenIntRegModeIndex(reg); + } +} +.... + +Then: + +.... + NUM_ARCH_INTREGS = 32, +.... + +so we undertand that this covers x0 to x31. `NUM_INTREGS` is also 32, so I'm a bit confused, that case is never reached. + +.... + INTREG_SPX = NUM_INTREGS, +.... + + +SP is 32, but it is a bit more magic, since in ARM there is one SP per <> as mentioned at <>. + +.... + INTREG_SPX = NUM_INTREGS +.... + + + +We can also have a quick look at the `AddXImm` instruction which corresponds to a simple addition of an immediate as shown in link:userland/arch/aarch64/add.S[]: + +.... +add x0, x1, 2 +.... + +Its <> contains in `build/ARM/arch/arm/generated/exec-ns.cc.inc` (hand formatted and slightly simplified): + +.... +Fault AddXImm::execute(ExecContext *xc, Trace::InstRecord *traceData) const { + uint64_t Dest64 = 0; + uint64_t Op164 = 0; + Op164 = ((xc->readIntRegOperand(this, 0)) & mask(intWidth)); + Dest64 = Op164 + imm; + uint64_t final_val = Dest64; + xc->setIntRegOperand(this, 0, (Dest64) & mask(intWidth)); + if (traceData) { traceData->setData(final_val); } + return NoFault; +} +.... + +and `imm` is set directly on the constructor. + ===== gem5 `Process` The `Process` class is used only for <>, and it represents a process like a Linux userland process, in addition to any further gem5 specific data needed to represent the process. @@ -17117,9 +17353,10 @@ POSIX' multithreading API. Contrast with <> which is for processes. This was for a looong time the only "portable" multithreading alternative, until <>, thus also extending the portability to Windows. -* link:userland/posix/pthread_count.c[] -* link:userland/posix/pthread_deadlock.c[] -* link:userland/posix/pthread_self.c[] +* link:userland/posix/pthread_self.c[]: the simplest example possible +* link:userland/posix/pthread_count.c[]: count an atomic varible across threads +* link:userland/posix/pthread_deadlock.c[]: purposefully create a deadlock to see what it looks like +* link:userland/posix/pthread_barrier.c[]: related: https://stackoverflow.com/questions/28663622/understanding-posix-barrier-mechanism [[pthread-mutex]] ===== pthread_mutex @@ -18582,7 +18819,10 @@ The example in `man futex` is also a must. [[getcpu]] ==== `getcpu` system call and the `sched_getaffinity` glibc wrapper -Example: link:userland/linux/sched_getcpu.c[] +Examples: + +* link:userland/linux/sched_getcpu.c[] +* link:userland/linux/sched_getcpu_barrier.c[]: this uses a barrier to ensure that gem5 will run each thread on one separate CPU Returns the CPU that the process/thread is currently running on: @@ -21160,6 +21400,10 @@ TODO. Create a minimal runnable example of going into EL0 and jumping to EL1. See <> D1.6.2 "The stack pointer registers". +There is one SP per <>. + +This can also be seen clearly on the analysis at <>. + TODO create a minimal runnable example. TODO: how to select to use SP0 in an exception handler? diff --git a/path_properties.py b/path_properties.py index 00a5bdd..3af79bd 100644 --- a/path_properties.py +++ b/path_properties.py @@ -754,6 +754,10 @@ path_properties_tuples = ( 'requires_syscall_getcpu': True, 'test_run_args': {'cpus': 2}, }, + 'sched_getcpu_barrier.c': { + 'requires_syscall_getcpu': True, + 'test_run_args': {'cpus': 2}, + }, 'time_boot.c': {'requires_sudo': True}, 'virt_to_phys_user.c': {'requires_argument': True}, } diff --git a/userland/linux/sched_getcpu_barrier.c b/userland/linux/sched_getcpu_barrier.c new file mode 100644 index 0000000..eb20083 --- /dev/null +++ b/userland/linux/sched_getcpu_barrier.c @@ -0,0 +1,46 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#getcpu */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +pthread_barrier_t barrier; + +void* main_thread(void *arg) { + (void)arg; + printf("%d\n", sched_getcpu()); + pthread_barrier_wait(&barrier); + return NULL; +} + +int main(int argc, char **argv) { + pthread_t *threads; + unsigned int nthreads, i; + if (argc > 1) { + nthreads = strtoll(argv[1], NULL, 0); + } else { + nthreads = 1; + } + assert(!pthread_barrier_init(&barrier, NULL, nthreads)); + threads = malloc(nthreads * sizeof(*threads)); + for (i = 0; i < nthreads; ++i) { + assert(pthread_create( + &threads[i], + NULL, + main_thread, + NULL + ) == 0); + } + for (i = 0; i < nthreads; ++i) { + pthread_join(threads[i], NULL); + } + free(threads); + assert(!pthread_barrier_destroy(&barrier)); + return EXIT_SUCCESS; +} diff --git a/userland/posix/pthread_barrier.c b/userland/posix/pthread_barrier.c new file mode 100644 index 0000000..460141e --- /dev/null +++ b/userland/posix/pthread_barrier.c @@ -0,0 +1,47 @@ +/* https://cirosantilli.com/linux-kernel-module-cheat#pthreads */ + +#define _XOPEN_SOURCE 700 +#include +#include +#include +#include +#include +#include +#include +#include + +pthread_barrier_t barrier; + +void* main_thread(void *arg) { + (void)arg; + printf("before %ju\n", (uintmax_t)pthread_self()); + pthread_barrier_wait(&barrier); + printf("after %ju\n", (uintmax_t)pthread_self()); + return NULL; +} + +int main(int argc, char **argv) { + pthread_t *threads; + unsigned int nthreads, i; + if (argc > 1) { + nthreads = strtoll(argv[1], NULL, 0); + } else { + nthreads = 1; + } + assert(!pthread_barrier_init(&barrier, NULL, nthreads)); + threads = malloc(nthreads * sizeof(*threads)); + for (i = 0; i < nthreads; ++i) { + assert(pthread_create( + &threads[i], + NULL, + main_thread, + NULL + ) == 0); + } + for (i = 0; i < nthreads; ++i) { + pthread_join(threads[i], NULL); + } + free(threads); + assert(!pthread_barrier_destroy(&barrier)); + return EXIT_SUCCESS; +}