mirror of
https://github.com/cirosantilli/linux-kernel-module-cheat.git
synced 2026-01-23 02:05:57 +01:00
cpu register interface
pthread_barrier
This commit is contained in:
252
README.adoc
252
README.adoc
@@ -15112,6 +15112,7 @@ or for example the key methods of an <<arm-str-instruction,ARM 64-bit (X) STR wi
|
|||||||
}
|
}
|
||||||
....
|
....
|
||||||
|
|
||||||
|
We also notice that the key argument passed to those instructions is of type `ExecContext`, which is discussed further at: xref:gem5-execcontext[xrefstyle=full].
|
||||||
|
|
||||||
The file is an include so that compilation can be split up into chunks by the autogenerated includers
|
The file is an include so that compilation can be split up into chunks by the autogenerated includers
|
||||||
|
|
||||||
@@ -15316,6 +15317,8 @@ Fault STXRX64::completeAcc(PacketPtr pkt, ExecContext *xc,
|
|||||||
}
|
}
|
||||||
....
|
....
|
||||||
|
|
||||||
|
From GDB on <<timingsimplecpu-analysis-ldr-stall>> we see that `completeAcc` gets called from `TimingSimpleCPU::completeDataAccess`.
|
||||||
|
|
||||||
==== gem5 port system
|
==== gem5 port system
|
||||||
|
|
||||||
The gem5 memory system is connected in a very flexible way through the port system.
|
The gem5 memory system is connected in a very flexible way through the port system.
|
||||||
@@ -15872,6 +15875,239 @@ therefore there is one `ExecContext` for each `ThreadContext`, and each `ExecCon
|
|||||||
|
|
||||||
This makes sense, since each `ThreadContext` represents one CPU register set, and therefore needs a separate `ExecContext` which allows instruction implementations to access those registers.
|
This makes sense, since each `ThreadContext` represents one CPU register set, and therefore needs a separate `ExecContext` which allows instruction implementations to access those registers.
|
||||||
|
|
||||||
|
[[gem5-execcontext-readintregoperand-register-resolution]]
|
||||||
|
====== gem5 `ExecContext::readIntRegOperand` register resolution
|
||||||
|
|
||||||
|
Let's have a look at how `ExecContext::readIntRegOperand` actually matches registers to decoded registers IDs, since it is not obvious.
|
||||||
|
|
||||||
|
Let's study a simple aarch64 register register addition:
|
||||||
|
|
||||||
|
....
|
||||||
|
add x0, x1, x2
|
||||||
|
....
|
||||||
|
|
||||||
|
which corresponds to the `AddXSReg` instruction (formatted and simplified):
|
||||||
|
|
||||||
|
....
|
||||||
|
Fault AddXSReg::execute(ExecContext *xc, Trace::InstRecord *traceData) const {
|
||||||
|
uint64_t Op264 = 0;
|
||||||
|
uint64_t Dest64 = 0;
|
||||||
|
uint64_t Op164 = 0;
|
||||||
|
Op264 = ((xc->readIntRegOperand(this, 0)) & mask(intWidth));
|
||||||
|
Op164 = ((xc->readIntRegOperand(this, 1)) & mask(intWidth));
|
||||||
|
uint64_t secOp = shiftReg64(Op264, shiftAmt, shiftType, intWidth);
|
||||||
|
Dest64 = Op164 + secOp;
|
||||||
|
uint64_t final_val = Dest64;
|
||||||
|
xc->setIntRegOperand(this, 0, (Dest64) & mask(intWidth));
|
||||||
|
if (traceData) { traceData->setData(final_val); }
|
||||||
|
return NoFault;
|
||||||
|
}
|
||||||
|
....
|
||||||
|
|
||||||
|
So what are those magic `0` and `1` constants on `xc->readIntRegOperand(this, 0)` and `xc->readIntRegOperand(this, 1)`?
|
||||||
|
|
||||||
|
First, we guess that they must be related to the reading of `x1` and `x2`, which are the inputs of the addition.
|
||||||
|
|
||||||
|
Next, we also guess that the `0` read must correspond to `x2`, since it later gets potentially shifted as mentioned at xref:arm-shift-suffixes[xrefstyle=full].
|
||||||
|
|
||||||
|
Let's also have a look at the decoder code that builds the instruction instance in `build/ARM/arch/arm/generated/decoder-ns.cc.inc`:
|
||||||
|
|
||||||
|
....
|
||||||
|
|
||||||
|
ArmShiftType type =
|
||||||
|
(ArmShiftType)(uint8_t)bits(machInst, 23, 22);
|
||||||
|
if (type == ROR)
|
||||||
|
return new Unknown64(machInst);
|
||||||
|
uint8_t imm6 = bits(machInst, 15, 10);
|
||||||
|
if (!bits(machInst, 31) && bits(imm6, 5))
|
||||||
|
return new Unknown64(machInst);
|
||||||
|
IntRegIndex rd = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
|
||||||
|
IntRegIndex rdzr = makeZero(rd);
|
||||||
|
IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
|
||||||
|
IntRegIndex rm = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
|
||||||
|
|
||||||
|
return new AddXSReg(machInst, rdzr, rn, rm, imm6, type);
|
||||||
|
....
|
||||||
|
|
||||||
|
and the ARM instruction pseudocode from the <<armarm8>>:
|
||||||
|
|
||||||
|
....
|
||||||
|
ADD <Xd>, <Xn>, <Xm>{, <shift> #<amount>}
|
||||||
|
....
|
||||||
|
|
||||||
|
and the constructor:
|
||||||
|
|
||||||
|
....
|
||||||
|
AddXSReg::AddXSReg(ExtMachInst machInst,
|
||||||
|
IntRegIndex _dest,
|
||||||
|
IntRegIndex _op1,
|
||||||
|
IntRegIndex _op2,
|
||||||
|
int32_t _shiftAmt,
|
||||||
|
ArmShiftType _shiftType
|
||||||
|
) : DataXSRegOp("add", machInst, IntAluOp,
|
||||||
|
_dest, _op1, _op2, _shiftAmt, _shiftType) {
|
||||||
|
_numSrcRegs = 0;
|
||||||
|
_numDestRegs = 0;
|
||||||
|
_numFPDestRegs = 0;
|
||||||
|
_numVecDestRegs = 0;
|
||||||
|
_numVecElemDestRegs = 0;
|
||||||
|
_numVecPredDestRegs = 0;
|
||||||
|
_numIntDestRegs = 0;
|
||||||
|
_numCCDestRegs = 0;
|
||||||
|
_srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op2);
|
||||||
|
_destRegIdx[_numDestRegs++] = RegId(IntRegClass, dest);
|
||||||
|
_numIntDestRegs++;
|
||||||
|
_srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op1);
|
||||||
|
flags[IsInteger] = true;;
|
||||||
|
}
|
||||||
|
....
|
||||||
|
|
||||||
|
where `RegId` is just a container class, and so the lines that we care about for now are:
|
||||||
|
|
||||||
|
....
|
||||||
|
_srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op2);
|
||||||
|
_srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, op1);
|
||||||
|
....
|
||||||
|
|
||||||
|
which matches the guess we made earlier: `op2` is `0` and `op1` is `1` (`op1` and `op2` are the same as `_op1` and `_op2` which are set in the base constructor `DataXSRegOp`).
|
||||||
|
|
||||||
|
We also note that the register decodings (which the ARM spec says are `1` for `x1` and `2` for `x2`) are actually passed as enum `IntRegIndex`:
|
||||||
|
|
||||||
|
....
|
||||||
|
IntRegIndex _op1,
|
||||||
|
IntRegIndex _op2,
|
||||||
|
....
|
||||||
|
|
||||||
|
which are defined at `src/arch/arm/interegs.hh`:
|
||||||
|
|
||||||
|
....
|
||||||
|
enum IntRegIndex
|
||||||
|
{
|
||||||
|
/* All the unique register indices. */
|
||||||
|
INTREG_R0,
|
||||||
|
INTREG_R1,
|
||||||
|
INTREG_R2,
|
||||||
|
....
|
||||||
|
|
||||||
|
Then `SimpleExecContext::readIntRegOperand` does:
|
||||||
|
|
||||||
|
....
|
||||||
|
/** Reads an integer register. */
|
||||||
|
RegVal
|
||||||
|
readIntRegOperand(const StaticInst *si, int idx) override
|
||||||
|
{
|
||||||
|
numIntRegReads++;
|
||||||
|
const RegId& reg = si->srcRegIdx(idx);
|
||||||
|
assert(reg.isIntReg());
|
||||||
|
return thread->readIntReg(reg.index());
|
||||||
|
}
|
||||||
|
....
|
||||||
|
|
||||||
|
and:
|
||||||
|
|
||||||
|
....
|
||||||
|
const RegId& srcRegIdx(int i) const { return _srcRegIdx[i]; }
|
||||||
|
....
|
||||||
|
|
||||||
|
which is what is populated in the constructor.
|
||||||
|
|
||||||
|
Then, `RegIndex::index() { return regIdx; }` just returns the decoded register bytes, and now `SimpleThread::readIntReg`:
|
||||||
|
|
||||||
|
....
|
||||||
|
RegVal readIntReg(RegIndex reg_idx) const override {
|
||||||
|
int flatIndex = isa->flattenIntIndex(reg_idx);
|
||||||
|
return readIntRegFlat(flatIndex);
|
||||||
|
}
|
||||||
|
....
|
||||||
|
|
||||||
|
`readIntRegFlag` is what finally reads from the int register array:
|
||||||
|
|
||||||
|
....
|
||||||
|
RegVal SimpleThreadContext::readIntRegFlat(RegIndex idx) const override { return intRegs[idx]; }
|
||||||
|
|
||||||
|
std::array<RegVal, TheISA::NumIntRegs> SimpleThreadContext::intRegs;
|
||||||
|
....
|
||||||
|
|
||||||
|
and then there is the flattening magic at:
|
||||||
|
|
||||||
|
....
|
||||||
|
int
|
||||||
|
flattenIntIndex(int reg) const
|
||||||
|
{
|
||||||
|
assert(reg >= 0);
|
||||||
|
if (reg < NUM_ARCH_INTREGS) {
|
||||||
|
return intRegMap[reg];
|
||||||
|
} else if (reg < NUM_INTREGS) {
|
||||||
|
return reg;
|
||||||
|
} else if (reg == INTREG_SPX) {
|
||||||
|
CPSR cpsr = miscRegs[MISCREG_CPSR];
|
||||||
|
ExceptionLevel el = opModeToEL(
|
||||||
|
(OperatingMode) (uint8_t) cpsr.mode);
|
||||||
|
if (!cpsr.sp && el != EL0)
|
||||||
|
return INTREG_SP0;
|
||||||
|
switch (el) {
|
||||||
|
case EL3:
|
||||||
|
return INTREG_SP3;
|
||||||
|
case EL2:
|
||||||
|
return INTREG_SP2;
|
||||||
|
case EL1:
|
||||||
|
return INTREG_SP1;
|
||||||
|
case EL0:
|
||||||
|
return INTREG_SP0;
|
||||||
|
default:
|
||||||
|
panic("Invalid exception level");
|
||||||
|
return 0; // Never happens.
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return flattenIntRegModeIndex(reg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
....
|
||||||
|
|
||||||
|
Then:
|
||||||
|
|
||||||
|
....
|
||||||
|
NUM_ARCH_INTREGS = 32,
|
||||||
|
....
|
||||||
|
|
||||||
|
so we undertand that this covers x0 to x31. `NUM_INTREGS` is also 32, so I'm a bit confused, that case is never reached.
|
||||||
|
|
||||||
|
....
|
||||||
|
INTREG_SPX = NUM_INTREGS,
|
||||||
|
....
|
||||||
|
|
||||||
|
|
||||||
|
SP is 32, but it is a bit more magic, since in ARM there is one SP per <<arm-exception-levels,exception level>> as mentioned at <<arm-sp0-vs-spx>>.
|
||||||
|
|
||||||
|
....
|
||||||
|
INTREG_SPX = NUM_INTREGS
|
||||||
|
....
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
We can also have a quick look at the `AddXImm` instruction which corresponds to a simple addition of an immediate as shown in link:userland/arch/aarch64/add.S[]:
|
||||||
|
|
||||||
|
....
|
||||||
|
add x0, x1, 2
|
||||||
|
....
|
||||||
|
|
||||||
|
Its <<gem5-execute-vs-initiateacc-vs-completeacc,`execute` method>> contains in `build/ARM/arch/arm/generated/exec-ns.cc.inc` (hand formatted and slightly simplified):
|
||||||
|
|
||||||
|
....
|
||||||
|
Fault AddXImm::execute(ExecContext *xc, Trace::InstRecord *traceData) const {
|
||||||
|
uint64_t Dest64 = 0;
|
||||||
|
uint64_t Op164 = 0;
|
||||||
|
Op164 = ((xc->readIntRegOperand(this, 0)) & mask(intWidth));
|
||||||
|
Dest64 = Op164 + imm;
|
||||||
|
uint64_t final_val = Dest64;
|
||||||
|
xc->setIntRegOperand(this, 0, (Dest64) & mask(intWidth));
|
||||||
|
if (traceData) { traceData->setData(final_val); }
|
||||||
|
return NoFault;
|
||||||
|
}
|
||||||
|
....
|
||||||
|
|
||||||
|
and `imm` is set directly on the constructor.
|
||||||
|
|
||||||
===== gem5 `Process`
|
===== gem5 `Process`
|
||||||
|
|
||||||
The `Process` class is used only for <<gem5-syscall-emulation-mode>>, and it represents a process like a Linux userland process, in addition to any further gem5 specific data needed to represent the process.
|
The `Process` class is used only for <<gem5-syscall-emulation-mode>>, and it represents a process like a Linux userland process, in addition to any further gem5 specific data needed to represent the process.
|
||||||
@@ -17117,9 +17353,10 @@ POSIX' multithreading API. Contrast with <<fork>> which is for processes.
|
|||||||
|
|
||||||
This was for a looong time the only "portable" multithreading alternative, until <<cpp-multithreading,C++11 finally added threads>>, thus also extending the portability to Windows.
|
This was for a looong time the only "portable" multithreading alternative, until <<cpp-multithreading,C++11 finally added threads>>, thus also extending the portability to Windows.
|
||||||
|
|
||||||
* link:userland/posix/pthread_count.c[]
|
* link:userland/posix/pthread_self.c[]: the simplest example possible
|
||||||
* link:userland/posix/pthread_deadlock.c[]
|
* link:userland/posix/pthread_count.c[]: count an atomic varible across threads
|
||||||
* link:userland/posix/pthread_self.c[]
|
* link:userland/posix/pthread_deadlock.c[]: purposefully create a deadlock to see what it looks like
|
||||||
|
* link:userland/posix/pthread_barrier.c[]: related: https://stackoverflow.com/questions/28663622/understanding-posix-barrier-mechanism
|
||||||
|
|
||||||
[[pthread-mutex]]
|
[[pthread-mutex]]
|
||||||
===== pthread_mutex
|
===== pthread_mutex
|
||||||
@@ -18582,7 +18819,10 @@ The example in `man futex` is also a must.
|
|||||||
[[getcpu]]
|
[[getcpu]]
|
||||||
==== `getcpu` system call and the `sched_getaffinity` glibc wrapper
|
==== `getcpu` system call and the `sched_getaffinity` glibc wrapper
|
||||||
|
|
||||||
Example: link:userland/linux/sched_getcpu.c[]
|
Examples:
|
||||||
|
|
||||||
|
* link:userland/linux/sched_getcpu.c[]
|
||||||
|
* link:userland/linux/sched_getcpu_barrier.c[]: this uses a barrier to ensure that gem5 will run each thread on one separate CPU
|
||||||
|
|
||||||
Returns the CPU that the process/thread is currently running on:
|
Returns the CPU that the process/thread is currently running on:
|
||||||
|
|
||||||
@@ -21160,6 +21400,10 @@ TODO. Create a minimal runnable example of going into EL0 and jumping to EL1.
|
|||||||
|
|
||||||
See <<armarm8-db>> D1.6.2 "The stack pointer registers".
|
See <<armarm8-db>> D1.6.2 "The stack pointer registers".
|
||||||
|
|
||||||
|
There is one SP per <<arm-exception-levels,exception level>>.
|
||||||
|
|
||||||
|
This can also be seen clearly on the analysis at <<gem5-execcontext-readintregoperand-register-resolution>>.
|
||||||
|
|
||||||
TODO create a minimal runnable example.
|
TODO create a minimal runnable example.
|
||||||
|
|
||||||
TODO: how to select to use SP0 in an exception handler?
|
TODO: how to select to use SP0 in an exception handler?
|
||||||
|
|||||||
@@ -754,6 +754,10 @@ path_properties_tuples = (
|
|||||||
'requires_syscall_getcpu': True,
|
'requires_syscall_getcpu': True,
|
||||||
'test_run_args': {'cpus': 2},
|
'test_run_args': {'cpus': 2},
|
||||||
},
|
},
|
||||||
|
'sched_getcpu_barrier.c': {
|
||||||
|
'requires_syscall_getcpu': True,
|
||||||
|
'test_run_args': {'cpus': 2},
|
||||||
|
},
|
||||||
'time_boot.c': {'requires_sudo': True},
|
'time_boot.c': {'requires_sudo': True},
|
||||||
'virt_to_phys_user.c': {'requires_argument': True},
|
'virt_to_phys_user.c': {'requires_argument': True},
|
||||||
}
|
}
|
||||||
|
|||||||
46
userland/linux/sched_getcpu_barrier.c
Normal file
46
userland/linux/sched_getcpu_barrier.c
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
/* https://cirosantilli.com/linux-kernel-module-cheat#getcpu */
|
||||||
|
|
||||||
|
#define _GNU_SOURCE
|
||||||
|
#include <assert.h>
|
||||||
|
#include <pthread.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
pthread_barrier_t barrier;
|
||||||
|
|
||||||
|
void* main_thread(void *arg) {
|
||||||
|
(void)arg;
|
||||||
|
printf("%d\n", sched_getcpu());
|
||||||
|
pthread_barrier_wait(&barrier);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
pthread_t *threads;
|
||||||
|
unsigned int nthreads, i;
|
||||||
|
if (argc > 1) {
|
||||||
|
nthreads = strtoll(argv[1], NULL, 0);
|
||||||
|
} else {
|
||||||
|
nthreads = 1;
|
||||||
|
}
|
||||||
|
assert(!pthread_barrier_init(&barrier, NULL, nthreads));
|
||||||
|
threads = malloc(nthreads * sizeof(*threads));
|
||||||
|
for (i = 0; i < nthreads; ++i) {
|
||||||
|
assert(pthread_create(
|
||||||
|
&threads[i],
|
||||||
|
NULL,
|
||||||
|
main_thread,
|
||||||
|
NULL
|
||||||
|
) == 0);
|
||||||
|
}
|
||||||
|
for (i = 0; i < nthreads; ++i) {
|
||||||
|
pthread_join(threads[i], NULL);
|
||||||
|
}
|
||||||
|
free(threads);
|
||||||
|
assert(!pthread_barrier_destroy(&barrier));
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
47
userland/posix/pthread_barrier.c
Normal file
47
userland/posix/pthread_barrier.c
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
/* https://cirosantilli.com/linux-kernel-module-cheat#pthreads */
|
||||||
|
|
||||||
|
#define _XOPEN_SOURCE 700
|
||||||
|
#include <assert.h>
|
||||||
|
#include <pthread.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
pthread_barrier_t barrier;
|
||||||
|
|
||||||
|
void* main_thread(void *arg) {
|
||||||
|
(void)arg;
|
||||||
|
printf("before %ju\n", (uintmax_t)pthread_self());
|
||||||
|
pthread_barrier_wait(&barrier);
|
||||||
|
printf("after %ju\n", (uintmax_t)pthread_self());
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
pthread_t *threads;
|
||||||
|
unsigned int nthreads, i;
|
||||||
|
if (argc > 1) {
|
||||||
|
nthreads = strtoll(argv[1], NULL, 0);
|
||||||
|
} else {
|
||||||
|
nthreads = 1;
|
||||||
|
}
|
||||||
|
assert(!pthread_barrier_init(&barrier, NULL, nthreads));
|
||||||
|
threads = malloc(nthreads * sizeof(*threads));
|
||||||
|
for (i = 0; i < nthreads; ++i) {
|
||||||
|
assert(pthread_create(
|
||||||
|
&threads[i],
|
||||||
|
NULL,
|
||||||
|
main_thread,
|
||||||
|
NULL
|
||||||
|
) == 0);
|
||||||
|
}
|
||||||
|
for (i = 0; i < nthreads; ++i) {
|
||||||
|
pthread_join(threads[i], NULL);
|
||||||
|
}
|
||||||
|
free(threads);
|
||||||
|
assert(!pthread_barrier_destroy(&barrier));
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user