mirror of
https://github.com/cirosantilli/linux-kernel-module-cheat.git
synced 2026-01-23 02:05:57 +01:00
Rationalize -mcpu for emulators, compilers and assemblers on ARM
Move SVE example in from arm-assembly-cheat. atomic.cpp aarch64 add LSE ldadd placeholder, not compiling yet
This commit is contained in:
54
README.adoc
54
README.adoc
@@ -14006,6 +14006,60 @@ There are analogous LD3 and LD4 instruction.
|
|||||||
* assembly optimized libraries:
|
* assembly optimized libraries:
|
||||||
** https://github.com/projectNe10/Ne10
|
** https://github.com/projectNe10/Ne10
|
||||||
|
|
||||||
|
==== ARM SVE
|
||||||
|
|
||||||
|
Example: link:userland/arch/aarch64/sve.S[]
|
||||||
|
|
||||||
|
Scalable Vector Extension.
|
||||||
|
|
||||||
|
aarch64 only, newer than <<arm-neon>>.
|
||||||
|
|
||||||
|
It is called Scalable because it does not specify the vector width! Therefore we don't have to worry about new vector width instructions every few years! Hurray!
|
||||||
|
|
||||||
|
The instructions then allow implicitly tracking the loop index without knowing the actual vector length.
|
||||||
|
|
||||||
|
Added to QEMU use mode in 3.0.0.
|
||||||
|
|
||||||
|
TODO announcement date. Possibly 2017: https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf There is also a 2016 mention: https://community.arm.com/tools/hpc/b/hpc/posts/technology-update-the-scalable-vector-extension-sve-for-the-armv8-a-architecture
|
||||||
|
|
||||||
|
The Linux kernel shows `/proc/cpuinfo` compatibility as `sve`.
|
||||||
|
|
||||||
|
Official spec: https://developer.arm.com/docs/100891/latest/sve-overview/introducing-sve
|
||||||
|
|
||||||
|
===== SVE bibliography
|
||||||
|
|
||||||
|
* https://www.rico.cat/files/ICS18-gem5-sve-tutorial.pdf step by step of a complete code execution examples, the best initial tutorial so far
|
||||||
|
* https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf paper with some nice few concrete examples, illustrations and rationale
|
||||||
|
* https://static.docs.arm.com/dui0965/c/DUI0965C_scalable_vector_extension_guide.pdf
|
||||||
|
* https://developer.arm.com/products/software-development-tools/hpc/documentation/writing-inline-sve-assembly quick inlining guide
|
||||||
|
|
||||||
|
====== SVE spec
|
||||||
|
|
||||||
|
<<armarm8>> A1.7 "ARMv8 architecture extensions" says:
|
||||||
|
|
||||||
|
____
|
||||||
|
SVE is an optional extension to ARMv8.2. That is, SVE requires the implementation of ARMv8.2.
|
||||||
|
____
|
||||||
|
|
||||||
|
A1.7.8 "The Scalable Vector Extension (SVE)": then says that only changes to the existing registers are described in that manual, and that you should look instead at the "ARM Architecture Reference Manual Supplement, The Scalable Vector Extension (SVE), for ARMv8-A."
|
||||||
|
|
||||||
|
We then download the zip from: https://developer.arm.com/docs/ddi0584/latest/arm-architecture-reference-manual-supplement-the-scalable-vector-extension-sve-for-armv8-a and it contains the PDF: `DDI0584A_d_SVE_supp_armv8A.pdf` which we use here.
|
||||||
|
|
||||||
|
That document then describes the SVE instructions and registers.
|
||||||
|
|
||||||
|
=== ARMv8 architecture extensions
|
||||||
|
|
||||||
|
==== ARMv8.1 architecture extension
|
||||||
|
|
||||||
|
<<armarm8-db>> A1.7.3 "The ARMv8.1 architecture extension"
|
||||||
|
|
||||||
|
[[arm-lse]]
|
||||||
|
===== ARM Large System Extensions (LSE)
|
||||||
|
|
||||||
|
<<armarm8-db>> "ARMv8.1-LSE, ARMv8.1 Large System Extensions"
|
||||||
|
|
||||||
|
* LDADD: link:userland/cpp/atomic.cpp[]
|
||||||
|
|
||||||
=== ARM assembly bibliography
|
=== ARM assembly bibliography
|
||||||
|
|
||||||
==== ARM non-official bibliography
|
==== ARM non-official bibliography
|
||||||
|
|||||||
@@ -48,7 +48,6 @@ Build the baremetal examples with crosstool-NG.
|
|||||||
cc_flags = [
|
cc_flags = [
|
||||||
'-I', self.env['root_dir'], LF,
|
'-I', self.env['root_dir'], LF,
|
||||||
'-O{}'.format(self.env['optimization_level']), LF,
|
'-O{}'.format(self.env['optimization_level']), LF,
|
||||||
'-mcpu={}'.format(self.env['mcpu']), LF,
|
|
||||||
'-nostartfiles', LF,
|
'-nostartfiles', LF,
|
||||||
]
|
]
|
||||||
if self.env['arch'] == 'arm':
|
if self.env['arch'] == 'arm':
|
||||||
|
|||||||
37
common.py
37
common.py
@@ -658,20 +658,40 @@ Incompatible archs are skipped.
|
|||||||
else:
|
else:
|
||||||
env['gem5_build_id'] = consts['default_build_id']
|
env['gem5_build_id'] = consts['default_build_id']
|
||||||
env['is_arm'] = False
|
env['is_arm'] = False
|
||||||
|
# Our approach is as follows:
|
||||||
|
#
|
||||||
|
# * compilers: control maximum arch version emitted explicitly -mcpu
|
||||||
|
# +
|
||||||
|
# This helps to prevent blowing up simulation unnecessarily.
|
||||||
|
# +
|
||||||
|
# It does not matter if we miss any perf features for QEMU which is functional,
|
||||||
|
# but it could matter for gem5 perf simulations.
|
||||||
|
# * assemblers: enable as many features as possible.
|
||||||
|
# +
|
||||||
|
# Well, if I'm explicitly writing down the instructions, I want
|
||||||
|
# my emulator to blow up in peace!
|
||||||
|
# * emulators: enable as many features as possible
|
||||||
|
# +
|
||||||
|
# This is the gem5 default behavior, for QEMU TODO not sure if default,
|
||||||
|
# but we select it explicitly with -cpu max.
|
||||||
|
# https://habkost.net/posts/2017/03/qemu-cpu-model-probing-story.html
|
||||||
|
# +
|
||||||
|
# We doe this because QEMU does not add all possible Cortex Axx, there are
|
||||||
|
# just too many, and gem5 does not allow selecting lower feature in general.
|
||||||
if env['arch'] == 'arm':
|
if env['arch'] == 'arm':
|
||||||
env['armv'] = 7
|
env['armv'] = 7
|
||||||
env['mcpu'] = 'cortex-a15'
|
|
||||||
env['buildroot_toolchain_prefix'] = 'arm-buildroot-linux-gnueabihf'
|
env['buildroot_toolchain_prefix'] = 'arm-buildroot-linux-gnueabihf'
|
||||||
env['crosstool_ng_toolchain_prefix'] = 'arm-unknown-eabi'
|
env['crosstool_ng_toolchain_prefix'] = 'arm-unknown-eabi'
|
||||||
env['ubuntu_toolchain_prefix'] = 'arm-linux-gnueabihf'
|
env['ubuntu_toolchain_prefix'] = 'arm-linux-gnueabihf'
|
||||||
env['is_arm'] = True
|
env['is_arm'] = True
|
||||||
|
env['march'] = 'armv8-a'
|
||||||
elif env['arch'] == 'aarch64':
|
elif env['arch'] == 'aarch64':
|
||||||
env['armv'] = 8
|
env['armv'] = 8
|
||||||
env['mcpu'] = 'cortex-a57'
|
|
||||||
env['buildroot_toolchain_prefix'] = 'aarch64-buildroot-linux-gnu'
|
env['buildroot_toolchain_prefix'] = 'aarch64-buildroot-linux-gnu'
|
||||||
env['crosstool_ng_toolchain_prefix'] = 'aarch64-unknown-elf'
|
env['crosstool_ng_toolchain_prefix'] = 'aarch64-unknown-elf'
|
||||||
env['ubuntu_toolchain_prefix'] = 'aarch64-linux-gnu'
|
env['ubuntu_toolchain_prefix'] = 'aarch64-linux-gnu'
|
||||||
env['is_arm'] = True
|
env['is_arm'] = True
|
||||||
|
env['march'] = 'armv8-a+lse'
|
||||||
elif env['arch'] == 'x86_64':
|
elif env['arch'] == 'x86_64':
|
||||||
env['crosstool_ng_toolchain_prefix'] = 'x86_64-unknown-elf'
|
env['crosstool_ng_toolchain_prefix'] = 'x86_64-unknown-elf'
|
||||||
env['gem5_arch'] = 'X86'
|
env['gem5_arch'] = 'X86'
|
||||||
@@ -1545,6 +1565,10 @@ https://github.com/cirosantilli/linux-kernel-module-cheat#gem5-debug-build
|
|||||||
cc_flags_after.extend(['-pthread', LF])
|
cc_flags_after.extend(['-pthread', LF])
|
||||||
if self.need_rebuild([in_path] + extra_objs + extra_deps, out_path):
|
if self.need_rebuild([in_path] + extra_objs + extra_deps, out_path):
|
||||||
cc_flags.extend(my_path_properties['cc_flags'])
|
cc_flags.extend(my_path_properties['cc_flags'])
|
||||||
|
if self.env['verbose']:
|
||||||
|
cc_flags.extend([
|
||||||
|
'-v', LF,
|
||||||
|
])
|
||||||
cc_flags_after.extend(my_path_properties['cc_flags_after'])
|
cc_flags_after.extend(my_path_properties['cc_flags_after'])
|
||||||
if my_path_properties['cc_pedantic']:
|
if my_path_properties['cc_pedantic']:
|
||||||
cc_flags.extend(['-pedantic', LF])
|
cc_flags.extend(['-pedantic', LF])
|
||||||
@@ -1557,6 +1581,15 @@ https://github.com/cirosantilli/linux-kernel-module-cheat#gem5-debug-build
|
|||||||
elif in_ext == self.env['cxx_ext']:
|
elif in_ext == self.env['cxx_ext']:
|
||||||
cc = self.env['gxx_path']
|
cc = self.env['gxx_path']
|
||||||
std = my_path_properties['cxx_std']
|
std = my_path_properties['cxx_std']
|
||||||
|
if self.env['is_arm']:
|
||||||
|
if in_ext == self.env['asm_ext']:
|
||||||
|
cc_flags.extend([
|
||||||
|
'-Xassembler', '-march=all', LF,
|
||||||
|
])
|
||||||
|
else:
|
||||||
|
cc_flags.extend([
|
||||||
|
'-march={}'.format(self.env['march']), LF,
|
||||||
|
])
|
||||||
if dirpath_relative_root_components_len > 0:
|
if dirpath_relative_root_components_len > 0:
|
||||||
if dirpath_relative_root_components[0] == 'userland':
|
if dirpath_relative_root_components[0] == 'userland':
|
||||||
if dirpath_relative_root_components_len > 1:
|
if dirpath_relative_root_components_len > 1:
|
||||||
|
|||||||
@@ -344,7 +344,6 @@ path_properties_tuples = (
|
|||||||
{
|
{
|
||||||
'allowed_archs': {'arm'},
|
'allowed_archs': {'arm'},
|
||||||
'cc_flags': [
|
'cc_flags': [
|
||||||
'-Xassembler', '-mcpu=cortex-a72', LF,
|
|
||||||
# To prevent:
|
# To prevent:
|
||||||
# > vfp.S: Error: selected processor does not support <FPU instruction> in ARM mode
|
# > vfp.S: Error: selected processor does not support <FPU instruction> in ARM mode
|
||||||
# https://stackoverflow.com/questions/41131432/cross-compiling-error-selected-processor-does-not-support-fmrx-r3-fpexc-in/52875732#52875732
|
# https://stackoverflow.com/questions/41131432/cross-compiling-error-selected-processor-does-not-support-fmrx-r3-fpexc-in/52875732#52875732
|
||||||
@@ -383,7 +382,9 @@ path_properties_tuples = (
|
|||||||
}
|
}
|
||||||
),
|
),
|
||||||
'aarch64': (
|
'aarch64': (
|
||||||
{'allowed_archs': {'aarch64'}},
|
{
|
||||||
|
'allowed_archs': {'aarch64'},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
'inline_asm': (
|
'inline_asm': (
|
||||||
{
|
{
|
||||||
@@ -399,6 +400,7 @@ path_properties_tuples = (
|
|||||||
'signal_generated_by_os': True,
|
'signal_generated_by_os': True,
|
||||||
'signal_received': signal.Signals.SIGILL,
|
'signal_received': signal.Signals.SIGILL,
|
||||||
},
|
},
|
||||||
|
'sve.S': {'gem5_unimplemented_instruction': True}
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
'x86_64': (
|
'x86_64': (
|
||||||
|
|||||||
15
run
15
run
@@ -576,6 +576,7 @@ Extra options to append at the end of the emulator command line.
|
|||||||
qemu_user_and_system_options +
|
qemu_user_and_system_options +
|
||||||
debug_args
|
debug_args
|
||||||
)
|
)
|
||||||
|
cpu = 'max'
|
||||||
else:
|
else:
|
||||||
extra_emulator_args.extend(extra_qemu_args)
|
extra_emulator_args.extend(extra_qemu_args)
|
||||||
self.make_run_dirs()
|
self.make_run_dirs()
|
||||||
@@ -594,9 +595,11 @@ Extra options to append at the end of the emulator command line.
|
|||||||
serial_monitor = ['-serial', serial, LF]
|
serial_monitor = ['-serial', serial, LF]
|
||||||
if self.env['kvm']:
|
if self.env['kvm']:
|
||||||
extra_emulator_args.extend([
|
extra_emulator_args.extend([
|
||||||
'-cpu', 'host', LF,
|
|
||||||
'-enable-kvm', LF,
|
'-enable-kvm', LF,
|
||||||
])
|
])
|
||||||
|
cpu = 'host'
|
||||||
|
else:
|
||||||
|
cpu = 'max'
|
||||||
extra_emulator_args.extend([
|
extra_emulator_args.extend([
|
||||||
'-serial',
|
'-serial',
|
||||||
'tcp::{},server,nowait'.format(self.env['extra_serial_port']), LF
|
'tcp::{},server,nowait'.format(self.env['extra_serial_port']), LF
|
||||||
@@ -706,19 +709,15 @@ Extra options to append at the end of the emulator command line.
|
|||||||
])
|
])
|
||||||
elif self.env['is_arm']:
|
elif self.env['is_arm']:
|
||||||
extra_emulator_args.extend(['-semihosting', LF])
|
extra_emulator_args.extend(['-semihosting', LF])
|
||||||
if self.env['arch'] == 'arm':
|
|
||||||
cpu = 'cortex-a15'
|
|
||||||
else:
|
|
||||||
cpu = 'cortex-a57'
|
|
||||||
append = ['-append', '{} {}'.format(root, kernel_cli), LF]
|
append = ['-append', '{} {}'.format(root, kernel_cli), LF]
|
||||||
cmd.extend(
|
cmd.extend(
|
||||||
[
|
|
||||||
'-cpu', cpu, LF,
|
|
||||||
] +
|
|
||||||
virtio_gpu_pci
|
virtio_gpu_pci
|
||||||
)
|
)
|
||||||
if self.env['baremetal'] is None:
|
if self.env['baremetal'] is None:
|
||||||
cmd.extend(append)
|
cmd.extend(append)
|
||||||
|
extra_emulator_args.extend([
|
||||||
|
'-cpu', cpu, LF,
|
||||||
|
])
|
||||||
if self.env['tmux']:
|
if self.env['tmux']:
|
||||||
tmux_args = '--run-id {}'.format(self.env['run_id'])
|
tmux_args = '--run-id {}'.format(self.env['run_id'])
|
||||||
if self.env['tmux_program'] == 'shell':
|
if self.env['tmux_program'] == 'shell':
|
||||||
|
|||||||
49
userland/arch/aarch64/sve.S
Normal file
49
userland/arch/aarch64/sve.S
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
/* https://github.com/cirosantilli/linux-kernel-module-cheat#arm-sve */
|
||||||
|
|
||||||
|
#include <lkmc.h>
|
||||||
|
|
||||||
|
.data
|
||||||
|
x: .double 1.5, 2.5, 3.5, 4.5
|
||||||
|
y: .double 5.0, 6.0, 7.0, 8.0
|
||||||
|
y_expect: .double 8.0, 11.0, 14.0, 17.0
|
||||||
|
a: .double 2.0
|
||||||
|
n: .word 4
|
||||||
|
|
||||||
|
LKMC_PROLOGUE
|
||||||
|
adr x0, x
|
||||||
|
adr x1, y
|
||||||
|
adr x2, a
|
||||||
|
adr x3, n
|
||||||
|
bl daxpy
|
||||||
|
LKMC_ASSERT_MEMCMP(y, y_expect, =0x20)
|
||||||
|
LKMC_EPILOGUE
|
||||||
|
|
||||||
|
/* Multiply by a scalar and add.
|
||||||
|
*
|
||||||
|
* Operation:
|
||||||
|
*
|
||||||
|
* Y += a * X
|
||||||
|
*
|
||||||
|
* C signature:
|
||||||
|
*
|
||||||
|
* void daxpy(double *x, double *y, double *a, int *n)
|
||||||
|
*
|
||||||
|
* The name "daxpy" comes from LAPACK:
|
||||||
|
* http://www.netlib.org/lapack/explore-html/de/da4/group__double__blas__level1_ga8f99d6a644d3396aa32db472e0cfc91c.html
|
||||||
|
*
|
||||||
|
* Adapted from: https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf
|
||||||
|
*/
|
||||||
|
daxpy:
|
||||||
|
ldrsw x3, [x3]
|
||||||
|
mov x4, 0
|
||||||
|
whilelt p0.d, x4, x3
|
||||||
|
ld1rd z0.d, p0/z, [x2]
|
||||||
|
.loop:
|
||||||
|
ld1d z1.d, p0/z, [x0, x4, lsl 3]
|
||||||
|
ld1d z2.d, p0/z, [x1, x4, lsl 3]
|
||||||
|
fmla z2.d, p0/m, z1.d, z0.d
|
||||||
|
st1d z2.d, p0, [x1, x4, lsl 3]
|
||||||
|
incd x4
|
||||||
|
whilelt p0.d, x4, x3
|
||||||
|
b.first .loop
|
||||||
|
ret
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
/* https://github.com/cirosantilli/linux-kernel-module-cheat#cmpxchg-instruction */
|
/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-cmpxchg-instruction */
|
||||||
|
|
||||||
#include <lkmc.h>
|
#include <lkmc.h>
|
||||||
|
|
||||||
@@ -24,5 +24,4 @@ LKMC_PROLOGUE
|
|||||||
LKMC_ASSERT_EQ(%rax, $0)
|
LKMC_ASSERT_EQ(%rax, $0)
|
||||||
LKMC_ASSERT_EQ(%r13, $2)
|
LKMC_ASSERT_EQ(%r13, $2)
|
||||||
LKMC_ASSERT_EQ(%r14, $2)
|
LKMC_ASSERT_EQ(%r14, $2)
|
||||||
|
|
||||||
LKMC_EPILOGUE
|
LKMC_EPILOGUE
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
// https://github.com/cirosantilli/linux-kernel-module-cheat#cpp
|
// https://github.com/cirosantilli/linux-kernel-module-cheat#cpp
|
||||||
// https://github.com/cirosantilli/linux-kernel-module-cheat#x86-lock-prefix
|
|
||||||
//
|
//
|
||||||
// The non-atomic counters have undefined values which get printed:
|
// The non-atomic counters have undefined values which get printed:
|
||||||
// they are extremely likely to be less than the correct value due to
|
// they are extremely likely to be less than the correct value due to
|
||||||
@@ -15,7 +14,6 @@
|
|||||||
// On GCC 4.8 x86-64, using atomic offered a 5x peformance improvement
|
// On GCC 4.8 x86-64, using atomic offered a 5x peformance improvement
|
||||||
// over the same program with mutexes.
|
// over the same program with mutexes.
|
||||||
|
|
||||||
|
|
||||||
#if __cplusplus >= 201103L
|
#if __cplusplus >= 201103L
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@@ -24,7 +22,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
std::atomic_ulong my_atomic_ulong(0);
|
std::atomic_ulong my_atomic_ulong(0);
|
||||||
unsigned long my_non_atomic_ulong = 0;
|
unsigned long my_non_atomic_ulong = 0;
|
||||||
#if defined(__x86_64__)
|
#if defined(__x86_64__) || defined(__aarch64__)
|
||||||
unsigned long my_arch_atomic_ulong = 0;
|
unsigned long my_arch_atomic_ulong = 0;
|
||||||
unsigned long my_arch_non_atomic_ulong = 0;
|
unsigned long my_arch_non_atomic_ulong = 0;
|
||||||
#endif
|
#endif
|
||||||
@@ -41,6 +39,7 @@ void threadMain() {
|
|||||||
:
|
:
|
||||||
:
|
:
|
||||||
);
|
);
|
||||||
|
// https://github.com/cirosantilli/linux-kernel-module-cheat#x86-lock-prefix
|
||||||
__asm__ __volatile__ (
|
__asm__ __volatile__ (
|
||||||
"lock;"
|
"lock;"
|
||||||
"incq %0;"
|
"incq %0;"
|
||||||
@@ -48,6 +47,21 @@ void threadMain() {
|
|||||||
:
|
:
|
||||||
:
|
:
|
||||||
);
|
);
|
||||||
|
#elif defined(__aarch64__)
|
||||||
|
__asm__ __volatile__ (
|
||||||
|
"add %0, %0, 1;"
|
||||||
|
: "+r" (my_arch_non_atomic_ulong)
|
||||||
|
:
|
||||||
|
:
|
||||||
|
);
|
||||||
|
// https://github.com/cirosantilli/linux-kernel-module-cheat#arm-lse
|
||||||
|
__asm__ __volatile__ (
|
||||||
|
"ldadd %[inc], xzr, [%[addr]];"
|
||||||
|
: "=m" (my_arch_atomic_ulong)
|
||||||
|
: [inc] "r" (1),
|
||||||
|
[addr] "r" (&my_arch_atomic_ulong)
|
||||||
|
:
|
||||||
|
);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -75,7 +89,7 @@ int main(int argc, char **argv) {
|
|||||||
// We can also use the atomics direclty through `operator T` conversion.
|
// We can also use the atomics direclty through `operator T` conversion.
|
||||||
assert(my_atomic_ulong == my_atomic_ulong.load());
|
assert(my_atomic_ulong == my_atomic_ulong.load());
|
||||||
std::cout << "my_non_atomic_ulong " << my_non_atomic_ulong << std::endl;
|
std::cout << "my_non_atomic_ulong " << my_non_atomic_ulong << std::endl;
|
||||||
#if defined(__x86_64__)
|
#if defined(__x86_64__) || defined(__aarch64__)
|
||||||
assert(my_arch_atomic_ulong == nthreads * niters);
|
assert(my_arch_atomic_ulong == nthreads * niters);
|
||||||
std::cout << "my_arch_non_atomic_ulong " << my_arch_non_atomic_ulong << std::endl;
|
std::cout << "my_arch_non_atomic_ulong " << my_arch_non_atomic_ulong << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user