Rationalize -mcpu for emulators, compilers and assemblers on ARM

Move SVE example in from arm-assembly-cheat.

atomic.cpp aarch64 add LSE ldadd placeholder, not compiling yet
This commit is contained in:
Ciro Santilli 六四事件 法轮功
2019-06-26 00:00:00 +00:00
parent ce3d546ac8
commit 3fdd83c2c5
8 changed files with 168 additions and 19 deletions

View File

@@ -14006,6 +14006,60 @@ There are analogous LD3 and LD4 instruction.
* assembly optimized libraries: * assembly optimized libraries:
** https://github.com/projectNe10/Ne10 ** https://github.com/projectNe10/Ne10
==== ARM SVE
Example: link:userland/arch/aarch64/sve.S[]
Scalable Vector Extension.
aarch64 only, newer than <<arm-neon>>.
It is called Scalable because it does not specify the vector width! Therefore we don't have to worry about new vector width instructions every few years! Hurray!
The instructions then allow implicitly tracking the loop index without knowing the actual vector length.
Added to QEMU use mode in 3.0.0.
TODO announcement date. Possibly 2017: https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf There is also a 2016 mention: https://community.arm.com/tools/hpc/b/hpc/posts/technology-update-the-scalable-vector-extension-sve-for-the-armv8-a-architecture
The Linux kernel shows `/proc/cpuinfo` compatibility as `sve`.
Official spec: https://developer.arm.com/docs/100891/latest/sve-overview/introducing-sve
===== SVE bibliography
* https://www.rico.cat/files/ICS18-gem5-sve-tutorial.pdf step by step of a complete code execution examples, the best initial tutorial so far
* https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf paper with some nice few concrete examples, illustrations and rationale
* https://static.docs.arm.com/dui0965/c/DUI0965C_scalable_vector_extension_guide.pdf
* https://developer.arm.com/products/software-development-tools/hpc/documentation/writing-inline-sve-assembly quick inlining guide
====== SVE spec
<<armarm8>> A1.7 "ARMv8 architecture extensions" says:
____
SVE is an optional extension to ARMv8.2. That is, SVE requires the implementation of ARMv8.2.
____
A1.7.8 "The Scalable Vector Extension (SVE)": then says that only changes to the existing registers are described in that manual, and that you should look instead at the "ARM Architecture Reference Manual Supplement, The Scalable Vector Extension (SVE), for ARMv8-A."
We then download the zip from: https://developer.arm.com/docs/ddi0584/latest/arm-architecture-reference-manual-supplement-the-scalable-vector-extension-sve-for-armv8-a and it contains the PDF: `DDI0584A_d_SVE_supp_armv8A.pdf` which we use here.
That document then describes the SVE instructions and registers.
=== ARMv8 architecture extensions
==== ARMv8.1 architecture extension
<<armarm8-db>> A1.7.3 "The ARMv8.1 architecture extension"
[[arm-lse]]
===== ARM Large System Extensions (LSE)
<<armarm8-db>> "ARMv8.1-LSE, ARMv8.1 Large System Extensions"
* LDADD: link:userland/cpp/atomic.cpp[]
=== ARM assembly bibliography === ARM assembly bibliography
==== ARM non-official bibliography ==== ARM non-official bibliography

View File

@@ -48,7 +48,6 @@ Build the baremetal examples with crosstool-NG.
cc_flags = [ cc_flags = [
'-I', self.env['root_dir'], LF, '-I', self.env['root_dir'], LF,
'-O{}'.format(self.env['optimization_level']), LF, '-O{}'.format(self.env['optimization_level']), LF,
'-mcpu={}'.format(self.env['mcpu']), LF,
'-nostartfiles', LF, '-nostartfiles', LF,
] ]
if self.env['arch'] == 'arm': if self.env['arch'] == 'arm':

View File

@@ -658,20 +658,40 @@ Incompatible archs are skipped.
else: else:
env['gem5_build_id'] = consts['default_build_id'] env['gem5_build_id'] = consts['default_build_id']
env['is_arm'] = False env['is_arm'] = False
# Our approach is as follows:
#
# * compilers: control maximum arch version emitted explicitly -mcpu
# +
# This helps to prevent blowing up simulation unnecessarily.
# +
# It does not matter if we miss any perf features for QEMU which is functional,
# but it could matter for gem5 perf simulations.
# * assemblers: enable as many features as possible.
# +
# Well, if I'm explicitly writing down the instructions, I want
# my emulator to blow up in peace!
# * emulators: enable as many features as possible
# +
# This is the gem5 default behavior, for QEMU TODO not sure if default,
# but we select it explicitly with -cpu max.
# https://habkost.net/posts/2017/03/qemu-cpu-model-probing-story.html
# +
# We doe this because QEMU does not add all possible Cortex Axx, there are
# just too many, and gem5 does not allow selecting lower feature in general.
if env['arch'] == 'arm': if env['arch'] == 'arm':
env['armv'] = 7 env['armv'] = 7
env['mcpu'] = 'cortex-a15'
env['buildroot_toolchain_prefix'] = 'arm-buildroot-linux-gnueabihf' env['buildroot_toolchain_prefix'] = 'arm-buildroot-linux-gnueabihf'
env['crosstool_ng_toolchain_prefix'] = 'arm-unknown-eabi' env['crosstool_ng_toolchain_prefix'] = 'arm-unknown-eabi'
env['ubuntu_toolchain_prefix'] = 'arm-linux-gnueabihf' env['ubuntu_toolchain_prefix'] = 'arm-linux-gnueabihf'
env['is_arm'] = True env['is_arm'] = True
env['march'] = 'armv8-a'
elif env['arch'] == 'aarch64': elif env['arch'] == 'aarch64':
env['armv'] = 8 env['armv'] = 8
env['mcpu'] = 'cortex-a57'
env['buildroot_toolchain_prefix'] = 'aarch64-buildroot-linux-gnu' env['buildroot_toolchain_prefix'] = 'aarch64-buildroot-linux-gnu'
env['crosstool_ng_toolchain_prefix'] = 'aarch64-unknown-elf' env['crosstool_ng_toolchain_prefix'] = 'aarch64-unknown-elf'
env['ubuntu_toolchain_prefix'] = 'aarch64-linux-gnu' env['ubuntu_toolchain_prefix'] = 'aarch64-linux-gnu'
env['is_arm'] = True env['is_arm'] = True
env['march'] = 'armv8-a+lse'
elif env['arch'] == 'x86_64': elif env['arch'] == 'x86_64':
env['crosstool_ng_toolchain_prefix'] = 'x86_64-unknown-elf' env['crosstool_ng_toolchain_prefix'] = 'x86_64-unknown-elf'
env['gem5_arch'] = 'X86' env['gem5_arch'] = 'X86'
@@ -1545,6 +1565,10 @@ https://github.com/cirosantilli/linux-kernel-module-cheat#gem5-debug-build
cc_flags_after.extend(['-pthread', LF]) cc_flags_after.extend(['-pthread', LF])
if self.need_rebuild([in_path] + extra_objs + extra_deps, out_path): if self.need_rebuild([in_path] + extra_objs + extra_deps, out_path):
cc_flags.extend(my_path_properties['cc_flags']) cc_flags.extend(my_path_properties['cc_flags'])
if self.env['verbose']:
cc_flags.extend([
'-v', LF,
])
cc_flags_after.extend(my_path_properties['cc_flags_after']) cc_flags_after.extend(my_path_properties['cc_flags_after'])
if my_path_properties['cc_pedantic']: if my_path_properties['cc_pedantic']:
cc_flags.extend(['-pedantic', LF]) cc_flags.extend(['-pedantic', LF])
@@ -1557,6 +1581,15 @@ https://github.com/cirosantilli/linux-kernel-module-cheat#gem5-debug-build
elif in_ext == self.env['cxx_ext']: elif in_ext == self.env['cxx_ext']:
cc = self.env['gxx_path'] cc = self.env['gxx_path']
std = my_path_properties['cxx_std'] std = my_path_properties['cxx_std']
if self.env['is_arm']:
if in_ext == self.env['asm_ext']:
cc_flags.extend([
'-Xassembler', '-march=all', LF,
])
else:
cc_flags.extend([
'-march={}'.format(self.env['march']), LF,
])
if dirpath_relative_root_components_len > 0: if dirpath_relative_root_components_len > 0:
if dirpath_relative_root_components[0] == 'userland': if dirpath_relative_root_components[0] == 'userland':
if dirpath_relative_root_components_len > 1: if dirpath_relative_root_components_len > 1:

View File

@@ -344,7 +344,6 @@ path_properties_tuples = (
{ {
'allowed_archs': {'arm'}, 'allowed_archs': {'arm'},
'cc_flags': [ 'cc_flags': [
'-Xassembler', '-mcpu=cortex-a72', LF,
# To prevent: # To prevent:
# > vfp.S: Error: selected processor does not support <FPU instruction> in ARM mode # > vfp.S: Error: selected processor does not support <FPU instruction> in ARM mode
# https://stackoverflow.com/questions/41131432/cross-compiling-error-selected-processor-does-not-support-fmrx-r3-fpexc-in/52875732#52875732 # https://stackoverflow.com/questions/41131432/cross-compiling-error-selected-processor-does-not-support-fmrx-r3-fpexc-in/52875732#52875732
@@ -383,7 +382,9 @@ path_properties_tuples = (
} }
), ),
'aarch64': ( 'aarch64': (
{'allowed_archs': {'aarch64'}}, {
'allowed_archs': {'aarch64'},
},
{ {
'inline_asm': ( 'inline_asm': (
{ {
@@ -399,6 +400,7 @@ path_properties_tuples = (
'signal_generated_by_os': True, 'signal_generated_by_os': True,
'signal_received': signal.Signals.SIGILL, 'signal_received': signal.Signals.SIGILL,
}, },
'sve.S': {'gem5_unimplemented_instruction': True}
} }
), ),
'x86_64': ( 'x86_64': (

15
run
View File

@@ -576,6 +576,7 @@ Extra options to append at the end of the emulator command line.
qemu_user_and_system_options + qemu_user_and_system_options +
debug_args debug_args
) )
cpu = 'max'
else: else:
extra_emulator_args.extend(extra_qemu_args) extra_emulator_args.extend(extra_qemu_args)
self.make_run_dirs() self.make_run_dirs()
@@ -594,9 +595,11 @@ Extra options to append at the end of the emulator command line.
serial_monitor = ['-serial', serial, LF] serial_monitor = ['-serial', serial, LF]
if self.env['kvm']: if self.env['kvm']:
extra_emulator_args.extend([ extra_emulator_args.extend([
'-cpu', 'host', LF,
'-enable-kvm', LF, '-enable-kvm', LF,
]) ])
cpu = 'host'
else:
cpu = 'max'
extra_emulator_args.extend([ extra_emulator_args.extend([
'-serial', '-serial',
'tcp::{},server,nowait'.format(self.env['extra_serial_port']), LF 'tcp::{},server,nowait'.format(self.env['extra_serial_port']), LF
@@ -706,19 +709,15 @@ Extra options to append at the end of the emulator command line.
]) ])
elif self.env['is_arm']: elif self.env['is_arm']:
extra_emulator_args.extend(['-semihosting', LF]) extra_emulator_args.extend(['-semihosting', LF])
if self.env['arch'] == 'arm':
cpu = 'cortex-a15'
else:
cpu = 'cortex-a57'
append = ['-append', '{} {}'.format(root, kernel_cli), LF] append = ['-append', '{} {}'.format(root, kernel_cli), LF]
cmd.extend( cmd.extend(
[
'-cpu', cpu, LF,
] +
virtio_gpu_pci virtio_gpu_pci
) )
if self.env['baremetal'] is None: if self.env['baremetal'] is None:
cmd.extend(append) cmd.extend(append)
extra_emulator_args.extend([
'-cpu', cpu, LF,
])
if self.env['tmux']: if self.env['tmux']:
tmux_args = '--run-id {}'.format(self.env['run_id']) tmux_args = '--run-id {}'.format(self.env['run_id'])
if self.env['tmux_program'] == 'shell': if self.env['tmux_program'] == 'shell':

View File

@@ -0,0 +1,49 @@
/* https://github.com/cirosantilli/linux-kernel-module-cheat#arm-sve */
#include <lkmc.h>
.data
x: .double 1.5, 2.5, 3.5, 4.5
y: .double 5.0, 6.0, 7.0, 8.0
y_expect: .double 8.0, 11.0, 14.0, 17.0
a: .double 2.0
n: .word 4
LKMC_PROLOGUE
adr x0, x
adr x1, y
adr x2, a
adr x3, n
bl daxpy
LKMC_ASSERT_MEMCMP(y, y_expect, =0x20)
LKMC_EPILOGUE
/* Multiply by a scalar and add.
*
* Operation:
*
* Y += a * X
*
* C signature:
*
* void daxpy(double *x, double *y, double *a, int *n)
*
* The name "daxpy" comes from LAPACK:
* http://www.netlib.org/lapack/explore-html/de/da4/group__double__blas__level1_ga8f99d6a644d3396aa32db472e0cfc91c.html
*
* Adapted from: https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf
*/
daxpy:
ldrsw x3, [x3]
mov x4, 0
whilelt p0.d, x4, x3
ld1rd z0.d, p0/z, [x2]
.loop:
ld1d z1.d, p0/z, [x0, x4, lsl 3]
ld1d z2.d, p0/z, [x1, x4, lsl 3]
fmla z2.d, p0/m, z1.d, z0.d
st1d z2.d, p0, [x1, x4, lsl 3]
incd x4
whilelt p0.d, x4, x3
b.first .loop
ret

View File

@@ -1,4 +1,4 @@
/* https://github.com/cirosantilli/linux-kernel-module-cheat#cmpxchg-instruction */ /* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-cmpxchg-instruction */
#include <lkmc.h> #include <lkmc.h>
@@ -24,5 +24,4 @@ LKMC_PROLOGUE
LKMC_ASSERT_EQ(%rax, $0) LKMC_ASSERT_EQ(%rax, $0)
LKMC_ASSERT_EQ(%r13, $2) LKMC_ASSERT_EQ(%r13, $2)
LKMC_ASSERT_EQ(%r14, $2) LKMC_ASSERT_EQ(%r14, $2)
LKMC_EPILOGUE LKMC_EPILOGUE

View File

@@ -1,5 +1,4 @@
// https://github.com/cirosantilli/linux-kernel-module-cheat#cpp // https://github.com/cirosantilli/linux-kernel-module-cheat#cpp
// https://github.com/cirosantilli/linux-kernel-module-cheat#x86-lock-prefix
// //
// The non-atomic counters have undefined values which get printed: // The non-atomic counters have undefined values which get printed:
// they are extremely likely to be less than the correct value due to // they are extremely likely to be less than the correct value due to
@@ -15,7 +14,6 @@
// On GCC 4.8 x86-64, using atomic offered a 5x peformance improvement // On GCC 4.8 x86-64, using atomic offered a 5x peformance improvement
// over the same program with mutexes. // over the same program with mutexes.
#if __cplusplus >= 201103L #if __cplusplus >= 201103L
#include <atomic> #include <atomic>
#include <cassert> #include <cassert>
@@ -24,7 +22,7 @@
#include <vector> #include <vector>
std::atomic_ulong my_atomic_ulong(0); std::atomic_ulong my_atomic_ulong(0);
unsigned long my_non_atomic_ulong = 0; unsigned long my_non_atomic_ulong = 0;
#if defined(__x86_64__) #if defined(__x86_64__) || defined(__aarch64__)
unsigned long my_arch_atomic_ulong = 0; unsigned long my_arch_atomic_ulong = 0;
unsigned long my_arch_non_atomic_ulong = 0; unsigned long my_arch_non_atomic_ulong = 0;
#endif #endif
@@ -41,6 +39,7 @@ void threadMain() {
: :
: :
); );
// https://github.com/cirosantilli/linux-kernel-module-cheat#x86-lock-prefix
__asm__ __volatile__ ( __asm__ __volatile__ (
"lock;" "lock;"
"incq %0;" "incq %0;"
@@ -48,6 +47,21 @@ void threadMain() {
: :
: :
); );
#elif defined(__aarch64__)
__asm__ __volatile__ (
"add %0, %0, 1;"
: "+r" (my_arch_non_atomic_ulong)
:
:
);
// https://github.com/cirosantilli/linux-kernel-module-cheat#arm-lse
__asm__ __volatile__ (
"ldadd %[inc], xzr, [%[addr]];"
: "=m" (my_arch_atomic_ulong)
: [inc] "r" (1),
[addr] "r" (&my_arch_atomic_ulong)
:
);
#endif #endif
} }
} }
@@ -75,7 +89,7 @@ int main(int argc, char **argv) {
// We can also use the atomics direclty through `operator T` conversion. // We can also use the atomics direclty through `operator T` conversion.
assert(my_atomic_ulong == my_atomic_ulong.load()); assert(my_atomic_ulong == my_atomic_ulong.load());
std::cout << "my_non_atomic_ulong " << my_non_atomic_ulong << std::endl; std::cout << "my_non_atomic_ulong " << my_non_atomic_ulong << std::endl;
#if defined(__x86_64__) #if defined(__x86_64__) || defined(__aarch64__)
assert(my_arch_atomic_ulong == nthreads * niters); assert(my_arch_atomic_ulong == nthreads * niters);
std::cout << "my_arch_non_atomic_ulong " << my_arch_non_atomic_ulong << std::endl; std::cout << "my_arch_non_atomic_ulong " << my_arch_non_atomic_ulong << std::endl;
#endif #endif