mirror of
https://github.com/cirosantilli/linux-kernel-module-cheat.git
synced 2026-01-25 03:01:36 +01:00
assembly SIMD add: make uniform for all ISAs, mark as entry point to learning SIMD
This commit is contained in:
19
README.adoc
19
README.adoc
@@ -11622,6 +11622,23 @@ Other infrastructure sanity checks that you might want to look into include:
|
||||
* `ASSERT_MEMCMP` tests
|
||||
** link:userland/arch/x86_64/lkmc_assert_memcmp_fail.S[]
|
||||
|
||||
=== Assembly SIMD
|
||||
|
||||
Much like ADD for non-SIMD, start learning SIMD instructions by looking at the integer and floating point SIMD ADD instructions of each ISA:
|
||||
|
||||
* x86
|
||||
** link:userland/arch/x86_64/addpd.S[]: `ADDPS`, `ADDPD`
|
||||
** link:userland/arch/x86_64/paddq.S[]: `PADDQ`, `PADDL`, `PADDW`, `PADDB`
|
||||
* arm
|
||||
** link:userland/arch/arm/vadd.S[]
|
||||
* aarch64
|
||||
** link:userland/arch/aarch64/add_vector.S[]
|
||||
** link:userland/arch/aarch64/fadd_vector.S[]
|
||||
|
||||
Then it is just a huge copy paste of infinite boring details:
|
||||
|
||||
* <<x86-simd>>
|
||||
|
||||
=== User vs system assembly
|
||||
|
||||
By "userland assembly", we mean "the parts of the ISA which can be freely used from userland".
|
||||
@@ -11854,8 +11871,6 @@ History:
|
||||
* link:https://en.wikipedia.org/wiki/MMX_(instruction_set)[MMX]: 1997
|
||||
* link:https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions[SSE]: Streaming SIMD Extensions. 1999. 128-bit XMM registers.
|
||||
* link:https://en.wikipedia.org/wiki/SSE2[SSE2]: 2004
|
||||
** link:userland/arch/x86_64/addpd.S[]: `ADDPS`, `ADDPD`
|
||||
** link:userland/arch/x86_64/paddq.S[]: `PADDQ`, `PADDL`, `PADDW`, `PADDB`
|
||||
* link:https://en.wikipedia.org/wiki/SSE3[SSE3]: 2006
|
||||
* link:https://en.wikipedia.org/wiki/SSE4[SSE4]: 2006
|
||||
* link:https://en.wikipedia.org/wiki/Advanced_Vector_Extensions[AVX]: Advanced Vector Extensions. 2011. 256-bit YMM registers. Extension of XMM.
|
||||
|
||||
32
userland/arch/aarch64/add_vector.S
Normal file
32
userland/arch/aarch64/add_vector.S
Normal file
@@ -0,0 +1,32 @@
|
||||
/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd
|
||||
*
|
||||
* Add a bunch of integers in one go.
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
ENTRY
|
||||
.data
|
||||
input0: .long 0xF1F1F1F1, 0xF2F2F2F2, 0xF3F3F3F3, 0xF4F4F4F4
|
||||
input1: .long 0x12121212, 0x13131313, 0x14141414, 0x15151515
|
||||
expect_4s: .long 0x04040403, 0x06060605, 0x08080807, 0x0A0A0A09
|
||||
expect_2d: .long 0x04040403, 0x06060606, 0x08080807, 0x0A0A0A0A
|
||||
.bss
|
||||
output: .skip 16
|
||||
.text
|
||||
#define TEST(size) \
|
||||
adr x0, input0; \
|
||||
ld1 {v0. ## size}, [x0]; \
|
||||
adr x1, input1; \
|
||||
ld1 {v1. ## size}, [x1]; \
|
||||
add v2. ## size, v0. ## size, v1. ## size; \
|
||||
adr x0, output; \
|
||||
st1 {v2. ## size}, [x0]; \
|
||||
ASSERT_MEMCMP(output, expect_ ## size, 0x10)
|
||||
|
||||
/* 4x 32-bit */
|
||||
TEST(4s)
|
||||
/* 2x 64-bit */
|
||||
TEST(2d)
|
||||
#undef TEST
|
||||
EXIT
|
||||
34
userland/arch/aarch64/fadd_vector.S
Normal file
34
userland/arch/aarch64/fadd_vector.S
Normal file
@@ -0,0 +1,34 @@
|
||||
/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd
|
||||
*
|
||||
* Add a bunch of floating point numbers in one go.
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
ENTRY
|
||||
.data
|
||||
input0_4s: .float 1.5, 2.5, 3.5, 4.5
|
||||
input1_4s: .float 5.5, 6.5, 7.5, 8.5
|
||||
expect_4s: .float 7.0, 9.0, 11.0, 13.0
|
||||
input0_2d: .double 1.5, 2.5
|
||||
input1_2d: .double 5.5, 6.5
|
||||
expect_2d: .double 7.0, 9.0
|
||||
.bss
|
||||
output: .skip 16
|
||||
.text
|
||||
#define TEST(size) \
|
||||
adr x0, input0_ ## size; \
|
||||
ld1 {v0. ## size}, [x0]; \
|
||||
adr x1, input1_ ## size; \
|
||||
ld1 {v1. ## size}, [x1]; \
|
||||
fadd v2. ## size, v0. ## size, v1. ## size; \
|
||||
adr x0, output; \
|
||||
st1 {v2. ## size}, [x0]; \
|
||||
ASSERT_MEMCMP(output, expect_ ## size, 0x10)
|
||||
|
||||
/* 4x 32-bit */
|
||||
TEST(4s)
|
||||
/* 2x 64-bit */
|
||||
TEST(2d)
|
||||
#undef TEST
|
||||
EXIT
|
||||
@@ -1,86 +0,0 @@
|
||||
/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
|
||||
|
||||
#include "common.h"
|
||||
|
||||
ENTRY
|
||||
/* 4x 32-bit integer add.
|
||||
*
|
||||
* s stands for single == 32 bits.
|
||||
*
|
||||
* 1 in ld1 means to load just one register, see:
|
||||
* https://github.com/cirosantilli/arm-assembly-cheat#simd-interleaving
|
||||
*/
|
||||
.data
|
||||
u32_0: .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444
|
||||
u32_1: .word 0x15551555, 0x16661666, 0x17771777, 0x18881888
|
||||
u32_sum_expect: .word 0x06670666, 0x08890889, 0x0AAB0AAA, 0x0CCD0CCC
|
||||
.bss
|
||||
u32_sum: .skip 16
|
||||
.text
|
||||
adr x0, u32_0
|
||||
ld1 {v0.4s}, [x0]
|
||||
adr x1, u32_1
|
||||
ld1 {v1.4s}, [x1]
|
||||
add v2.4s, v0.4s, v1.4s
|
||||
adr x0, u32_sum
|
||||
st1 {v2.4s}, [x0]
|
||||
ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10)
|
||||
|
||||
/* 2x 64-bit integer add.
|
||||
*
|
||||
* d stands for double == 64 bits.
|
||||
*/
|
||||
.data
|
||||
u64_0: .quad 0xF1111111F1111111, 0xF2222222F2222222
|
||||
u64_1: .quad 0x1555555515555555, 0x1666666616666666
|
||||
u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
|
||||
.bss
|
||||
u64_sum: .skip 16
|
||||
.text
|
||||
adr x0, u64_0
|
||||
ld1 {v0.2d}, [x0]
|
||||
adr x1, u64_1
|
||||
ld1 {v1.2d}, [x1]
|
||||
add v2.2d, v0.2d, v1.2d
|
||||
adr x0, u64_sum
|
||||
st1 {v2.2d}, [x0]
|
||||
ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10)
|
||||
|
||||
/* 4x 32-bit float add.
|
||||
*
|
||||
* The only difference between the integer point version
|
||||
* is that we use fadd instead of add.
|
||||
*/
|
||||
.data
|
||||
f32_0: .float 1.5, 2.5, 3.5, 4.5
|
||||
f32_1: .float 5.5, 6.5, 7.5, 8.5
|
||||
f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
|
||||
.bss
|
||||
f32_sum: .skip 16
|
||||
.text
|
||||
adr x0, f32_0
|
||||
ld1 {v0.4s}, [x0]
|
||||
adr x1, f32_1
|
||||
ld1 {v1.4s}, [x1]
|
||||
fadd v2.4s, v0.4s, v1.4s
|
||||
adr x0, f32_sum
|
||||
st1 {v2.4s}, [x0]
|
||||
ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10)
|
||||
|
||||
/* 2x 64-bit float add. */
|
||||
.data
|
||||
f64_0: .double 1.5, 2.5
|
||||
f64_1: .double 5.5, 6.5
|
||||
f64_sum_expect: .double 7.0, 9.0
|
||||
.bss
|
||||
f64_sum: .skip 16
|
||||
.text
|
||||
adr x0, f64_0
|
||||
ld1 {v0.2d}, [x0]
|
||||
adr x1, f64_1
|
||||
ld1 {v1.2d}, [x1]
|
||||
fadd v2.2d, v0.2d, v1.2d
|
||||
adr x0, f64_sum
|
||||
st1 {v2.2d}, [x0]
|
||||
ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10)
|
||||
EXIT
|
||||
@@ -1,113 +0,0 @@
|
||||
/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
|
||||
|
||||
#include "common.h"
|
||||
|
||||
ENTRY
|
||||
/* vadd.u32
|
||||
*
|
||||
* Add 4x 32-bit unsigned integers in one go.
|
||||
*
|
||||
* q means 128-bits.
|
||||
*
|
||||
* u32 means that we treat memory as uint32_t types.
|
||||
*
|
||||
* 4 is deduced: in 128 bits you can fit 4 u32.
|
||||
*
|
||||
* Observe how the carry is propagated within u32 integers,
|
||||
* but not across them.
|
||||
*/
|
||||
.data
|
||||
u32_0: .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444
|
||||
u32_1: .word 0x15551555, 0x16661666, 0x17771777, 0x18881888
|
||||
u32_sum_expect: .word 0x06670666, 0x08890888, 0x0AAB0AAA, 0x0CCD0CCC
|
||||
.bss
|
||||
u32_sum: .skip 0x10
|
||||
.text
|
||||
ldr r0, =u32_0
|
||||
vld1.32 {q0}, [r0]
|
||||
ldr r0, =u32_1
|
||||
vld1.32 {q1}, [r0]
|
||||
vadd.u32 q2, q0, q1
|
||||
ldr r0, =u32_sum
|
||||
vst1.u32 {q2}, [r0]
|
||||
ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10)
|
||||
|
||||
/* vadd.u64: 2x 64-bit unsigned integer add. */
|
||||
.data
|
||||
u64_0: .quad 0xF1111111F1111111, 0xF2222222F2222222
|
||||
u64_1: .quad 0x1555555515555555, 0x1666666616666666
|
||||
u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
|
||||
.bss
|
||||
u64_sum: .skip 0x10
|
||||
.text
|
||||
ldr r0, =u64_0
|
||||
vld1.64 {q0}, [r0]
|
||||
ldr r0, =u64_1
|
||||
vld1.64 {q1}, [r0]
|
||||
vadd.u64 q2, q0, q1
|
||||
ldr r0, =u64_sum
|
||||
vst1.u64 {q2}, [r0]
|
||||
ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10)
|
||||
|
||||
/* vadd.s64: 2x 64-bit signed integer add. TODO: how to differentiate
|
||||
* it from signed? I think signed and unsigned addition are identical
|
||||
* in two's complement, the only difference is overflow / carry detection
|
||||
* flags. But how do flags work when there are many values being added
|
||||
* at once?
|
||||
*/
|
||||
.data
|
||||
s64_0: .quad -1, -2
|
||||
s64_1: .quad -1, -2
|
||||
s64_sum_expect: .quad -2, -4
|
||||
.bss
|
||||
s64_sum: .skip 0x10
|
||||
.text
|
||||
ldr r0, =s64_0
|
||||
vld1.64 {q0}, [r0]
|
||||
ldr r0, =s64_1
|
||||
vld1.64 {q1}, [r0]
|
||||
vadd.s64 q2, q0, q1
|
||||
ldr r0, =s64_sum
|
||||
vst1.s64 {q2}, [r0]
|
||||
ASSERT_MEMCMP(s64_sum, s64_sum_expect, 0x10)
|
||||
|
||||
/* vadd.f32: 4x 32-bit float add. */
|
||||
.data
|
||||
f32_0: .float 1.5, 2.5, 3.5, 4.5
|
||||
f32_1: .float 5.5, 6.5, 7.5, 8.5
|
||||
f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
|
||||
.bss
|
||||
f32_sum: .skip 0x10
|
||||
.text
|
||||
ldr r0, =f32_0
|
||||
vld1.32 {q0}, [r0]
|
||||
ldr r0, =f32_1
|
||||
vld1.32 {q1}, [r0]
|
||||
vadd.f32 q2, q0, q1
|
||||
ldr r0, =f32_sum
|
||||
vst1.32 {q2}, [r0]
|
||||
ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10)
|
||||
|
||||
/* vadd.f64: 2x 64-bit float add: appears not possible.
|
||||
*
|
||||
* https://stackoverflow.com/questions/36052564/does-arm-support-simd-operations-for-64-bit-floating-point-numbers
|
||||
*/
|
||||
.data
|
||||
f64_0: .double 1.5, 2.5
|
||||
f64_1: .double 5.5, 6.5
|
||||
f64_sum_expect: .double 7.0, 9.0
|
||||
.bss
|
||||
f64_sum: .skip 0x10
|
||||
.text
|
||||
ldr r0, =f64_0
|
||||
vld1.64 {q0}, [r0]
|
||||
ldr r0, =f64_1
|
||||
vld1.64 {q1}, [r0]
|
||||
#if 0
|
||||
/* bad type in Neon instruction -- `vadd.f64 q2,q0,q1' */
|
||||
vadd.f64 q2, q0, q1
|
||||
ldr r0, =f64_sum
|
||||
vst1.64 {q2}, [r0]
|
||||
ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10)
|
||||
#endif
|
||||
EXIT
|
||||
71
userland/arch/arm/vadd.S
Normal file
71
userland/arch/arm/vadd.S
Normal file
@@ -0,0 +1,71 @@
|
||||
/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd */
|
||||
|
||||
#include "common.h"
|
||||
|
||||
.bss
|
||||
output: .skip 16
|
||||
ENTRY
|
||||
/* Integer. */
|
||||
.data
|
||||
input0_u: .long 0xF1F1F1F1, 0xF2F2F2F2, 0xF3F3F3F3, 0xF4F4F4F4
|
||||
input1_u: .long 0x12121212, 0x13131313, 0x14141414, 0x15151515
|
||||
expect_u_32: .long 0x04040403, 0x06060605, 0x08080807, 0x0A0A0A09
|
||||
expect_u_64: .long 0x04040403, 0x06060606, 0x08080807, 0x0A0A0A0A
|
||||
.text
|
||||
#define TEST(size) \
|
||||
ldr r0, =input0_u; \
|
||||
vld1. ## size {q0}, [r0]; \
|
||||
ldr r0, =input1_u; \
|
||||
vld1. ## size {q1}, [r0]; \
|
||||
vadd.u ## size q2, q0, q1; \
|
||||
ldr r0, =output; \
|
||||
vst1.u ## size {q2}, [r0]; \
|
||||
ASSERT_MEMCMP(output, expect_u_ ## size, 0x10)
|
||||
|
||||
/* vadd.u32
|
||||
*
|
||||
* Add 4x 32-bit unsigned integers in one go.
|
||||
*
|
||||
* q means quad (128-bits)
|
||||
*
|
||||
* u32 means that we treat memory as uint32_t types.
|
||||
*
|
||||
* 4 is deduced: in 128 bits you can fit 4x u32.
|
||||
*/
|
||||
TEST(32)
|
||||
/* 2x 64-bit */
|
||||
TEST(64)
|
||||
#undef TEST
|
||||
|
||||
/* Floating point. */
|
||||
.data
|
||||
input0_f_32: .float 1.5, 2.5, 3.5, 4.5
|
||||
input1_f_32: .float 5.5, 6.5, 7.5, 8.5
|
||||
expect_f_32: .float 7.0, 9.0, 11.0, 13.0
|
||||
input0_f_64: .double 1.5, 2.5
|
||||
input1_f_64: .double 5.5, 6.5
|
||||
expect_f_64: .double 7.0, 9.0
|
||||
.text
|
||||
#define TEST(size) \
|
||||
ldr r0, =input0_f_ ## size; \
|
||||
vld1. ## size {q0}, [r0]; \
|
||||
ldr r0, =input1_f_ ## size; \
|
||||
vld1. ## size {q1}, [r0]; \
|
||||
vadd.f ## size q2, q0, q1; \
|
||||
ldr r0, =output; \
|
||||
vst1. ## size {q2}, [r0]; \
|
||||
ASSERT_MEMCMP(output, expect_f_ ## size, 0x10)
|
||||
|
||||
/* 4x 32-bit. */
|
||||
TEST(32)
|
||||
#if 0
|
||||
/* vadd.f64: 2x 64-bit float add: appears not possible.
|
||||
* https://stackoverflow.com/questions/36052564/does-arm-support-simd-operations-for-64-bit-floating-point-numbers
|
||||
*
|
||||
* Fails with:
|
||||
* bad type in Neon instruction -- `vadd.f64 q2,q0,q1' */
|
||||
*/
|
||||
TEST(64)
|
||||
#endif
|
||||
#undef TEST
|
||||
EXIT
|
||||
@@ -23,7 +23,9 @@ ENTRY
|
||||
movups %xmm0, output; \
|
||||
ASSERT_MEMCMP(output, addp ## size ## _expect, $0x10)
|
||||
|
||||
/* 4x 32-bit */
|
||||
TEST(s)
|
||||
/* 2x 64-bit */
|
||||
TEST(d)
|
||||
#undef TEST
|
||||
EXIT
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-simd
|
||||
/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd
|
||||
*
|
||||
* Add a bunch of integers in one go.
|
||||
*
|
||||
@@ -25,9 +25,14 @@ ENTRY
|
||||
movups %xmm0, output; \
|
||||
ASSERT_MEMCMP(output, padd ## size ## _expect, $0x10)
|
||||
|
||||
/* 16x 8-bit */
|
||||
TEST(b)
|
||||
/* 8x 4-bit */
|
||||
TEST(w)
|
||||
/* 4x 8-bit */
|
||||
/* 4x long */
|
||||
TEST(d)
|
||||
/* 2x 16-bit */
|
||||
TEST(q)
|
||||
#undef TEST
|
||||
EXIT
|
||||
|
||||
Reference in New Issue
Block a user