assembly SIMD add: make uniform for all ISAs, mark as entry point to learning SIMD

This commit is contained in:
Ciro Santilli 六四事件 法轮功
2019-05-12 00:00:04 +00:00
parent b14f68f9bf
commit a13e99ec1c
8 changed files with 162 additions and 202 deletions

View File

@@ -11622,6 +11622,23 @@ Other infrastructure sanity checks that you might want to look into include:
* `ASSERT_MEMCMP` tests * `ASSERT_MEMCMP` tests
** link:userland/arch/x86_64/lkmc_assert_memcmp_fail.S[] ** link:userland/arch/x86_64/lkmc_assert_memcmp_fail.S[]
=== Assembly SIMD
Much like ADD for non-SIMD, start learning SIMD instructions by looking at the integer and floating point SIMD ADD instructions of each ISA:
* x86
** link:userland/arch/x86_64/addpd.S[]: `ADDPS`, `ADDPD`
** link:userland/arch/x86_64/paddq.S[]: `PADDQ`, `PADDL`, `PADDW`, `PADDB`
* arm
** link:userland/arch/arm/vadd.S[]
* aarch64
** link:userland/arch/aarch64/add_vector.S[]
** link:userland/arch/aarch64/fadd_vector.S[]
Then it is just a huge copy paste of infinite boring details:
* <<x86-simd>>
=== User vs system assembly === User vs system assembly
By "userland assembly", we mean "the parts of the ISA which can be freely used from userland". By "userland assembly", we mean "the parts of the ISA which can be freely used from userland".
@@ -11854,8 +11871,6 @@ History:
* link:https://en.wikipedia.org/wiki/MMX_(instruction_set)[MMX]: 1997 * link:https://en.wikipedia.org/wiki/MMX_(instruction_set)[MMX]: 1997
* link:https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions[SSE]: Streaming SIMD Extensions. 1999. 128-bit XMM registers. * link:https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions[SSE]: Streaming SIMD Extensions. 1999. 128-bit XMM registers.
* link:https://en.wikipedia.org/wiki/SSE2[SSE2]: 2004 * link:https://en.wikipedia.org/wiki/SSE2[SSE2]: 2004
** link:userland/arch/x86_64/addpd.S[]: `ADDPS`, `ADDPD`
** link:userland/arch/x86_64/paddq.S[]: `PADDQ`, `PADDL`, `PADDW`, `PADDB`
* link:https://en.wikipedia.org/wiki/SSE3[SSE3]: 2006 * link:https://en.wikipedia.org/wiki/SSE3[SSE3]: 2006
* link:https://en.wikipedia.org/wiki/SSE4[SSE4]: 2006 * link:https://en.wikipedia.org/wiki/SSE4[SSE4]: 2006
* link:https://en.wikipedia.org/wiki/Advanced_Vector_Extensions[AVX]: Advanced Vector Extensions. 2011. 256-bit YMM registers. Extension of XMM. * link:https://en.wikipedia.org/wiki/Advanced_Vector_Extensions[AVX]: Advanced Vector Extensions. 2011. 256-bit YMM registers. Extension of XMM.

View File

@@ -0,0 +1,32 @@
/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd
*
* Add a bunch of integers in one go.
*/
#include "common.h"
ENTRY
.data
input0: .long 0xF1F1F1F1, 0xF2F2F2F2, 0xF3F3F3F3, 0xF4F4F4F4
input1: .long 0x12121212, 0x13131313, 0x14141414, 0x15151515
expect_4s: .long 0x04040403, 0x06060605, 0x08080807, 0x0A0A0A09
expect_2d: .long 0x04040403, 0x06060606, 0x08080807, 0x0A0A0A0A
.bss
output: .skip 16
.text
#define TEST(size) \
adr x0, input0; \
ld1 {v0. ## size}, [x0]; \
adr x1, input1; \
ld1 {v1. ## size}, [x1]; \
add v2. ## size, v0. ## size, v1. ## size; \
adr x0, output; \
st1 {v2. ## size}, [x0]; \
ASSERT_MEMCMP(output, expect_ ## size, 0x10)
/* 4x 32-bit */
TEST(4s)
/* 2x 64-bit */
TEST(2d)
#undef TEST
EXIT

View File

@@ -0,0 +1,34 @@
/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd
*
* Add a bunch of floating point numbers in one go.
*/
#include "common.h"
ENTRY
.data
input0_4s: .float 1.5, 2.5, 3.5, 4.5
input1_4s: .float 5.5, 6.5, 7.5, 8.5
expect_4s: .float 7.0, 9.0, 11.0, 13.0
input0_2d: .double 1.5, 2.5
input1_2d: .double 5.5, 6.5
expect_2d: .double 7.0, 9.0
.bss
output: .skip 16
.text
#define TEST(size) \
adr x0, input0_ ## size; \
ld1 {v0. ## size}, [x0]; \
adr x1, input1_ ## size; \
ld1 {v1. ## size}, [x1]; \
fadd v2. ## size, v0. ## size, v1. ## size; \
adr x0, output; \
st1 {v2. ## size}, [x0]; \
ASSERT_MEMCMP(output, expect_ ## size, 0x10)
/* 4x 32-bit */
TEST(4s)
/* 2x 64-bit */
TEST(2d)
#undef TEST
EXIT

View File

@@ -1,86 +0,0 @@
/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
#include "common.h"
ENTRY
/* 4x 32-bit integer add.
*
* s stands for single == 32 bits.
*
* 1 in ld1 means to load just one register, see:
* https://github.com/cirosantilli/arm-assembly-cheat#simd-interleaving
*/
.data
u32_0: .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444
u32_1: .word 0x15551555, 0x16661666, 0x17771777, 0x18881888
u32_sum_expect: .word 0x06670666, 0x08890889, 0x0AAB0AAA, 0x0CCD0CCC
.bss
u32_sum: .skip 16
.text
adr x0, u32_0
ld1 {v0.4s}, [x0]
adr x1, u32_1
ld1 {v1.4s}, [x1]
add v2.4s, v0.4s, v1.4s
adr x0, u32_sum
st1 {v2.4s}, [x0]
ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10)
/* 2x 64-bit integer add.
*
* d stands for double == 64 bits.
*/
.data
u64_0: .quad 0xF1111111F1111111, 0xF2222222F2222222
u64_1: .quad 0x1555555515555555, 0x1666666616666666
u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
.bss
u64_sum: .skip 16
.text
adr x0, u64_0
ld1 {v0.2d}, [x0]
adr x1, u64_1
ld1 {v1.2d}, [x1]
add v2.2d, v0.2d, v1.2d
adr x0, u64_sum
st1 {v2.2d}, [x0]
ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10)
/* 4x 32-bit float add.
*
* The only difference between the integer point version
* is that we use fadd instead of add.
*/
.data
f32_0: .float 1.5, 2.5, 3.5, 4.5
f32_1: .float 5.5, 6.5, 7.5, 8.5
f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
.bss
f32_sum: .skip 16
.text
adr x0, f32_0
ld1 {v0.4s}, [x0]
adr x1, f32_1
ld1 {v1.4s}, [x1]
fadd v2.4s, v0.4s, v1.4s
adr x0, f32_sum
st1 {v2.4s}, [x0]
ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10)
/* 2x 64-bit float add. */
.data
f64_0: .double 1.5, 2.5
f64_1: .double 5.5, 6.5
f64_sum_expect: .double 7.0, 9.0
.bss
f64_sum: .skip 16
.text
adr x0, f64_0
ld1 {v0.2d}, [x0]
adr x1, f64_1
ld1 {v1.2d}, [x1]
fadd v2.2d, v0.2d, v1.2d
adr x0, f64_sum
st1 {v2.2d}, [x0]
ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10)
EXIT

View File

@@ -1,113 +0,0 @@
/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */
#include "common.h"
ENTRY
/* vadd.u32
*
* Add 4x 32-bit unsigned integers in one go.
*
* q means 128-bits.
*
* u32 means that we treat memory as uint32_t types.
*
* 4 is deduced: in 128 bits you can fit 4 u32.
*
* Observe how the carry is propagated within u32 integers,
* but not across them.
*/
.data
u32_0: .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444
u32_1: .word 0x15551555, 0x16661666, 0x17771777, 0x18881888
u32_sum_expect: .word 0x06670666, 0x08890888, 0x0AAB0AAA, 0x0CCD0CCC
.bss
u32_sum: .skip 0x10
.text
ldr r0, =u32_0
vld1.32 {q0}, [r0]
ldr r0, =u32_1
vld1.32 {q1}, [r0]
vadd.u32 q2, q0, q1
ldr r0, =u32_sum
vst1.u32 {q2}, [r0]
ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10)
/* vadd.u64: 2x 64-bit unsigned integer add. */
.data
u64_0: .quad 0xF1111111F1111111, 0xF2222222F2222222
u64_1: .quad 0x1555555515555555, 0x1666666616666666
u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
.bss
u64_sum: .skip 0x10
.text
ldr r0, =u64_0
vld1.64 {q0}, [r0]
ldr r0, =u64_1
vld1.64 {q1}, [r0]
vadd.u64 q2, q0, q1
ldr r0, =u64_sum
vst1.u64 {q2}, [r0]
ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10)
/* vadd.s64: 2x 64-bit signed integer add. TODO: how to differentiate
* it from signed? I think signed and unsigned addition are identical
* in two's complement, the only difference is overflow / carry detection
* flags. But how do flags work when there are many values being added
* at once?
*/
.data
s64_0: .quad -1, -2
s64_1: .quad -1, -2
s64_sum_expect: .quad -2, -4
.bss
s64_sum: .skip 0x10
.text
ldr r0, =s64_0
vld1.64 {q0}, [r0]
ldr r0, =s64_1
vld1.64 {q1}, [r0]
vadd.s64 q2, q0, q1
ldr r0, =s64_sum
vst1.s64 {q2}, [r0]
ASSERT_MEMCMP(s64_sum, s64_sum_expect, 0x10)
/* vadd.f32: 4x 32-bit float add. */
.data
f32_0: .float 1.5, 2.5, 3.5, 4.5
f32_1: .float 5.5, 6.5, 7.5, 8.5
f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
.bss
f32_sum: .skip 0x10
.text
ldr r0, =f32_0
vld1.32 {q0}, [r0]
ldr r0, =f32_1
vld1.32 {q1}, [r0]
vadd.f32 q2, q0, q1
ldr r0, =f32_sum
vst1.32 {q2}, [r0]
ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10)
/* vadd.f64: 2x 64-bit float add: appears not possible.
*
* https://stackoverflow.com/questions/36052564/does-arm-support-simd-operations-for-64-bit-floating-point-numbers
*/
.data
f64_0: .double 1.5, 2.5
f64_1: .double 5.5, 6.5
f64_sum_expect: .double 7.0, 9.0
.bss
f64_sum: .skip 0x10
.text
ldr r0, =f64_0
vld1.64 {q0}, [r0]
ldr r0, =f64_1
vld1.64 {q1}, [r0]
#if 0
/* bad type in Neon instruction -- `vadd.f64 q2,q0,q1' */
vadd.f64 q2, q0, q1
ldr r0, =f64_sum
vst1.64 {q2}, [r0]
ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10)
#endif
EXIT

71
userland/arch/arm/vadd.S Normal file
View File

@@ -0,0 +1,71 @@
/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd */
#include "common.h"
.bss
output: .skip 16
ENTRY
/* Integer. */
.data
input0_u: .long 0xF1F1F1F1, 0xF2F2F2F2, 0xF3F3F3F3, 0xF4F4F4F4
input1_u: .long 0x12121212, 0x13131313, 0x14141414, 0x15151515
expect_u_32: .long 0x04040403, 0x06060605, 0x08080807, 0x0A0A0A09
expect_u_64: .long 0x04040403, 0x06060606, 0x08080807, 0x0A0A0A0A
.text
#define TEST(size) \
ldr r0, =input0_u; \
vld1. ## size {q0}, [r0]; \
ldr r0, =input1_u; \
vld1. ## size {q1}, [r0]; \
vadd.u ## size q2, q0, q1; \
ldr r0, =output; \
vst1.u ## size {q2}, [r0]; \
ASSERT_MEMCMP(output, expect_u_ ## size, 0x10)
/* vadd.u32
*
* Add 4x 32-bit unsigned integers in one go.
*
* q means quad (128-bits)
*
* u32 means that we treat memory as uint32_t types.
*
* 4 is deduced: in 128 bits you can fit 4x u32.
*/
TEST(32)
/* 2x 64-bit */
TEST(64)
#undef TEST
/* Floating point. */
.data
input0_f_32: .float 1.5, 2.5, 3.5, 4.5
input1_f_32: .float 5.5, 6.5, 7.5, 8.5
expect_f_32: .float 7.0, 9.0, 11.0, 13.0
input0_f_64: .double 1.5, 2.5
input1_f_64: .double 5.5, 6.5
expect_f_64: .double 7.0, 9.0
.text
#define TEST(size) \
ldr r0, =input0_f_ ## size; \
vld1. ## size {q0}, [r0]; \
ldr r0, =input1_f_ ## size; \
vld1. ## size {q1}, [r0]; \
vadd.f ## size q2, q0, q1; \
ldr r0, =output; \
vst1. ## size {q2}, [r0]; \
ASSERT_MEMCMP(output, expect_f_ ## size, 0x10)
/* 4x 32-bit. */
TEST(32)
#if 0
/* vadd.f64: 2x 64-bit float add: appears not possible.
* https://stackoverflow.com/questions/36052564/does-arm-support-simd-operations-for-64-bit-floating-point-numbers
*
* Fails with:
* bad type in Neon instruction -- `vadd.f64 q2,q0,q1' */
*/
TEST(64)
#endif
#undef TEST
EXIT

View File

@@ -23,7 +23,9 @@ ENTRY
movups %xmm0, output; \ movups %xmm0, output; \
ASSERT_MEMCMP(output, addp ## size ## _expect, $0x10) ASSERT_MEMCMP(output, addp ## size ## _expect, $0x10)
/* 4x 32-bit */
TEST(s) TEST(s)
/* 2x 64-bit */
TEST(d) TEST(d)
#undef TEST #undef TEST
EXIT EXIT

View File

@@ -1,4 +1,4 @@
/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-simd /* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd
* *
* Add a bunch of integers in one go. * Add a bunch of integers in one go.
* *
@@ -25,9 +25,14 @@ ENTRY
movups %xmm0, output; \ movups %xmm0, output; \
ASSERT_MEMCMP(output, padd ## size ## _expect, $0x10) ASSERT_MEMCMP(output, padd ## size ## _expect, $0x10)
/* 16x 8-bit */
TEST(b) TEST(b)
/* 8x 4-bit */
TEST(w) TEST(w)
/* 4x 8-bit */
/* 4x long */
TEST(d) TEST(d)
/* 2x 16-bit */
TEST(q) TEST(q)
#undef TEST #undef TEST
EXIT EXIT