diff --git a/README.adoc b/README.adoc index bd09887..efeaa75 100644 --- a/README.adoc +++ b/README.adoc @@ -11622,6 +11622,23 @@ Other infrastructure sanity checks that you might want to look into include: * `ASSERT_MEMCMP` tests ** link:userland/arch/x86_64/lkmc_assert_memcmp_fail.S[] +=== Assembly SIMD + +Much like ADD for non-SIMD, start learning SIMD instructions by looking at the integer and floating point SIMD ADD instructions of each ISA: + +* x86 +** link:userland/arch/x86_64/addpd.S[]: `ADDPS`, `ADDPD` +** link:userland/arch/x86_64/paddq.S[]: `PADDQ`, `PADDL`, `PADDW`, `PADDB` +* arm +** link:userland/arch/arm/vadd.S[] +* aarch64 +** link:userland/arch/aarch64/add_vector.S[] +** link:userland/arch/aarch64/fadd_vector.S[] + +Then it is just a huge copy paste of infinite boring details: + +* <> + === User vs system assembly By "userland assembly", we mean "the parts of the ISA which can be freely used from userland". @@ -11854,8 +11871,6 @@ History: * link:https://en.wikipedia.org/wiki/MMX_(instruction_set)[MMX]: 1997 * link:https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions[SSE]: Streaming SIMD Extensions. 1999. 128-bit XMM registers. * link:https://en.wikipedia.org/wiki/SSE2[SSE2]: 2004 -** link:userland/arch/x86_64/addpd.S[]: `ADDPS`, `ADDPD` -** link:userland/arch/x86_64/paddq.S[]: `PADDQ`, `PADDL`, `PADDW`, `PADDB` * link:https://en.wikipedia.org/wiki/SSE3[SSE3]: 2006 * link:https://en.wikipedia.org/wiki/SSE4[SSE4]: 2006 * link:https://en.wikipedia.org/wiki/Advanced_Vector_Extensions[AVX]: Advanced Vector Extensions. 2011. 256-bit YMM registers. Extension of XMM. diff --git a/userland/arch/aarch64/add_vector.S b/userland/arch/aarch64/add_vector.S new file mode 100644 index 0000000..25c8662 --- /dev/null +++ b/userland/arch/aarch64/add_vector.S @@ -0,0 +1,32 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd + * + * Add a bunch of integers in one go. + */ + +#include "common.h" + +ENTRY +.data + input0: .long 0xF1F1F1F1, 0xF2F2F2F2, 0xF3F3F3F3, 0xF4F4F4F4 + input1: .long 0x12121212, 0x13131313, 0x14141414, 0x15151515 + expect_4s: .long 0x04040403, 0x06060605, 0x08080807, 0x0A0A0A09 + expect_2d: .long 0x04040403, 0x06060606, 0x08080807, 0x0A0A0A0A +.bss + output: .skip 16 +.text +#define TEST(size) \ + adr x0, input0; \ + ld1 {v0. ## size}, [x0]; \ + adr x1, input1; \ + ld1 {v1. ## size}, [x1]; \ + add v2. ## size, v0. ## size, v1. ## size; \ + adr x0, output; \ + st1 {v2. ## size}, [x0]; \ + ASSERT_MEMCMP(output, expect_ ## size, 0x10) + + /* 4x 32-bit */ + TEST(4s) + /* 2x 64-bit */ + TEST(2d) +#undef TEST +EXIT diff --git a/userland/arch/aarch64/fadd_vector.S b/userland/arch/aarch64/fadd_vector.S new file mode 100644 index 0000000..0c507bd --- /dev/null +++ b/userland/arch/aarch64/fadd_vector.S @@ -0,0 +1,34 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd + * + * Add a bunch of floating point numbers in one go. + */ + +#include "common.h" + +ENTRY +.data + input0_4s: .float 1.5, 2.5, 3.5, 4.5 + input1_4s: .float 5.5, 6.5, 7.5, 8.5 + expect_4s: .float 7.0, 9.0, 11.0, 13.0 + input0_2d: .double 1.5, 2.5 + input1_2d: .double 5.5, 6.5 + expect_2d: .double 7.0, 9.0 +.bss + output: .skip 16 +.text +#define TEST(size) \ + adr x0, input0_ ## size; \ + ld1 {v0. ## size}, [x0]; \ + adr x1, input1_ ## size; \ + ld1 {v1. ## size}, [x1]; \ + fadd v2. ## size, v0. ## size, v1. ## size; \ + adr x0, output; \ + st1 {v2. ## size}, [x0]; \ + ASSERT_MEMCMP(output, expect_ ## size, 0x10) + + /* 4x 32-bit */ + TEST(4s) + /* 2x 64-bit */ + TEST(2d) +#undef TEST +EXIT diff --git a/userland/arch/aarch64/simd.S b/userland/arch/aarch64/simd.S deleted file mode 100644 index 4e306eb..0000000 --- a/userland/arch/aarch64/simd.S +++ /dev/null @@ -1,86 +0,0 @@ -/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */ - -#include "common.h" - -ENTRY - /* 4x 32-bit integer add. - * - * s stands for single == 32 bits. - * - * 1 in ld1 means to load just one register, see: - * https://github.com/cirosantilli/arm-assembly-cheat#simd-interleaving - */ -.data - u32_0: .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444 - u32_1: .word 0x15551555, 0x16661666, 0x17771777, 0x18881888 - u32_sum_expect: .word 0x06670666, 0x08890889, 0x0AAB0AAA, 0x0CCD0CCC -.bss - u32_sum: .skip 16 -.text - adr x0, u32_0 - ld1 {v0.4s}, [x0] - adr x1, u32_1 - ld1 {v1.4s}, [x1] - add v2.4s, v0.4s, v1.4s - adr x0, u32_sum - st1 {v2.4s}, [x0] - ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10) - - /* 2x 64-bit integer add. - * - * d stands for double == 64 bits. - */ -.data - u64_0: .quad 0xF1111111F1111111, 0xF2222222F2222222 - u64_1: .quad 0x1555555515555555, 0x1666666616666666 - u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888 -.bss - u64_sum: .skip 16 -.text - adr x0, u64_0 - ld1 {v0.2d}, [x0] - adr x1, u64_1 - ld1 {v1.2d}, [x1] - add v2.2d, v0.2d, v1.2d - adr x0, u64_sum - st1 {v2.2d}, [x0] - ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10) - - /* 4x 32-bit float add. - * - * The only difference between the integer point version - * is that we use fadd instead of add. - */ -.data - f32_0: .float 1.5, 2.5, 3.5, 4.5 - f32_1: .float 5.5, 6.5, 7.5, 8.5 - f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0 -.bss - f32_sum: .skip 16 -.text - adr x0, f32_0 - ld1 {v0.4s}, [x0] - adr x1, f32_1 - ld1 {v1.4s}, [x1] - fadd v2.4s, v0.4s, v1.4s - adr x0, f32_sum - st1 {v2.4s}, [x0] - ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10) - - /* 2x 64-bit float add. */ -.data - f64_0: .double 1.5, 2.5 - f64_1: .double 5.5, 6.5 - f64_sum_expect: .double 7.0, 9.0 -.bss - f64_sum: .skip 16 -.text - adr x0, f64_0 - ld1 {v0.2d}, [x0] - adr x1, f64_1 - ld1 {v1.2d}, [x1] - fadd v2.2d, v0.2d, v1.2d - adr x0, f64_sum - st1 {v2.2d}, [x0] - ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10) -EXIT diff --git a/userland/arch/arm/simd.S b/userland/arch/arm/simd.S deleted file mode 100644 index ddec03d..0000000 --- a/userland/arch/arm/simd.S +++ /dev/null @@ -1,113 +0,0 @@ -/* https://github.com/cirosantilli/arm-assembly-cheat#advanced-simd-instructions */ - -#include "common.h" - -ENTRY - /* vadd.u32 - * - * Add 4x 32-bit unsigned integers in one go. - * - * q means 128-bits. - * - * u32 means that we treat memory as uint32_t types. - * - * 4 is deduced: in 128 bits you can fit 4 u32. - * - * Observe how the carry is propagated within u32 integers, - * but not across them. - */ -.data - u32_0: .word 0xF111F111, 0xF222F222, 0xF333F333, 0xF444F444 - u32_1: .word 0x15551555, 0x16661666, 0x17771777, 0x18881888 - u32_sum_expect: .word 0x06670666, 0x08890888, 0x0AAB0AAA, 0x0CCD0CCC -.bss - u32_sum: .skip 0x10 -.text - ldr r0, =u32_0 - vld1.32 {q0}, [r0] - ldr r0, =u32_1 - vld1.32 {q1}, [r0] - vadd.u32 q2, q0, q1 - ldr r0, =u32_sum - vst1.u32 {q2}, [r0] - ASSERT_MEMCMP(u32_sum, u32_sum_expect, 0x10) - - /* vadd.u64: 2x 64-bit unsigned integer add. */ -.data - u64_0: .quad 0xF1111111F1111111, 0xF2222222F2222222 - u64_1: .quad 0x1555555515555555, 0x1666666616666666 - u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888 -.bss - u64_sum: .skip 0x10 -.text - ldr r0, =u64_0 - vld1.64 {q0}, [r0] - ldr r0, =u64_1 - vld1.64 {q1}, [r0] - vadd.u64 q2, q0, q1 - ldr r0, =u64_sum - vst1.u64 {q2}, [r0] - ASSERT_MEMCMP(u64_sum, u64_sum_expect, 0x10) - - /* vadd.s64: 2x 64-bit signed integer add. TODO: how to differentiate - * it from signed? I think signed and unsigned addition are identical - * in two's complement, the only difference is overflow / carry detection - * flags. But how do flags work when there are many values being added - * at once? - */ -.data - s64_0: .quad -1, -2 - s64_1: .quad -1, -2 - s64_sum_expect: .quad -2, -4 -.bss - s64_sum: .skip 0x10 -.text - ldr r0, =s64_0 - vld1.64 {q0}, [r0] - ldr r0, =s64_1 - vld1.64 {q1}, [r0] - vadd.s64 q2, q0, q1 - ldr r0, =s64_sum - vst1.s64 {q2}, [r0] - ASSERT_MEMCMP(s64_sum, s64_sum_expect, 0x10) - - /* vadd.f32: 4x 32-bit float add. */ -.data - f32_0: .float 1.5, 2.5, 3.5, 4.5 - f32_1: .float 5.5, 6.5, 7.5, 8.5 - f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0 -.bss - f32_sum: .skip 0x10 -.text - ldr r0, =f32_0 - vld1.32 {q0}, [r0] - ldr r0, =f32_1 - vld1.32 {q1}, [r0] - vadd.f32 q2, q0, q1 - ldr r0, =f32_sum - vst1.32 {q2}, [r0] - ASSERT_MEMCMP(f32_sum, f32_sum_expect, 0x10) - - /* vadd.f64: 2x 64-bit float add: appears not possible. - * - * https://stackoverflow.com/questions/36052564/does-arm-support-simd-operations-for-64-bit-floating-point-numbers - */ -.data - f64_0: .double 1.5, 2.5 - f64_1: .double 5.5, 6.5 - f64_sum_expect: .double 7.0, 9.0 -.bss - f64_sum: .skip 0x10 -.text - ldr r0, =f64_0 - vld1.64 {q0}, [r0] - ldr r0, =f64_1 - vld1.64 {q1}, [r0] -#if 0 - /* bad type in Neon instruction -- `vadd.f64 q2,q0,q1' */ - vadd.f64 q2, q0, q1 - ldr r0, =f64_sum - vst1.64 {q2}, [r0] - ASSERT_MEMCMP(f64_sum, f64_sum_expect, 0x10) -#endif -EXIT diff --git a/userland/arch/arm/vadd.S b/userland/arch/arm/vadd.S new file mode 100644 index 0000000..07fced4 --- /dev/null +++ b/userland/arch/arm/vadd.S @@ -0,0 +1,71 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd */ + +#include "common.h" + +.bss + output: .skip 16 +ENTRY + /* Integer. */ +.data + input0_u: .long 0xF1F1F1F1, 0xF2F2F2F2, 0xF3F3F3F3, 0xF4F4F4F4 + input1_u: .long 0x12121212, 0x13131313, 0x14141414, 0x15151515 + expect_u_32: .long 0x04040403, 0x06060605, 0x08080807, 0x0A0A0A09 + expect_u_64: .long 0x04040403, 0x06060606, 0x08080807, 0x0A0A0A0A +.text +#define TEST(size) \ + ldr r0, =input0_u; \ + vld1. ## size {q0}, [r0]; \ + ldr r0, =input1_u; \ + vld1. ## size {q1}, [r0]; \ + vadd.u ## size q2, q0, q1; \ + ldr r0, =output; \ + vst1.u ## size {q2}, [r0]; \ + ASSERT_MEMCMP(output, expect_u_ ## size, 0x10) + + /* vadd.u32 + * + * Add 4x 32-bit unsigned integers in one go. + * + * q means quad (128-bits) + * + * u32 means that we treat memory as uint32_t types. + * + * 4 is deduced: in 128 bits you can fit 4x u32. + */ + TEST(32) + /* 2x 64-bit */ + TEST(64) +#undef TEST + + /* Floating point. */ +.data + input0_f_32: .float 1.5, 2.5, 3.5, 4.5 + input1_f_32: .float 5.5, 6.5, 7.5, 8.5 + expect_f_32: .float 7.0, 9.0, 11.0, 13.0 + input0_f_64: .double 1.5, 2.5 + input1_f_64: .double 5.5, 6.5 + expect_f_64: .double 7.0, 9.0 +.text +#define TEST(size) \ + ldr r0, =input0_f_ ## size; \ + vld1. ## size {q0}, [r0]; \ + ldr r0, =input1_f_ ## size; \ + vld1. ## size {q1}, [r0]; \ + vadd.f ## size q2, q0, q1; \ + ldr r0, =output; \ + vst1. ## size {q2}, [r0]; \ + ASSERT_MEMCMP(output, expect_f_ ## size, 0x10) + + /* 4x 32-bit. */ + TEST(32) +#if 0 + /* vadd.f64: 2x 64-bit float add: appears not possible. + * https://stackoverflow.com/questions/36052564/does-arm-support-simd-operations-for-64-bit-floating-point-numbers + * + * Fails with: + * bad type in Neon instruction -- `vadd.f64 q2,q0,q1' */ + */ + TEST(64) +#endif +#undef TEST +EXIT diff --git a/userland/arch/x86_64/addpd.S b/userland/arch/x86_64/addpd.S index 2e9b32b..eefdfd2 100644 --- a/userland/arch/x86_64/addpd.S +++ b/userland/arch/x86_64/addpd.S @@ -23,7 +23,9 @@ ENTRY movups %xmm0, output; \ ASSERT_MEMCMP(output, addp ## size ## _expect, $0x10) + /* 4x 32-bit */ TEST(s) + /* 2x 64-bit */ TEST(d) #undef TEST EXIT diff --git a/userland/arch/x86_64/paddq.S b/userland/arch/x86_64/paddq.S index 9da3f46..07b4cbc 100644 --- a/userland/arch/x86_64/paddq.S +++ b/userland/arch/x86_64/paddq.S @@ -1,4 +1,4 @@ -/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-simd +/* https://github.com/cirosantilli/linux-kernel-module-cheat#assembly-simd * * Add a bunch of integers in one go. * @@ -25,9 +25,14 @@ ENTRY movups %xmm0, output; \ ASSERT_MEMCMP(output, padd ## size ## _expect, $0x10) + /* 16x 8-bit */ TEST(b) + /* 8x 4-bit */ TEST(w) + /* 4x 8-bit */ + /* 4x long */ TEST(d) + /* 2x 16-bit */ TEST(q) #undef TEST EXIT