From 7cf3c20a40a985055bbf5d1edacabd6e129ca315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?= =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= Date: Sun, 12 May 2019 00:00:01 +0000 Subject: [PATCH] x86 assembly: addpd --- README.adoc | 10 ++++++++-- userland/arch/aarch64/simd.S | 4 ++-- userland/arch/x86_64/addpd.S | 32 ++++++++++++++++++++++++++++++++ userland/arch/x86_64/paddq.S | 8 +++++--- 4 files changed, 47 insertions(+), 7 deletions(-) create mode 100644 userland/arch/x86_64/addpd.S diff --git a/README.adoc b/README.adoc index 55d5436..2d5052d 100644 --- a/README.adoc +++ b/README.adoc @@ -11578,7 +11578,12 @@ Sources: * link:userland/arch/arm/add.S[] * link:userland/arch/aarch64/add.S[] -This verifies that the venerable `add` instruction and our setup are working. +These examples use the venerable ADD instruction to: + +* introduce the basics of how a given assembly works: how many inputs / outputs, who is input and output, can it use memory or just registers, etc. ++ +It is then a big copy paste for most other data instructions. +* verify that the venerable `add` instruction and our assertions are working Then, modify that program to make the assertion fail: @@ -11849,7 +11854,8 @@ History: * link:https://en.wikipedia.org/wiki/MMX_(instruction_set)[MMX]: 1997 * link:https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions[SSE]: Streaming SIMD Extensions. 1999. 128-bit XMM registers. * link:https://en.wikipedia.org/wiki/SSE2[SSE2]: 2004 -** link:userland/arch/x86_64/paddq.S[] +** link:userland/arch/x86_64/addpd.S[]: `ADDPS`, `ADDPD` +** link:userland/arch/x86_64/paddq.S[]: `PADDQ`, `PADDL`, `PADDW`, `PADDB` * link:https://en.wikipedia.org/wiki/SSE3[SSE3]: 2006 * link:https://en.wikipedia.org/wiki/SSE4[SSE4]: 2006 * link:https://en.wikipedia.org/wiki/Advanced_Vector_Extensions[AVX]: Advanced Vector Extensions. 2011. 256-bit YMM registers. Extension of XMM. diff --git a/userland/arch/aarch64/simd.S b/userland/arch/aarch64/simd.S index 4ff9b39..4e306eb 100644 --- a/userland/arch/aarch64/simd.S +++ b/userland/arch/aarch64/simd.S @@ -35,7 +35,7 @@ ENTRY u64_1: .quad 0x1555555515555555, 0x1666666616666666 u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888 .bss - u64_sum: .skip 16 + u64_sum: .skip 16 .text adr x0, u64_0 ld1 {v0.2d}, [x0] @@ -56,7 +56,7 @@ ENTRY f32_1: .float 5.5, 6.5, 7.5, 8.5 f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0 .bss - f32_sum: .skip 16 + f32_sum: .skip 16 .text adr x0, f32_0 ld1 {v0.4s}, [x0] diff --git a/userland/arch/x86_64/addpd.S b/userland/arch/x86_64/addpd.S new file mode 100644 index 0000000..3fa3a5d --- /dev/null +++ b/userland/arch/x86_64/addpd.S @@ -0,0 +1,32 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-simd + * + * Add a bunch of floating point numbers in one go. + */ + +#include "common.h" + +ENTRY +.bss + output: .skip 16 + +#define t + +.data + addps_input0: .float 1.5, 2.5, 3.5, 4.5 + addps_input1: .float 5.5, 6.5, 7.5, 8.5 + addps_expect: .float 7.0, 9.0, 11.0, 13.0 + addpd_input0: .double 1.5, 2.5 + addpd_input1: .double 5.5, 6.5 + addpd_expect: .double 7.0, 9.0 +.text +#define TEST(size) \ + movups addp ## size ## _input0, %xmm0; \ + movups addp ## size ## _input1, %xmm1; \ + addp ## size %xmm1, %xmm0; \ + movups %xmm0, output; \ + ASSERT_MEMCMP(output, addp ## size ## _expect, $0x10) + + TEST(s) + TEST(d) +#undef TEST +EXIT diff --git a/userland/arch/x86_64/paddq.S b/userland/arch/x86_64/paddq.S index 9519bbd..9da3f46 100644 --- a/userland/arch/x86_64/paddq.S +++ b/userland/arch/x86_64/paddq.S @@ -1,6 +1,8 @@ /* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-simd * * Add a bunch of integers in one go. + * + * The different variants basically determine if carries get forwarded or not. */ #include "common.h" @@ -14,14 +16,14 @@ ENTRY paddd_expect: .long 0x04040403, 0x06060605, 0x08080807, 0x0A0A0A09 paddq_expect: .long 0x04040403, 0x06060606, 0x08080807, 0x0A0A0A0A .bss - result: .skip 16 + output: .skip 16 .text movups input1, %xmm1 #define TEST(size) \ movups input0, %xmm0; \ padd ## size %xmm1, %xmm0; \ - movups %xmm0, result; \ - ASSERT_MEMCMP(result, padd ## size ## _expect, $0x10) + movups %xmm0, output; \ + ASSERT_MEMCMP(output, padd ## size ## _expect, $0x10) TEST(b) TEST(w)