From 7cf3c20a40a985055bbf5d1edacabd6e129ca315 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?=
 =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= <ciro.santilli@gmail.com>
Date: Sun, 12 May 2019 00:00:01 +0000
Subject: [PATCH] x86 assembly: addpd

---
 README.adoc                  | 10 ++++++++--
 userland/arch/aarch64/simd.S |  4 ++--
 userland/arch/x86_64/addpd.S | 32 ++++++++++++++++++++++++++++++++
 userland/arch/x86_64/paddq.S |  8 +++++---
 4 files changed, 47 insertions(+), 7 deletions(-)
 create mode 100644 userland/arch/x86_64/addpd.S

diff --git a/README.adoc b/README.adoc
index 55d5436..2d5052d 100644
--- a/README.adoc
+++ b/README.adoc
@@ -11578,7 +11578,12 @@ Sources:
 * link:userland/arch/arm/add.S[]
 * link:userland/arch/aarch64/add.S[]
 
-This verifies that the venerable `add` instruction and our setup are working.
+These examples use the venerable ADD instruction to:
+
+* introduce the basics of how a given assembly works: how many inputs / outputs, who is input and output, can it use memory or just registers, etc.
++
+It is then a big copy paste for most other data instructions.
+* verify that the venerable `add` instruction and our assertions are working
 
 Then, modify that program to make the assertion fail:
 
@@ -11849,7 +11854,8 @@ History:
 * link:https://en.wikipedia.org/wiki/MMX_(instruction_set)[MMX]: 1997
 * link:https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions[SSE]: Streaming SIMD Extensions. 1999. 128-bit XMM registers.
 * link:https://en.wikipedia.org/wiki/SSE2[SSE2]: 2004
-** link:userland/arch/x86_64/paddq.S[]
+** link:userland/arch/x86_64/addpd.S[]: `ADDPS`, `ADDPD`
+** link:userland/arch/x86_64/paddq.S[]: `PADDQ`, `PADDL`, `PADDW`, `PADDB`
 * link:https://en.wikipedia.org/wiki/SSE3[SSE3]: 2006
 * link:https://en.wikipedia.org/wiki/SSE4[SSE4]: 2006
 * link:https://en.wikipedia.org/wiki/Advanced_Vector_Extensions[AVX]: Advanced Vector Extensions. 2011. 256-bit YMM registers. Extension of XMM.
diff --git a/userland/arch/aarch64/simd.S b/userland/arch/aarch64/simd.S
index 4ff9b39..4e306eb 100644
--- a/userland/arch/aarch64/simd.S
+++ b/userland/arch/aarch64/simd.S
@@ -35,7 +35,7 @@ ENTRY
     u64_1:          .quad 0x1555555515555555, 0x1666666616666666
     u64_sum_expect: .quad 0x0666666706666666, 0x0888888908888888
 .bss
-    u64_sum: .skip 16
+    u64_sum:        .skip 16
 .text
     adr x0, u64_0
     ld1 {v0.2d}, [x0]
@@ -56,7 +56,7 @@ ENTRY
     f32_1:          .float 5.5, 6.5,  7.5,  8.5
     f32_sum_expect: .float 7.0, 9.0, 11.0, 13.0
 .bss
-    f32_sum: .skip 16
+    f32_sum:        .skip 16
 .text
     adr x0, f32_0
     ld1 {v0.4s}, [x0]
diff --git a/userland/arch/x86_64/addpd.S b/userland/arch/x86_64/addpd.S
new file mode 100644
index 0000000..3fa3a5d
--- /dev/null
+++ b/userland/arch/x86_64/addpd.S
@@ -0,0 +1,32 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-simd
+ *
+ * Add a bunch of floating point numbers in one go.
+ */
+
+#include "common.h"
+
+ENTRY
+.bss
+    output: .skip 16
+
+#define t
+
+.data
+    addps_input0: .float 1.5, 2.5,  3.5,  4.5
+    addps_input1: .float 5.5, 6.5,  7.5,  8.5
+    addps_expect: .float 7.0, 9.0, 11.0, 13.0
+    addpd_input0: .double 1.5, 2.5
+    addpd_input1: .double 5.5, 6.5
+    addpd_expect: .double 7.0, 9.0
+.text
+#define TEST(size) \
+    movups addp ## size ## _input0, %xmm0; \
+    movups addp ## size ## _input1, %xmm1; \
+    addp ## size %xmm1, %xmm0; \
+    movups %xmm0, output; \
+    ASSERT_MEMCMP(output, addp ## size ## _expect, $0x10)
+
+    TEST(s)
+    TEST(d)
+#undef TEST
+EXIT
diff --git a/userland/arch/x86_64/paddq.S b/userland/arch/x86_64/paddq.S
index 9519bbd..9da3f46 100644
--- a/userland/arch/x86_64/paddq.S
+++ b/userland/arch/x86_64/paddq.S
@@ -1,6 +1,8 @@
 /* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-simd
  *
  * Add a bunch of integers in one go.
+ *
+ * The different variants basically determine if carries get forwarded or not.
  */
 
 #include "common.h"
@@ -14,14 +16,14 @@ ENTRY
     paddd_expect: .long 0x04040403, 0x06060605, 0x08080807, 0x0A0A0A09
     paddq_expect: .long 0x04040403, 0x06060606, 0x08080807, 0x0A0A0A0A
 .bss
-    result: .skip 16
+    output:       .skip 16
 .text
     movups input1, %xmm1
 #define TEST(size) \
     movups input0, %xmm0; \
     padd ## size %xmm1, %xmm0; \
-    movups %xmm0, result; \
-    ASSERT_MEMCMP(result, padd ## size ## _expect, $0x10)
+    movups %xmm0, output; \
+    ASSERT_MEMCMP(output, padd ## size ## _expect, $0x10)
 
     TEST(b)
     TEST(w)