From d62070d9344da84917fb46bd997cb7a4309ff576 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?=
 =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= <ciro.santilli@gmail.com>
Date: Sun, 23 Jun 2019 00:00:02 +0000
Subject: [PATCH] x86 asm: move the rest of SIMD from x86-assembly-cheat

---
 README.adoc                        | 67 +++++++++++++++++++++++++++---
 userland/arch/x86_64/addpd.S       |  4 +-
 userland/arch/x86_64/cvttss2si.S   | 20 +++++++++
 userland/arch/x86_64/movaps.S      | 18 ++++++++
 userland/arch/x86_64/movss.S       | 14 +++++++
 userland/arch/x86_64/movups.S      | 16 +++++++
 userland/arch/x86_64/vfmadd132pd.S | 23 ++++++++++
 7 files changed, 154 insertions(+), 8 deletions(-)
 create mode 100644 userland/arch/x86_64/cvttss2si.S
 create mode 100644 userland/arch/x86_64/movaps.S
 create mode 100644 userland/arch/x86_64/movss.S
 create mode 100644 userland/arch/x86_64/movups.S
 create mode 100644 userland/arch/x86_64/vfmadd132pd.S
diff --git a/README.adoc b/README.adoc
index cb1e328..7ba7724 100644
--- a/README.adoc
+++ b/README.adoc
@@ -11927,7 +11927,7 @@ Let's start as usual with floating point addition + register file:
 Much like ADD for non-SIMD, start learning SIMD instructions by looking at the integer and floating point SIMD ADD instructions of each ISA:
 
 * x86
-** <<x86-addpd-instruction>>
+** <<x86-sse-data-transfer-instructions,ADDPD>>
 ** <<x86-paddq-instruction>>
 * arm
 ** <<arm-vadd-instruction>>
@@ -11959,6 +11959,28 @@ as mentioned at:
 
 Bibliography: https://stackoverflow.com/questions/1389712/getting-started-with-intel-x86-sse-simd-instructions/56409539#56409539
 
+==== FMA instruction
+
+Fused multiply add:
+
+* x86: <<x86-fma>>
+
+Bibliography:
+
+* https://en.wikipedia.org/wiki/Multiply–accumulate_operation
+* https://en.wikipedia.org/wiki/FMA_instruction_set
+
+Particularly important numerical analysis instruction, that is used in particular for;
+
+* Dot product
+* Matrix multiplication
+
+FMA is so important that IEEE 754 specifies it with single precision drop compared to a separate add and multiply!
+
+Micro-op fun: http://stackoverflow.com/questions/28630864/how-is-fma-implemented
+
+Historically, FMA instructions have been added relatively late to instruction sets.
+
 === User vs system assembly
 
 By "userland assembly", we mean "the parts of the ISA which can be freely used from userland".
@@ -12858,6 +12880,8 @@ In GCC, you can choose between them with `-mfpmath=`.
 
 === x86 SIMD
 
+Parent section: <<simd-assembly>>
+
 History:
 
 * link:https://en.wikipedia.org/wiki/MMX_(instruction_set)[MMX]: MultiMedia eXtension (unofficial name). 1997. MM0-MM7 64-bit registers.
@@ -12869,15 +12893,33 @@ History:
 * AVX2:2013
 * AVX-512: 2016. 512-bit ZMM registers. Extension of YMM.
 
+==== x86 SSE instructions
+
+<<intel-manual-1>> 5.5 "SSE INSTRUCTIONS"
+
+===== x86 SSE data transfer instructions
+
+<<intel-manual-1>> 5.5.1.1 "SSE Data Transfer Instructions"
+
+* link:userland/arch/x86_64/movaps.S[]: MOVAPS: move 4 x 32-bits between two XMM registeres or XMM registers and 16-byte aligned memory
+* link:userland/arch/x86_64/movaps.S[]: MOVUPS: like MOVAPS but also works for unaligned memory
+* link:userland/arch/x86_64/movss.S[]: MOVSS: move 32-bits between two XMM registeres or XMM registers and memory
+
+===== x86 SSE packed arithmetic instructions
+
+<<intel-manual-1>> 5.5.1.2 "SSE Packed Arithmetic Instructions"
+
+* link:userland/arch/x86_64/addpd.S[]: ADDPS, ADDPD: good first instruction to learn SIMD: <<simd-assembly>>
+
+===== x86 SSE conversion instructions
+
+<<intel-manual-1>> 5.5.1.6 "SSE Conversion Instructions"
+
 ==== x86 SSE2 instructions
 
 <<intel-manual-1>> 5.6 "SSE2 INSTRUCTIONS"
 
-===== x86 ADDPD instruction
-
-link:userland/arch/x86_64/addpd.S[]: ADDPS, ADDPD
-
-Good first instruction to learn SIMD: <<simd-assembly>>
+* link:userland/arch/x86_64/cvttss2si.S[]: CVTTSS2SI: convert 32-bit floating point to 32-bit integer, store the result in a general purpose register. Round towards 0.
 
 ===== x86 PADDQ instruction
 
@@ -12885,6 +12927,17 @@ link:userland/arch/x86_64/paddq.S[]: PADDQ, PADDL, PADDW, PADDB
 
 Good first instruction to learn SIMD: <<simd-assembly>>
 
+[[x86-fma]]
+==== x86 fused multiply add (FMA)
+
+<<intel-manual-1>> 5.15 "FUSED-MULTIPLY-ADD (FMA)"
+
+* link:userland/arch/x86_64/vfmadd132pd.S[]: VFMADD132PD: "Multiply packed double-precision floating-point values from xmm1 and xmm3/mem, add to xmm2 and put result in xmm1." TODO: but I don't understand the manual, experimentally on <<p51>> Ubuntu 19.04 host the result is stored in XMM2!
+
+These instructions were not part of any SSEn set: they actually have a dedicated CPUID flag for it! It appears under `/proc/cpuinfo` as `fma`. They were introduced into AVX512F however.
+
+They are also unusual for x86 instructions in that they take 3 operands, as you would intuitively expect from the definition of FMA.
+
 === x86 system instructions
 
 <<intel-manual-1>> 5.20 "SYSTEM INSTRUCTIONS"
@@ -13630,6 +13683,8 @@ Why GNU GAS 2.29 does not have a mnemonic for it in A64 because it is very recen
 
 === ARM SIMD
 
+Parent section: <<simd-assembly>>
+
 ==== ARM VFP
 
 The name for the ARMv7 and AArch32 floating point and SIMD instructions / registers.
diff --git a/userland/arch/x86_64/addpd.S b/userland/arch/x86_64/addpd.S
index e00ac79..2698e99 100644
--- a/userland/arch/x86_64/addpd.S
+++ b/userland/arch/x86_64/addpd.S
@@ -1,6 +1,6 @@
-/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-addpd-instruction
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions
  *
- * Add a few floating point numbers in one go (P == packaged)
+ * Add a few floating point numbers in one go (P == packaged).
  */
 
 #include <lkmc.h>
diff --git a/userland/arch/x86_64/cvttss2si.S b/userland/arch/x86_64/cvttss2si.S
new file mode 100644
index 0000000..0b95a5f
--- /dev/null
+++ b/userland/arch/x86_64/cvttss2si.S
@@ -0,0 +1,20 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions */
+
+#include <lkmc.h>
+
+LKMC_PROLOGUE
+.data
+    .align 16
+    input_2_5: .float 2.5
+    input_minus_2_5: .float -2.5
+.text
+    /* Positive input. */
+    movss input_2_5, %xmm0
+    cvttss2si %xmm0, %eax
+    LKMC_ASSERT_EQ_32(%eax, $2)
+
+    /* Negative input. */
+    movss input_minus_2_5, %xmm0
+    cvttss2si %xmm0, %eax
+    LKMC_ASSERT_EQ_32(%eax, $-2)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/movaps.S b/userland/arch/x86_64/movaps.S
new file mode 100644
index 0000000..92cd0d6
--- /dev/null
+++ b/userland/arch/x86_64/movaps.S
@@ -0,0 +1,18 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions */
+
+#include <lkmc.h>
+
+LKMC_PROLOGUE
+.data
+    /* Ensure that the memory is 16-byte aligned. */
+    .align 16
+    input: .float 1.5, 2.5, 3.5, 4.5
+.bss
+    .align 16
+    output: .skip 16
+.text
+    movaps input, %xmm0
+    movaps %xmm0, %xmm1
+    movaps %xmm1, output
+    LKMC_ASSERT_MEMCMP(input, output, $16)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/movss.S b/userland/arch/x86_64/movss.S
new file mode 100644
index 0000000..9c02c4e
--- /dev/null
+++ b/userland/arch/x86_64/movss.S
@@ -0,0 +1,14 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-data-transfer-instructions */
+
+#include <lkmc.h>
+
+.data
+    input:  .float 1.5
+.bss
+    output: .skip 4
+LKMC_PROLOGUE
+    movss input, %xmm0
+    movss %xmm0, %xmm1
+    movss %xmm1, output
+    LKMC_ASSERT_MEMCMP(input, output, $4)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/movups.S b/userland/arch/x86_64/movups.S
new file mode 100644
index 0000000..850dd34
--- /dev/null
+++ b/userland/arch/x86_64/movups.S
@@ -0,0 +1,16 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions */
+
+#include <lkmc.h>
+
+LKMC_PROLOGUE
+.data
+    /* Unlike MOVAPS, we don't need to align memory here. */
+    input: .float 1.5, 2.5, 3.5, 4.5
+.bss
+    output: .skip 16
+.text
+    movups input, %xmm0
+    movups %xmm0, %xmm1
+    movups %xmm1, output
+    LKMC_ASSERT_MEMCMP(input, output, $16)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/vfmadd132pd.S b/userland/arch/x86_64/vfmadd132pd.S
new file mode 100644
index 0000000..a2dbc79
--- /dev/null
+++ b/userland/arch/x86_64/vfmadd132pd.S
@@ -0,0 +1,23 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-fma */
+
+#include <lkmc.h>
+
+LKMC_PROLOGUE
+.data
+    .align 16
+    input0: .double 1.5,  2.5
+    input1: .double 2.0,  4.0
+    input2: .double 2.5,  3.5
+    expect: .double 6.5, 16.5
+.bss
+    .align 16
+    output: .skip 16
+.text
+    movaps input1, %xmm0
+    movaps input0, %xmm1
+    movaps input2, %xmm2
+    /* xmm2 = xmm1 + (xmm0 * xmm2) */
+    vfmadd132pd %xmm0, %xmm1, %xmm2
+    movaps %xmm2, output
+    LKMC_ASSERT_MEMCMP(output, expect, $0x10)
+LKMC_EPILOGUE