From d62070d9344da84917fb46bd997cb7a4309ff576 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?= =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= Date: Sun, 23 Jun 2019 00:00:02 +0000 Subject: [PATCH] x86 asm: move the rest of SIMD from x86-assembly-cheat --- README.adoc | 67 +++++++++++++++++++++++++++--- userland/arch/x86_64/addpd.S | 4 +- userland/arch/x86_64/cvttss2si.S | 20 +++++++++ userland/arch/x86_64/movaps.S | 18 ++++++++ userland/arch/x86_64/movss.S | 14 +++++++ userland/arch/x86_64/movups.S | 16 +++++++ userland/arch/x86_64/vfmadd132pd.S | 23 ++++++++++ 7 files changed, 154 insertions(+), 8 deletions(-) create mode 100644 userland/arch/x86_64/cvttss2si.S create mode 100644 userland/arch/x86_64/movaps.S create mode 100644 userland/arch/x86_64/movss.S create mode 100644 userland/arch/x86_64/movups.S create mode 100644 userland/arch/x86_64/vfmadd132pd.S diff --git a/README.adoc b/README.adoc index cb1e328..7ba7724 100644 --- a/README.adoc +++ b/README.adoc @@ -11927,7 +11927,7 @@ Let's start as usual with floating point addition + register file: Much like ADD for non-SIMD, start learning SIMD instructions by looking at the integer and floating point SIMD ADD instructions of each ISA: * x86 -** <> +** <> ** <> * arm ** <> @@ -11959,6 +11959,28 @@ as mentioned at: Bibliography: https://stackoverflow.com/questions/1389712/getting-started-with-intel-x86-sse-simd-instructions/56409539#56409539 +==== FMA instruction + +Fused multiply add: + +* x86: <> + +Bibliography: + +* https://en.wikipedia.org/wiki/Multiply–accumulate_operation +* https://en.wikipedia.org/wiki/FMA_instruction_set + +Particularly important numerical analysis instruction, that is used in particular for; + +* Dot product +* Matrix multiplication + +FMA is so important that IEEE 754 specifies it with single precision drop compared to a separate add and multiply! + +Micro-op fun: http://stackoverflow.com/questions/28630864/how-is-fma-implemented + +Historically, FMA instructions have been added relatively late to instruction sets. + === User vs system assembly By "userland assembly", we mean "the parts of the ISA which can be freely used from userland". @@ -12858,6 +12880,8 @@ In GCC, you can choose between them with `-mfpmath=`. === x86 SIMD +Parent section: <> + History: * link:https://en.wikipedia.org/wiki/MMX_(instruction_set)[MMX]: MultiMedia eXtension (unofficial name). 1997. MM0-MM7 64-bit registers. @@ -12869,15 +12893,33 @@ History: * AVX2:2013 * AVX-512: 2016. 512-bit ZMM registers. Extension of YMM. +==== x86 SSE instructions + +<> 5.5 "SSE INSTRUCTIONS" + +===== x86 SSE data transfer instructions + +<> 5.5.1.1 "SSE Data Transfer Instructions" + +* link:userland/arch/x86_64/movaps.S[]: MOVAPS: move 4 x 32-bits between two XMM registeres or XMM registers and 16-byte aligned memory +* link:userland/arch/x86_64/movaps.S[]: MOVUPS: like MOVAPS but also works for unaligned memory +* link:userland/arch/x86_64/movss.S[]: MOVSS: move 32-bits between two XMM registeres or XMM registers and memory + +===== x86 SSE packed arithmetic instructions + +<> 5.5.1.2 "SSE Packed Arithmetic Instructions" + +* link:userland/arch/x86_64/addpd.S[]: ADDPS, ADDPD: good first instruction to learn SIMD: <> + +===== x86 SSE conversion instructions + +<> 5.5.1.6 "SSE Conversion Instructions" + ==== x86 SSE2 instructions <> 5.6 "SSE2 INSTRUCTIONS" -===== x86 ADDPD instruction - -link:userland/arch/x86_64/addpd.S[]: ADDPS, ADDPD - -Good first instruction to learn SIMD: <> +* link:userland/arch/x86_64/cvttss2si.S[]: CVTTSS2SI: convert 32-bit floating point to 32-bit integer, store the result in a general purpose register. Round towards 0. ===== x86 PADDQ instruction @@ -12885,6 +12927,17 @@ link:userland/arch/x86_64/paddq.S[]: PADDQ, PADDL, PADDW, PADDB Good first instruction to learn SIMD: <> +[[x86-fma]] +==== x86 fused multiply add (FMA) + +<> 5.15 "FUSED-MULTIPLY-ADD (FMA)" + +* link:userland/arch/x86_64/vfmadd132pd.S[]: VFMADD132PD: "Multiply packed double-precision floating-point values from xmm1 and xmm3/mem, add to xmm2 and put result in xmm1." TODO: but I don't understand the manual, experimentally on <> Ubuntu 19.04 host the result is stored in XMM2! + +These instructions were not part of any SSEn set: they actually have a dedicated CPUID flag for it! It appears under `/proc/cpuinfo` as `fma`. They were introduced into AVX512F however. + +They are also unusual for x86 instructions in that they take 3 operands, as you would intuitively expect from the definition of FMA. + === x86 system instructions <> 5.20 "SYSTEM INSTRUCTIONS" @@ -13630,6 +13683,8 @@ Why GNU GAS 2.29 does not have a mnemonic for it in A64 because it is very recen === ARM SIMD +Parent section: <> + ==== ARM VFP The name for the ARMv7 and AArch32 floating point and SIMD instructions / registers. diff --git a/userland/arch/x86_64/addpd.S b/userland/arch/x86_64/addpd.S index e00ac79..2698e99 100644 --- a/userland/arch/x86_64/addpd.S +++ b/userland/arch/x86_64/addpd.S @@ -1,6 +1,6 @@ -/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-addpd-instruction +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions * - * Add a few floating point numbers in one go (P == packaged) + * Add a few floating point numbers in one go (P == packaged). */ #include diff --git a/userland/arch/x86_64/cvttss2si.S b/userland/arch/x86_64/cvttss2si.S new file mode 100644 index 0000000..0b95a5f --- /dev/null +++ b/userland/arch/x86_64/cvttss2si.S @@ -0,0 +1,20 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions */ + +#include + +LKMC_PROLOGUE +.data + .align 16 + input_2_5: .float 2.5 + input_minus_2_5: .float -2.5 +.text + /* Positive input. */ + movss input_2_5, %xmm0 + cvttss2si %xmm0, %eax + LKMC_ASSERT_EQ_32(%eax, $2) + + /* Negative input. */ + movss input_minus_2_5, %xmm0 + cvttss2si %xmm0, %eax + LKMC_ASSERT_EQ_32(%eax, $-2) +LKMC_EPILOGUE diff --git a/userland/arch/x86_64/movaps.S b/userland/arch/x86_64/movaps.S new file mode 100644 index 0000000..92cd0d6 --- /dev/null +++ b/userland/arch/x86_64/movaps.S @@ -0,0 +1,18 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions */ + +#include + +LKMC_PROLOGUE +.data + /* Ensure that the memory is 16-byte aligned. */ + .align 16 + input: .float 1.5, 2.5, 3.5, 4.5 +.bss + .align 16 + output: .skip 16 +.text + movaps input, %xmm0 + movaps %xmm0, %xmm1 + movaps %xmm1, output + LKMC_ASSERT_MEMCMP(input, output, $16) +LKMC_EPILOGUE diff --git a/userland/arch/x86_64/movss.S b/userland/arch/x86_64/movss.S new file mode 100644 index 0000000..9c02c4e --- /dev/null +++ b/userland/arch/x86_64/movss.S @@ -0,0 +1,14 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-data-transfer-instructions */ + +#include + +.data + input: .float 1.5 +.bss + output: .skip 4 +LKMC_PROLOGUE + movss input, %xmm0 + movss %xmm0, %xmm1 + movss %xmm1, output + LKMC_ASSERT_MEMCMP(input, output, $4) +LKMC_EPILOGUE diff --git a/userland/arch/x86_64/movups.S b/userland/arch/x86_64/movups.S new file mode 100644 index 0000000..850dd34 --- /dev/null +++ b/userland/arch/x86_64/movups.S @@ -0,0 +1,16 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions */ + +#include + +LKMC_PROLOGUE +.data + /* Unlike MOVAPS, we don't need to align memory here. */ + input: .float 1.5, 2.5, 3.5, 4.5 +.bss + output: .skip 16 +.text + movups input, %xmm0 + movups %xmm0, %xmm1 + movups %xmm1, output + LKMC_ASSERT_MEMCMP(input, output, $16) +LKMC_EPILOGUE diff --git a/userland/arch/x86_64/vfmadd132pd.S b/userland/arch/x86_64/vfmadd132pd.S new file mode 100644 index 0000000..a2dbc79 --- /dev/null +++ b/userland/arch/x86_64/vfmadd132pd.S @@ -0,0 +1,23 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-fma */ + +#include + +LKMC_PROLOGUE +.data + .align 16 + input0: .double 1.5, 2.5 + input1: .double 2.0, 4.0 + input2: .double 2.5, 3.5 + expect: .double 6.5, 16.5 +.bss + .align 16 + output: .skip 16 +.text + movaps input1, %xmm0 + movaps input0, %xmm1 + movaps input2, %xmm2 + /* xmm2 = xmm1 + (xmm0 * xmm2) */ + vfmadd132pd %xmm0, %xmm1, %xmm2 + movaps %xmm2, output + LKMC_ASSERT_MEMCMP(output, expect, $0x10) +LKMC_EPILOGUE