mirror of
https://github.com/cirosantilli/linux-kernel-module-cheat.git
synced 2026-01-23 02:05:57 +01:00
x86 asm: move the rest of SIMD from x86-assembly-cheat
This commit is contained in:
67
README.adoc
67
README.adoc
@@ -11927,7 +11927,7 @@ Let's start as usual with floating point addition + register file:
|
|||||||
Much like ADD for non-SIMD, start learning SIMD instructions by looking at the integer and floating point SIMD ADD instructions of each ISA:
|
Much like ADD for non-SIMD, start learning SIMD instructions by looking at the integer and floating point SIMD ADD instructions of each ISA:
|
||||||
|
|
||||||
* x86
|
* x86
|
||||||
** <<x86-addpd-instruction>>
|
** <<x86-sse-data-transfer-instructions,ADDPD>>
|
||||||
** <<x86-paddq-instruction>>
|
** <<x86-paddq-instruction>>
|
||||||
* arm
|
* arm
|
||||||
** <<arm-vadd-instruction>>
|
** <<arm-vadd-instruction>>
|
||||||
@@ -11959,6 +11959,28 @@ as mentioned at:
|
|||||||
|
|
||||||
Bibliography: https://stackoverflow.com/questions/1389712/getting-started-with-intel-x86-sse-simd-instructions/56409539#56409539
|
Bibliography: https://stackoverflow.com/questions/1389712/getting-started-with-intel-x86-sse-simd-instructions/56409539#56409539
|
||||||
|
|
||||||
|
==== FMA instruction
|
||||||
|
|
||||||
|
Fused multiply add:
|
||||||
|
|
||||||
|
* x86: <<x86-fma>>
|
||||||
|
|
||||||
|
Bibliography:
|
||||||
|
|
||||||
|
* https://en.wikipedia.org/wiki/Multiply–accumulate_operation
|
||||||
|
* https://en.wikipedia.org/wiki/FMA_instruction_set
|
||||||
|
|
||||||
|
Particularly important numerical analysis instruction, that is used in particular for;
|
||||||
|
|
||||||
|
* Dot product
|
||||||
|
* Matrix multiplication
|
||||||
|
|
||||||
|
FMA is so important that IEEE 754 specifies it with single precision drop compared to a separate add and multiply!
|
||||||
|
|
||||||
|
Micro-op fun: http://stackoverflow.com/questions/28630864/how-is-fma-implemented
|
||||||
|
|
||||||
|
Historically, FMA instructions have been added relatively late to instruction sets.
|
||||||
|
|
||||||
=== User vs system assembly
|
=== User vs system assembly
|
||||||
|
|
||||||
By "userland assembly", we mean "the parts of the ISA which can be freely used from userland".
|
By "userland assembly", we mean "the parts of the ISA which can be freely used from userland".
|
||||||
@@ -12858,6 +12880,8 @@ In GCC, you can choose between them with `-mfpmath=`.
|
|||||||
|
|
||||||
=== x86 SIMD
|
=== x86 SIMD
|
||||||
|
|
||||||
|
Parent section: <<simd-assembly>>
|
||||||
|
|
||||||
History:
|
History:
|
||||||
|
|
||||||
* link:https://en.wikipedia.org/wiki/MMX_(instruction_set)[MMX]: MultiMedia eXtension (unofficial name). 1997. MM0-MM7 64-bit registers.
|
* link:https://en.wikipedia.org/wiki/MMX_(instruction_set)[MMX]: MultiMedia eXtension (unofficial name). 1997. MM0-MM7 64-bit registers.
|
||||||
@@ -12869,15 +12893,33 @@ History:
|
|||||||
* AVX2:2013
|
* AVX2:2013
|
||||||
* AVX-512: 2016. 512-bit ZMM registers. Extension of YMM.
|
* AVX-512: 2016. 512-bit ZMM registers. Extension of YMM.
|
||||||
|
|
||||||
|
==== x86 SSE instructions
|
||||||
|
|
||||||
|
<<intel-manual-1>> 5.5 "SSE INSTRUCTIONS"
|
||||||
|
|
||||||
|
===== x86 SSE data transfer instructions
|
||||||
|
|
||||||
|
<<intel-manual-1>> 5.5.1.1 "SSE Data Transfer Instructions"
|
||||||
|
|
||||||
|
* link:userland/arch/x86_64/movaps.S[]: MOVAPS: move 4 x 32-bits between two XMM registeres or XMM registers and 16-byte aligned memory
|
||||||
|
* link:userland/arch/x86_64/movaps.S[]: MOVUPS: like MOVAPS but also works for unaligned memory
|
||||||
|
* link:userland/arch/x86_64/movss.S[]: MOVSS: move 32-bits between two XMM registeres or XMM registers and memory
|
||||||
|
|
||||||
|
===== x86 SSE packed arithmetic instructions
|
||||||
|
|
||||||
|
<<intel-manual-1>> 5.5.1.2 "SSE Packed Arithmetic Instructions"
|
||||||
|
|
||||||
|
* link:userland/arch/x86_64/addpd.S[]: ADDPS, ADDPD: good first instruction to learn SIMD: <<simd-assembly>>
|
||||||
|
|
||||||
|
===== x86 SSE conversion instructions
|
||||||
|
|
||||||
|
<<intel-manual-1>> 5.5.1.6 "SSE Conversion Instructions"
|
||||||
|
|
||||||
==== x86 SSE2 instructions
|
==== x86 SSE2 instructions
|
||||||
|
|
||||||
<<intel-manual-1>> 5.6 "SSE2 INSTRUCTIONS"
|
<<intel-manual-1>> 5.6 "SSE2 INSTRUCTIONS"
|
||||||
|
|
||||||
===== x86 ADDPD instruction
|
* link:userland/arch/x86_64/cvttss2si.S[]: CVTTSS2SI: convert 32-bit floating point to 32-bit integer, store the result in a general purpose register. Round towards 0.
|
||||||
|
|
||||||
link:userland/arch/x86_64/addpd.S[]: ADDPS, ADDPD
|
|
||||||
|
|
||||||
Good first instruction to learn SIMD: <<simd-assembly>>
|
|
||||||
|
|
||||||
===== x86 PADDQ instruction
|
===== x86 PADDQ instruction
|
||||||
|
|
||||||
@@ -12885,6 +12927,17 @@ link:userland/arch/x86_64/paddq.S[]: PADDQ, PADDL, PADDW, PADDB
|
|||||||
|
|
||||||
Good first instruction to learn SIMD: <<simd-assembly>>
|
Good first instruction to learn SIMD: <<simd-assembly>>
|
||||||
|
|
||||||
|
[[x86-fma]]
|
||||||
|
==== x86 fused multiply add (FMA)
|
||||||
|
|
||||||
|
<<intel-manual-1>> 5.15 "FUSED-MULTIPLY-ADD (FMA)"
|
||||||
|
|
||||||
|
* link:userland/arch/x86_64/vfmadd132pd.S[]: VFMADD132PD: "Multiply packed double-precision floating-point values from xmm1 and xmm3/mem, add to xmm2 and put result in xmm1." TODO: but I don't understand the manual, experimentally on <<p51>> Ubuntu 19.04 host the result is stored in XMM2!
|
||||||
|
|
||||||
|
These instructions were not part of any SSEn set: they actually have a dedicated CPUID flag for it! It appears under `/proc/cpuinfo` as `fma`. They were introduced into AVX512F however.
|
||||||
|
|
||||||
|
They are also unusual for x86 instructions in that they take 3 operands, as you would intuitively expect from the definition of FMA.
|
||||||
|
|
||||||
=== x86 system instructions
|
=== x86 system instructions
|
||||||
|
|
||||||
<<intel-manual-1>> 5.20 "SYSTEM INSTRUCTIONS"
|
<<intel-manual-1>> 5.20 "SYSTEM INSTRUCTIONS"
|
||||||
@@ -13630,6 +13683,8 @@ Why GNU GAS 2.29 does not have a mnemonic for it in A64 because it is very recen
|
|||||||
|
|
||||||
=== ARM SIMD
|
=== ARM SIMD
|
||||||
|
|
||||||
|
Parent section: <<simd-assembly>>
|
||||||
|
|
||||||
==== ARM VFP
|
==== ARM VFP
|
||||||
|
|
||||||
The name for the ARMv7 and AArch32 floating point and SIMD instructions / registers.
|
The name for the ARMv7 and AArch32 floating point and SIMD instructions / registers.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-addpd-instruction
|
/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions
|
||||||
*
|
*
|
||||||
* Add a few floating point numbers in one go (P == packaged)
|
* Add a few floating point numbers in one go (P == packaged).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <lkmc.h>
|
#include <lkmc.h>
|
||||||
|
|||||||
20
userland/arch/x86_64/cvttss2si.S
Normal file
20
userland/arch/x86_64/cvttss2si.S
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions */
|
||||||
|
|
||||||
|
#include <lkmc.h>
|
||||||
|
|
||||||
|
LKMC_PROLOGUE
|
||||||
|
.data
|
||||||
|
.align 16
|
||||||
|
input_2_5: .float 2.5
|
||||||
|
input_minus_2_5: .float -2.5
|
||||||
|
.text
|
||||||
|
/* Positive input. */
|
||||||
|
movss input_2_5, %xmm0
|
||||||
|
cvttss2si %xmm0, %eax
|
||||||
|
LKMC_ASSERT_EQ_32(%eax, $2)
|
||||||
|
|
||||||
|
/* Negative input. */
|
||||||
|
movss input_minus_2_5, %xmm0
|
||||||
|
cvttss2si %xmm0, %eax
|
||||||
|
LKMC_ASSERT_EQ_32(%eax, $-2)
|
||||||
|
LKMC_EPILOGUE
|
||||||
18
userland/arch/x86_64/movaps.S
Normal file
18
userland/arch/x86_64/movaps.S
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions */
|
||||||
|
|
||||||
|
#include <lkmc.h>
|
||||||
|
|
||||||
|
LKMC_PROLOGUE
|
||||||
|
.data
|
||||||
|
/* Ensure that the memory is 16-byte aligned. */
|
||||||
|
.align 16
|
||||||
|
input: .float 1.5, 2.5, 3.5, 4.5
|
||||||
|
.bss
|
||||||
|
.align 16
|
||||||
|
output: .skip 16
|
||||||
|
.text
|
||||||
|
movaps input, %xmm0
|
||||||
|
movaps %xmm0, %xmm1
|
||||||
|
movaps %xmm1, output
|
||||||
|
LKMC_ASSERT_MEMCMP(input, output, $16)
|
||||||
|
LKMC_EPILOGUE
|
||||||
14
userland/arch/x86_64/movss.S
Normal file
14
userland/arch/x86_64/movss.S
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-data-transfer-instructions */
|
||||||
|
|
||||||
|
#include <lkmc.h>
|
||||||
|
|
||||||
|
.data
|
||||||
|
input: .float 1.5
|
||||||
|
.bss
|
||||||
|
output: .skip 4
|
||||||
|
LKMC_PROLOGUE
|
||||||
|
movss input, %xmm0
|
||||||
|
movss %xmm0, %xmm1
|
||||||
|
movss %xmm1, output
|
||||||
|
LKMC_ASSERT_MEMCMP(input, output, $4)
|
||||||
|
LKMC_EPILOGUE
|
||||||
16
userland/arch/x86_64/movups.S
Normal file
16
userland/arch/x86_64/movups.S
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-sse-packed-arithmetic-instructions */
|
||||||
|
|
||||||
|
#include <lkmc.h>
|
||||||
|
|
||||||
|
LKMC_PROLOGUE
|
||||||
|
.data
|
||||||
|
/* Unlike MOVAPS, we don't need to align memory here. */
|
||||||
|
input: .float 1.5, 2.5, 3.5, 4.5
|
||||||
|
.bss
|
||||||
|
output: .skip 16
|
||||||
|
.text
|
||||||
|
movups input, %xmm0
|
||||||
|
movups %xmm0, %xmm1
|
||||||
|
movups %xmm1, output
|
||||||
|
LKMC_ASSERT_MEMCMP(input, output, $16)
|
||||||
|
LKMC_EPILOGUE
|
||||||
23
userland/arch/x86_64/vfmadd132pd.S
Normal file
23
userland/arch/x86_64/vfmadd132pd.S
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-fma */
|
||||||
|
|
||||||
|
#include <lkmc.h>
|
||||||
|
|
||||||
|
LKMC_PROLOGUE
|
||||||
|
.data
|
||||||
|
.align 16
|
||||||
|
input0: .double 1.5, 2.5
|
||||||
|
input1: .double 2.0, 4.0
|
||||||
|
input2: .double 2.5, 3.5
|
||||||
|
expect: .double 6.5, 16.5
|
||||||
|
.bss
|
||||||
|
.align 16
|
||||||
|
output: .skip 16
|
||||||
|
.text
|
||||||
|
movaps input1, %xmm0
|
||||||
|
movaps input0, %xmm1
|
||||||
|
movaps input2, %xmm2
|
||||||
|
/* xmm2 = xmm1 + (xmm0 * xmm2) */
|
||||||
|
vfmadd132pd %xmm0, %xmm1, %xmm2
|
||||||
|
movaps %xmm2, output
|
||||||
|
LKMC_ASSERT_MEMCMP(output, expect, $0x10)
|
||||||
|
LKMC_EPILOGUE
|
||||||
Reference in New Issue
Block a user