From 082901414a9f8b5dda040bc07b7d1e3cf74ebf0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?=
 =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= <ciro.santilli@gmail.com>
Date: Sat, 22 Jun 2019 00:00:01 +0000
Subject: [PATCH] x86 asm: move x87 FPU instructions from x86-assembly-cheat

---
 README.adoc                         | 52 +++++++++++++++++++++++++++++
 userland/arch/x86_64/fabs.S         | 24 +++++++++++++
 userland/arch/x86_64/fadd.S         | 40 ++++++++++++++++++++++
 userland/arch/x86_64/faddp.S        | 36 ++++++++++++++++++++
 userland/arch/x86_64/fchs.S         | 24 +++++++++++++
 userland/arch/x86_64/fild.S         | 16 +++++++++
 userland/arch/x86_64/fld1.S         | 12 +++++++
 userland/arch/x86_64/fldl_literal.S | 18 ++++++++++
 userland/arch/x86_64/fldz.S         | 12 +++++++
 userland/arch/x86_64/fscale.S       | 34 +++++++++++++++++++
 userland/arch/x86_64/fsqrt.S        | 26 +++++++++++++++
 userland/arch/x86_64/fxch.S         | 45 +++++++++++++++++++++++++
 12 files changed, 339 insertions(+)
 create mode 100644 userland/arch/x86_64/fabs.S
 create mode 100644 userland/arch/x86_64/fadd.S
 create mode 100644 userland/arch/x86_64/faddp.S
 create mode 100644 userland/arch/x86_64/fchs.S
 create mode 100644 userland/arch/x86_64/fild.S
 create mode 100644 userland/arch/x86_64/fld1.S
 create mode 100644 userland/arch/x86_64/fldl_literal.S
 create mode 100644 userland/arch/x86_64/fldz.S
 create mode 100644 userland/arch/x86_64/fscale.S
 create mode 100644 userland/arch/x86_64/fsqrt.S
 create mode 100644 userland/arch/x86_64/fxch.S
diff --git a/README.adoc b/README.adoc
index 0d86e36..85e2652 100644
--- a/README.adoc
+++ b/README.adoc
@@ -11939,6 +11939,23 @@ Then it is just a huge copy paste of infinite boring details:
 * <<x86-simd>>
 * <<arm-simd>>
 
+To debug these instructoins, you can see the register values in GDB with:
+
+....
+info registers float
+....
+
+or alternatively with register names (here the ARMv8 V0 register):
+
+....
+print $v0
+....
+
+as mentioned at:
+
+* https://stackoverflow.com/questions/5429137/how-to-print-register-values-in-gdb/38036152#38036152
+* https://reverseengineering.stackexchange.com/questions/8992/floating-point-registers-on-arm/20623#20623
+
 Bibliography: https://stackoverflow.com/questions/1389712/getting-started-with-intel-x86-sse-simd-instructions/56409539#56409539
 
 === User vs system assembly
@@ -11995,6 +12012,7 @@ Examples under `arch/<arch>/c/` directories show to how use inline assembly from
 * x86_64
 ** link:userland/arch/x86_64/inline_asm/inc.c[]
 ** link:userland/arch/x86_64/inline_asm/add.c[]
+** link:userland/arch/x86_64/inline_asm/sqrt_x87.c[] Shows how to use the <<x86-x87-fpu-instructions>> from inline assembly. Bibliography: https://stackoverflow.com/questions/6514537/how-do-i-specify-immediate-floating-point-numbers-with-inline-assembly/52906126#52906126
 * arm
 ** link:userland/arch/arm/inline_asm/inc.c[]
 ** link:userland/arch/arm/inline_asm/inc_memory.c[]
@@ -12395,6 +12413,7 @@ Common combo with idiv 32-bit, which takes the input from `edx:eax`: so you need
 
 Has some Intel vs AT&T name overload hell:
 
+* https://stackoverflow.com/questions/6555094/what-does-cltq-do-in-assembly/45386217#45386217
 * https://stackoverflow.com/questions/17170388/trying-to-understand-the-assembly-instruction-cltd-on-x86/50315201#50315201
 * https://sourceware.org/binutils/docs/as/i386_002dMnemonics.html
 
@@ -12703,6 +12722,39 @@ There is also the `cpuinfo` command line tool that parses the CPUID instruction
 
 Old floating point unit that you should likely not use anymore, prefer instead the newer <<x86-simd>> instructions.
 
+* FPU basic examples, start here
+** link:userland/arch/x86_64/fadd.S[] FADD. The x76 FPU works on a stack of floating point numbers.
+** link:userland/arch/x86_64/faddp.S[] FADDP. Instructions with the P suffix also Pop the stack. This is often what you want for most computations, where the intermediate results don't matter.
+** link:userland/arch/x86_64/fldl_literal.S[] FLDL literal. It does not seem possible to either https://stackoverflow.com/questions/6514537/how-do-i-specify-immediate-floating-point-numbers-with-inline-assembly
+*** load floating point immediates into x86 x87 FPU registers
+*** encode floating point literals in x86 instructions, including MOV
+* Bulk instructions
+** link:userland/arch/x86_64/fabs.S[] FABS: absolute value: `ST0 = |ST0|`
+** link:userland/arch/x86_64/fchs.S[] FCHS: change sign: `ST0 = -ST0`
+** link:userland/arch/x86_64/fild.S[] FILD: Integer Load. Convert integer to float.
+** link:userland/arch/x86_64/fld1.S[] FLD1: Push 1.0 to ST0. CISC!
+** link:userland/arch/x86_64/fldz.S[] FLDZ: Push 0.0 to ST0.
+** link:userland/arch/x86_64/fscale.S[] FSCALE: `ST0 = ST0 * 2 ^ RoundTowardZero(ST1)`
+** link:userland/arch/x86_64/fsqrt.S[] FSQRT: square root
+** link:userland/arch/x86_64/fxch.S[] FXCH: swap ST0 and another register
+
+==== x86 x87 FPU vs SIMD
+
+http://stackoverflow.com/questions/1844669/benefits-of-x87-over-sse
+
+Modern x86 has two main ways of doing floating point operations:
+
+* <<x86-x87-fpu-instructions>>
+* <<x86-simd>>
+
+Advantages of FPU:
+
+* present in old CPUs, while SSE2 is only required in x86-64
+* contains some instructions no present in SSE, e.g. trigonometric
+* higher precision: FPU holds 80 bit Intel extension, while SSE2 only does up to 64 bit operations despite having the 128-bit register
+
+In GCC, you can choose between them with `-mfpmath=`.
+
 === x86 SIMD
 
 History:
diff --git a/userland/arch/x86_64/fabs.S b/userland/arch/x86_64/fabs.S
new file mode 100644
index 0000000..5e0b339
--- /dev/null
+++ b/userland/arch/x86_64/fabs.S
@@ -0,0 +1,24 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_1_0:       .double  1.0
+    double_minus_1_0: .double -1.0
+LKMC_PROLOGUE
+    /* |-1| == 1 */
+    fldl double_minus_1_0
+    fabs
+    fldl double_1_0
+    fcomip %st(1)
+    LKMC_ASSERT(je)
+    finit
+
+    /* |1| == 1 */
+    fldl double_1_0
+    fabs
+    fldl double_1_0
+    fcomip %st(1)
+    LKMC_ASSERT(je)
+    finit
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/fadd.S b/userland/arch/x86_64/fadd.S
new file mode 100644
index 0000000..8daddc2
--- /dev/null
+++ b/userland/arch/x86_64/fadd.S
@@ -0,0 +1,40 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_1_5: .double 1.5
+    double_2_5: .double 2.5
+    double_4_0: .double 4.0
+LKMC_PROLOGUE
+    /* Load to the FPU stack.
+     * Push value from memory to the FPU stack. */
+    fldl double_1_5
+    /* FPU stack after operation:
+     * ST0 == 1.5 */
+
+    fldl double_2_5
+    /* FPU stack after operation:
+     * ST0 == 2.5
+     * ST1 == 1.5 */
+
+    /* ST0 = ST0 + ST1 */
+    fadd %st, %st(1)
+    /* FPU stack after operation:
+     * ST0 == 4.0
+     * ST1 == 1.5 */
+
+    fldl double_4_0
+    /* FPU stack after operation:
+     * ST0 == 4.0
+     * ST1 == 1.5
+     * ST2 == 4.0 */
+
+    /* Compare ST0 == ST2 */
+    fcomi %st(2)
+    /* FPU stack after operation:
+     * ST0 == 4.0
+     * ST1 == 1.5
+     * ST2 == 4.0 */
+    LKMC_ASSERT(je)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/faddp.S b/userland/arch/x86_64/faddp.S
new file mode 100644
index 0000000..7c127a7
--- /dev/null
+++ b/userland/arch/x86_64/faddp.S
@@ -0,0 +1,36 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_1_5: .double 1.5
+    double_2_5: .double 2.5
+    double_4_0: .double 4.0
+LKMC_PROLOGUE
+    fldl double_1_5
+    /* FPU stack after operation:
+     * ST0 == 1.5 */
+
+    fldl double_2_5
+    /* FPU stack after operation:
+     * ST0 == 2.5
+     * ST1 == 1.5 */
+
+    /* ST0 = ST0 + ST1
+     * Pop ST0. */
+    faddp %st, %st(1)
+    /* FPU stack after operation:
+     * ST0 == 4.0 */
+
+    fldl double_4_0
+    /* FPU stack after operation:
+     * ST0 == 4.0
+     * ST1 == 4.0 */
+
+    /* Compare ST0 == ST1
+     * Pop ST0. */
+    fcomip %st(1)
+    /* FPU stack after operation:
+     * ST0 == 4.0 */
+    LKMC_ASSERT(je)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/fchs.S b/userland/arch/x86_64/fchs.S
new file mode 100644
index 0000000..77f0219
--- /dev/null
+++ b/userland/arch/x86_64/fchs.S
@@ -0,0 +1,24 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_1:       .double  1.0
+    double_minus_1: .double -1.0
+LKMC_PROLOGUE
+    /* -(1) == -1 */
+    fldl double_1
+    fchs
+    fldl double_minus_1
+    fcomip %st(1)
+    LKMC_ASSERT(je)
+    finit
+
+    /* -(-1) == 1 */
+    fldl double_minus_1
+    fchs
+    fldl double_1
+    fcomip %st(1)
+    LKMC_ASSERT(je)
+    finit
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/fild.S b/userland/arch/x86_64/fild.S
new file mode 100644
index 0000000..655a4ac
--- /dev/null
+++ b/userland/arch/x86_64/fild.S
@@ -0,0 +1,16 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_10_0: .double 10.0
+.bss
+    double_10_0_2: .skip 8
+LKMC_PROLOGUE
+    movl $10, double_10_0_2
+    fildl double_10_0_2
+    fldl double_10_0
+    fcomip %st(1)
+    LKMC_ASSERT(je)
+    finit
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/fld1.S b/userland/arch/x86_64/fld1.S
new file mode 100644
index 0000000..0c69cc0
--- /dev/null
+++ b/userland/arch/x86_64/fld1.S
@@ -0,0 +1,12 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_1_0: .double 1.0
+LKMC_PROLOGUE
+    fld1
+    fldl double_1_0
+    fcomip %st(1)
+    LKMC_ASSERT(je)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/fldl_literal.S b/userland/arch/x86_64/fldl_literal.S
new file mode 100644
index 0000000..4037acb
--- /dev/null
+++ b/userland/arch/x86_64/fldl_literal.S
@@ -0,0 +1,18 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_1_5: .double 1.5
+.bss
+    double_1_5_2: .skip 8
+LKMC_PROLOGUE
+#if 0
+    /* Error: junk `.5' after expression */
+    movq $1.5, double_1_5_2
+    fldl double_1_5
+    fldl double_1_5_2
+    fcomi %st(1)
+    LKMC_ASSERT(je)
+#endif
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/fldz.S b/userland/arch/x86_64/fldz.S
new file mode 100644
index 0000000..75e63fc
--- /dev/null
+++ b/userland/arch/x86_64/fldz.S
@@ -0,0 +1,12 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_0_0: .double 0.0
+LKMC_PROLOGUE
+    fldz
+    fldl double_0_0
+    fcomip %st(1)
+    LKMC_ASSERT(je)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/fscale.S b/userland/arch/x86_64/fscale.S
new file mode 100644
index 0000000..4348b4f
--- /dev/null
+++ b/userland/arch/x86_64/fscale.S
@@ -0,0 +1,34 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_1_0: .double 1.0
+    double_2_5: .double 2.5
+    double_4_0: .double 4.0
+LKMC_PROLOGUE
+    fldl double_4_0
+    # ST0 = 4.0
+
+    fldl double_2_5
+    # ST0 = 2.5
+    # ST1 = 4.0
+
+    fldl double_1_0
+    # ST0 = 1.0
+    # ST1 = 2.5
+    # ST2 = 4.0
+
+    # ST0 = 1 * 2 ^ (RoundTowardZero(2.5))
+    #     = 1 * 2 ^ 2
+    #     = 4
+    fscale
+    # ST0 = 4.0
+    # ST1 = 2.5
+    # ST2 = 4.0
+
+    fcomip %st(2)
+    # ST0 = 4.0
+    # ST1 = 2.5
+    LKMC_ASSERT(je)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/fsqrt.S b/userland/arch/x86_64/fsqrt.S
new file mode 100644
index 0000000..5d8504a
--- /dev/null
+++ b/userland/arch/x86_64/fsqrt.S
@@ -0,0 +1,26 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_1_41: .double 1.41
+    double_1_42: .double 1.42
+    double_2_0: .double 2.0
+    double_4_0: .double 4.0
+LKMC_PROLOGUE
+    /* sqrt(4) == 4 */
+    fldl double_4_0
+    fsqrt
+    fldl double_2_0
+    fcomip %st(1)
+    LKMC_ASSERT(je)
+
+    /* 1.41 < sqrt(2) < 1.42 */
+    fsqrt
+    fldl double_1_41
+    fcomip %st(1)
+    LKMC_ASSERT(jbe)
+    fldl double_1_42
+    fcomip %st(1)
+    LKMC_ASSERT(jae)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/fxch.S b/userland/arch/x86_64/fxch.S
new file mode 100644
index 0000000..53dc417
--- /dev/null
+++ b/userland/arch/x86_64/fxch.S
@@ -0,0 +1,45 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-x87-fpu-instructions */
+
+#include <lkmc.h>
+
+.data
+    double_0_0: .double 0.0
+    double_1_0: .double 1.0
+LKMC_PROLOGUE
+    fldz
+    # ST0 = 0.0
+
+    fld1
+    # ST0 = 1.0
+    # ST1 = 0.0
+
+    # Swap ST0 and ST1.
+    fxch %st(1)
+    # ST0 = 0.0
+    # ST1 = 1.0
+
+    fldz
+    # ST0 = 0.0
+    # ST1 = 0.0
+    # ST2 = 1.0
+
+    fcomip %st(1)
+    # ST0 = 0.0
+    # ST1 = 1.0
+    LKMC_ASSERT(je)
+
+    # Swap ST0 and ST1.
+    fxch %st(1)
+    # ST0 = 1.0
+    # ST1 = 0.0
+
+    fld1
+    # ST0 = 1.0
+    # ST1 = 1.0
+    # ST2 = 0.0
+
+    fcomip %st(1)
+    # ST0 = 1.0
+    # ST1 = 0.0
+    LKMC_ASSERT(je)
+LKMC_EPILOGUE