From 6a9299599e781b29abfce64e4923ab0af3ef731d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?=
 =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= <ciro.santilli@gmail.com>
Date: Wed, 19 Jun 2019 00:00:00 +0000
Subject: [PATCH] x86 asm: move string instructions from x86-assembly-cheat

---
 README.adoc                 | 131 ++++++++++++++++++++++++------------
 userland/arch/x86_64/cmps.S |  24 +++++++
 userland/arch/x86_64/lods.S |  16 +++++
 userland/arch/x86_64/movs.S |  22 ++++++
 userland/arch/x86_64/nop.S  |   2 +-
 userland/arch/x86_64/rep.S  |  73 ++++++++++++++++++++
 userland/arch/x86_64/scas.S |  25 +++++++
 userland/arch/x86_64/stos.S |  62 +++++++++++++++++
 8 files changed, 312 insertions(+), 43 deletions(-)
 create mode 100644 userland/arch/x86_64/cmps.S
 create mode 100644 userland/arch/x86_64/lods.S
 create mode 100644 userland/arch/x86_64/movs.S
 create mode 100644 userland/arch/x86_64/rep.S
 create mode 100644 userland/arch/x86_64/scas.S
 create mode 100644 userland/arch/x86_64/stos.S
diff --git a/README.adoc b/README.adoc
index a60b6e1..02e9866 100644
--- a/README.adoc
+++ b/README.adoc
@@ -12311,6 +12311,17 @@ When reading disassembly, many instructions have either a `.n` or `.w` suffix.
 
 Bibliography: https://stackoverflow.com/questions/27147043/n-suffix-to-branch-instruction
 
+=== NOP instructions
+
+* x86: link:userland/arch/x86_64/nop.S[NOP]
+* ARM: <<arm-nop-instruction>>
+
+No OPeration.
+
+Does nothing except take up one processor cycle and occupy some instruction memory.
+
+Applications: http://stackoverflow.com/questions/234906/whats-the-purpose-of-the-nop-opcode
+
 == x86 userland assembly
 
 Arch agnostic infrastructure getting started at: <<userland-assembly>>.
@@ -12354,29 +12365,29 @@ Bibliography:
 
 <<intel-manual-1>> 5.1.2 "Binary Arithmetic Instructions":
 
-* link:userland/arch/x86_64/add.S[ADD]
-** link:userland/arch/x86_64/inc.S[INC]
-** link:userland/arch/x86_64/adc.S[ADC]
-* link:userland/arch/x86_64/sub.S[SUB]
-** link:userland/arch/x86_64/dec.S[DEC]
-** link:userland/arch/x86_64/sbb.S[SBB]
-* link:userland/arch/x86_64/mul.S[MUL]
-** link:userland/arch/x86_64/neg.S[NEG]
-** link:userland/arch/x86_64/imul.S[IMUL]
-* link:userland/arch/x86_64/div.S[DIV]
-** link:userland/arch/x86_64/div_overflow.S[DIV overflow]
-** link:userland/arch/x86_64/div_zero.S[DIV zero]
-** link:userland/arch/x86_64/idiv.S[IDIV]
-* link:userland/arch/x86_64/cmp.S[CMP]
+* link:userland/arch/x86_64/add.S[]: ADD
+** link:userland/arch/x86_64/inc.S[]: INC
+** link:userland/arch/x86_64/adc.S[]: ADC
+* link:userland/arch/x86_64/sub.S[]: SUB
+** link:userland/arch/x86_64/dec.S[]: DEC
+** link:userland/arch/x86_64/sbb.S[]: SBB
+* link:userland/arch/x86_64/mul.S[]: MUL
+** link:userland/arch/x86_64/neg.S[]: NEG
+** link:userland/arch/x86_64/imul.S[]: IMUL
+* link:userland/arch/x86_64/div.S[]: DIV
+** link:userland/arch/x86_64/div_overflow.S[]: DIV overflow
+** link:userland/arch/x86_64/div_zero.S[]: DIV zero
+** link:userland/arch/x86_64/idiv.S[]: IDIV
+* link:userland/arch/x86_64/cmp.S[]: CMP
 
 === x86 logical instructions
 
 <<intel-manual-1>> 5.1.4 "Logical Instructions"
 
-* link:userland/arch/x86_64/and.S[AND]
-* link:userland/arch/x86_64/not.S[NOT]
-* link:userland/arch/x86_64/or.S[OR]
-* link:userland/arch/x86_64/xor.S[XOR]
+* link:userland/arch/x86_64/and.S[]: AND
+* link:userland/arch/x86_64/not.S[]: NOT
+* link:userland/arch/x86_64/or.S[]: OR
+* link:userland/arch/x86_64/xor.S[]: XOR
 
 === x86 shift and rotate instructions
 
@@ -12400,10 +12411,10 @@ Keeps the same sign on right shift.
 Not directly exposed in C, for which signed shift is undetermined behavior, but does exist in Java via the `>>>` operator. C compilers can omit it however.
 +
 SHL and SAL are exactly the same and have the same encoding: https://stackoverflow.com/questions/8373415/difference-between-shl-and-sal-in-80x86/56621271#56621271
-* link:userland/arch/x86_64/rol.S[ROL and ROR]
+* link:userland/arch/x86_64/rol.S[]: ROL and ROR
 +
 Rotates the bit that is going out around to the other side.
-* link:userland/arch/x86_64/rol.S[RCL and RCR]
+* link:userland/arch/x86_64/rol.S[]: RCL and RCR
 +
 Like ROL and ROR, but insert the carry bit instead, which effectively generates a rotation of 8 + 1 bits. TODO application.
 
@@ -12411,26 +12422,28 @@ Like ROL and ROR, but insert the carry bit instead, which effectively generates
 
 <<intel-manual-1>> 5.1.6 "Bit and Byte Instructions"
 
-* link:userland/arch/x86_64/bt.S[BT]
+* link:userland/arch/x86_64/bt.S[]: BT
 +
 Bit test: test if the Nth bit a bit of a register is set and store the result in the CF FLAG.
 +
 ....
 CF = reg[N]
 ....
-* link:userland/arch/x86_64/btr.S[BTR]
+* link:userland/arch/x86_64/btr.S[]: BTR
 +
 Do a BT and then set the bit to 0.
-* link:userland/arch/x86_64/btc.S[BTC]
+* link:userland/arch/x86_64/btc.S[]: BTC
 +
 Do a BT and then swap the value of the tested bit.
-* link:userland/arch/x86_64/setcc.S[SETcc]
+* link:userland/arch/x86_64/setcc.S[]: SETcc
 +
-Set a a byte of a register to 0 or 1 depending on the cc condition.
-* link:userland/arch/x86_64/popcnt.S[POPCNT]
+Set a byte of a register to 0 or 1 depending on the cc condition.
++
+Bibliography: https://stackoverflow.com/questions/1406783/how-to-read-and-write-x86-flags-registers-directly/30952577#30952577
+* link:userland/arch/x86_64/popcnt.S[]: POPCNT
 +
 Count the number of 1 bits.
-* link:userland/arch/x86_64/test.S[TEST]
+* link:userland/arch/x86_64/test.S[]: TEST
 +
 Like <<x86-binary-arithmetic-instructions,CMP>> but does AND instead of SUB:
 +
@@ -12442,12 +12455,12 @@ ZF = (!(X && Y)) ? 1 : 0
 
 <<intel-manual-1>> 5.1.7 "Control Transfer Instructions"
 
-* link:userland/arch/x86_64/jmp.S[JMP]
-** link:userland/arch/x86_64/jmp_indirect.S[JMP indirect]
+* link:userland/arch/x86_64/jmp.S[]: JMP
+** link:userland/arch/x86_64/jmp_indirect.S[]: JMP indirect
 
 ==== x86 Jcc instructions
 
-link:userland/arch/x86_64/jcc.S[Jcc]
+link:userland/arch/x86_64/jcc.S[]
 
 Jump if certain conditions of the flags register are met.
 
@@ -12472,29 +12485,61 @@ JG vs JA and JL vs JB:
 
 ==== x86 LOOP instruction
 
-link:userland/arch/x86_64/loop.S[LOOP]
+link:userland/arch/x86_64/loop.S[]
 
 Vs <<x86-jcc-instructions,Jcc>>: https://stackoverflow.com/questions/6805692/x86-assembly-programming-loops-with-ecx-and-loop-instruction-versus-jmp-jcond Holy CISC!
 
+==== x86 string instructions
+
+<<intel-manual-1>> 5.1.8 "String Instructions"
+
+These instructions do some operation on an array item, and automatically update the index to the next item:
+
+* First example explained in more detail
+** link:userland/arch/x86_64/stos.S[]: STOS: STOre String: store register to memory. STOSD is called STOSL in GNU GAS as usual: https://stackoverflow.com/questions/6211629/gcc-inline-assembly-error-no-such-instruction-stosd
+* Further examples
+** link:userland/arch/x86_64/cmps.S[]: CMPS: CoMPare Strings: compare two values in memory with addresses given by RSI and RDI. Could be used to implement `memcmp`. Store the result in JZ as usual.
+** link:userland/arch/x86_64/lods.S[]: LODS: LOaD String: load from memory to register.
+** link:userland/arch/x86_64/movs.S[]: MOVS: MOV String: move from one memory to another with addresses given by RSI and RDI. Could be used to implement `memmov`.
+** link:userland/arch/x86_64/scas.S[]: SCAS: SCan String: compare memory to the value in a register. Could be used to implement `strchr`.
+
+The RSI and RDI registers are actually named after these intructions! S is the source of string instructions, D is the destination of string instructions.
+
+The direction of the index increment depends on the direction flag of the FLAGS register: 0 means forward and 1 means backward: https://stackoverflow.com/questions/9636691/what-are-cld-and-std-for-in-x86-assembly-language-what-does-df-do
+
+These instructions were originally developed to speed up "string" operations such as those present in the `<string.h>` header of the C standard library.
+
+However, as computer architecture evolved, those instructions might not offer considerable speedups anymore, and modern glibc such as 2.29 just uses <<x86-simd>> operations instead:, see also: https://stackoverflow.com/questions/33480999/how-can-the-rep-stosb-instruction-execute-faster-than-the-equivalent-loop
+
+===== x86 REP prefix
+
+Example: link:userland/arch/x86_64/rep.S[]
+
+Repeat a string instruction RCX times:
+
+As the repetitions happen:
+
+* RCX decreases, until it reaches 0
+* RDI and RSI increase
+
+The variants: REPZ, REPNZ (alias REPE, REPNE) repeat a given instruction until something happens.
+
+REP and REPZ also additionally stop if the comparison operation they repeat fails.
+
+* REP: INS, OUTS, MOVS, LODS, and STOS
+* REPZ: CMPS and SCAS
+
 === x86 miscellaneous instructions
 
 <<intel-manual-1>> 5.1.13 "Miscellaneous Instructions"
 
-==== x86 NOP instruction
-
-link:userland/arch/x86_64/nop.S[NOP]
-
-No OPeration.
-
-Does nothing except take up one processor cycle and occupy some instruction memory.
-
-Applications: http://stackoverflow.com/questions/234906/whats-the-purpose-of-the-nop-opcode
+NOP: <<nop-instructions>>
 
 === x86 random number generator instructions
 
 <<intel-manual-1>> 5.1.15 Random Number Generator Instructions
 
-Example: link:userland/arch/x86_64/rdrand.S[RDRAND]
+Example: link:userland/arch/x86_64/rdrand.S[]: RDRAND
 
 If you run that executable multiple times, it prints a random number every time to stdout.
 
@@ -12508,7 +12553,7 @@ RDRAND sets the carry flag when data is ready so we must loop if the carry flag
 
 ==== x86 CPUID instruction
 
-Example: link:userland/arch/x86_64/cpuid.S[CPUID]
+Example: link:userland/arch/x86_64/cpuid.S[]
 
 Fills EAX, EBX, ECX and EDX with CPU information.
 
@@ -13299,6 +13344,8 @@ See: <<arm-adr-instruction>>.
 
 ==== ARM NOP instruction
 
+Parent section: <<nop-instructions>>
+
 There are a few different ways to encode NOP, notably MOV a register into itself, and a dedicated miscellaneous instruction.
 
 Example: link:userland/arch/arm/nop.S[]
diff --git a/userland/arch/x86_64/cmps.S b/userland/arch/x86_64/cmps.S
new file mode 100644
index 0000000..b89ca8d
--- /dev/null
+++ b/userland/arch/x86_64/cmps.S
@@ -0,0 +1,24 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-string-instructions */
+
+    # Compare two arrays
+
+#include <lkmc.h>
+
+.section .rodata
+    my_quad_array_1: .quad 1, 2
+    my_quad_array_2: .quad 1, 3
+LKMC_PROLOGUE
+    mov $0, %r12
+    mov $0, %r13
+    cld
+    lea my_quad_array_1(%rip), %rsi
+    lea my_quad_array_2(%rip), %rdi
+    cmpsq
+    setz %r12b
+    cmpsq
+    setz %r13b
+    /* 1 == 1 */
+    LKMC_ASSERT_EQ(%r12, $1)
+    /* 2 != 3 */
+    LKMC_ASSERT_EQ(%r13, $0)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/lods.S b/userland/arch/x86_64/lods.S
new file mode 100644
index 0000000..d2b12f0
--- /dev/null
+++ b/userland/arch/x86_64/lods.S
@@ -0,0 +1,16 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-string-instructions */
+
+#include <lkmc.h>
+
+.section .rodata
+    my_quad_array: .quad 1, 2
+LKMC_PROLOGUE
+    lea my_quad_array(%rip), %rsi
+    cld
+    lodsq
+    mov %rax, %r12
+    lodsq
+    mov %rax, %r13
+    LKMC_ASSERT_EQ(%r12, $1)
+    LKMC_ASSERT_EQ(%r13, $2)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/movs.S b/userland/arch/x86_64/movs.S
new file mode 100644
index 0000000..74ab624
--- /dev/null
+++ b/userland/arch/x86_64/movs.S
@@ -0,0 +1,22 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-string-instructions */
+# # movs
+
+    # Copy one string into another.
+
+    # Input pointed by esi, output by edi.
+
+#include <lkmc.h>
+
+.section .rodata
+    src: .quad 1, 2
+.bss
+    dest: .skip 16
+LKMC_PROLOGUE
+    cld
+    lea src(%rip), %rsi
+    lea dest(%rip), %rdi
+    movsq
+    movsq
+    LKMC_ASSERT_EQ(dest + 0, $1)
+    LKMC_ASSERT_EQ(dest + 8, $2)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/nop.S b/userland/arch/x86_64/nop.S
index 16a1960..c0da11d 100644
--- a/userland/arch/x86_64/nop.S
+++ b/userland/arch/x86_64/nop.S
@@ -1,4 +1,4 @@
-/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-nop-instruction */
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#nop-instructions */
 
 #include <lkmc.h>
 
diff --git a/userland/arch/x86_64/rep.S b/userland/arch/x86_64/rep.S
new file mode 100644
index 0000000..5bb9e18
--- /dev/null
+++ b/userland/arch/x86_64/rep.S
@@ -0,0 +1,73 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-rep-prefix */
+
+#include <lkmc.h>
+
+.bss
+    src: .skip 16
+    dst: .skip 16
+LKMC_PROLOGUE
+
+    /* memset: REP STOSQ */
+    cld
+    lea dst(%rip), %rdi
+    /* 2 elements. */
+    mov $2, %rcx
+    /* Set every element to 42. */
+    mov $0x2A, %rax
+    rep stosq
+    /* RCX was decremented down to zero. */
+    LKMC_ASSERT_EQ(%rcx, $0)
+    /* And the memory was set. */
+    LKMC_ASSERT_EQ(dst + 0, $0x2A)
+    LKMC_ASSERT_EQ(dst + 8, $0x2A)
+
+    /* memcpy: REP MOVSQ */
+    cld
+    movq $2, src + 0
+    movq $3, src + 8
+    lea src(%rip), %rsi
+    lea dst(%rip), %rdi
+    mov $2, %rcx
+    rep movsq
+    LKMC_ASSERT_EQ(dst + 0, $2)
+    LKMC_ASSERT_EQ(dst + 8, $3)
+
+    /* memcmp: REPZ CMPSQ */
+
+        /* Setup src. */
+        movl $2, src + 0x0
+        movl $3, src + 0x4
+        movl $4, src + 0x8
+        movl $5, src + 0xA
+
+        /* Equal. */
+        movl $2, dst + 0x0
+        movl $3, dst + 0x4
+        movl $4, dst + 0x8
+        movl $5, dst + 0xA
+        cld
+        mov $src, %rsi
+        mov $dst, %rdi
+        mov $4, %rcx
+        repz cmpsl
+        mov %rcx, %r12
+        /* Last flag was equal. */
+        LKMC_ASSERT(jz)
+        /* RCX was decreased all the way to zero. */
+        LKMC_ASSERT_EQ(%r12, $0)
+
+        /* Different. */
+        movl $2, dst + 0x0
+        movl $3, dst + 0x4
+        movl $2, dst + 0x8
+        movl $5, dst + 0xA
+        mov $src, %rsi
+        mov $dst, %rdi
+        mov $4, %rcx
+        repz cmpsl
+        mov %rcx, %r12
+        LKMC_ASSERT(jnz)
+        /* We stopped half-way with 1 comparision missing. */
+        LKMC_ASSERT_EQ(%r12, $1)
+
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/scas.S b/userland/arch/x86_64/scas.S
new file mode 100644
index 0000000..69b268a
--- /dev/null
+++ b/userland/arch/x86_64/scas.S
@@ -0,0 +1,25 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-string-instructions */
+
+#include <lkmc.h>
+
+.section .rodata
+    my_quad_array: .quad 1, 2
+LKMC_PROLOGUE
+    mov $0, %r12
+    mov $0, %r13
+    /* RDI holds the address. */
+    lea my_quad_array(%rip), %rdi
+    cld
+    mov $1, %rax
+    /* Compare RAX to *RDI (1 == 1)  */
+    scasq
+    setz %r12b
+    mov $3, %rax
+    /* Compare RAX to *RDI (3 == 2)  */
+    scasq
+    setz %r13b
+    /* 1 == 1 */
+    LKMC_ASSERT_EQ(%r12, $1)
+    /* 2 != 3 */
+    LKMC_ASSERT_EQ(%r13, $0)
+LKMC_EPILOGUE
diff --git a/userland/arch/x86_64/stos.S b/userland/arch/x86_64/stos.S
new file mode 100644
index 0000000..03aa556
--- /dev/null
+++ b/userland/arch/x86_64/stos.S
@@ -0,0 +1,62 @@
+/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-string-instructions */
+
+#include <lkmc.h>
+
+.data
+    my_quad_array: .quad 0, 0
+    my_quad_array_expect_forward: .quad 1, 2
+    my_quad_array_expect_backwards: .quad 4, 3
+LKMC_PROLOGUE
+
+    /* Clear the direction flag: move forward. */
+    cld
+
+    /* The target address is stored in RDI. */
+    lea my_quad_array(%rip), %rdi
+
+    /* my_quad_array[0] = 1 */
+    mov $1, %rax
+    /* RAX is automatically used as the source. */
+    stosq
+
+    /* my_quad_array[1] = 2 */
+    mov $2, %rax
+    stosq
+
+    /* RDI moved 2x 8 bytes forward. */
+    sub $my_quad_array, %rdi
+    LKMC_ASSERT_EQ(%rdi, $0x10)
+
+    /* The memory was modified. */
+    LKMC_ASSERT_MEMCMP(
+        my_quad_array,
+        my_quad_array_expect_forward,
+        $0x10
+    )
+
+    /* Now with backwards direction. */
+    std
+
+    /* The target address is stored in RDI. */
+    lea (my_quad_array + 8)(%rip), %rdi
+
+    /* my_quad_array[1] = 3 */
+    mov $3, %rax
+    stosq
+
+    /* my_quad_array[0] = 4 */
+    mov $4, %rax
+    stosq
+
+    /* RDI moved 2x 8 bytes backwards. */
+    sub $my_quad_array, %rdi
+    LKMC_ASSERT_EQ(%rdi, $-0x8)
+
+    /* The memory was modified. */
+    LKMC_ASSERT_MEMCMP(
+        my_quad_array,
+        my_quad_array_expect_backwards,
+        $0x10
+    )
+
+LKMC_EPILOGUE