From 6a9299599e781b29abfce64e4923ab0af3ef731d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?= =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= Date: Wed, 19 Jun 2019 00:00:00 +0000 Subject: [PATCH] x86 asm: move string instructions from x86-assembly-cheat --- README.adoc | 131 ++++++++++++++++++++++++------------ userland/arch/x86_64/cmps.S | 24 +++++++ userland/arch/x86_64/lods.S | 16 +++++ userland/arch/x86_64/movs.S | 22 ++++++ userland/arch/x86_64/nop.S | 2 +- userland/arch/x86_64/rep.S | 73 ++++++++++++++++++++ userland/arch/x86_64/scas.S | 25 +++++++ userland/arch/x86_64/stos.S | 62 +++++++++++++++++ 8 files changed, 312 insertions(+), 43 deletions(-) create mode 100644 userland/arch/x86_64/cmps.S create mode 100644 userland/arch/x86_64/lods.S create mode 100644 userland/arch/x86_64/movs.S create mode 100644 userland/arch/x86_64/rep.S create mode 100644 userland/arch/x86_64/scas.S create mode 100644 userland/arch/x86_64/stos.S diff --git a/README.adoc b/README.adoc index a60b6e1..02e9866 100644 --- a/README.adoc +++ b/README.adoc @@ -12311,6 +12311,17 @@ When reading disassembly, many instructions have either a `.n` or `.w` suffix. Bibliography: https://stackoverflow.com/questions/27147043/n-suffix-to-branch-instruction +=== NOP instructions + +* x86: link:userland/arch/x86_64/nop.S[NOP] +* ARM: <> + +No OPeration. + +Does nothing except take up one processor cycle and occupy some instruction memory. + +Applications: http://stackoverflow.com/questions/234906/whats-the-purpose-of-the-nop-opcode + == x86 userland assembly Arch agnostic infrastructure getting started at: <>. @@ -12354,29 +12365,29 @@ Bibliography: <> 5.1.2 "Binary Arithmetic Instructions": -* link:userland/arch/x86_64/add.S[ADD] -** link:userland/arch/x86_64/inc.S[INC] -** link:userland/arch/x86_64/adc.S[ADC] -* link:userland/arch/x86_64/sub.S[SUB] -** link:userland/arch/x86_64/dec.S[DEC] -** link:userland/arch/x86_64/sbb.S[SBB] -* link:userland/arch/x86_64/mul.S[MUL] -** link:userland/arch/x86_64/neg.S[NEG] -** link:userland/arch/x86_64/imul.S[IMUL] -* link:userland/arch/x86_64/div.S[DIV] -** link:userland/arch/x86_64/div_overflow.S[DIV overflow] -** link:userland/arch/x86_64/div_zero.S[DIV zero] -** link:userland/arch/x86_64/idiv.S[IDIV] -* link:userland/arch/x86_64/cmp.S[CMP] +* link:userland/arch/x86_64/add.S[]: ADD +** link:userland/arch/x86_64/inc.S[]: INC +** link:userland/arch/x86_64/adc.S[]: ADC +* link:userland/arch/x86_64/sub.S[]: SUB +** link:userland/arch/x86_64/dec.S[]: DEC +** link:userland/arch/x86_64/sbb.S[]: SBB +* link:userland/arch/x86_64/mul.S[]: MUL +** link:userland/arch/x86_64/neg.S[]: NEG +** link:userland/arch/x86_64/imul.S[]: IMUL +* link:userland/arch/x86_64/div.S[]: DIV +** link:userland/arch/x86_64/div_overflow.S[]: DIV overflow +** link:userland/arch/x86_64/div_zero.S[]: DIV zero +** link:userland/arch/x86_64/idiv.S[]: IDIV +* link:userland/arch/x86_64/cmp.S[]: CMP === x86 logical instructions <> 5.1.4 "Logical Instructions" -* link:userland/arch/x86_64/and.S[AND] -* link:userland/arch/x86_64/not.S[NOT] -* link:userland/arch/x86_64/or.S[OR] -* link:userland/arch/x86_64/xor.S[XOR] +* link:userland/arch/x86_64/and.S[]: AND +* link:userland/arch/x86_64/not.S[]: NOT +* link:userland/arch/x86_64/or.S[]: OR +* link:userland/arch/x86_64/xor.S[]: XOR === x86 shift and rotate instructions @@ -12400,10 +12411,10 @@ Keeps the same sign on right shift. Not directly exposed in C, for which signed shift is undetermined behavior, but does exist in Java via the `>>>` operator. C compilers can omit it however. + SHL and SAL are exactly the same and have the same encoding: https://stackoverflow.com/questions/8373415/difference-between-shl-and-sal-in-80x86/56621271#56621271 -* link:userland/arch/x86_64/rol.S[ROL and ROR] +* link:userland/arch/x86_64/rol.S[]: ROL and ROR + Rotates the bit that is going out around to the other side. -* link:userland/arch/x86_64/rol.S[RCL and RCR] +* link:userland/arch/x86_64/rol.S[]: RCL and RCR + Like ROL and ROR, but insert the carry bit instead, which effectively generates a rotation of 8 + 1 bits. TODO application. @@ -12411,26 +12422,28 @@ Like ROL and ROR, but insert the carry bit instead, which effectively generates <> 5.1.6 "Bit and Byte Instructions" -* link:userland/arch/x86_64/bt.S[BT] +* link:userland/arch/x86_64/bt.S[]: BT + Bit test: test if the Nth bit a bit of a register is set and store the result in the CF FLAG. + .... CF = reg[N] .... -* link:userland/arch/x86_64/btr.S[BTR] +* link:userland/arch/x86_64/btr.S[]: BTR + Do a BT and then set the bit to 0. -* link:userland/arch/x86_64/btc.S[BTC] +* link:userland/arch/x86_64/btc.S[]: BTC + Do a BT and then swap the value of the tested bit. -* link:userland/arch/x86_64/setcc.S[SETcc] +* link:userland/arch/x86_64/setcc.S[]: SETcc + -Set a a byte of a register to 0 or 1 depending on the cc condition. -* link:userland/arch/x86_64/popcnt.S[POPCNT] +Set a byte of a register to 0 or 1 depending on the cc condition. ++ +Bibliography: https://stackoverflow.com/questions/1406783/how-to-read-and-write-x86-flags-registers-directly/30952577#30952577 +* link:userland/arch/x86_64/popcnt.S[]: POPCNT + Count the number of 1 bits. -* link:userland/arch/x86_64/test.S[TEST] +* link:userland/arch/x86_64/test.S[]: TEST + Like <> but does AND instead of SUB: + @@ -12442,12 +12455,12 @@ ZF = (!(X && Y)) ? 1 : 0 <> 5.1.7 "Control Transfer Instructions" -* link:userland/arch/x86_64/jmp.S[JMP] -** link:userland/arch/x86_64/jmp_indirect.S[JMP indirect] +* link:userland/arch/x86_64/jmp.S[]: JMP +** link:userland/arch/x86_64/jmp_indirect.S[]: JMP indirect ==== x86 Jcc instructions -link:userland/arch/x86_64/jcc.S[Jcc] +link:userland/arch/x86_64/jcc.S[] Jump if certain conditions of the flags register are met. @@ -12472,29 +12485,61 @@ JG vs JA and JL vs JB: ==== x86 LOOP instruction -link:userland/arch/x86_64/loop.S[LOOP] +link:userland/arch/x86_64/loop.S[] Vs <>: https://stackoverflow.com/questions/6805692/x86-assembly-programming-loops-with-ecx-and-loop-instruction-versus-jmp-jcond Holy CISC! +==== x86 string instructions + +<> 5.1.8 "String Instructions" + +These instructions do some operation on an array item, and automatically update the index to the next item: + +* First example explained in more detail +** link:userland/arch/x86_64/stos.S[]: STOS: STOre String: store register to memory. STOSD is called STOSL in GNU GAS as usual: https://stackoverflow.com/questions/6211629/gcc-inline-assembly-error-no-such-instruction-stosd +* Further examples +** link:userland/arch/x86_64/cmps.S[]: CMPS: CoMPare Strings: compare two values in memory with addresses given by RSI and RDI. Could be used to implement `memcmp`. Store the result in JZ as usual. +** link:userland/arch/x86_64/lods.S[]: LODS: LOaD String: load from memory to register. +** link:userland/arch/x86_64/movs.S[]: MOVS: MOV String: move from one memory to another with addresses given by RSI and RDI. Could be used to implement `memmov`. +** link:userland/arch/x86_64/scas.S[]: SCAS: SCan String: compare memory to the value in a register. Could be used to implement `strchr`. + +The RSI and RDI registers are actually named after these intructions! S is the source of string instructions, D is the destination of string instructions. + +The direction of the index increment depends on the direction flag of the FLAGS register: 0 means forward and 1 means backward: https://stackoverflow.com/questions/9636691/what-are-cld-and-std-for-in-x86-assembly-language-what-does-df-do + +These instructions were originally developed to speed up "string" operations such as those present in the `` header of the C standard library. + +However, as computer architecture evolved, those instructions might not offer considerable speedups anymore, and modern glibc such as 2.29 just uses <> operations instead:, see also: https://stackoverflow.com/questions/33480999/how-can-the-rep-stosb-instruction-execute-faster-than-the-equivalent-loop + +===== x86 REP prefix + +Example: link:userland/arch/x86_64/rep.S[] + +Repeat a string instruction RCX times: + +As the repetitions happen: + +* RCX decreases, until it reaches 0 +* RDI and RSI increase + +The variants: REPZ, REPNZ (alias REPE, REPNE) repeat a given instruction until something happens. + +REP and REPZ also additionally stop if the comparison operation they repeat fails. + +* REP: INS, OUTS, MOVS, LODS, and STOS +* REPZ: CMPS and SCAS + === x86 miscellaneous instructions <> 5.1.13 "Miscellaneous Instructions" -==== x86 NOP instruction - -link:userland/arch/x86_64/nop.S[NOP] - -No OPeration. - -Does nothing except take up one processor cycle and occupy some instruction memory. - -Applications: http://stackoverflow.com/questions/234906/whats-the-purpose-of-the-nop-opcode +NOP: <> === x86 random number generator instructions <> 5.1.15 Random Number Generator Instructions -Example: link:userland/arch/x86_64/rdrand.S[RDRAND] +Example: link:userland/arch/x86_64/rdrand.S[]: RDRAND If you run that executable multiple times, it prints a random number every time to stdout. @@ -12508,7 +12553,7 @@ RDRAND sets the carry flag when data is ready so we must loop if the carry flag ==== x86 CPUID instruction -Example: link:userland/arch/x86_64/cpuid.S[CPUID] +Example: link:userland/arch/x86_64/cpuid.S[] Fills EAX, EBX, ECX and EDX with CPU information. @@ -13299,6 +13344,8 @@ See: <>. ==== ARM NOP instruction +Parent section: <> + There are a few different ways to encode NOP, notably MOV a register into itself, and a dedicated miscellaneous instruction. Example: link:userland/arch/arm/nop.S[] diff --git a/userland/arch/x86_64/cmps.S b/userland/arch/x86_64/cmps.S new file mode 100644 index 0000000..b89ca8d --- /dev/null +++ b/userland/arch/x86_64/cmps.S @@ -0,0 +1,24 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-string-instructions */ + + # Compare two arrays + +#include + +.section .rodata + my_quad_array_1: .quad 1, 2 + my_quad_array_2: .quad 1, 3 +LKMC_PROLOGUE + mov $0, %r12 + mov $0, %r13 + cld + lea my_quad_array_1(%rip), %rsi + lea my_quad_array_2(%rip), %rdi + cmpsq + setz %r12b + cmpsq + setz %r13b + /* 1 == 1 */ + LKMC_ASSERT_EQ(%r12, $1) + /* 2 != 3 */ + LKMC_ASSERT_EQ(%r13, $0) +LKMC_EPILOGUE diff --git a/userland/arch/x86_64/lods.S b/userland/arch/x86_64/lods.S new file mode 100644 index 0000000..d2b12f0 --- /dev/null +++ b/userland/arch/x86_64/lods.S @@ -0,0 +1,16 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-string-instructions */ + +#include + +.section .rodata + my_quad_array: .quad 1, 2 +LKMC_PROLOGUE + lea my_quad_array(%rip), %rsi + cld + lodsq + mov %rax, %r12 + lodsq + mov %rax, %r13 + LKMC_ASSERT_EQ(%r12, $1) + LKMC_ASSERT_EQ(%r13, $2) +LKMC_EPILOGUE diff --git a/userland/arch/x86_64/movs.S b/userland/arch/x86_64/movs.S new file mode 100644 index 0000000..74ab624 --- /dev/null +++ b/userland/arch/x86_64/movs.S @@ -0,0 +1,22 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-string-instructions */ +# # movs + + # Copy one string into another. + + # Input pointed by esi, output by edi. + +#include + +.section .rodata + src: .quad 1, 2 +.bss + dest: .skip 16 +LKMC_PROLOGUE + cld + lea src(%rip), %rsi + lea dest(%rip), %rdi + movsq + movsq + LKMC_ASSERT_EQ(dest + 0, $1) + LKMC_ASSERT_EQ(dest + 8, $2) +LKMC_EPILOGUE diff --git a/userland/arch/x86_64/nop.S b/userland/arch/x86_64/nop.S index 16a1960..c0da11d 100644 --- a/userland/arch/x86_64/nop.S +++ b/userland/arch/x86_64/nop.S @@ -1,4 +1,4 @@ -/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-nop-instruction */ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#nop-instructions */ #include diff --git a/userland/arch/x86_64/rep.S b/userland/arch/x86_64/rep.S new file mode 100644 index 0000000..5bb9e18 --- /dev/null +++ b/userland/arch/x86_64/rep.S @@ -0,0 +1,73 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-rep-prefix */ + +#include + +.bss + src: .skip 16 + dst: .skip 16 +LKMC_PROLOGUE + + /* memset: REP STOSQ */ + cld + lea dst(%rip), %rdi + /* 2 elements. */ + mov $2, %rcx + /* Set every element to 42. */ + mov $0x2A, %rax + rep stosq + /* RCX was decremented down to zero. */ + LKMC_ASSERT_EQ(%rcx, $0) + /* And the memory was set. */ + LKMC_ASSERT_EQ(dst + 0, $0x2A) + LKMC_ASSERT_EQ(dst + 8, $0x2A) + + /* memcpy: REP MOVSQ */ + cld + movq $2, src + 0 + movq $3, src + 8 + lea src(%rip), %rsi + lea dst(%rip), %rdi + mov $2, %rcx + rep movsq + LKMC_ASSERT_EQ(dst + 0, $2) + LKMC_ASSERT_EQ(dst + 8, $3) + + /* memcmp: REPZ CMPSQ */ + + /* Setup src. */ + movl $2, src + 0x0 + movl $3, src + 0x4 + movl $4, src + 0x8 + movl $5, src + 0xA + + /* Equal. */ + movl $2, dst + 0x0 + movl $3, dst + 0x4 + movl $4, dst + 0x8 + movl $5, dst + 0xA + cld + mov $src, %rsi + mov $dst, %rdi + mov $4, %rcx + repz cmpsl + mov %rcx, %r12 + /* Last flag was equal. */ + LKMC_ASSERT(jz) + /* RCX was decreased all the way to zero. */ + LKMC_ASSERT_EQ(%r12, $0) + + /* Different. */ + movl $2, dst + 0x0 + movl $3, dst + 0x4 + movl $2, dst + 0x8 + movl $5, dst + 0xA + mov $src, %rsi + mov $dst, %rdi + mov $4, %rcx + repz cmpsl + mov %rcx, %r12 + LKMC_ASSERT(jnz) + /* We stopped half-way with 1 comparision missing. */ + LKMC_ASSERT_EQ(%r12, $1) + +LKMC_EPILOGUE diff --git a/userland/arch/x86_64/scas.S b/userland/arch/x86_64/scas.S new file mode 100644 index 0000000..69b268a --- /dev/null +++ b/userland/arch/x86_64/scas.S @@ -0,0 +1,25 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-string-instructions */ + +#include + +.section .rodata + my_quad_array: .quad 1, 2 +LKMC_PROLOGUE + mov $0, %r12 + mov $0, %r13 + /* RDI holds the address. */ + lea my_quad_array(%rip), %rdi + cld + mov $1, %rax + /* Compare RAX to *RDI (1 == 1) */ + scasq + setz %r12b + mov $3, %rax + /* Compare RAX to *RDI (3 == 2) */ + scasq + setz %r13b + /* 1 == 1 */ + LKMC_ASSERT_EQ(%r12, $1) + /* 2 != 3 */ + LKMC_ASSERT_EQ(%r13, $0) +LKMC_EPILOGUE diff --git a/userland/arch/x86_64/stos.S b/userland/arch/x86_64/stos.S new file mode 100644 index 0000000..03aa556 --- /dev/null +++ b/userland/arch/x86_64/stos.S @@ -0,0 +1,62 @@ +/* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-string-instructions */ + +#include + +.data + my_quad_array: .quad 0, 0 + my_quad_array_expect_forward: .quad 1, 2 + my_quad_array_expect_backwards: .quad 4, 3 +LKMC_PROLOGUE + + /* Clear the direction flag: move forward. */ + cld + + /* The target address is stored in RDI. */ + lea my_quad_array(%rip), %rdi + + /* my_quad_array[0] = 1 */ + mov $1, %rax + /* RAX is automatically used as the source. */ + stosq + + /* my_quad_array[1] = 2 */ + mov $2, %rax + stosq + + /* RDI moved 2x 8 bytes forward. */ + sub $my_quad_array, %rdi + LKMC_ASSERT_EQ(%rdi, $0x10) + + /* The memory was modified. */ + LKMC_ASSERT_MEMCMP( + my_quad_array, + my_quad_array_expect_forward, + $0x10 + ) + + /* Now with backwards direction. */ + std + + /* The target address is stored in RDI. */ + lea (my_quad_array + 8)(%rip), %rdi + + /* my_quad_array[1] = 3 */ + mov $3, %rax + stosq + + /* my_quad_array[0] = 4 */ + mov $4, %rax + stosq + + /* RDI moved 2x 8 bytes backwards. */ + sub $my_quad_array, %rdi + LKMC_ASSERT_EQ(%rdi, $-0x8) + + /* The memory was modified. */ + LKMC_ASSERT_MEMCMP( + my_quad_array, + my_quad_array_expect_backwards, + $0x10 + ) + +LKMC_EPILOGUE