x86 asm: move binary arithmetic instructions from x86-assembly-cheat

2026-01-28 04:24:26 +01:00 · 2019-06-11 00:00:00 +00:00
parent 90925e7e06
commit 20990604fb
9 changed files with 324 additions and 6 deletions
--- a/README.adoc
+++ b/README.adoc
@@ -10208,13 +10208,13 @@ There are not yet enabled, but it should be easy to so, see: <<add-new-buildroot
 https://stackoverflow.com/questions/6147242/heap-vs-binary-search-tree-bst/29548834#29548834
-Usage:
+First we build it with <<m5ops-instructions>> enabled, and then we extract the stats:
 ....
 ./build-userland \
  --arch aarch64 \
-  --ccflagg='-DLKMC_M5OPS_ENABLE=1' \
+  --ccflags='-DLKMC_M5OPS_ENABLE=1' \
-  --force-build cpp/bst_vs_heap \
+  --force-rebuild cpp/bst_vs_heap \
  --static \
 ;
 ./run \
@@ -10910,7 +10910,7 @@ To use that file, first rebuild `m5ops.out` with the m5ops instructions enabled
 ./build-userland \
  --arch aarch64 \
  --ccflags='-DLKMC_M5OPS_ENABLE=1' \
-  --force-build c/m5ops \
+  --force-rebuild c/m5ops \
  --static \
 ;
 ./build-buildroot --arch aarch64
@@ -10941,7 +10941,7 @@ In theory, the cleanest way to add m5ops to your benchmarks would be to do exact
 However, I think it is usually not worth the trouble of hacking up the build system of the benchmark to do this, and I recommend just hardcoding in a few raw instructions here and there, and managing it with version control + `sed`.
-Bibliography:x
+Bibliography:
 * https://stackoverflow.com/questions/56506154/how-to-analyze-only-interest-area-in-source-code-by-using-gem5/56506419#56506419
 * https://www.mail-archive.com/gem5-users@gem5.org/msg15418.html
@@ -12353,7 +12353,10 @@ Bibliography:
 * link:userland/arch/x86_64/add.S[ADD]
 * link:userland/arch/x86_64/dec.S[DEC]
 * link:userland/arch/x86_64/imul.S[IMUL]
 * link:userland/arch/x86_64/inc.S[INC]
 * link:userland/arch/x86_64/mul.S[MUL]
 * link:userland/arch/x86_64/neg.S[NEG]
 * link:userland/arch/x86_64/sub.S[SUB]
 === x86 SIMD
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ https://github.com/cirosantilli/linux-kernel-module-cheat-regression#gem5-unit-t
                    'git', LF,
                    '-C', self.env['gem5_default_source_dir'], LF,
                    'worktree', 'add', LF,
-                    '-b', os.path.join('wt', self.env['gem5_build_id']), LF,
+                    '-b', os.path.join('wt', self.env['gem5_worktree']), LF,
                    self.env['gem5_source_dir'], LF,
                ])
            else:
--- a/userland/arch/x86_64/adc.S
+++ b/userland/arch/x86_64/adc.S
@@ -0,0 +1,16 @@
 # Add with carry.
 #
 # edx:eax += ebx:ecx
 #include <lkmc.h>
 LKMC_PROLOGUE
    movl $0x80000000, %eax
    movl $0x80000000, %ecx
    movl $0, %ebx
    movl $0, %edx
    addl %ecx, %eax
    adcl %ebx, %edx
    LKMC_ASSERT_EQ_32(%eax, $0)
    LKMC_ASSERT_EQ_32(%edx, $1)
 LKMC_EPILOGUE
--- a/userland/arch/x86_64/div.S
+++ b/userland/arch/x86_64/div.S
@@ -0,0 +1,92 @@
 /* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-binary-arithmetic-instructions
 *
 * Unsigned integer division, interface similar to MUL:
 *
 * ....
 * rax = rdx:rax / SRC
 * rdx = rdx:rax % SRC
 * ....
 *
 * DIV can be used to calculate modulus, but GCC does not use it becaues it is slow, 
 * and choses alternative techniques instead
 * http://stackoverflow.com/questions/4361979/how-does-the-gcc-implementation-of-module-work-and-why-does-it-not-use-the
 */
 #include <lkmc.h>
 LKMC_PROLOGUE
    /* 64-bit hello world:
     *
     * 5 / 2 = 2 with leftover of 1.
     */
    mov $0, %rdx
    mov $5, %rax
    mov $2, %rbx
    div %rbx
    mov %rax, %r12
    mov %rdx, %r13
    LKMC_ASSERT_EQ(%r12, $2)
    LKMC_ASSERT_EQ(%r13, $1)
    /* Now with a simple carry. */
    mov $1, %rdx
    mov $2, %rax
    mov $2, %rbx
    div %rbx
    mov %rax, %r12
    mov %rdx, %r13
    LKMC_ASSERT_EQ(%r12, $0x8000000000000001)
    LKMC_ASSERT_EQ(%r13, $0)
    /* TODO SIGFPE example does not fit into rax. */
    mov $2, %rdx
    mov $0, %rax
    mov $2, %rbx
    div %rbx
 #if 0
    /* 32 bit */
    mov $1, %eax
    mov $1, %edx
    mov $2, %ecx
    div %ecx
    LKMC_ASSERT_EQ_32(%eax, $0x80000000)
    LKMC_ASSERT_EQ_32(%edx, $1)
    # # Division by zero
    # # Division overflow
        # If either
        # - divisor == 0
        # - result > output register size
        # A divide error exception occurs.
        # It then gets handled by the interrupt service 0.
        # Both 0 division and overflow are treated exactly the same!
        # Linux treats this by sending a signal to the process and killing it.
        # Minimal 16-bit example of handling the interrupt:
        # https://github.com/cirosantilli/x86-bare-metal-examples/blob/9e58c1dc656dab54aa69daa38f84eb8c0aa6151e/idt_zero_divide.S
            # Output does not fit into edx.
            #mov eax, 0
            #mov edx, 1
            #mov ecx, 1
            #div ecx
            # Division by zero.
            #mov eax, 1
            #mov edx, 0
            #mov ecx, 0
            #div ecx
        # There is no immediate version:
        # http://stackoverflow.com/questions/4529260/mul-instruction-doesnt-support-an-immediate-value
 #endif
 LKMC_EPILOGUE
--- a/userland/arch/x86_64/idiv.S
+++ b/userland/arch/x86_64/idiv.S
@@ -0,0 +1,34 @@
 /* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-binary-arithmetic-instructions
 *
 * Signed integer division.
 *
 * Much like MUL vs IMUL.
 */
 #include <lkmc.h>
 LKMC_PROLOGUE
    movl $-5, %eax
    # Don't forget this!
    cltd
    movl $-2, %ecx
    idivl %ecx
    LKMC_ASSERT_EQ_32(%eax, $2)
    LKMC_ASSERT_EQ_32 edx, -1
    movl $1, %eax
    movl $1, %edx
    movl $4, %ecx
    idivl %ecx
    LKMC_ASSERT_EQ_32(%eax, $0x40000000)
    LKMC_ASSERT_EQ_32(%edx, $1)
    # RUNTIME ERROR: result must fit into signed dword:
    #mov eax, 1
    #mov edx, 1
    #mov ecx, 2
    #idiv ecx
    # TODO division by zero
 LKMC_EPILOGUE
--- a/userland/arch/x86_64/imul.S
+++ b/userland/arch/x86_64/imul.S
@@ -0,0 +1,42 @@
 /* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-binary-arithmetic-instructions
 *
 * Signed multiply.
 *
 * Has many more forms than MUL including immediate and up to three arguments.
 */
 #include <lkmc.h>
 LKMC_PROLOGUE
    /* The single register forms are just like MUL, and
     * extend over rdx:rax.
     *
     * rdx : rax =  rax * rbx
     *           =   -1 *   2
     *           =  -2
     *           =  0xFFFFFFFFFFFFFFFF : 0xFFFFFFFFFFFFFFFE
     */
    mov $-1, %rax
    mov $2, %rbx
    mov $42, %rdx
    imul %rbx
    mov %rax, %r12
    mov %rdx, %r13
    LKMC_ASSERT_EQ(%r12, $0xFFFFFFFFFFFFFFFE)
    LKMC_ASSERT_EQ(%r13, $0xFFFFFFFFFFFFFFFF)
    /* The multi-argument formas don't extend over rdx, but
     * are more convenient in many cases.
     *
     * rax = rbx * 3
     */
    mov $42, %rax
    mov $-2, %rbx
    mov $42, %rdx
    imul $3, %rbx, %rax
    mov %rax, %r12
    mov %rdx, %r13
    LKMC_ASSERT_EQ(%r12, $-6)
    LKMC_ASSERT_EQ(%r13, $42)
 LKMC_EPILOGUE
--- a/userland/arch/x86_64/mul.S
+++ b/userland/arch/x86_64/mul.S
@@ -0,0 +1,98 @@
 /* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-binary-arithmetic-instructions
 *
 * Unsigned multiply.
 *
 * The result is spread across edx:eax.
 */
 #include <lkmc.h>
 LKMC_PROLOGUE
    /* 64-bit hello world:
     *
     * rdx : rax = rax * rbx
     * 0x0 : 4   =   2 *   2
     */
    mov $2, %rax
    mov $2, %rbx
    mul %rbx
    /* Move to callee saved registers to persist after our asserts. */
    mov %rax, %r12
    mov %rdx, %r13
    mov %rbx, %r14
    LKMC_ASSERT_EQ(%r12, $4)
    LKMC_ASSERT_EQ(%r13, $0)
    /* rbx is untouched. */
    LKMC_ASSERT_EQ(%r14, $2)
    /* 64-bit with a carry:
     *
     * rdx :                rax = rax                * rbx
     * 0x1 : 0x0000000000000002 = 0x8000000000000001 *   2
     */
    mov $0x8000000000000001, %rax
    mov $2, %rbx
    mul %rbx
    mov %rax, %r12
    mov %rdx, %r13
    LKMC_ASSERT_EQ(%r12, $2)
    LKMC_ASSERT_EQ(%r13, $1)
    /* 8-bit is special: does not use dx for output:
     *
     *   ah : al = al *   bl
     * 0x10 :  0 =  2 * 0x80
     */
    mov $0, %eax
    mov $2, %al
    mov $0x80, %bl
    mov $0, %dl
    mul %bl
    LKMC_ASSERT_EQ_32(%eax, $0x100)
    /* 16-bit
     *
     *  dx :     ax = ax *     bx
     * 0x1 : 0x0000 =  2 * 0x8000
     */
    mov $0, %eax
    mov $0, %edx
    mov $2, %ax
    mov $0x8000, %bx
    mov $0, %dx
    mul %bx
    mov %eax, %r12d
    mov %edx, %r13d
    LKMC_ASSERT_EQ_32(%r12d, $0)
    LKMC_ASSERT_EQ_32(%r13d, $1)
    /* 32-bit */
    mov $2, %eax
    mov $0x80000000, %ebx
    mov $0, %edx
    mul %ebx
    mov %eax, %r12d
    mov %edx, %r13d
    LKMC_ASSERT_EQ_32(%r12d, $0)
    LKMC_ASSERT_EQ_32(%r13d, $1)
 #if 0
    /* No immediate form, although imul has one:
     * http://stackoverflow.com/questions/20499141/is-it-possible-to-multiply-by-and-immediate-with-mul-in-x86-assembly/33202309#33202309
     *
     * Error: operand type mismatch for `mul'
     */
    mul $2
 #endif
    /* Memory version */
 .data
    mylong: .long 0x11111111
 .text
    movl $2, %eax
    mull mylong
    LKMC_ASSERT_EQ_32(%eax, $0x22222222)
 LKMC_EPILOGUE
--- a/userland/arch/x86_64/neg.S
+++ b/userland/arch/x86_64/neg.S
@@ -0,0 +1,14 @@
 /* https://github.com/cirosantilli/linux-kernel-module-cheat#x86-binary-arithmetic-instructions
 *
 * Negate: i *= -1.
 */
 #include <lkmc.h>
 LKMC_PROLOGUE
    mov $2, %rax
    neg %rax
    LKMC_ASSERT_EQ(%rax, $-2)
    neg %eax
    LKMC_ASSERT_EQ(%rax, $2)
 LKMC_EPILOGUE
--- a/userland/arch/x86_64/sbb.S
+++ b/userland/arch/x86_64/sbb.S
@@ -0,0 +1,19 @@
 # Subtract with borrow:
 #
 #     edx:eax -= ebx:ecx
 #include <lkmc.h>
 LKMC_PROLOGUE
    movl $0, %eax
    movl $0, %ebx
    movl $0x80000000, %ecx
    movl $1, %edx
    subl %ecx, %eax
    sbbl %ebx, %edx
    LKMC_ASSERT_EQ_32(%eax, $0x80000000)
    LKMC_ASSERT_EQ_32(%edx, $0)
 LKMC_EPILOGUE