From ddae0d52f2cdd9c3807537ee20b90cd2b98b7f84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciro=20Santilli=20=E5=85=AD=E5=9B=9B=E4=BA=8B=E4=BB=B6=20?= =?UTF-8?q?=E6=B3=95=E8=BD=AE=E5=8A=9F?= Date: Sat, 1 Jun 2019 00:00:02 +0000 Subject: [PATCH] x86 asm: intrinsics from memory --- README.adoc | 5 +++-- userland/arch/x86_64/addpd.S | 6 ++++-- userland/arch/x86_64/intrinsics/addpd.c | 15 ++++++++++++++- userland/arch/x86_64/intrinsics/paddq.c | 15 +++++++++++++++ userland/arch/x86_64/paddq.S | 8 +++++--- 5 files changed, 41 insertions(+), 8 deletions(-) diff --git a/README.adoc b/README.adoc index 0e8417e..6afb65d 100644 --- a/README.adoc +++ b/README.adoc @@ -11933,8 +11933,9 @@ where: * ``: data type: ** `ps`: 4 floats (Packed Single) ** `pd`: 2 doubles (Packed Double) -** `ss`: 1 float (Single Single) +** `ss`: 1 float (Single Single), often the lowest order one ** `sd`: 1 double (Single Double) +** `si128`: 128-bits of integers of any size ** `ep` integer types, e.g.: *** `epi32`: 32 bit signed integers *** `epu16`: 16 bit unsigned integers @@ -11967,7 +11968,7 @@ Present in `gcc-7_3_0-release` tree at: `gcc/config/i386/x86intrin.h`. Bibliography: * https://www.cs.virginia.edu/~cr4bd/3330/S2018/simdref.html -* https://www.cs.virginia.edu/~cr4bd/3330/S2018/simdref.html +* https://software.intel.com/en-us/articles/how-to-use-intrinsics ==== GCC inline assembly register variables diff --git a/userland/arch/x86_64/addpd.S b/userland/arch/x86_64/addpd.S index 9822248..cdaffb9 100644 --- a/userland/arch/x86_64/addpd.S +++ b/userland/arch/x86_64/addpd.S @@ -6,15 +6,17 @@ #include LKMC_PROLOGUE -.bss - output: .skip 16 .data + .align 16 addps_input0: .float 1.5, 2.5, 3.5, 4.5 addps_input1: .float 5.5, 6.5, 7.5, 8.5 addps_expect: .float 7.0, 9.0, 11.0, 13.0 addpd_input0: .double 1.5, 2.5 addpd_input1: .double 5.5, 6.5 addpd_expect: .double 7.0, 9.0 +.bss + .align 16 + output: .skip 16 .text #define TEST(size) \ movups addp ## size ## _input0, %xmm0; \ diff --git a/userland/arch/x86_64/intrinsics/addpd.c b/userland/arch/x86_64/intrinsics/addpd.c index 315f666..bc6eb82 100644 --- a/userland/arch/x86_64/intrinsics/addpd.c +++ b/userland/arch/x86_64/intrinsics/addpd.c @@ -1,11 +1,16 @@ /* https://github.com/cirosantilli/linux-kernel-module-cheat#gcc-intrinsics */ #include +#include #include -int main(void) { +float global_input0[] __attribute__((aligned(16))) = {1.5f, 2.5f, 3.5f, 4.5f}; +float global_input1[] __attribute__((aligned(16))) = {5.5f, 6.5f, 7.5f, 8.5f}; +float global_output[4] __attribute__((aligned(16))); +float global_expected[] __attribute__((aligned(16))) = {7.0f, 9.0f, 11.0f, 13.0f}; +int main(void) { /* 32-bit add (addps). */ { __m128 input0 = _mm_set_ps(1.5f, 2.5f, 3.5f, 4.5f); @@ -33,6 +38,14 @@ int main(void) { assert(_mm_cvtss_f32(_mm_shuffle_ps(output, output, 3)) == 7.0f); } + /* Now from memory. */ + { + __m128 *input0 = (__m128 *)global_input0; + __m128 *input1 = (__m128 *)global_input1; + _mm_store_ps(global_output, _mm_add_ps(*input0, *input1)); + assert(!memcmp(global_output, global_expected, sizeof(global_output))); + } + /* 64-bit add (addpd). */ { __m128d input0 = _mm_set_pd(1.5, 2.5); diff --git a/userland/arch/x86_64/intrinsics/paddq.c b/userland/arch/x86_64/intrinsics/paddq.c index 89cc2f0..505238d 100644 --- a/userland/arch/x86_64/intrinsics/paddq.c +++ b/userland/arch/x86_64/intrinsics/paddq.c @@ -1,9 +1,16 @@ /* https://github.com/cirosantilli/linux-kernel-module-cheat#gcc-intrinsics */ #include +#include +#include #include +uint32_t global_input0[] __attribute__((aligned(16))) = {1, 2, 3, 4}; +uint32_t global_input1[] __attribute__((aligned(16))) = {5, 6, 7, 8}; +uint32_t global_output[4] __attribute__((aligned(16))); +uint32_t global_expected[] __attribute__((aligned(16))) = {6, 8, 10, 12}; + int main(void) { /* 32-bit add hello world. */ @@ -19,6 +26,14 @@ int main(void) { assert(_mm_extract_epi32(output, 0) == 12); } + /* Now from memory. */ + { + __m128i *input0 = (__m128i *)global_input0; + __m128i *input1 = (__m128i *)global_input1; + _mm_store_si128((__m128i *)global_output, _mm_add_epi32(*input0, *input1)); + assert(!memcmp(global_output, global_expected, sizeof(global_output))); + } + /* Now a bunch of other sizes. */ { __m128i input0 = _mm_set_epi32(0xF1F1F1F1, 0xF2F2F2F2, 0xF3F3F3F3, 0xF4F4F4F4); diff --git a/userland/arch/x86_64/paddq.S b/userland/arch/x86_64/paddq.S index 50c8ef3..879607b 100644 --- a/userland/arch/x86_64/paddq.S +++ b/userland/arch/x86_64/paddq.S @@ -10,6 +10,7 @@ LKMC_PROLOGUE .data + .align 16 input0: .long 0xF1F1F1F1, 0xF2F2F2F2, 0xF3F3F3F3, 0xF4F4F4F4 input1: .long 0x12121212, 0x13131313, 0x14141414, 0x15151515 paddb_expect: .long 0x03030303, 0x05050505, 0x07070707, 0x09090909 @@ -17,13 +18,14 @@ LKMC_PROLOGUE paddd_expect: .long 0x04040403, 0x06060605, 0x08080807, 0x0A0A0A09 paddq_expect: .long 0x04040403, 0x06060606, 0x08080807, 0x0A0A0A0A .bss + .align 16 output: .skip 16 .text - movups input1, %xmm1 + movaps input1, %xmm1 #define TEST(size) \ - movups input0, %xmm0; \ + movaps input0, %xmm0; \ padd ## size %xmm1, %xmm0; \ - movups %xmm0, output; \ + movaps %xmm0, output; \ LKMC_ASSERT_MEMCMP(output, padd ## size ## _expect, $0x10) /* 16x 8-bit */